Esempio n. 1
0
def evaluate_crubadan_odin(filename):
  '''
  Trains models from the Crubadan data and runs them on Odin
  '''
  print "Loading character features..."
  trainsetchar = get_features('crubadan', option='char')
  print "Loading word features..."
  trainsetword = get_features('crubadan', option='word')
  print "Loading test data..."
  labels = [x[0] for x in odin.source_sents()]
  test = [(Counter(sentence2ngrams(x[1], with_word_boundary=True, option='allgrams')), Counter(x[1].split())) for x in odin.source_sents()]
  
  print "Calculating results..."
  with open(filename,'w') as f:
    f.write(' '.join(labels)+'\n')
    labels = None
    for lang in sorted(trainsetchar.keys()):
      print lang
      charresult = lang
      wordresult = lang
      modelchar = SGT(trainsetchar.pop(lang))
      modelword = SGT(trainsetword.pop(lang))
      for sentence in test:
        charresult += ' ' + float.hex(modelchar.estimate(sentence[0]))
        wordresult += ' ' + float.hex(modelword.estimate(sentence[1]))
      f.write(charresult+'\n')
      f.write(wordresult+'\n')
  print "Done!"
Esempio n. 2
0
def featurize(text, all_features, option="3gram"):
    """ Inputs a sentence string and outputs the np.array() """
    import numpy as np
    from collections import Counter
    from extractfeature import sentence2ngrams
    return np.array([Counter(sentence2ngrams(text, option=option))[j] \
                     for j in all_features])
Esempio n. 3
0
def evaluate_crubadan_odin(filename):
    '''
  Trains models from the Crubadan data and runs them on Odin
  '''
    print "Loading character features..."
    trainsetchar = get_features('crubadan', option='char')
    print "Loading word features..."
    trainsetword = get_features('crubadan', option='word')
    print "Loading test data..."
    labels = [x[0] for x in odin.source_sents()]
    test = [(Counter(
        sentence2ngrams(x[1], with_word_boundary=True,
                        option='allgrams')), Counter(x[1].split()))
            for x in odin.source_sents()]

    print "Calculating results..."
    with open(filename, 'w') as f:
        f.write(' '.join(labels) + '\n')
        labels = None
        for lang in sorted(trainsetchar.keys()):
            print lang
            charresult = lang
            wordresult = lang
            modelchar = SGT(trainsetchar.pop(lang))
            modelword = SGT(trainsetword.pop(lang))
            for sentence in test:
                charresult += ' ' + float.hex(modelchar.estimate(sentence[0]))
                wordresult += ' ' + float.hex(modelword.estimate(sentence[1]))
            f.write(charresult + '\n')
            f.write(wordresult + '\n')
    print "Done!"
Esempio n. 4
0
def featurize(text, all_features, option="3gram"):
  """ Inputs a sentence string and outputs the np.array() """
  import numpy as np
  from collections import Counter
  from extractfeature import sentence2ngrams
  return np.array([Counter(sentence2ngrams(text, option=option))[j] \
                   for j in all_features])
Esempio n. 5
0
def sugarlid_cosine(text, option='3gram', data_source='crubadan'):
  """ Cosine Vector based sugarlid. """
  from cosine import cosine_similarity
  char_ngrams = get_features(data_source, option=option)
  ##for i in char_ngrams:
  ##  print char_ngrams[i]
  #print sentence2ngrams(text, option=option)
  try:
    query_vector = " ".join(sentence2ngrams(text, option=option))
  except TypeError:
    query_vector = " ".join(["_".join(i) for i in \
                             sentence2ngrams(text, option=option)])
    print query_vector
  results = []
  for i in char_ngrams:
    lang_vector = " ".join([str(j+" ")*char_ngrams[i][j] \
                            for j in char_ngrams[i]])
    score = cosine_similarity(query_vector, lang_vector)
    if score > 0:
      results.append((score,i))
  return sorted(results, reverse=True)
Esempio n. 6
0
def sugarlid_cosine(text, option='3gram', data_source='crubadan'):
    """ Cosine Vector based sugarlid. """
    from cosine import cosine_similarity
    char_ngrams = get_features(data_source, option=option)
    ##for i in char_ngrams:
    ##  print char_ngrams[i]
    #print sentence2ngrams(text, option=option)
    try:
        query_vector = " ".join(sentence2ngrams(text, option=option))
    except TypeError:
        query_vector = " ".join(["_".join(i) for i in \
                                 sentence2ngrams(text, option=option)])
        print query_vector
    results = []
    for i in char_ngrams:
        lang_vector = " ".join([str(j+" ")*char_ngrams[i][j] \
                                for j in char_ngrams[i]])
        score = cosine_similarity(query_vector, lang_vector)
        if score > 0:
            results.append((score, i))
    return sorted(results, reverse=True)
Esempio n. 7
0
def classify_odin(sentence, verbose=True):
  '''
  Given an input string, classifies it based on Odin character n-grams.
  Effectively an informal test.
  '''
  test = Counter(sentence2ngrams(sentence, with_word_boundary=True, option='allgrams'))
  trainset = get_features('odin', option='char')
  sgt_results = []
  for lang in trainset:
    train = trainset[lang]
    sgt_results.append((SGT(train, min=6000).estimate(test),lang))
  sgt_results.sort(reverse=True)
  if verbose:
    for i in sgt_results[:10]:
      print i
  return sgt_results
Esempio n. 8
0
def classify_odin(sentence, verbose=True):
    '''
  Given an input string, classifies it based on Odin character n-grams.
  Effectively an informal test.
  '''
    test = Counter(
        sentence2ngrams(sentence, with_word_boundary=True, option='allgrams'))
    trainset = get_features('odin', option='char')
    sgt_results = []
    for lang in trainset:
        train = trainset[lang]
        sgt_results.append((SGT(train, min=6000).estimate(test), lang))
    sgt_results.sort(reverse=True)
    if verbose:
        for i in sgt_results[:10]:
            print i
    return sgt_results
Esempio n. 9
0
def train_nbc(train=True):
  '''
  data_source = {'odin':'../../data/odin/odin-cleaner.tar',
                'udhr':'../../data/udhr/udhr-unicode.tar',
                'omniglotphrase':'../../data/omniglot/omniglotphrases.tar'}
  '''
  data_source = {'udhr':'../../data/udhr/udhr-unicode.tar'}
  featuresets = []
  for s in data_source:
    for lang, sent in extract_features_from_tarfile(data_source[s]):
      if lang in ISO2LANG:
        featuresets += [({'3gram':i},lang) for i in sentence2ngrams(sent)]
        print len(featuresets)
  
  if train:
    return nbc.train(featuresets)
  else:
    with codecs.open('3grams-featuresets.pk','wb') as fout:
      pickle.dump(featuresets, fout)
Esempio n. 10
0
def test(test_sentence, classifier, option=''):
  classes = defaultdict(list)
  test_features = [{'3gram':i} for i in sentence2ngrams(test_sentence)]
  
  # Classify features from test_sentence.
  for i in test_features:
    result = classifier.prob_classify(i); best = result.max()
    classes[best].append(result.prob(best))
  
  # Calculate the scores of the classified features from the test_sentence.
  answers = {}
  if option[:3] == 'geo': # geometric mean
    for i in classes:
      answers[i] = geometric_mean(classes[i])
  elif option[:3] == 'ari': # arithmetic mean
    for i in classes:
      answers[i] = arithmetic_mean(classes[i])
  else: # use arithmetic-geometric mean, see 
    for i in classes:
      answers[i] = arigeo_mean(classes[i])
  return max(answers.iteritems(), key=operator.itemgetter(1))[0], answers
Esempio n. 11
0
def featurize(text, all_features, option="3gram"):
  """ Inputs a sentence string and outputs the np.array() """
  return np.array([Counter(sentence2ngrams(text, option=option))[j] \
                   for j in all_features])
Esempio n. 12
0
            normalise(featureset[lang][i])
    identify = sum_cosine
    DataStr = MultiCounter
    option = "separate"
    with_word_boundary = True

char = None
word = None
numlang = len(featureset)

while True:
    input = raw_input("\nEnter text to be identified: ").decode(
        sys.stdin.encoding)
    sentfeat = DataStr(
        sentence2ngrams(input,
                        option=option,
                        with_word_boundary=with_word_boundary))
    results = identify(featureset, sentfeat)
    result_list = [code for score, code in sorted(results, reverse=True)]
    for i in range(5):
        code = result_list[i]
        print "  {}. {}: {}".format(i + 1, code,
                                    unicode(ISO2LANG[code][0]).title())
    #print "\tTop ten results: {}".format(" ".join(result_list[0:10]))
    answercode = raw_input("What was the correct answer? ").decode(
        sys.stdin.encoding)
    try:
        answerlang = unicode(ISO2LANG[answercode][0]).title()
    except IndexError:
        try:
            answerlang = unicode(answercode)
Esempio n. 13
0
def evaluator(data_source,
              option="all",
              model="cosine",
              tfidf=False,
              with_word_boundary=True,
              seed=0,
              warnings=False,
              weight=None):
    """
  Segments the data into 90-10 portions using tenfold(), 
  then trains a model using 90% of the data and evaluates on the remaining 10%.
  """
    from universalcorpus.miniethnologue import ISO2LANG, MACRO2LANG
    from extractfeature import sentence2ngrams
    from collections import defaultdict, Counter
    from multinomialnaivebayes import SGT
    from time import time

    ### Choose the data structure to record features, and the function that will be called when identifying a sentence
    if model == "cosine":
        DataStr = Counter
        identify = dot_product  # The sentence feature vectors will not be normalised, to save time. This does not affect classification.

    elif model == "cosine-combined":
        DataStr = MultiCounter
        identify = sum_cosine
        option = "separate"
        if not weight:
            weight = [1, 1, 1, 1, 1, 1]

    else:
        print "Sorry, the model '{}' isn't available!".format(model)
        return None

    ### Get ready to record these statistics
    ten_fold_accuracy = []
    ten_fold_mrr = []
    ten_fold_precision = []
    ten_fold_recall = []
    ten_fold_fscore = []

    fold_counter = 0

    ### Set up the tenfold cross-validation, then evaluate on each fold
    for train, test in tenfold(data_source, randseed=seed):
        fold_counter += 1
        print "Loading fold {}...".format(fold_counter)
        start = time()

        ### Extract the features
        featureset = defaultdict(DataStr)
        for lang, trainsent in train:
            if lang in ISO2LANG or lang in MACRO2LANG:
                trainsentcount = DataStr(
                    sentence2ngrams(trainsent,
                                    option=option,
                                    with_word_boundary=with_word_boundary))
                if len(trainsentcount) > 0:
                    featureset[lang].update(trainsentcount)
                elif warnings:
                    print("*** No features for: {}".format(trainsent))
            elif warnings:
                print("*** {} not recognised!".format(lang))

        ### Process the features to produce weights
        if model == "cosine":
            print "Normalising to unit length..."
            for lang in featureset:
                normalise(featureset[lang])  # Updates featureset.
            if tfidf:
                print "Calculating tf-idf..."
                tfidfize(featureset)  # Updates featureset.

        elif model == "cosine-combined":
            print "Normalising and re-weighting components..."
            for lang in featureset:
                for i in range(6):
                    normalise(featureset[lang][i],
                              weight[i])  # Updates featureset.

        print "Evaluating..."
        fold_results = Counter(
        )  # Records the number of times the correct language is at a specific rank
        macro_true = defaultdict(
            int
        )  # These three are to calculate precision, recall, and f-score for each language
        macro_fpos = defaultdict(int)
        macro_fneg = defaultdict(int)

        ### Identify each sentence in the test data
        for lang, testsent in test:
            ### Extract features
            sentfeat = DataStr(
                sentence2ngrams(testsent,
                                option=option,
                                with_word_boundary=with_word_boundary))
            if len(sentfeat) == 0:
                print "*** No features for: {}".format(testsent)
                continue

            ### Predict the language
            results = identify(featureset, sentfeat)
            result_list = [
                code for score, code in sorted(results, reverse=True)
            ]
            try:
                rank = result_list.index(
                    lang) + 1  # Compare the prediction with the answer
            except ValueError:  # If the language was not seen in training
                rank = float('inf')
            #print rank

            ### Note the result
            fold_results[rank] += 1
            if rank == 1:
                macro_true[lang] += 1
            else:
                macro_fneg[lang] += 1
                macro_fpos[result_list[0]] += 1

        ### Calculate statistics for this fold
        accuracy = fold_results[1] / sum(fold_results.values())
        print "Accuracy: {}".format(accuracy)
        ten_fold_accuracy.append(accuracy)

        mrr = sum([count / rank for rank, count in fold_results.items()
                   ]) / sum(fold_results.values())
        print "Mean Reciprocal Rank: {}".format(mrr)
        ten_fold_mrr.append(mrr)

        langset = set(macro_true.keys()) & set(macro_fpos.keys()) & set(
            macro_fneg.keys())
        precision = {
            lang: macro_true[lang] / (macro_true[lang] + macro_fpos[lang])
            for lang in langset
        }
        recall = {
            lang: macro_true[lang] / (macro_true[lang] + macro_fneg[lang])
            for lang in langset
        }
        fscore = {
            lang: 2 * precision[lang] * recall[lang] /
            (precision[lang] + recall[lang])
            for lang in langset
        }

        average_precision = sum(precision.values()) / len(langset)
        average_recall = sum(recall.values()) / len(langset)
        average_fscore = sum(fscore.values()) / len(langset)

        print "Macro precision: {}".format(average_precision)
        print "Macro recall: {}".format(average_recall)
        print "Macro f-score: {}".format(average_fscore)

        ten_fold_precision.append(average_precision)
        ten_fold_recall.append(average_recall)
        ten_fold_fscore.append(average_fscore)

        end = time() - start
        print "{} seconds to evaluate {} sentences in fold {}\n".format(
            end, sum(fold_results.values()), fold_counter)

    ### Average over all folds
    overall_accuracy = sum(ten_fold_accuracy) / 10
    overall_mrr = sum(ten_fold_mrr) / 10
    overall_precision = sum(ten_fold_precision) / 10
    overall_recall = sum(ten_fold_recall) / 10
    overall_fscore = sum(ten_fold_fscore) / 10

    print "=============================================="
    print "Average accuracy: {}".format(overall_accuracy)
    print "Average MRR: {}".format(overall_mrr)
    print "Average macro precision: {}".format(overall_precision)
    print "Average macro recall: {}".format(overall_recall)
    print "Average macro f-score: {}".format(overall_fscore)

    return (overall_accuracy, overall_mrr, overall_precision, overall_recall,
            overall_fscore)
Esempio n. 14
0
  return exp(result)

'''
train = Counter({'a':1,'b':5,'c':2})
test = Counter({'b':1,'a':1})

langSGT = SGT(train)
langMLE = MLE(train)

print SGTestimate(langSGT,test)
print MLEestimate(langMLE,test)
'''

from extractfeature import sentence2ngrams, get_features
s = "ich bin schwanger"
test = Counter(sentence2ngrams(s, with_word_boundary=True))
print test

trainset = get_features('odin', option='3gram')
sgt_results = []
mle_results = []

'''
german = SGT(trainset['deu'])
wakawaka = SGT(trainset['wkw'])

for x in test:
  print x, trainset['deu'][x], SGTestimate(german, Counter({x:1}))
  print x, trainset['wkw'][x], SGTestimate(wakawaka, Counter({x:1}))

print len(trainset['wkw'])
Esempio n. 15
0
def featurize(text, all_features, option="3gram"):
    """ Inputs a sentence string and outputs the np.array() """
    return np.array([Counter(sentence2ngrams(text, option=option))[j] \
                     for j in all_features])
Esempio n. 16
0
      featureset[lang][len(ngram)][ngram] = count
  for lang in featureset:
    for i in range(6):
      normalise(featureset[lang][i])
  identify = sum_cosine
  DataStr = MultiCounter
  option = "separate"
  with_word_boundary = True

char = None
word = None
numlang = len(featureset)

while True:
  input = raw_input("\nEnter text to be identified: ").decode(sys.stdin.encoding)
  sentfeat = DataStr(sentence2ngrams(input, option=option, with_word_boundary=with_word_boundary))
  results = identify(featureset, sentfeat)
  result_list = [code for score, code in sorted(results, reverse=True)]
  for i in range(5):
    code = result_list[i]
    print "  {}. {}: {}".format(i+1, code, unicode(ISO2LANG[code][0]).title())
  #print "\tTop ten results: {}".format(" ".join(result_list[0:10]))
  answercode = raw_input("What was the correct answer? ").decode(sys.stdin.encoding)
  try:
    answerlang = unicode(ISO2LANG[answercode][0]).title()
  except IndexError:
    try:
      answerlang = unicode(answercode)
      answercode = LANG2ISO[answerlang.lower()][0]
      print '  We interpret "{}" to mean "{}"'.format(answerlang, answercode)
      answerlang = answerlang.title()