def evaluate_crubadan_odin(filename): ''' Trains models from the Crubadan data and runs them on Odin ''' print "Loading character features..." trainsetchar = get_features('crubadan', option='char') print "Loading word features..." trainsetword = get_features('crubadan', option='word') print "Loading test data..." labels = [x[0] for x in odin.source_sents()] test = [(Counter(sentence2ngrams(x[1], with_word_boundary=True, option='allgrams')), Counter(x[1].split())) for x in odin.source_sents()] print "Calculating results..." with open(filename,'w') as f: f.write(' '.join(labels)+'\n') labels = None for lang in sorted(trainsetchar.keys()): print lang charresult = lang wordresult = lang modelchar = SGT(trainsetchar.pop(lang)) modelword = SGT(trainsetword.pop(lang)) for sentence in test: charresult += ' ' + float.hex(modelchar.estimate(sentence[0])) wordresult += ' ' + float.hex(modelword.estimate(sentence[1])) f.write(charresult+'\n') f.write(wordresult+'\n') print "Done!"
def featurize(text, all_features, option="3gram"): """ Inputs a sentence string and outputs the np.array() """ import numpy as np from collections import Counter from extractfeature import sentence2ngrams return np.array([Counter(sentence2ngrams(text, option=option))[j] \ for j in all_features])
def evaluate_crubadan_odin(filename): ''' Trains models from the Crubadan data and runs them on Odin ''' print "Loading character features..." trainsetchar = get_features('crubadan', option='char') print "Loading word features..." trainsetword = get_features('crubadan', option='word') print "Loading test data..." labels = [x[0] for x in odin.source_sents()] test = [(Counter( sentence2ngrams(x[1], with_word_boundary=True, option='allgrams')), Counter(x[1].split())) for x in odin.source_sents()] print "Calculating results..." with open(filename, 'w') as f: f.write(' '.join(labels) + '\n') labels = None for lang in sorted(trainsetchar.keys()): print lang charresult = lang wordresult = lang modelchar = SGT(trainsetchar.pop(lang)) modelword = SGT(trainsetword.pop(lang)) for sentence in test: charresult += ' ' + float.hex(modelchar.estimate(sentence[0])) wordresult += ' ' + float.hex(modelword.estimate(sentence[1])) f.write(charresult + '\n') f.write(wordresult + '\n') print "Done!"
def sugarlid_cosine(text, option='3gram', data_source='crubadan'): """ Cosine Vector based sugarlid. """ from cosine import cosine_similarity char_ngrams = get_features(data_source, option=option) ##for i in char_ngrams: ## print char_ngrams[i] #print sentence2ngrams(text, option=option) try: query_vector = " ".join(sentence2ngrams(text, option=option)) except TypeError: query_vector = " ".join(["_".join(i) for i in \ sentence2ngrams(text, option=option)]) print query_vector results = [] for i in char_ngrams: lang_vector = " ".join([str(j+" ")*char_ngrams[i][j] \ for j in char_ngrams[i]]) score = cosine_similarity(query_vector, lang_vector) if score > 0: results.append((score,i)) return sorted(results, reverse=True)
def sugarlid_cosine(text, option='3gram', data_source='crubadan'): """ Cosine Vector based sugarlid. """ from cosine import cosine_similarity char_ngrams = get_features(data_source, option=option) ##for i in char_ngrams: ## print char_ngrams[i] #print sentence2ngrams(text, option=option) try: query_vector = " ".join(sentence2ngrams(text, option=option)) except TypeError: query_vector = " ".join(["_".join(i) for i in \ sentence2ngrams(text, option=option)]) print query_vector results = [] for i in char_ngrams: lang_vector = " ".join([str(j+" ")*char_ngrams[i][j] \ for j in char_ngrams[i]]) score = cosine_similarity(query_vector, lang_vector) if score > 0: results.append((score, i)) return sorted(results, reverse=True)
def classify_odin(sentence, verbose=True): ''' Given an input string, classifies it based on Odin character n-grams. Effectively an informal test. ''' test = Counter(sentence2ngrams(sentence, with_word_boundary=True, option='allgrams')) trainset = get_features('odin', option='char') sgt_results = [] for lang in trainset: train = trainset[lang] sgt_results.append((SGT(train, min=6000).estimate(test),lang)) sgt_results.sort(reverse=True) if verbose: for i in sgt_results[:10]: print i return sgt_results
def classify_odin(sentence, verbose=True): ''' Given an input string, classifies it based on Odin character n-grams. Effectively an informal test. ''' test = Counter( sentence2ngrams(sentence, with_word_boundary=True, option='allgrams')) trainset = get_features('odin', option='char') sgt_results = [] for lang in trainset: train = trainset[lang] sgt_results.append((SGT(train, min=6000).estimate(test), lang)) sgt_results.sort(reverse=True) if verbose: for i in sgt_results[:10]: print i return sgt_results
def train_nbc(train=True): ''' data_source = {'odin':'../../data/odin/odin-cleaner.tar', 'udhr':'../../data/udhr/udhr-unicode.tar', 'omniglotphrase':'../../data/omniglot/omniglotphrases.tar'} ''' data_source = {'udhr':'../../data/udhr/udhr-unicode.tar'} featuresets = [] for s in data_source: for lang, sent in extract_features_from_tarfile(data_source[s]): if lang in ISO2LANG: featuresets += [({'3gram':i},lang) for i in sentence2ngrams(sent)] print len(featuresets) if train: return nbc.train(featuresets) else: with codecs.open('3grams-featuresets.pk','wb') as fout: pickle.dump(featuresets, fout)
def test(test_sentence, classifier, option=''): classes = defaultdict(list) test_features = [{'3gram':i} for i in sentence2ngrams(test_sentence)] # Classify features from test_sentence. for i in test_features: result = classifier.prob_classify(i); best = result.max() classes[best].append(result.prob(best)) # Calculate the scores of the classified features from the test_sentence. answers = {} if option[:3] == 'geo': # geometric mean for i in classes: answers[i] = geometric_mean(classes[i]) elif option[:3] == 'ari': # arithmetic mean for i in classes: answers[i] = arithmetic_mean(classes[i]) else: # use arithmetic-geometric mean, see for i in classes: answers[i] = arigeo_mean(classes[i]) return max(answers.iteritems(), key=operator.itemgetter(1))[0], answers
def featurize(text, all_features, option="3gram"): """ Inputs a sentence string and outputs the np.array() """ return np.array([Counter(sentence2ngrams(text, option=option))[j] \ for j in all_features])
normalise(featureset[lang][i]) identify = sum_cosine DataStr = MultiCounter option = "separate" with_word_boundary = True char = None word = None numlang = len(featureset) while True: input = raw_input("\nEnter text to be identified: ").decode( sys.stdin.encoding) sentfeat = DataStr( sentence2ngrams(input, option=option, with_word_boundary=with_word_boundary)) results = identify(featureset, sentfeat) result_list = [code for score, code in sorted(results, reverse=True)] for i in range(5): code = result_list[i] print " {}. {}: {}".format(i + 1, code, unicode(ISO2LANG[code][0]).title()) #print "\tTop ten results: {}".format(" ".join(result_list[0:10])) answercode = raw_input("What was the correct answer? ").decode( sys.stdin.encoding) try: answerlang = unicode(ISO2LANG[answercode][0]).title() except IndexError: try: answerlang = unicode(answercode)
def evaluator(data_source, option="all", model="cosine", tfidf=False, with_word_boundary=True, seed=0, warnings=False, weight=None): """ Segments the data into 90-10 portions using tenfold(), then trains a model using 90% of the data and evaluates on the remaining 10%. """ from universalcorpus.miniethnologue import ISO2LANG, MACRO2LANG from extractfeature import sentence2ngrams from collections import defaultdict, Counter from multinomialnaivebayes import SGT from time import time ### Choose the data structure to record features, and the function that will be called when identifying a sentence if model == "cosine": DataStr = Counter identify = dot_product # The sentence feature vectors will not be normalised, to save time. This does not affect classification. elif model == "cosine-combined": DataStr = MultiCounter identify = sum_cosine option = "separate" if not weight: weight = [1, 1, 1, 1, 1, 1] else: print "Sorry, the model '{}' isn't available!".format(model) return None ### Get ready to record these statistics ten_fold_accuracy = [] ten_fold_mrr = [] ten_fold_precision = [] ten_fold_recall = [] ten_fold_fscore = [] fold_counter = 0 ### Set up the tenfold cross-validation, then evaluate on each fold for train, test in tenfold(data_source, randseed=seed): fold_counter += 1 print "Loading fold {}...".format(fold_counter) start = time() ### Extract the features featureset = defaultdict(DataStr) for lang, trainsent in train: if lang in ISO2LANG or lang in MACRO2LANG: trainsentcount = DataStr( sentence2ngrams(trainsent, option=option, with_word_boundary=with_word_boundary)) if len(trainsentcount) > 0: featureset[lang].update(trainsentcount) elif warnings: print("*** No features for: {}".format(trainsent)) elif warnings: print("*** {} not recognised!".format(lang)) ### Process the features to produce weights if model == "cosine": print "Normalising to unit length..." for lang in featureset: normalise(featureset[lang]) # Updates featureset. if tfidf: print "Calculating tf-idf..." tfidfize(featureset) # Updates featureset. elif model == "cosine-combined": print "Normalising and re-weighting components..." for lang in featureset: for i in range(6): normalise(featureset[lang][i], weight[i]) # Updates featureset. print "Evaluating..." fold_results = Counter( ) # Records the number of times the correct language is at a specific rank macro_true = defaultdict( int ) # These three are to calculate precision, recall, and f-score for each language macro_fpos = defaultdict(int) macro_fneg = defaultdict(int) ### Identify each sentence in the test data for lang, testsent in test: ### Extract features sentfeat = DataStr( sentence2ngrams(testsent, option=option, with_word_boundary=with_word_boundary)) if len(sentfeat) == 0: print "*** No features for: {}".format(testsent) continue ### Predict the language results = identify(featureset, sentfeat) result_list = [ code for score, code in sorted(results, reverse=True) ] try: rank = result_list.index( lang) + 1 # Compare the prediction with the answer except ValueError: # If the language was not seen in training rank = float('inf') #print rank ### Note the result fold_results[rank] += 1 if rank == 1: macro_true[lang] += 1 else: macro_fneg[lang] += 1 macro_fpos[result_list[0]] += 1 ### Calculate statistics for this fold accuracy = fold_results[1] / sum(fold_results.values()) print "Accuracy: {}".format(accuracy) ten_fold_accuracy.append(accuracy) mrr = sum([count / rank for rank, count in fold_results.items() ]) / sum(fold_results.values()) print "Mean Reciprocal Rank: {}".format(mrr) ten_fold_mrr.append(mrr) langset = set(macro_true.keys()) & set(macro_fpos.keys()) & set( macro_fneg.keys()) precision = { lang: macro_true[lang] / (macro_true[lang] + macro_fpos[lang]) for lang in langset } recall = { lang: macro_true[lang] / (macro_true[lang] + macro_fneg[lang]) for lang in langset } fscore = { lang: 2 * precision[lang] * recall[lang] / (precision[lang] + recall[lang]) for lang in langset } average_precision = sum(precision.values()) / len(langset) average_recall = sum(recall.values()) / len(langset) average_fscore = sum(fscore.values()) / len(langset) print "Macro precision: {}".format(average_precision) print "Macro recall: {}".format(average_recall) print "Macro f-score: {}".format(average_fscore) ten_fold_precision.append(average_precision) ten_fold_recall.append(average_recall) ten_fold_fscore.append(average_fscore) end = time() - start print "{} seconds to evaluate {} sentences in fold {}\n".format( end, sum(fold_results.values()), fold_counter) ### Average over all folds overall_accuracy = sum(ten_fold_accuracy) / 10 overall_mrr = sum(ten_fold_mrr) / 10 overall_precision = sum(ten_fold_precision) / 10 overall_recall = sum(ten_fold_recall) / 10 overall_fscore = sum(ten_fold_fscore) / 10 print "==============================================" print "Average accuracy: {}".format(overall_accuracy) print "Average MRR: {}".format(overall_mrr) print "Average macro precision: {}".format(overall_precision) print "Average macro recall: {}".format(overall_recall) print "Average macro f-score: {}".format(overall_fscore) return (overall_accuracy, overall_mrr, overall_precision, overall_recall, overall_fscore)
return exp(result) ''' train = Counter({'a':1,'b':5,'c':2}) test = Counter({'b':1,'a':1}) langSGT = SGT(train) langMLE = MLE(train) print SGTestimate(langSGT,test) print MLEestimate(langMLE,test) ''' from extractfeature import sentence2ngrams, get_features s = "ich bin schwanger" test = Counter(sentence2ngrams(s, with_word_boundary=True)) print test trainset = get_features('odin', option='3gram') sgt_results = [] mle_results = [] ''' german = SGT(trainset['deu']) wakawaka = SGT(trainset['wkw']) for x in test: print x, trainset['deu'][x], SGTestimate(german, Counter({x:1})) print x, trainset['wkw'][x], SGTestimate(wakawaka, Counter({x:1})) print len(trainset['wkw'])
featureset[lang][len(ngram)][ngram] = count for lang in featureset: for i in range(6): normalise(featureset[lang][i]) identify = sum_cosine DataStr = MultiCounter option = "separate" with_word_boundary = True char = None word = None numlang = len(featureset) while True: input = raw_input("\nEnter text to be identified: ").decode(sys.stdin.encoding) sentfeat = DataStr(sentence2ngrams(input, option=option, with_word_boundary=with_word_boundary)) results = identify(featureset, sentfeat) result_list = [code for score, code in sorted(results, reverse=True)] for i in range(5): code = result_list[i] print " {}. {}: {}".format(i+1, code, unicode(ISO2LANG[code][0]).title()) #print "\tTop ten results: {}".format(" ".join(result_list[0:10])) answercode = raw_input("What was the correct answer? ").decode(sys.stdin.encoding) try: answerlang = unicode(ISO2LANG[answercode][0]).title() except IndexError: try: answerlang = unicode(answercode) answercode = LANG2ISO[answerlang.lower()][0] print ' We interpret "{}" to mean "{}"'.format(answerlang, answercode) answerlang = answerlang.title()