def update(weights, predseq, labelseq, postagseq, vecs1, vecs2, sent): for i in range(len(predseq)): true = labelseq[i] pred = predseq[i] pos = postagseq[i] vec1 = vecs1[i] vec2 = vecs2[i] if i == 0: prev_true = '*' prev_pred = '*' else: prev_true = labelseq[i - 1] prev_pred = predseq[i - 1] if true != pred: true_feats = features.extract(sent[i], true, prev_true, pos, vec1, vec2) for feat in true_feats: if feat in weights: weights[feat] += step pred_feats = features.extract(sent[i], pred, prev_pred, pos, vec1, vec2) for feat in pred_feats: if feat in weights: weights[feat] -= step return weights
def ikProblemToFeatures(ikproblem,featureList): """Standard feature extractor for IKProblems""" if isinstance(ikproblem,dict): #assume it's a JSON object already return features.extract(ikproblem,featureList) elif isinstance(ikproblem,IKProblem): jsonObj = ikproblem.toJson() return features.extract(jsonObj,featureList) elif isinstance(ikproblem,IKObjective): return ikObjectiveToFeatures(ikproblem,featureList) else: assert hasattr(ikproblem,'__iter__'),"IK problem must either be an IKProblem, single IKObjective, or a list" return sum([ikObjectiveToFeatures(o,f) for o,f in zip(ikproblem,featureList)],[])
def testPOSBIGRAMS(self): text = "I love you." results = features.extract(text, [Features.POSBIGRAMS], False) result = results[Features.POSBIGRAMS] self.assertEqual(1, result['(VBP)-(PRP)']) self.assertEqual(1, result['(PRP)-(VBP)']) self.assertEqual(1, result['(PRP)-(.)']) results = features.extract(text, [Features.POSBIGRAMS], True) result = results[Features.POSBIGRAMS] self.assertEqual(1/3.0, result['(VBP)-(PRP)']) self.assertEqual(1/3.0, result['(PRP)-(VBP)']) self.assertEqual(1/3.0, result['(PRP)-(.)']) self.assertEqual(1, sum(result.values()))
def testPOSUnigrams(self): text = "I love you." results = features.extract(text, [Features.POSUNIGRAM], False) result = results[Features.POSUNIGRAM] self.assertEqual(1, result['VBP']) self.assertEqual(2, result['PRP']) self.assertEqual(1, result['.']) results = features.extract(text, [Features.POSUNIGRAM], True) result = results[Features.POSUNIGRAM] self.assertEqual(1/4.0, result['VBP']) self.assertEqual(2/4.0, result['PRP']) self.assertEqual(1/4.0, result['.']) self.assertEqual(1, sum(result.values()))
def update(weights, predseq, labelseq, sent, postagseq, info, ad): for i in range(len(predseq) + 1): if i == len(predseq): word = '' pos = '' true = '<STOP>' pred = '<STOP>' else: word = sent[i] true = labelseq[i] pred = predseq[i] pos = postagseq[i] if i == 0: prev_true = '*' prev_pred = '*' else: prev_true = labelseq[i-1] prev_pred = predseq[i-1] if true != pred or i == len(predseq) and prev_true != prev_pred: true_feats = extract(word, true, prev_true, pos, info) pred_feats = extract(word, pred, prev_pred, pos, info) up = set(true_feats).difference(set(pred_feats)) down = set(pred_feats).difference(set(true_feats)) # ADAGRAD for u in up: if u in ad: ad[u] += 1 for d in down: if d in ad: ad[d] += 1 for u in up: if u in weights: if ad[u] > 0.0: weights[u] += step/math.sqrt(ad[u]) # ADAGRAD else: weights[u] += step # ?? for d in down: if d in weights: if ad[d] > 0.0: weights[d] -= step/math.sqrt(ad[d]) # ADAGRAD else: weights[d] -- step # ?? return weights
def test_level2(sourceword,target): bestoutfn = "../L2output/{0}.{1}.best".format(sourceword, target) oofoutfn = "../L2output/{0}.{1}.oof".format(sourceword, target) bestoutfile = open(bestoutfn,'w') oofoutfile = open(oofoutfn,'w') level2_classifier = util_run_experiment.get_pickled_classifier(sourceword,target,'level2') frd1,frd2,frd3,frd4 = sorted(list(get_four_friends(target))) ##Need 4 more features from level1. classfrd1,classfrd2,classfrd3,classfrd4 = get_level1_classifiers(frd1,frd2,frd3,frd4,sourceword) # finaldir = "../trialdata/alltrials/" finaldir = "../finaltest" problems = util_run_experiment.get_test_instances(finaldir, sourceword) for problem in problems: level1_features = features.extract(problem) answer_frd1 = classfrd1.classify(level1_features) answer_frd2 = classfrd2.classify(level1_features) answer_frd3 = classfrd3.classify(level1_features) answer_frd4 = classfrd4.classify(level1_features) level2_features = train_extracted_level2.extend_features(level1_features,(answer_frd1,answer_frd2,answer_frd3,answer_frd4),frd1,frd2,frd3,frd4) level2_answer = level2_classifier.classify(level2_features) level2_dist = level2_classifier.prob_classify(level2_features) oof_answers = util_run_experiment.topfive(level2_dist) print(output_one_best(problem, target, level2_answer), file=bestoutfile) print(output_five_best(problem, target, oof_answers), file=oofoutfile)
def main(): parser = argparse.ArgumentParser(description='clwsd') parser.add_argument('--sourceword', type=str, nargs=1, required=True) parser.add_argument('--targetlang', type=str, nargs=1, required=True) parser.add_argument('--classifier', type=str, nargs=1, required=False) args = parser.parse_args() all_target_languages = "de es fr it nl".split() assert args.targetlang[0] in all_target_languages target = args.targetlang[0] sourceword = args.sourceword[0] nltk.classify.megam.config_megam(bin='/usr/local/bin/megam') classifier = get_maxent_classifier(sourceword, target) fn = "../trialdata/alltrials/{0}.data".format(sourceword) ## XXX(alexr): fix later. stanford.taggerhome = "/home/alex/software/stanford-postagger-2012-11-11" problems = extract_wsd_problems(fn) gold_answers = read_gold.get_gold_answers(sourceword, target) for problem in problems: featureset = features.extract(problem) answer = classifier.classify(featureset) print(problem.tokenized) print(answer) label = gold_answers[problem.instance_id] print("CORRECT" if label == answer else "WRONG", end=" ") print("should be:", label)
def main(): parser = argparse.ArgumentParser(description='clwsd') parser.add_argument('--sourceword', type=str, required=True) parser.add_argument('--targetlang', type=str, required=True) parser.add_argument('--taggerhome', type=str, required=True) args = parser.parse_args() all_target_languages = "de es fr it nl".split() assert args.targetlang in all_target_languages target = args.targetlang sourceword = args.sourceword stanford.taggerhome = args.taggerhome gold_answers = read_gold.get_gold_answers(sourceword, target) instances = get_training_data(sourceword, target) print("... training ...") nltk.classify.megam.config_megam(bin='/usr/local/bin/megam') classifier = MaxentClassifier.train(instances, trace=0, algorithm='megam') print("LABELS", classifier.labels()) ## with open("../eval/{0}.output".format(sourceword), "w") as outfile: fn = "../trialdata/alltrials/{0}.data".format(sourceword) problems = extract_wsd_problems(fn) for problem in problems: featureset = features.extract(problem) answer = classifier.classify(featureset) print(output_one_best(problem, target, answer)) label = gold_answers[problem.instance_id] print("CORRECT" if label == answer else "WRONG") print("distribution was...") dist = classifier.prob_classify(featureset) for key in dist.samples(): print(" ", key, dist.prob(key))
def main(): with open('model.pickle') as f: vectorizer, model = pickle.load(f) for hyp1, hyp2, ref in sentences(): f1 = features.extract(ref, hyp1) f2 = features.extract(ref, hyp2) f = features.diff(f2, f1) if f['min_match'] == f['min_match2'] == f['min_match3'] == 0: print 0 continue score = model.predict(vectorizer.transform((f,))) # w . (f_2 - f_1) if score > 0: print 1 else: print -1
def mrf_optimize(problem): """Build the MRF and do the optimization!!""" featureset = features.extract(problem) for lang in all_target_languages: classifier = classifiers[lang] unary = unary_penalty_table(classifier, featureset) print(unary) langnode = Node(lang, unary) ## get all the combinations of nodes. langpairs = list(itertools.combinations(all_target_languages, 2)) ## create an edge for each language pair. for l1, l2 in langpairs: print("building edges for", l1, l2) edgepotentials = {} for val1 in possible_values(l1): for val2 in possible_values(l2): cooccurrence = cooccurrences[(l1,l2)] joint = cooccurrence.lookup_joint(l1, val1, l2, val2) # negative logprob of the joint probability. Definitely the best # edge potential, for sure. edgepotentials[(val1,val2)] = -math.log(joint, 2) Edge(l1, l2, edgepotentials) ## XXX how many iterations? print("mrf solving!!!") answers, oof_answers = beliefprop(10) print("mrf solved!!!") return answers, oof_answers
def get_training_data_from_extracted(sourceword, targetlang): """Return a list of (featureset, label) for training.""" out = [] problems = [] fn = "../trainingdata/{0}.{1}.train".format(sourceword, targetlang) with open(fn) as infile: lines = infile.readlines() lines = [line.strip() for line in lines] contexts = [line for line in lines[0::3]] indices = [int(line) for line in lines[1::3]] labelss = [line.split(",") for line in lines[2::3]] assert len(contexts) == len(labelss) == len(indices) answers = [] for context, index, labels in zip(contexts, indices, labelss): problem = WSDProblem(sourceword, context, testset=False, head_index=index) for label in labels: if label == '': continue problems.append(problem) answers.append(label) for problem,answer in zip(problems, answers): featureset = features.extract(problem) label = answer assert(type(label) is str) out.append((featureset, label)) #print("###the features are: \n{}".format(featureset)) #input() return out
def extract_all_features(formatted_corpus): sent_cnt = 0 y_symbols = [] # Our array of transistions X_dict = list() # Our matrix for sentence in formatted_corpus: sent_cnt += 1 if sent_cnt % 1000 == 0: a = 1 # print(sent_cnt, 'sentences on', len(formatted_corpus), flush=True) stack = [] queue = list(sentence) graph = {} graph['heads'] = {} graph['heads']['0'] = '0' graph['deprels'] = {} graph['deprels']['0'] = 'ROOT' while queue: x = features.extract(stack, queue, graph, FEATURE_NAMES, sentence) X_dict.append(x) stack, queue, graph, trans = reference(stack, queue, graph) y_symbols.append(trans) stack, graph = transition.empty_stack(stack, graph) # Poorman's projectivization to have well-formed graphs. for word in sentence: word['head'] = graph['heads'][word['id']] # print(y_symbols) # print(graph) return X_dict, y_symbols
def read_dataset(filename): X, y = [], [] points = features.extract(filename) for f, v in points: X.append(f.todict()) y.append(v) return X, y
def getTrainDS(): root = './data' X = np.mat([0]*(len(markerList))) y = np.array([]) for lang in langs: for scDir in os.listdir(os.path.join(root, lang)): sc = getSC(os.path.join(root, lang, scDir)) feature = extract(sc) feature = np.asmatrix(feature) X = np.append(X, feature, axis=0) X = np.asmatrix(X) y = np.append(y, [lang]) print "[SUCCESS] Extracted", scDir del feature X = X[1:] print "Shuffling datasets." X, y = shuffle(X, y) print "[SUCCES] Shuffled datasets" print "Splitting and saving datasets." ds = splitDS(X, y) print "[SUCCESS] Splitted datasets." saveDS(ds) print "[SUCCESS] Saved datasets." del X, y
def testLetterUnigram(self): text = "ekin" results = features.extract(text,[Features.LETTERUNIGRAM],False) result = results[Features.LETTERUNIGRAM] self.assertEqual(1, result[ord('e')-ord('a')]) self.assertEqual(1, result[ord('k')-ord('a')]) self.assertEqual(1, result[ord('i')-ord('a')]) self.assertEqual(1, result[ord('n')-ord('a')]) self.assertEqual(4, sum(result)) results = features.extract(text,[Features.LETTERUNIGRAM],True) result = results[Features.LETTERUNIGRAM] self.assertEqual(1/4.0, result[ord('e')-ord('a')]) self.assertEqual(1/4.0, result[ord('k')-ord('a')]) self.assertEqual(1/4.0, result[ord('i')-ord('a')]) self.assertEqual(1/4.0, result[ord('n')-ord('a')]) self.assertEqual(4/4.0, sum(result)) text = "" results = features.extract(text,[Features.LETTERUNIGRAM],False) result = results[Features.LETTERUNIGRAM] self.assertEqual(0, sum(result)) results = features.extract(text,[Features.LETTERUNIGRAM],True) result = results[Features.LETTERUNIGRAM] self.assertEqual(0, sum(result)) text = "eee" results = features.extract(text,[Features.LETTERUNIGRAM],False) result = results[Features.LETTERUNIGRAM] self.assertEqual(3, result[ord('e')-ord('a')]) self.assertEqual(3, sum(result)) results = features.extract(text,[Features.LETTERUNIGRAM],True) result = results[Features.LETTERUNIGRAM] self.assertEqual(1.0, result[ord('e')-ord('a')]) self.assertEqual(1.0, sum(result)) text = "ee k ee" results = features.extract(text,[Features.LETTERUNIGRAM],False) result = results[Features.LETTERUNIGRAM] self.assertEqual(4, result[ord('e')-ord('a')]) self.assertEqual(1, result[ord('k')-ord('a')]) self.assertEqual(5, sum(result)) results = features.extract(text,[Features.LETTERUNIGRAM],True) result = results[Features.LETTERUNIGRAM] self.assertEqual(4/5.0, result[ord('e')-ord('a')]) self.assertEqual(1/5.0, result[ord('k')-ord('a')]) self.assertEqual(5/5.0, sum(result)) text = "\nn" results = features.extract(text, [Features.LETTERUNIGRAM], False) result = results[Features.LETTERUNIGRAM] self.assertEqual(1.0, result[ord('n')-ord('a')]) self.assertEqual(1.0, sum(result))
def main(args: argparse.Namespace) -> None: with open(args.data, "r") as src: with open(args.features, "w") as out_file: for line in src: line = word_tokenize(line.replace(":", "_")) feature_list = features.extract(line) for feature in feature_list: print("\t".join(feature), file=out_file) print("", file=out_file)
def get_score(word, current_tag, prev_tag, postag, weights, info): score = 0.0 features_list = extract(word, current_tag, prev_tag, postag, info) for feature in features_list: if feature in weights: score += weights[feature] return score
def get_score(word, current_tag, prev_tag, pos, v1, v2, weights): score = 0.0 features_list = extract(word, current_tag, prev_tag, pos, v1, v2) for feature in features_list: if feature in weights: score += weights[feature] return score, features_list
def playing(colrow, g): # takes a board position and adds a players move col, row = colrow[0], colrow[1:3] row = BOARD - int(row) col = alphabet.index(col) tmp = g[0].copy() g[0] = g[1] g[1] = tmp g = features.extract(g, row, col, BOT) return prediction(g)
def ikProblemToFeatures(ikproblem, featureList): """Standard feature extractor for IKProblems""" if isinstance(ikproblem, dict): #assume it's a JSON object already return features.extract(ikproblem, featureList) elif isinstance(ikproblem, IKProblem): jsonObj = ikproblem.toJson() return features.extract(jsonObj, featureList) elif isinstance(ikproblem, IKObjective): return ikObjectiveToFeatures(ikproblem, featureList) else: assert hasattr( ikproblem, '__iter__' ), "IK problem must either be an IKProblem, single IKObjective, or a list" return sum([ ikObjectiveToFeatures(o, f) for o, f in zip(ikproblem, featureList) ], [])
def read_dataset(filename, use_text_features): if use_text_features: mask = set(['meta', 'text']) else: mask = set(['meta']) X, y = [], [] points = list(features.filter(features.extract(filename), mask)) for f, v in points: X.append(f.todict()) y.append(v) return X, y
def get_training_data_from_extracted(sourceword, targetlang): """Return a list of (featureset, label) for training.""" frd1, frd2, frd3, frd4 = sorted(list(get_four_friends(targetlang))) ##Get other four languages. ##Get the intersection of four training sentences. tool_class = Occurrence(sourceword, frd1, frd2) intersection = tool_class.get_common_four_sents(sourceword, frd1, frd2, frd3, frd4) out = [] problems = [] fn = "../trainingdata/{0}.{1}.train".format(sourceword, targetlang) with open(fn) as infile: lines = infile.readlines() lines = [line.strip() for line in lines] contexts = [line for line in lines[0::3]] indices = [int(line) for line in lines[1::3]] labelss = [line.split(",") for line in lines[2::3]] assert len(contexts) == len(labelss) == len(indices) print("the length of them...", len(contexts), len(indices), len(labelss)) # input() answers = [] extention = [] for context, index, labels in zip(contexts, indices, labelss): sentence_id = context + "####" + str(index) if ( sentence_id in intersection ): ##If this sentence also appears in 4 other languages, we can use more features... problem = WSDProblem(sourceword, context, testset=False, head_index=index) more_featuress = intersection[sentence_id] # print(more_featuress) for more_feature in more_featuress: for label in labels: if label == "": continue problems.append(problem) # more_features = intersection[context] extention.append(more_feature) answers.append(label) print("###intersection for five languages....{}\n".format(len(extention))) for problem, answer, more_feature in zip(problems, answers, extention): featureset = features.extract(problem) featureset = extend_features(featureset, more_feature, frd1, frd2, frd3, frd4) label = answer assert type(label) is str # print("=================@@@@features {}\n@@@@label{}\n".format(featureset,label)) out.append((featureset, label)) print("###Length of the output should be the same{}\n".format(len(out))) return out
def extract_all_train_features(sents, tagseqs, postagseqs, info): featset = set([]) i = 0 for sent in sents: sys.stderr.write(str(i) + "\r") j = 0 for word in sent: tag = tagseqs[i][j] postag = postagseqs[i][j] if j == 0: # first position prev = '*' else: prev = tagseqs[i][j-1] featset.update(extract(word, tag, prev, postag, info)) # get a list of all features possible j += 1 # features for the last label featset.update(extract('', '<STOP>', tag, '', info)) i += 1 featlist = list(featset) for f in featlist: print f return featlist
def extract_features(self): """ This method is used to extract features (A,B,D). """ returnVars = features.extract(self.original_image, self.contour_mask, self.contour) if len(returnVars) == 0: self.feature_set = returnVars else: self.feature_set = returnVars[0] self.asymmetry_horizontal = returnVars[1] self.asymmetry_vertical = returnVars[2] self.warp_img_segmented = returnVars[3]
def get_training_data(sourceword, target): """Return a list of (featureset, label) for training.""" out = [] ## map from id to labels gold_answers = read_gold.get_gold_answers(sourceword, target) problems = get_training_problems(sourceword) ## now collate them. for problem in problems: theid = problem.instance_id featureset = features.extract(problem) label = gold_answers[theid] out.append((featureset, label)) return out
def convert_to_features(data_part) -> np.ndarray: """This converts a given group to features""" current_features = Features() feature_list = list() last_features = current_features for row in data_part.itertuples(): row_data = Row(row) current_features.update(row_data) feature_list.append( features.extract(row_data, current_features, last_features)) last_features = current_features.snapshot() return np.array(feature_list)
def get_training_data_from_extracted(sourceword, targetlang): """Return a list of (featureset, label) for training.""" frd1,frd2,frd3,frd4 = sorted(list(get_four_friends(targetlang))) ##Get other four languages. classfrd1,classfrd2,classfrd3,classfrd4 = get_level1_classifiers(frd1,frd2,frd3,frd4,sourceword) ##Get the intersection of four training sentences. tool_class = Occurrence(sourceword,frd1,frd2) out = [] problems = [] fn = "../trainingdata/{0}.{1}.train".format(sourceword, targetlang) with open(fn) as infile: lines = infile.readlines() lines = [line.strip() for line in lines] contexts = [line for line in lines[0::3]] indices = [int(line) for line in lines[1::3]] labelss = [line.split(",") for line in lines[2::3]] assert len(contexts) == len(labelss) == len(indices) print("the length of them...",len(contexts),len(indices),len(labelss)) #input() answers = [] for context, index, labels in zip(contexts, indices, labelss): problem = WSDProblem(sourceword, context, testset=False, head_index=index) #print(more_featuress) for label in labels: if label == '': continue problems.append(problem) #more_features = intersection[context] answers.append(label) for problem,answer in zip(problems, answers): level1_features = features.extract(problem) answer_frd1 = classfrd1.classify(level1_features) answer_frd2 = classfrd2.classify(level1_features) answer_frd3 = classfrd3.classify(level1_features) answer_frd4 = classfrd4.classify(level1_features) level2_features = extend_features(level1_features,(answer_frd1,answer_frd2,answer_frd3,answer_frd4),frd1,frd2,frd3,frd4) label = answer assert(type(label) is str) #print("=================@@@@features {}\n@@@@label{}\n".format(featureset,label)) out.append((level2_features, label)) print("###Length of the output should be the same{}\n".format(len(out))) return out
def main(): print(os.getcwd()) print("Create Dataset") signal, y = dataset() df_y = pd.DataFrame(data=y, columns=['genre']) # construct features print("Feature Extraction") df_x = pd.DataFrame() for i in range(0, len(signal)): print("number " + str(i)) new_x = pd.DataFrame(features.extract(signal[i]), index=[i]) df_x = df_x.append(new_x) saveFeature(df_x, df_y)
def disambiguate_words(words): """Given a list of words/lemmas, return a list of disambiguation answers for them.""" classifiers = [classifier_for(word, nonnull=True) for word in words] answers = [] for i in range(len(words)): faketagged = [(w,None) for w in words] feat = features.extract(faketagged, i) classif = classifiers[i] ans = classif.classify(feat) if ans == UNTRANSLATED: ans = mfs_translation(words[i]) print("MFS!!!", words[i], "==>", ans) answers.append(ans) return [str(ans) for ans in answers]
def main(): setup_logging() for fn in sys.argv[1:] or ["input.png"]: im = cv.LoadImage(fn) fts = extract(im) pfn = fn + "-features.dat" info("Storing feature pickle in %s", pfn) dump(fts, file(pfn, "wb")) for l, layer in enumerate(fts): for fname, fval in layer.items(): ffn = "%s-feat-%d-%s.png" % (fn, l, fname) info("Rendering feature %s", ffn) mat2pil(fval).save(ffn)
def update(weights, predseq, labelseq, postagseq, vecs1, vecs2, sent): for i in range(len(predseq)): true = labelseq[i] pred = predseq[i] pos = postagseq[i] vec1 = vecs1[i] vec2 = vecs2[i] if i == 0: prev_true = '*' prev_pred = '*' else: prev_true = labelseq[i-1] prev_pred = predseq[i-1] if true != pred: true_feats = features.extract(sent[i], true, prev_true, pos, vec1, vec2) for feat in true_feats: if feat in weights: weights[feat] += step pred_feats = features.extract(sent[i], pred, prev_pred, pos, vec1, vec2) for feat in pred_feats: if feat in weights: weights[feat] -= step return weights
def prediction(gamemove, col): predictmove = gamemove[None, :] resultmatrix = model.predict(predictmove) resultmatrix = np.reshape(resultmatrix, [BOARD, BOARD]) result = np.unravel_index(resultmatrix.argmax(), resultmatrix.shape) while gamemove[4][result[0]][result[1]] == 0: # if move is illegal resultmatrix[result[0]][result[1]] = 0 # delete from softmax result = np.unravel_index(resultmatrix.argmax(), resultmatrix.shape) #find second best option resultmatrix = np.round_(resultmatrix, 1) * 10 print(alphabet[result[1] - BOARD], BOARD - result[0]) tmp = gamemove[0].copy() gamemove[0] = gamemove[1] gamemove[1] = tmp gamemove = features.extract(gamemove, result[0], result[1], col) return gamemove, resultmatrix
def extract_features(formatted_corpus, feature_names, training=True, model=None): non_proj = [] X_1 = [] y_1 = [] sent_cnt = 0 for sentence in formatted_corpus: sent_cnt += 1 if sent_cnt % 1000 == 0: print(sent_cnt, 'sentences on', len(formatted_corpus), flush=True) stack = [] queue = list(sentence) graph = {} graph['heads'] = {} graph['heads']['0'] = '0' graph['deprels'] = {} graph['deprels']['0'] = 'ROOT' transitions = [] feats = [] while queue: feats.append(features.extract(stack, queue, graph, feature_names, sentence)) stack, queue, graph, trans = reference(stack, queue, graph) transitions.append(trans) stack, graph = transition.empty_stack(stack, graph) X_1.extend(feats) y_1.extend(transitions) #print('Equal graphs:', transition.equal_graphs(sentence, graph)) if not transition.equal_graphs(sentence, graph): non_proj.append(sentence) # Poorman's projectivization to have well-formed graphs. for word in sentence: word['head'] = graph['heads'][word['id']] #print(transitions) #print(graph) #print(len(non_proj)) #s = sorted(non_proj, key=lambda x: len(x)) #print([x['form'] for x in s[0]]) #for x in non_proj: # print(len(x)) # print(x) return (X_1, y_1)
def prob_disambiguate_words(words): """Given a list of words/lemmas, return a list of disambiguation answers for them -- return a list of lists, where each sublist is ordered in decreasing probability.""" classifiers = [classifier_for(word, nonnull=True) for word in words] answers = [] for i in range(len(words)): faketagged = [(w,None) for w in words] feat = features.extract(faketagged, i) classif = classifiers[i] ## get all possible options, sorted in wrong order dist = classif.prob_classify(feat) options = [(dist.prob(samp), samp) for samp in dist.samples()] options.sort(reverse=True) myanswers = [str(lex) for (prob, lex) in options if prob > 0.01 ] print(myanswers) answers.append(myanswers) return answers
def test(file_name): # file_name="combined_nomeal.csv" with open("Guassian_model.pkl", 'rb') as file: Guassian_model = pickle.load(file) with open("pca_model.pkl", 'rb') as file: pca = pickle.load(file) test_data = pd.read_csv(file_name, header=None) print("---") fm = extract(test_data) print("---") sc = StandardScaler() test_data_set = sc.fit_transform(fm) pca_dataset = pca.fit_transform(test_data_set) gaussianNB_pred = Guassian_model.predict(pca_dataset) print("Classes of your given test " + str(gaussianNB_pred)) np.savetxt("output.csv", gaussianNB_pred, delimiter=",", fmt='%d') print("predicted class labels are saved in output.csv file")
def perform_prediction(split_test_sentences, feature_names, classifier, model): print("Extracting features from test Corpus") X_matrix_test = [] # The matrix of feature vectors Y_vec_test = [] # The vector of predicted outputs # We iterate over all sentences that are in the corpus for sentence in split_test_sentences: stack = [] # Stack of tokens that will be manipulated with shift, reduce, left-arc, and right-arc queue = list(sentence) # Queue of input tokens that will be cycles through graph = {} # The dependency graph that we will go store our dependency arcs in graph['heads'] = {} # Create a dictionary inside graph for heads of each sentence graph['heads']['0'] = '0' # The first element in the heads dictionary inside the graph dictionary is '0' graph['deprels'] = {} # Define another dictionary with dependency relations stored inside, again inside graph graph['deprels']['0'] = 'ROOT' # Make the first element in the deprel dictionary the 'ROOT' keyword while queue: x_elem = features.extract(stack, queue, graph, feature_names, sentence) x_predictable = features.vec.transform(x_elem) # X_matrix_test.extend(x_elem) # stack, queue, graph, y_elem = dparser.reference(stack, queue, graph) # WE DO NOT USE Y_VEC_TEST # Y_vec_test.append(y_elem) predicted_transition = classifier.predict(x_predictable) stack, queue, graph, transition = parse_ml(stack, queue, graph, predicted_transition)
def extract_features(csv_path, csv_file): data = [] f_temp = [] with open(csv_path, "r") as data_file: #open the files for row in data_file: strsplit = row.split(',') strsplit = list(map(float, strsplit)) data.append(strsplit) no_of_data_per_window = int(sampling_rate * window_size) no_of_windows = len(data) // no_of_data_per_window * 2 - 1 #count = 0 for i in range(no_of_windows): window_slice_data = [] for j in range(no_of_data_per_window): window_slice_data.append(data[i * no_of_data_per_window // 2 + j]) #count += 1 #print(count) window_slice_data = np.asarray(window_slice_data)[:, 0:3].tolist() f_temp.append(features.extract(window_slice_data)) l_temp = np.full(len(f_temp), table[csv_file]) return f_temp, l_temp
def getData(): if (os.path.exists('df_no_index_10mfcc.csv')): df = pd.read_csv('df_no_index_10mfcc.csv') df_y = df['genre'] df_x = df.drop('genre', axis=1) return df_x, df_y # features haven't been created print("Create Dataset") signal, y = saveDataset.dataset() df_y = pd.Series(data=y) # construct features print("Feature Extraction") df_x = pd.DataFrame() for i in range(len(signal)): if (i % 50 == 0): print('audio {}'.format(i + 1)) new_x = pd.DataFrame(features.extract(signal[i]), index=[i]) df_x = df_x.append(new_x) saveDataset.saveFeature(df_x, pd.DataFrame(df_y, columns=['genre'])) return df_x, df_y
def main(): parser = util_run_experiment.get_argparser() args = parser.parse_args() assert args.targetlang in all_target_languages assert args.sourceword in all_words targetlang = args.targetlang sourceword = args.sourceword trialdir = args.trialdir stanford.taggerhome = args.taggerhome print("Loading and tagging test problems...") problems = util_run_experiment.get_test_instances(trialdir, sourceword) print("OK loaded and tagged.") ## classifier = get_maxent_classifier(sourceword, targetlang) classifier = get_pickled_classifier(sourceword, targetlang, "level1") if not classifier: print("Couldn't load pickled L1 classifier?") return print("Loaded pickled L1 classifier!") bestoutfn = "../L1output/{0}.{1}.best".format(sourceword, targetlang) oofoutfn = "../L1output/{0}.{1}.oof".format(sourceword, targetlang) with open(bestoutfn, "w") as bestoutfile, \ open(oofoutfn, "w") as oofoutfile: for problem in problems: featureset = features.extract(problem) answer = classifier.classify(featureset) dist = classifier.prob_classify(featureset) oof_answers = util_run_experiment.topfive(dist) print(output_one_best(problem, targetlang, answer), file=bestoutfile) print(output_five_best(problem, targetlang, oof_answers), file=oofoutfile)
def generate(track): blob = extract(track.source, track.details_file) features.extract(blob, track) update_tags(blob, track)
# collect val and time sequence from addresses dirPath = 'test_addr' addrs = os.listdir(dirPath) p = connectPSQL(psql) ''' for addr in addrs: if addr[0]!='d': color.pImportant('addr file: '+addr) full_path = os.path.join(dirPath,addr) data_file = 'test_'+addr.split('.')[0].split('_')[1]+'_database.csv' data_file = os.path.join('result',data_file) in_csv = collectTxnIn(p,full_path) out_csv = collectTxnOut(p,full_path) deal_sql.deal_feature(in_csv, out_csv, data_file) feature.extract(data_file) os.rename(full_path,os.path.join(dirPath,'done-'+addr)) ''' addr = 'add_ponzi_train.csv' color.pImportant('addr file: ' + addr) full_path = os.path.join(dirPath, addr) data_file = 'test_' + addr.split('.')[0].split('_')[1] + '_database.csv' data_file = os.path.join('result', data_file) in_csv = collectTxnIn(p, full_path) out_csv = collectTxnOut(p, full_path) deal_sql.deal_feature(in_csv, out_csv, data_file) feature.extract(data_file) p.close()
################################# meal.to_csv("combined_meal.csv", index=False, header=False) nomeal.to_csv("combined_nomeal.csv", index=False, header=False) # ########################################### X = [29 - i for i in range(0, 30)] #for i in range(10) : # plt.plot(X,meal.iloc[73,:],marker="X") # #print("AFTER FUNCTION") ###call function feature_Matrix_Meal = pd.DataFrame() feature_Matrix_Meal = extract(meal) feature_Matrix_NoMeal = pd.DataFrame() feature_Matrix_NoMeal = extract(nomeal) # #feature_Matrix_Meal['class']=1 #feature_Matrix_NoMeal['class']=0 # from sklearn.decomposition import PCA from sklearn.preprocessing import StandardScaler sc = StandardScaler() train_meal = sc.fit_transform(feature_Matrix_Meal) train_nomeal = sc.fit_transform(feature_Matrix_NoMeal)
def ikObjectiveToFeatures(ikgoal,featureList): """Given an IKObjective instance and a list of features, returns a feature descriptor as a list of floats""" jsonObj = loader.toJson(ikgoal) return features.extract(jsonObj,featureList)
import model import performance import dataset if __name__ == "__main__": # We begin by downloading the data. The data will be in the form of # "events" data: each datapoint for each patient will be a recorded event. X, Y = download.download() # The event data is reformatted. This is done by selecting the given # variables and transforming time-dependent events to a path. X = reformat.reformat(X, static_variables=["Age", "Gender"], dynamic_variables=["Creatinine", "Glucose"]) # Now, we normalise the data. X = normalise.normalise(X) # We extract features from the input data. features = features.extract(X) # The dataset is now split into a training and testing set. features_train, Y_train, features_test, Y_test = dataset.split( features, Y, proportion=0.75) # We now train the model with the selected features. classifier = model.train(features_train, Y_train) # We evaluate performance of the model now. performance.evaluate(classifier, features_test, Y_test)
p = connectPSQL(psql) times = [time.time()] #for addr in addrs: for i in range(1): addr = 'new1.csv' color.pImportant('addr file: ' + addr) full_path = os.path.join(dirPath, addr) in_csv = collectTxnIn(p, full_path) out_csv = collectTxnOut(p, full_path) times.append(time.time()) color.pImportant('collected all txns in ' + str(times[-1] - times[-2])) data_file = addr.split('.')[0] + '_database.csv' data_file = os.path.join('result', data_file) deal_sql.deal_feature(in_csv, out_csv, data_file) feature_file = feature.extract(data_file) times.append(time.time()) color.pImportant('dealed all datas in ' + str(times[-1] - times[-2])) color.pImportant('') ''' feature_df = pd.read_csv(feature_file) label_df = pd.read_csv(os.path.join(dirPath,label),header=None) label_df.columns=['ponzi'] feature_df['ponzi'] = label_df['ponzi'] feature_df.to_csv(feature_file,index=None) ''' p.close() color.pImportant('total time used: ' + str(times[-1] - times[0]))
def ikObjectiveToFeatures(ikgoal, featureList): """Given an IKObjective instance and a list of features, returns a feature descriptor as a list of floats""" jsonObj = loader.toJson(ikgoal) return features.extract(jsonObj, featureList)
for sentence in formatted_corpus: sent_cnt += 1 if sent_cnt % 1000 == 0: a = 1 # print(sent_cnt, 'sentences on', len(formatted_corpus), flush=True) stack = [] queue = list(sentence) graph = {} graph['heads'] = {} graph['heads']['0'] = '0' graph['deprels'] = {} graph['deprels']['0'] = 'ROOT' while queue: x = features.extract(stack, queue, graph, FEATURE_NAMES, sentence) X_test = vec.transform(x) predicted_trans_index = model.predict(X_test)[0] predicted_trans = dict_classes[predicted_trans_index] # Build new graph stack, queue, graph, trans = execute_transition( stack, queue, graph, predicted_trans) # Save the predicted trans y_predicted_symbols.append(trans) stack, graph = transition.empty_stack(stack, graph)
def testLetterBigrams(self): text = "" results = features.extract(text,[Features.LETTERBIGRAMS],False) result = results[Features.LETTERBIGRAMS] self.assertEqual(0, sum(result)) results = features.extract(text,[Features.LETTERBIGRAMS],True) result = results[Features.LETTERBIGRAMS] self.assertEqual(0, sum(result)) text = "eee" results = features.extract(text,[Features.LETTERBIGRAMS],False) result = results[Features.LETTERBIGRAMS] self.assertEqual(2, result['ee']) self.assertEqual(2, sum(result.values())) results = features.extract(text,[Features.LETTERBIGRAMS],True) result = results[Features.LETTERBIGRAMS] self.assertEqual(1.0, result['ee']) self.assertEqual(1.0, sum(result.values())) text = "eee eee e ee" results = features.extract(text,[Features.LETTERBIGRAMS],False) result = results[Features.LETTERBIGRAMS] self.assertEqual(5, result['ee']) self.assertEqual(5, sum(result.values())) results = features.extract(text,[Features.LETTERBIGRAMS],True) result = results[Features.LETTERBIGRAMS] self.assertEqual(1.0, result['ee']) self.assertEqual(1.0, sum(result.values())) text = "eekke eee e eze zzkzk" results = features.extract(text,[Features.LETTERBIGRAMS],False) result = results[Features.LETTERBIGRAMS] self.assertEqual(3, result['ee']) self.assertEqual(1, result['ek']) self.assertEqual(1, result['kk']) self.assertEqual(1, result['ke']) self.assertEqual(1, result['ez']) self.assertEqual(1, result['ze']) self.assertEqual(2, result['zk']) self.assertEqual(1, result['kz']) self.assertEqual(1, result['zz']) self.assertEqual(11+1, sum(result.values())) results = features.extract(text,[Features.LETTERBIGRAMS],True) result = results[Features.LETTERBIGRAMS] self.assertEqual(3/12.0, result['ee']) self.assertEqual(1/12.0, result['ek']) self.assertEqual(1/12.0, result['kk']) self.assertEqual(1/12.0, result['ke']) self.assertEqual(1/12.0, result['ez']) self.assertEqual(1/12.0, result['ze']) self.assertEqual(2/12.0, result['zk']) self.assertEqual(1/12.0, result['kz']) self.assertEqual(1/12.0, result['zz']) self.assertEqual(1.0, sum(result.values())) text = "ababababa" results = features.extract(text,[Features.LETTERBIGRAMS],False) result = results[Features.LETTERBIGRAMS] self.assertEqual(4, result['ab']) self.assertEqual(4, result['ba']) self.assertEqual(8, sum(result.values())) results = features.extract(text,[Features.LETTERBIGRAMS],True) result = results[Features.LETTERBIGRAMS] self.assertEqual(4/8.0, result['ab']) self.assertEqual(4/8.0, result['ba']) self.assertEqual(1.0, sum(result.values())) text = "ab \nn \tt" results = features.extract(text, [Features.LETTERBIGRAMS], False) result = results[Features.LETTERBIGRAMS] self.assertEqual(1.0, result['ab']) self.assertEqual(1.0, sum(result.values()))
def testSaveAndLoad(self): text = "ekin and ekin." results = features.extract(text, [Features.LETTERUNIGRAM, Features.LETTERBIGRAMS], False) features.save(results, "test1") returned = features.load("test1") os.remove("test1") self.assertEquals(results, returned) self.assertEqual(2, returned[Features.LETTERUNIGRAM][ord('e')-ord('a')]) results = features.extract(text, [Features.POSBIGRAMS, Features.LETTERBIGRAMS], True) features.save(results, "test2") returned = features.load("test2") os.remove("test2") self.assertEquals(results, returned) results = features.extract(text, [Features.LETTERUNIGRAM, Features.LETTERBIGRAMS, Features.POSUNIGRAM, Features.POSBIGRAMS], True) features.save(results, "test3") returned = features.load("test3") os.remove("test3") self.assertEquals(results, returned) text = "" results = features.extract(text, [Features.LETTERUNIGRAM, Features.LETTERBIGRAMS, Features.POSUNIGRAM, Features.POSBIGRAMS], False) features.save(results, "test4") returned = features.load("test4") os.remove("test4") self.assertEquals(results, returned) text = "\ttrying to test really long text\n\n \a and see if it still works or not \n 101 531 z3z3\n\t" text += text; text += text; results = features.extract(text, [Features.LETTERUNIGRAM, Features.LETTERBIGRAMS, Features.POSUNIGRAM, Features.POSBIGRAMS], False) features.save(results, "test5") returned = features.load("test5") os.remove("test5") self.assertEquals(results, returned) text = ['test1', '222aaa', 'threee333', 't\test1']; results['authors'] = [0, 0, 1, 1] results['text'] = list(); for t in text: results['text'].append(features.extract(t, [Features.LETTERUNIGRAM, Features.LETTERBIGRAMS, Features.POSUNIGRAM, Features.POSBIGRAMS], False)) features.save(results, "test6"); returned = features.load("test6"); os.remove("test6") self.assertEqual(results['authors'], returned['authors']) for idx in range(0,len(returned['text'])): feat = features.extract(text[idx], [Features.LETTERUNIGRAM, Features.LETTERBIGRAMS, Features.POSUNIGRAM, Features.POSBIGRAMS], False) self.assertEqual(feat, returned['text'][idx])
print(sent_cnt, 'sentences on', len(formatted_corpus), flush=True) stack = [] queue = list(sentence) graph = {} graph['heads'] = {} graph['heads']['0'] = '0' graph['deprels'] = {} graph['deprels']['0'] = 'ROOT' transitions = [] #feature_names = ["stack_pos", "stack_word", "queue_pos", "queue_word", "can_ra", "can_la"] while queue: # print(queue) x = features.extract(stack, queue, graph, feature_names, sentence) X.append( dict(zip(feature_names, x)) ) #Nu producerar denna en lång lista, där varje element är en mening. Ändra till att vare ord är ett eget element stack, queue, graph, trans = reference(stack, queue, graph) transitions.append(trans) y = trans y_vector.append(y) stack, graph = transition.empty_stack(stack, graph) # print('Equal graphs:', transition.equal_graphs(sentence, graph)) # Poorman's projectivization to have well-formed graphs. for word in sentence: word['head'] = graph['heads'][word['id']]
X = [] y = [] for sentence in formatted_corpus: sent_cnt += 1 #if sent_cnt % 1000 == 0: #print(sent_cnt, 'sentences on', len(formatted_corpus), flush=True) stack = [] queue = list(sentence) graph = {} graph['heads'] = {} graph['heads']['0'] = '0' graph['deprels'] = {} graph['deprels']['0'] = 'ROOT' transitions = [] while queue: X.append(features.extract(stack, queue, graph, feature_names, sentence)) stack, queue, graph, trans = reference(stack, queue, graph) transitions.append(trans) y.append(trans) stack, graph = transition.empty_stack(stack, graph) #print('Equal graphs:', transition.equal_graphs(sentence, graph)) # Poorman's projectivization to have well-formed graphs. for word in sentence: word['head'] = graph['heads'][word['id']] #print(transitions) #print(graph) vec = DictVectorizer(sparse=True) modelFilename = model_name + '.sav'
if sent_cnt % 1000 == 0: print(sent_cnt, 'sentences on', len(formatted_corpus), flush=True) stack = [] queue = list(sentence) graph = {} graph['heads'] = {} graph['heads']['0'] = '0' graph['deprels'] = {} graph['deprels']['0'] = 'ROOT' transitions = [] x_templist = [] y_templist = [] while queue: current_dictX, current_Y = features.extract( stack, queue, graph, feature_1, sentence, 1) stack, queue, graph, trans = reference(stack, queue, graph) transitions.append(trans) x_templist.append(current_dictX) y_templist.append(current_Y) stack, graph = transition.empty_stack(stack, graph) for word in sentence: word['head'] = graph['heads'][word['id']] x_list.extend(x_temp_list) y_list.extend(y_temp_list) print("Encoding the features and classes...")
def build_instance(tagged_sentence, annotated, index): feat = features.extract(tagged_sentence, annotated, index) label = tagged_sentence[index][1] return (feat, label)
def process( file ): # used by multiprocessor. Tensors are stored multiple times to reduce RAM usage. global gamecounter, kgsdatain, kgsdataout, kgsdatawin print file gamefile = open(file, 'r') gamecounter += 1 if gamecounter == SAVEFILE: lock.acquire() newin = np.load( '/media/falk/6,0 TB Volume/19 badukmovies/kgsdatain_alllayer.npy') newout = np.load( '/media/falk/6,0 TB Volume/19 badukmovies/kgsdataout_alllayer.npy') newwin = np.load( '/media/falk/6,0 TB Volume/19 badukmovies/kgsdatawin_alllayer.npy') newin = np.concatenate([newin, kgsdatain]) if newin.size else kgsdatain newout = np.concatenate([newout, kgsdataout ]) if newout.size else kgsdataout newwin = np.concatenate([newwin, kgsdatawin ]) if newwin.size else kgsdatawin kgsdatain = np.array([]) kgsdataout = np.array([]) kgsdatawin = np.array([]) print("Spielblock gespeichert") if len(newin) >= SAVEBIGFILE: metacounter = 0 while os.path.isfile( '/media/falk/6,0 TB Volume/19 badukmovies/pro/kgsdatain_alllayer_' + str(metacounter) + '.npy'): metacounter += 1 newinstr = '/media/falk/6,0 TB Volume/19 badukmovies/pro/kgsdatain_alllayer_' + str( metacounter) + '.npy' newoutstr = '/media/falk/6,0 TB Volume/19 badukmovies/pro/kgsdataout_alllayer_' + str( metacounter) + '.npy' newwinstr = '/media/falk/6,0 TB Volume/19 badukmovies/pro/kgsdatawin_alllayer_' + str( metacounter) + '.npy' # this is the final storage location np.save(newinstr, newin) np.save(newoutstr, newout) np.save(newwinstr, newwin) newout = np.array([]) newin = np.array([]) newwin = np.array([]) print( "--------------- META Spielblock gespeichert--------------------" ) np.save( '/media/falk/6,0 TB Volume/19 badukmovies/kgsdatain_alllayer.npy', newin) np.save( '/media/falk/6,0 TB Volume/19 badukmovies/kgsdataout_alllayer.npy', newout) np.save( '/media/falk/6,0 TB Volume/19 badukmovies/kgsdatawin_alllayer.npy', newwin) lock.release() gamecounter = 0 gamefile = gamefile.read() gamefile = gamefile.replace("\n", "") gamefile = gamefile.replace("(;", "") game = gamefile boardpos = initboardpos.copy() if "HA[" in gamefile: # if Handycap handycap = int(gamefile.split("HA[", 1)[1][0]) #write down Handycap if handycap >= 2: (boardpos[0, 15, 3], boardpos[0, 3, 15]) = (1, 1) (boardpos[2, 15, 3], boardpos[2, 3, 15]) = (0, 0) if handycap >= 3: boardpos[0, 15, 15] = 1 boardpos[2, 15, 15] = 0 if handycap >= 4: boardpos[0, 3, 3] = 1 boardpos[2, 3, 3] = 0 if handycap == 5: boardpos[0, 9, 9] = 1 boardpos[2, 9, 9] = 0 if handycap >= 6: (boardpos[0, 9, 3], boardpos[0, 9, 15]) = (1, 1) (boardpos[2, 9, 3], boardpos[0, 9, 15]) = (0, 0) if handycap == 7: boardpos[0, 9, 9] = 1 boardpos[2, 9, 9] = 0 if handycap >= 8: (boardpos[2, 3, 9], boardpos[0, 15, 9]) = (0, 0) if handycap == 9: boardpos[0, 9, 9] = 1 boardpos[2, 9, 9] = 0 # main sgf disassembly begins here for sig, col, y, x in zip(game, game[1:], game[3:], game[4:]): # structure: ;B[ic] ######################## store prev board, feedback ########################## if sig == ";": if x not in alphabet: break # error or pass: leave game if y not in alphabet: break # error or pass: leave game if col not in ("B", "W"): break # error: leave game xpos = alphabet.index(x) ypos = alphabet.index(y) storeboardpos = boardpos[None, :] kgsdatain = np.concatenate([kgsdatain, storeboardpos ]) if kgsdatain.size else storeboardpos # store to database (uses boardpos from last iteration) boardpos2 = boardpos[0:2].copy( ) # swap board for agent to always have color of plane 1 boardpos2[1] = boardpos[0] # 1st agent is b, 2nd w, 3rd b etc. boardpos[0] = boardpos[1] boardpos[1] = boardpos2[1] onemove = np.zeros([1, BOARD, BOARD], dtype=int) # add one in feedback board onemove[0, xpos, ypos] = 1 onewin = np.array([1]) if "RE[B" in game: # b wins = 1 onewin[0] = 1 else: onewin[0] = 0 kgsdatawin = np.concatenate( [kgsdatawin, onewin]) if kgsdatawin.size else onewin # we lose kgsdataout = np.concatenate([kgsdataout, onemove ]) if kgsdataout.size else onemove ############################# add new move ################################# boardpos = features.extract(boardpos, xpos, ypos, col) # modifies all layers in boardpos
def testLetterTrigrams(self): text = "" results = features.extract(text,[Features.LETTERTRIGRAMS],False) result = results[Features.LETTERTRIGRAMS] self.assertEqual(0, sum(result.values())) results = features.extract(text,[Features.LETTERTRIGRAMS],True) result = results[Features.LETTERTRIGRAMS] self.assertEqual(0, sum(result.values())) text = "eee" results = features.extract(text,[Features.LETTERTRIGRAMS],False) result = results[Features.LETTERTRIGRAMS] self.assertEqual(1, result['eee']) self.assertEqual(1, sum(result.values())) results = features.extract(text,[Features.LETTERTRIGRAMS],True) result = results[Features.LETTERTRIGRAMS] self.assertEqual(1.0, result['eee']) self.assertEqual(1.0, sum(result.values())) text = "eee eee e ee" results = features.extract(text,[Features.LETTERTRIGRAMS],False) result = results[Features.LETTERTRIGRAMS] self.assertEqual(2, result['eee']) self.assertEqual(2, sum(result.values())) results = features.extract(text,[Features.LETTERTRIGRAMS],True) result = results[Features.LETTERTRIGRAMS] self.assertEqual(1.0, result['eee']) self.assertEqual(1.0, sum(result.values())) text = "eekke eee e ezee zzkzkz" results = features.extract(text,[Features.LETTERTRIGRAMS],False) result = results[Features.LETTERTRIGRAMS] self.assertEqual(1, result['eek']) self.assertEqual(1, result['ekk']) self.assertEqual(1, result['kke']) self.assertEqual(1, result['eee']) self.assertEqual(1, result['eze']) self.assertEqual(1, result['zee']) self.assertEqual(1, result['zzk']) self.assertEqual(2, result['zkz']) self.assertEqual(1, result['kzk']) self.assertEqual(10, sum(result.values())) results = features.extract(text,[Features.LETTERTRIGRAMS],True) result = results[Features.LETTERTRIGRAMS] self.assertEqual(1/10.0, result['eek']) self.assertEqual(1/10.0, result['ekk']) self.assertEqual(1/10.0, result['kke']) self.assertEqual(1/10.0, result['eee']) self.assertEqual(1/10.0, result['eze']) self.assertEqual(1/10.0, result['zee']) self.assertEqual(1/10.0, result['zzk']) self.assertEqual(2/10.0, result['zkz']) self.assertEqual(1/10.0, result['kzk']) self.assertEqual(10/10.0, round(sum(result.values()))) text = "ababababa" results = features.extract(text,[Features.LETTERTRIGRAMS],False) result = results[Features.LETTERTRIGRAMS] self.assertEqual(4, result['aba']) self.assertEqual(3, result['bab']) self.assertEqual(7, sum(result.values())) results = features.extract(text,[Features.LETTERTRIGRAMS],True) result = results[Features.LETTERTRIGRAMS] self.assertEqual(4/7.0, result['aba']) self.assertEqual(3/7.0, result['bab']) self.assertEqual(1.0, sum(result.values())) text = "ab \nn \ttzx" results = features.extract(text, [Features.LETTERTRIGRAMS], False) result = results[Features.LETTERTRIGRAMS] self.assertEqual(1.0, result['tzx']) self.assertEqual(1.0, sum(result.values()))