def visitTextFile(self, textfile): splitter = Splitter(textfile.filePath, len(self.workers)) file_split_result = splitter.split() self.operations[textfile.id] = FilePartition(textfile.id, len(self.workers), file_split_result, textfile.filePath) self._set_collect_count(textfile)
def processQuestion(gloveModel, question, minLen=1, maxLen=3, useAPI=False, useSynonyms=False): tagger = POSTagger() pos = tagger.parse(question) # create splitter and generalizer splitter = Splitter() if question[-1] == '?' or question[-1] == '.': question = question[:-1] gen_question = splitter.generalize(question, pos) labels = [] resultsExists = False if not useAPI: parts = list(splitter.split(gen_question, min=minLen, max=maxLen)) else: resultsExists = True apiResult, _ = api.getBinaryRelations(question) parts = [ rel.predicate for rel in apiResult if len(rel.predicate_positions_) > 1 ] for part in parts: if len(part.split()) > 1: labels.append(part.split()[0] + ''.join(''.join([w[0].upper(), w[1:].lower()]) for w in part.split()[1:])) if useSynonyms: predicates = [max(part.split(), key=len) for part in parts] if predicates is not None and len(predicates) > 0: for predicate in predicates: for part in list(parts): if predicate in part: for syn in gloveModel.gloveModel.most_similar( predicate.lower()): parts.append(part.replace(predicate, syn[0])) if len(parts) == 0: resultsExists = False parts = list(splitter.split(gen_question, min=minLen, max=maxLen)) # create embedder part vectors = [] for part in parts: vectors.append(gloveModel.getVector(part)) return vectors, parts, pos, gen_question, labels, resultsExists
def process_text(text): splitter = Splitter() postagger = POSTagger() # Split the sentences to words splitted_sentences = splitter.split(text) # Do Parts of Speech Tagging on the words pos_tagged_sentences = postagger.pos_tag(splitted_sentences) dict_tagged_sentences = dicttagger.tag(pos_tagged_sentences) return sum_score(dict_tagged_sentences)
""" This splitter attempts to maximize apparent total surplus """ from splitter import Splitter from splitter import Bid class SurplusMaximizer(Splitter): def score(self, bid, averages): return bid.amount - averages[bid.item] items = ["Room 1", "Room 2", "Room 3"] bids = [Bid("Room 1", "Joey", 10), Bid("Room 1", "Josh", 15), Bid("Room 2", "Joey", 5), Bid("Room 2", "Josh", 0)] s = Splitter() print s.split(items, ["Joey", "Josh"], bids)
columns = ['Pclass_1', 'Pclass_2', 'Pclass_3', 'Sex_female', 'Sex_male', 'Age_categories_Missing','Age_categories_Infant', 'Age_categories_Child', 'Age_categories_Teenager', 'Age_categories_Young Adult', 'Age_categories_Adult', 'Age_categories_Senior'] target_column = 'Survived'; r.train_machine(train[columns], train[target_column]); holdout = test; all_X = train[columns] all_y = train[target_column] train_x, test_x, train_y, test_y = sp.split(train[columns], train[target_column]); # toPrint = sr.get_train()['Age'].describe(); # print(toPrint) r.train_machine(train_x, train_y); predictions = r.predict(test_x); accuracy = mt.model_accuracy(test_y, predictions); regressor_object = Regressor(); reg = regressor_object.get_regressor(); mt.set_cross_score(reg, all_X, all_y, 10) mt.sort_score(); scores = mt.get_scores(); cross_accurace = mt.get_mean(); regressor_object_1 = Regressor();
# -*- coding: utf-8 -*- """ Created on Sat May 25 10:06:24 2019 @author: Gerardo Cervantes """ from splitter import Splitter #For testing split keys split_keys = ['{PGUP}', '{BKSP}', '{F4}'] if __name__ == "__main__": splitter = Splitter() splitter.split('Livesplit', '{pgup}', 0)
class Rule(object): def __init__(self): self.sentences = [] self.abbreviation = {} self.load_data() self.load_abbrv() self.normalizer = Normalizer() self.splitter = Splitter() self.corrector = Filter() self.lemmatizer = WordNetLemmatizer() self.missing_apostrophe_vocab = [ 'isnt', ' arent', 'wasnt', 'werent', 'wont', 'dont', 'didnt', 'doesnt', 'couldnt', 'shouldnt', 'hasnt', 'havent', 'hadnt' ] self.tokenizer_mistake_vocab = [ 'isn', 'aren', 'wasn', 'weren', 'won', 'don', 'didn', 'doesn', 'couldn', 'shouldn', 'hasn', 'haven', 'hadn' ] self._norm = joblib.load('model.crfsuite') def load_data(self): with open('../data/reviews.txt', 'r') as myFile: self.sentences = myFile.read().splitlines() def load_abbrv(self): with open('../data/abbreviation.txt', 'r') as myFile: self.abbreviation = { i.split('---')[0]: i.split('---')[1] for i in myFile.read().splitlines() } def test2(self, sentence): tokens = self.splitter.split(sentence.lower()) tokens = [j for i in tokens for j in i] print(tokens) X_test = self.normalizer.sent2features(tokens) for i in X_test: print(i) y_pred = self._norm.predict([X_test]) print(y_pred) def process(self, sentence): tokens = self.splitter.split(sentence.lower()) tokens = [j for i in tokens for j in i] X_test = self.normalizer.sent2features(tokens) y_pred = self._norm.predict([X_test]) for i in range(len(y_pred[0])): if y_pred[0][i] == 'N': o, f = self.correct(tokens[i]) if f: if tokens[i][0].isupper(): tokens[i] = o[0].upper() + o[1:] else: tokens[i] = o return tokens def test(self): tokens = [ self.splitter.split(i.lower()) for i in self.sentences[1001:5000] ] tokens = [j for i in tokens for j in i] X_test = [self.normalizer.sent2features(i) for i in tokens] y_pred = self._norm.predict(X_test) stats = {} output = [] for i in range(len(tokens)): flag = 0 for j in range(len(y_pred[i])): if y_pred[i][j] == 'N': print(tokens[i][j]) o, f = self.correct(tokens[i][j]) if f: output.append((tokens[i][j], o)) if tokens[i][j] not in stats: stats[tokens[i][j]] = 0 stats[tokens[i][j]] += 1 flag = 1 if flag == 1: print(' '.join(tokens[i])) print(y_pred[i]) with open('inter_correct_2.txt', 'w') as myFile: print('start writing') for old, new in output: print(old, new) myFile.write(old + '\t' + new + '\n') def correct(self, term): ret = [] flag = False #是否被修改了 for i in term.split('.'): # 排除加长词语 i = i.lower() i, res = self.correct_elongated(i) if res: flag = True ret.append(i) continue # tokenize时有时会把didn't分成didn和't if i in self.tokenizer_mistake_vocab: ret.append(i) continue # tokenize时会将"'"囊括其中比如't,不用管 if "'" in i: ret.append(i) continue # 有些词是用'-'组合而成的,不检查 if '-' in i: ret.append(i) continue # 解决助动词否定词缺少"'" if i in self.missing_apostrophe_vocab: i = self.correct_missing_apostrophe(i) flag = True ret.append(i) continue # 动词过去时已经被词典囊括,只检查名词单复数和动词的第三人称 if i in self.normalizer.dct or self.lemmatizer.lemmatize(i, wordnet.NOUN) in self.normalizer.dct \ or self.lemmatizer.lemmatize(i, wordnet.VERB) in self.normalizer.dct: ret.append(i) continue # 简称简写 if i in self.abbreviation: i = self.abbreviation[i] flag = True ret.append(i) continue if i.isalpha(): res_s = self.corrector.process(i) tmp = [] for res in res_s.split(' '): if res != i and ( self.lemmatizer.lemmatize( res, wordnet.NOUN) in self.normalizer.dct or self.lemmatizer.lemmatize( res, wordnet.VERB) in self.normalizer.dct or self.lemmatizer.lemmatize( res, wordnet.ADJ) in self.normalizer.dct or self.lemmatizer.lemmatize( res, wordnet.ADV) in self.normalizer.dct): flag = True tmp.append(res) if len(tmp) != 0: ret.append(' '.join(tmp)) else: ret.append(i) continue ret.append(i) if len(ret) == 2: return ret[0] + '. ' + ret[1], flag else: return ret[0], flag def correct_merged_words(self, term): if len(term) < 4: return term, False for i in range(1, len(term)): if term[:i] in self.normalizer.dct and term[ i:] in self.normalizer.dct: return term[:i] + ' ' + term[i:], True return term, False def correct_missing_apostrophe(self, term): res = term[:-1] + "'" + term[-1] return res def correct_elongated(self, term): count = 0 while True: start, end, flag = self.find_elongated_pos(term) if count == 0 and not flag: return term, False elif not flag: return term, True else: cand1 = term.replace(term[start:end + 1], term[start]) cand2 = term.replace(term[start:end + 1], term[start] * 2) # 如果candidate1 在词典里,结果就是candidate1 if cand1 in self.normalizer.dct: term = cand1 continue # 如果candidate2 在词典里,结果就是candidate2 if cand2 in self.normalizer.dct: term = cand2 continue # 如果都不在,选取candidate1 term = cand1 count += 1 def find_elongated_pos(self, term): prev = '' start = 0 ct = 1 for idx, i in enumerate(term): if idx == 0: prev = i start = idx else: if i == prev: ct += 1 if ct > 2: end = idx while end <= len(term) - 1 and term[end] == i: end += 1 end = end - 1 return start, end, True else: ct = 1 start = idx prev = i return -1, -1, False
class SplitterTest(TestCase): def setUp(self): self.s = Splitter() def test_returns_none_when_loot_is_undivisible_by_number_of_pirates(self): self.assertEqual(None,self.s.split([2,3],2)) def test_returns_none_when_there_are_not_enough_gems(self): self.assertEqual(None,self.s.split([4,2],3)) def test_returns_none_when_there_is_a_gem_greater_than_share(self): self.assertEqual(None,self.s.split([4,2,3],3)) def test_everybody_gets_the_same_kind_of_bucket_when_we_have_only_one_type_of_gem(self): self.assertEqual([[2],[2]],self.s.split([2,2],2)) self.assertEqual([[2],[2],[2]],self.s.split([2,2,2],3)) self.assertEqual([[2,2],[2,2]],self.s.split([2,2,2,2],2)) def test_everybody_gets_the_same_kind_of_bucket_when_we_have_the_same_set_of_gem_and_pirates(self): self.assertEqual([[3,2],[3,2]],self.s.split([3,2,3,2],2)) self.assertEqual([[3,2],[3,2],[3,2]],self.s.split([3,2,3,2,3,2],3)) def test_everybody_gets_the_same_value_with_a_different_number_of_gems(self): self.assertEqual([[3],[2,1]],self.s.split([1,2,3],2)) self.assertEqual([[3,2,2,2],[3,2,2,2],[3,2,2,2]],self.s.split([3,3,3,2,2,2,2,2,2,2,2,2],3)) #famoso caso da morte self.assertEqual([[7],[5,2]],self.s.split([7,5,2],2)) def test_should_not_create_a_bucket_greater_than_share(self): self.assertEqual([[13,1],[7,7]],self.s.split([13,7,7,1],2)) def test_should_rollback_when_the_first_decision_doesnt_fit(self): self.assertEqual([[7, 2, 2],[3, 3, 3, 2]],self.s.split([7, 3, 3, 3, 2, 2, 2],2)) def test_should_rollback_when_the_second_decision_doesnt_fit_also(self): self.assertEqual([[7, 2, 2, 2],[3, 3, 3, 2, 2]],self.s.split([7, 3, 3, 3, 2, 2, 2, 2, 2],2))