def _declareStuff(): try: shutil.rmtree('./app/collection') except: print('Creating collection folder') os.mkdir('./app/collection') try: shutil.rmtree('./app/index') except: print('Creating index folder') os.mkdir('./app/index') global aux aux = {} global sndx sndx = {} global prfx prfx = Trie() global psfx psfx = Trie() global docs docs = get_docs("./app/docs/reuters21578.tar.gz") global length length = get_docs_length("./app/docs/reuters21578.tar.gz") global docs_names docs_names = names_generator(length)
def test_trie(): t = Trie(['he','hers','his','she']) results = t.match('a his hoge hershe xx.') assert len(results) == 5 rows = [(2,3), (11,2), (11,4), (14,3), (15,2)] for i in xrange(5): print results[i], rows[i] assert results[i] == rows[i]
def test_trie(): t = Trie(['he', 'hers', 'his', 'she']) results = t.match('a his hoge hershe xx.') assert len(results) == 5 rows = [(2, 3), (11, 2), (11, 4), (14, 3), (15, 2)] for i in xrange(5): print results[i], rows[i] assert results[i] == rows[i]
class TestTrie(unittest.TestCase): def setUp(self): self.corpus = ['black', 'blue', 'orange', 'orangered', 'green', 'aqua marine'] self.fixture = Trie() for i in self.corpus: self.fixture[i] = True def test_len(self): assert len(self.fixture) == len(self.corpus) def test_path_exists(self): for i in self.corpus: assert self.fixture._path_exists(i) assert not self.fixture._path_exists('not a color') def test_delitem(self): key = self.corpus[0] assert self.fixture[key] == True del self.fixture[key] self.assertRaises(KeyError, self.fixture.__getitem__, key) assert not self.fixture._path_exists(key[:-1]) def test_getitem(self): for i in self.corpus: assert self.fixture[i] == True self.assertRaises(KeyError, self.fixture.__getitem__, 'not a color') def test_setitem(self): self.assertRaises(KeyError, self.fixture.__setitem__, '', None) def test_contains(self): for i in self.corpus: assert i in self.fixture assert 'not a color' not in self.fixture def test_iterator(self): collected_keys = set([k for k in self.fixture]) assert collected_keys == set(self.corpus) for k in self.fixture: assert self.fixture[k] == True def test_iterkeys(self): keys = set() for i in self.fixture.iterkeys(): keys.add(i) assert keys == set(self.corpus)
def __init__(self, *args, **kwargs): self._dict = {} self._trie = Trie(*args, **kwargs) d = dict(*args, **kwargs) for key, value in d.items(): self._dict[case_insensitive(key)] = value
def split_file( in_file, out_file, fields="bgg_user_name", trie_file=None, limits=LIMIT, construct=False, ): """ split input file along prefixes """ trie = None if trie_file and not construct: LOGGER.info("loading trie from file <%s>...", trie_file) trie = _trie_from_file(trie_file) if not trie: LOGGER.info("making trie for <%s>...", in_file) full_trie = _make_trie(file=in_file, fields=fields) limits = tuple(arg_to_iter(limits)) or (LIMIT, ) for limit in limits: trie = Trie(_prefixes(full_trie, limit=limit)) LOGGER.info("%d prefixes using limit %d", len(trie), limit) out_path = trie_file.format(limit=limit) if trie_file else None if not out_path or out_path == "-": for prefix, count in trie.items(): print(f"{prefix}\t{count}") else: with open(out_path, "w") as file_obj: for prefix, count in trie.items(): file_obj.write(f"{prefix}\t{count}\n") LOGGER.info("constructed trie of size %d", len(trie)) _save_to_prefixes(dst=out_file, trie=trie, file=in_file, fields=fields)
def __init__(self, *args, **kwargs): # pylint: disable=super-init-not-called self._dict = {} self._trie = Trie(*args, **kwargs) d = dict(*args, **kwargs) for key, value in d.iteritems(): self._dict[case_insensitive(key)] = value
def setUp(self): self.corpus = ['black', 'blue', 'orange', 'orangered', 'green', 'aqua marine'] self.fixture = Trie() for i in self.corpus: self.fixture[i] = True
def _load_data(file_path): """ Temporary Proof of concept function - Loads information from CSV file Parses and splits CORP_NME for search_trie :param file_path: File path of CSV :return: None """ import csv # TODO: Move field constants elsewhere index_field = 'CORP_NUM' end_event_field = 'END_EVENT_ID' name_field = 'CORP_NME' Search.__search_trie = Trie() Search.__cached_name = dict() if not os.path.isfile(file_path): logger.warning('File not found. Empty search trie instantiated') return with open(file_path) as file: reader = csv.DictReader(file, delimiter=';', quoting=csv.QUOTE_NONE) try: for row in reader: # Ignore columns with specified END_EVENT_ID if row[end_event_field] in (None, ''): continue # Build Cache Dictionary if row[index_field] not in Search.__cached_name: Search.__cached_name[row[index_field]] = \ [row[name_field]] else: Search.__cached_name[row[index_field]].append( row[name_field]) # Build Search Trie # Remove non-alphanumeric characters and split words clean_name = Search._clean_string(row[name_field]) for word in clean_name.split(): if word not in (None, ''): # Create all possible suffixes of word suffix_list = \ [(yield(word[i:])) for i in range(len(word))] for suffix in suffix_list: if suffix not in Search.__search_trie: Search.__search_trie[suffix] = set() Search.__search_trie[suffix].add( row[index_field]) except UnicodeDecodeError: logger.error('Unexpected input at line %s', reader.line_num) logger.info('Loaded and indexed data')
def load_data(): global data_loaded, prefix_tree, data, country_index, name_index json_data = open("world_universities_and_domains.json").read() data = json.loads(json_data) for i in data: country_index[i["country"].lower()].append(i) name_index[i['name'].lower()] = i splitted = i['name'].split(" ") if len(splitted) > 1: for splitted_name in splitted[1:]: name_index[splitted_name.lower() + str(uuid.uuid1())] = i prefix_tree = Trie(**name_index) data_loaded = True
def load_data(): global data_loaded, prefix_tree, data, country_index, name_index, domain_index response = requests.get("https://raw.githubusercontent.com/Hipo/university-domains-list/master/world_universities_and_domains.json") data = response.json() for i in data: country_index[i["country"].lower()].append(i) name_index[i['name'].lower()] = i for domain in i["domains"]: domain_index[domain].append(i) splitted = i['name'].split(" ") if len(splitted) > 1: for splitted_name in splitted[1:]: name_index[splitted_name.lower() + str(uuid.uuid1())] = i prefix_tree = Trie(**name_index) data_loaded = True
def context_entities(setname, domain_set): for domain in domain_set: with open( JSON_OUTPUT_DIR + '/mentions/{}/{}.json'.format(setname, domain), 'r') as f: # add entities to this trie trie = Trie() for ent_id in domain2entities[domain]: # 不考虑multi category的情况 trie.setdefault(entities2name[ent_id].lower(), 0) # 考虑multi category的情况 # trie.setdefault(entities2alias[ent_id], 0) total_alias_count = 0 total_doc_count = 0 matched_text_list = [] for line in tqdm(f, desc='processing {}...'.format(domain)): datum = json.loads(line.strip()) text = decode(datum['mention_context_tokens']) total_doc_count += 1 i = 0 matched_text = '' while i < len(text): item = trie.longest_prefix_item(text[i:], default=None) if item is not None: prefix, key_id = item if (i == 0 or text[i-1] == ' ') \ and (i+len(prefix) == len(text) or text[i+len(prefix)] == ' '): total_alias_count += 1 i += len(prefix) matched_text += '##' + prefix + '##' else: matched_text += prefix i += len(prefix) else: matched_text += text[i] i += 1 matched_text_list.append(matched_text) print('Avg alias count {:.2f} in {} documents.'.format( total_alias_count / total_doc_count, domain)) with open( JSON_OUTPUT_DIR + '/mentions/{}/{}_matched.json'.format(setname, domain), 'w') as f: for text in matched_text_list: f.write(text + '\n\n')
def createTrie(self, filePath): trie = Trie() # with open("/Users/nali/Downloads/final_dict.txt", 'r') as file: with open(filePath, 'r') as file: lines = file.readlines() flag = 0 for line in lines: flag += 1 parts = line.strip().split('\t') if (len(parts) > 1): words = parts[1].split(';') for word in words: if (word != ''): if (trie.get(word) == None): trie[word] = set() trie[word].add(parts[0]) else: trie[word].add(parts[0]) return trie
def __init__(self): self.full_match = Trie() self.repairs_match = Trie() self.full_sentences = {}
def _make_trie(file, fields="bgg_user_name", sep=","): return Trie(_process_file(file, fields, sep, count=True))
def _trie_from_file(file): try: return Trie(_prefixes_from_file(file)) except Exception: pass return None
word2soundex[sndx] = [] if word not in word2soundex[sndx]: word2soundex[sndx].append(word) # Prefix tree loader if not word in prefix_tree: prefix_tree[word] = None rword = word[::-1] # Postfix_tree loader if not rword in postfix_tree: postfix_tree[rword] = None return aux, word2soundex, prefix_tree, postfix_tree if __name__ == '__main__': aux = {'one': [0], 'five': [0], 'reuters': [0]} w2s = {'O500': ['one'], 'F100': ['five'], 'R362': ['reuters']} prfx = Trie(one=None, five=None, reuters=None, chelny=None) psfx = Trie(eon=None, evif=None, sretuer=None, ylehc=None) doc = next(get_docs("./docs/reuters21578.tar.gz")) name = 'collection 1' aux, w2s, prfx, psfx = load_data(doc, name, aux, w2s, prfx, psfx) print(aux) print(w2s) print(prfx) print(psfx)
reduce(lambda i, j: i & j, (set(x) for x in res))) except: message += 'NOTHING FOUND BY YOUR QUERY\n' message += '-----------------------------------------------------------\n' message += f'InnoGruk found {len(relevant_documents)} documents by your query\n' message += '-----------------------------------------------------------' return message, relevant_documents, track[2:] if __name__ == '__main__': prfx = Trie(apple=None, borrow=None, friend=None, ginger=None, lermontov=None, money=None, november=None, object=None) psfx = Trie(elppa=None, worrob=None, dneirf=None, regnig=None, votnomrel=None, yenom=None, rebmevon=None, tcejbo=None) sndx = { 'A140': ['apple'],
import os from pytrie import StringTrie as Trie import re myTrie = Trie() # create empty trie FOR QUERY2 myTrie1 = Trie() #create empty trie FOR QUERY1 path = input('Hello, \nFirstly,Enter a path that includes txt files : ') all_files = os.listdir(path) #BUİLD TRİE FOR QUERY 2 for i in range(len(all_files)): with open(all_files[i], 'r') as f: for line in f: res = re.findall(r'\w+', line) #take list of words line by line ,convert lower and put with filename to trie for j in range(len(res)): #check the key existence in the trie if not myTrie.has_key( res[j].lower()): # word is NOT in the trie myTrie[res[j].lower()] = {all_files[i] } # word-> set of file else: myTrie.get(res[j].lower()).add( all_files[i] ) # if word is IN the trie add only file info to fileSET OF WORD #BUİLD TRİE FOR QUERY 1 for i in range(len(all_files)): with open(all_files[i], 'r') as f: counter = 0 for line in f: res = re.findall(r'\w+', line)
def loadtree(arr): global trie trie = Trie() for key in arr: trie[key[::-1]] = key
def generate_weight_tree(list, func): map = {} for w in list: w = w map[w] = func(w) return Trie(map)
from flask import jsonify from pytrie import SortedStringTrie as Trie import pandas as pd import time import json import cPickle as pickle trieds = Trie() #trieds is short for trie datastructure. listofallnames = [] def loadstrings(): """ Function to load a csv file, preprocess the data and load it into the Trie Datastructure """ start = time.time() df = pd.read_csv("data.csv", error_bad_lines=False ) #loading csv data file as a dataframe using pandas listofallnames = {} print 'key generation in process, please wait.' count = 0 for index, row in df.iterrows(): rowdata = {} #preprocessing data in the dataframe rowdata['firstname'] = str(row['givenName']).lower() rowdata['middlename'] = str(row['middleName']).lower() rowdata['lastname'] = str(row['surname']).lower() if str(row['givenName']) != 'nan': try: listofallnames[str(rowdata['firstname'])].append(rowdata) except KeyError:
with open('candidates.txt','r') as candidates: data = candidates.readlines() for key in data: output.append(key.split("\n")[0]) return output onepercent = [] # print(getelse("hello",2)) # judgeoutput = dict() # # alpha = "abcdefghijklmnopqrstuvwxyz" print("Loading Dictionary") dictionary = loaddictionary() dictionaryprime = loadprimedictionary() print("Building Tree Begin",time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))) t = Trie(dictionary) tprime = Trie(dictionaryprime) print("Building Tree Stop",time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))) print("Loading Candidates") candidates = loadcandidates() judgeoutput = dict() i = 0 j = 0 print("Begin Time:",time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))) for key in candidates: if i % 170 == 0: print(i/170,"%") print(time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))) judgeoutput[key] = judge(key,j) i = i + 1 j = j + 1
def solution(): candidatesTokens = [] blendResults = [] blendWords = [] prefixTrie = Trie() reversalTrie = Trie() def equalLetter(replace, match, char1, char2): if char1 == char2: return match else: return replace def split(word): leng = len(word) prefix = word[:int(leng / 2) + 1] suffix = word[int(leng / 2 - 1):][::-1] splited = [prefix, suffix] return splited def calPreRe(): truePositiveAmount = 0 blendCorrectWords = [] blendFalseWords = [] for word in blendWords: if word in blendResults: truePositiveAmount += 1 blendCorrectWords.append(word) print(" truePositive : ") print(truePositiveAmount) print("\n\n ") # precision recall = float(truePositiveAmount) / (len(blendResults) - 32) precision = float(truePositiveAmount) / len(blendWords) print(" Recall is : ") print(recall) print("\n\n") print(" Precision is : ") print(precision) print("\n\n") for word in blendResults: if word not in blendWords: blendFalseWords.append(word) print("count of blendCorrectWords : " + len(blendCorrectWords)) print(" blendCorrectWords : " + blendCorrectWords) print(" count of blendFalseWords : " + len(blendFalseWords)) print(" blendFalseWords : " + blendFalseWords) def input(): file = open("data/dict.txt", "r", encoding='utf-8') wordList = file.read().splitlines() repeatLetters = [ 'aaaa', 'bbbb', 'cccc', 'dddd', 'eeee', 'ffff', 'gggg', 'hhhh', 'iiii', 'jjjj', 'kkkk', 'llll', 'mmmm', 'nnnn', 'oooo', 'pppp', 'qqqq', 'rrrr', 'ssss', 'tttt', 'uuuu', 'vvvv', 'wwww', 'xxxx', 'yyy', 'zzzz' ] # input prefixly for word in wordList: word = word.strip() word = word.lower() prefixTrie[word] = word reverseWord = word[::-1] reversalTrie[reverseWord] = reverseWord file1 = open("data/candidates.txt", "r", encoding='utf-8') wordList1 = file1.read().splitlines() for word in wordList1: word = word.strip() word = word.lower() flag = 0 for subString in repeatLetters: if subString in word: flag = 1 break if flag == 0: candidatesTokens.append(word) file2 = open("data/blends.txt", "r", encoding='utf-8') wordList2 = file2.read().splitlines() for word in wordList2: word = word.split()[0] word = word.lower() blendResults.append(word) def editDistance(word1, word2): A = [[]] replace = -1 match = 1 deletion = -1 insertion = -1 word1Length = len(word1) word2Length = len(word2) A = np.zeros([word1Length, word1Length]) # construct matrix for j in range(word1Length): for k in range(word1Length): A[j][k] = max( 0, A[j][k - 1] + deletion, A[j - 1][k] + insertion, A[j - 1][k - 1] + equalLetter(replace, match, word1[k - 1], word2[j - 1])) max_item = max(max(row) for row in A) return max_item def compareUsingLED(pref, reverse=0): if reverse: trieList = reversalTrie else: trieList = prefixTrie prefixDict = trieList.keys(prefix=pref) threshold = len(pref) * (0.85) prefixFlag = 0 for word in prefixDict: LEDValue = editDistance(pref, word) if LEDValue >= threshold: JWValue = jarowinklerSimilarity.get_jaro_distance( pref, word[:int(len(pref))], winkler=True, scaling=0.1) if JWValue > 0.8: return True return False # input input() for word in candidatesTokens: prefix, reverseSuffix = split(word) if compareUsingLED(prefix, 0) and compareUsingLED(reverseSuffix, 1): blendWords.append(word) calPreRe()
def create_training_instances(input_file, max_seq_length, tokenizer, rng, alias2entities): def is_whitespace(c): if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F: return True return False """Create `TrainingInstance`s from raw text.""" all_documents = [] all_alias_token_spans = [] from pytrie import SortedStringTrie as Trie trie = Trie() # add entities to this trie for alias, ents in alias2entities.items(): trie.setdefault(alias, 0) with open(input_file, "r") as reader: for line in tqdm(reader, desc='converting tokens'): line = tokenization.convert_to_unicode(line.strip()) line = json.loads(line)['text'] tokens = [] if do_lower_case: line = line.lower() char_to_word_offset = [] prev_is_whitespace = True for c in line: if is_whitespace(c): prev_is_whitespace = True else: if prev_is_whitespace: tokens.append(c) else: tokens[-1] += c prev_is_whitespace = False char_to_word_offset.append(len(tokens) - 1) # 匹配文档中的alias alias_spans = match_alias(line, trie, alias2entities) # 此时的span对应粗粒度的token,span_end对应alias的最后一个token alias_token_spans = [(char_to_word_offset[span[0]], char_to_word_offset[span[1] - 1]) for span in alias_spans] for span, token_span in zip(alias_spans, alias_token_spans): alias_tokens = ' '.join(tokens[token_span[0]:token_span[1] + 1]) alias_texts = line[span[0]:span[1]] assert alias_tokens in alias2entities, print( alias_tokens, token_span, alias_texts, span) # assert all(' '.join(tokens[span[0]: span[1] + 1]) in alias2entities for span in alias_token_spans), \ # print([' '.join(tokens[span[0]: span[1] + 1]) for span in alias_token_spans]) tok_to_orig_index = [] # 细粒度-粗粒度 orig_to_tok_index = [] # 粗粒度-细粒度 real_tokens = [] for (i, token) in enumerate(tokens): orig_to_tok_index.append(len(real_tokens)) sub_tokens = tokenizer.tokenize(token) for sub_token in sub_tokens: tok_to_orig_index.append(i) real_tokens.append(sub_token) # 判断当前span对应的粗粒度token是否为最后一个token, # 如果是的话,则取最后一个细粒度token为结尾,如果不是的话,取下一个粗粒度token对应的第一个细粒度token的前一个token为结尾。 real_alias_token_spans = [] for span in alias_token_spans: if span[1] == len(tokens) - 1: real_end = orig_to_tok_index[-1] else: real_end = orig_to_tok_index[span[1] + 1] - 1 real_start = orig_to_tok_index[span[0]] real_alias_token_spans.append((real_start, real_end)) # alias_token_spans = [(orig_to_tok_index[span[0]], orig_to_tok_index[span[1]]) # for span in alias_token_spans] all_documents.append(real_tokens) all_alias_token_spans.append(real_alias_token_spans) vocab_words = list(tokenizer.vocab.keys()) instances = [] for document_index in tqdm(range(len(all_documents)), total=len(all_documents), desc='creating instances'): instances.extend( create_instances_from_document(all_documents, document_index, all_alias_token_spans, max_seq_length, vocab_words, rng)) rng.shuffle(instances) return instances
t0 = time.time() #Start time for index, row in train.iterrows(): #Normalise the names of dishes. with_sc = row[1] processed_string = preprocessing(with_sc) for token in processed_string.split(): #Create Inverted Index if inverted_index.get(token) == None: inverted_index.update({token: set()}) inverted_index[token].add(row[0]) prefix_tree = Trie(**inverted_index) t1 = time.time() - t0 #Time required for index construction print("Time taken for Trie generation: {} ".format(t1)) def search_results(query): #Return primary keys for query matching strings t0 = time.time() #Start time found = False processed_query = preprocessing( query) #Apply same pre-processing as before
def solution(): trie = Trie() reverseTrie = Trie() candidatesList = [] blendList = [] blendAnswerList = [] repeatSubString = ['aaaa', 'bbbb', 'cccc', 'dddd', 'eeee', 'ffff', 'gggg', 'hhhh', 'iiii', 'jjjj', 'kkkk', 'llll', 'mmmm', 'nnnn', 'oooo', 'pppp', 'qqqq', 'rrrr', 'ssss', 'tttt', 'uuuu', 'vvvv', 'wwww', 'xxxx', 'yyy', 'zzzz'] def inputTrie(): file = open("data/dict.txt", "r", encoding='utf-8') wordList = file.read().splitlines() # input prefixly for word in wordList: word = word.strip() word = word.lower() trie[word] = word reverseWord = word[::-1] reverseTrie[reverseWord] = reverseWord def inputCandidate(): file = open("data/candidates.txt", "r", encoding='utf-8') wordList = file.read().splitlines() for word in wordList: word = word.strip() word = word.lower() flag = 0 for subString in repeatSubString: if subString in word: flag = 1 break if flag == 0: candidatesList.append(word) def inputBlendAnswerList(): file = open("data/blends.txt", "r", encoding='utf-8') wordList = file.read().splitlines() for word in wordList: word = word.split()[0] word = word.lower() blendAnswerList.append(word) def splitWord(word): length = len(word) prefix = word[:int(length/2) + 1] reverseSuffix = word[int(length/2 - 1) :][::-1] splitedCandidate = [prefix, reverseSuffix] return splitedCandidate def equal(replace, match, char1, char2): if char1 == char2: return match else: return replace def localEditDistance(word1,word2): A = [[]] deletion = -1 insertion = -1 replace = -1 match = 1 word1Length = len(word1) word2Length = len(word2) A = np.zeros([word1Length, word1Length]) # construct matrix for j in range(word1Length): for k in range(word1Length): A[j][k] = max(0, A[j][k-1] + deletion, A[j-1][k] + insertion, A[j-1][k-1] + equal(replace, match, word1[k-1],word2[j-1])) max_item = max(max(row) for row in A) return max_item def comparePrefixNDictUsingLED(pref, reverse = 0): if reverse: trieList = reverseTrie else: trieList = trie prefixDict = trieList.keys(prefix = pref) threshold = len(pref) * (0.85) prefixFlag = 0 for word in prefixDict: LEDValue = localEditDistance(pref, word) if LEDValue >= threshold: JWValue = jarowinklerSimilarity.get_jaro_distance(pref, word[:int(len(pref))], winkler = True, scaling = 0.1) if JWValue > 0.8: return True return False def calAccurancy(): truePositiveAmount = 0 print(" *********** blendlist : ") print(blendList) print("\n\n ") print(" *********** count blendlist : ") print(len(blendList)) print("\n\n ") blendFindedList = [] blendMissedList = [] for word in blendList: if word in blendAnswerList: truePositiveAmount += 1 blendFindedList.append(word) print(" *********** truePositive : ") print(truePositiveAmount) print("\n\n ") # precision precision = float(truePositiveAmount) / len(blendList) recall = float(truePositiveAmount) / (len(blendAnswerList) - 32) print("***** Precision is : ") print(precision) print("\n\n") print("***** Recall is : ") print(recall) print("\n\n") for word in blendAnswerList: if word not in blendList: blendMissedList.append(word) print(" *********** count of blendFindedList : ") print(len(blendFindedList)) print(" *********** blendFindedList : ") print(blendFindedList) print("\n\n ") print(" *********** count of blendMissedList : ") print(len(blendMissedList)) print(" *********** blendMissedList : ") print(blendMissedList) print("\n\n ") # input inputTrie() inputCandidate() inputBlendAnswerList() # append filtered candidates to blendList for word in candidatesList: prefix, reverseSuffix = splitWord(word) if comparePrefixNDictUsingLED(prefix, 0) and comparePrefixNDictUsingLED(reverseSuffix, 1): blendList.append(word) calAccurancy()
numeros['treinti{}'.format(text)] = 30 + number numeros['treinta y {}'.format(text)] = 30 + number numeros['cuarenti{}'.format(text)] = 40 + number numeros['cuarenta y {}'.format(text)] = 40 + number numeros['cincuenti{}'.format(text)] = 50 + number numeros['cincuenta y {}'.format(text)] = 50 + number numeros['sesenti{}'.format(text)] = 60 + number numeros['sesenta y {}'.format(text)] = 60 + number numeros['setenti{}'.format(text)] = 70 + number numeros['setenta y {}'.format(text)] = 70 + number numeros['ochenti{}'.format(text)] = 80 + number numeros['ochenta y {}'.format(text)] = 80 + number numeros['noventi{}'.format(text)] = 90 + number numeros['noventa y {}'.format(text)] = 90 + number numeros = Trie(numeros) multiplicadores = Trie({ 'mil': 1000, 'ientos': 100, 'cientos': 100, 'ciento': 100, 'cientas': 100, 'cienta': 100, 'cien': 100 }) momento_del_dia = Trie({ 'de la noche': 'pm', 'de la mañana': 'am', 'del mediodia': 'pm', 'de la tarde': 'pm' }) time_of_day_to_sum = { 'noche': 12,
def solution(): trie = Trie() reverseTrie = Trie() candidatesList = [] blendList = [] blendAnswerList = [] def inputTrie(): file = open("data/dict.txt", "r", encoding='utf-8') wordList = file.read().splitlines() # input prefixly for word in wordList: word = word.strip() word = word.lower() trie[word] = word reverseWord = word[::-1] reverseTrie[reverseWord] = reverseWord def inputCandidate(): file = open("data/candidates.txt", "r", encoding='utf-8') wordList = file.read().splitlines() for word in wordList: word = word.strip() word = word.lower() candidatesList.append(word) def inputBlendAnswerList(): file = open("data/blends.txt", "r", encoding='utf-8') wordList = file.read().splitlines() for word in wordList: word = word.split()[0] word = word.lower() blendAnswerList.append(word) def splitWord(word): length = len(word) prefix = word[:int(length / 2)] reverseSuffix = word[int(length / 2):][::-1] splitedCandidate = [prefix, reverseSuffix] return splitedCandidate def equal(replace, match, char1, char2): if char1 == char2: return match else: return replace def localEditDistance(word1, word2): A = [[]] deletion = -1 insertion = -1 replace = -1 match = 1 word1Length = len(word1) word2Length = len(word2) A = np.zeros([word1Length, word1Length]) # construct matrix for j in range(word1Length): for k in range(word1Length): A[j][k] = max( 0, A[j][k - 1] + deletion, A[j - 1][k] + insertion, A[j - 1][k - 1] + equal(replace, match, word1[k - 1], word2[j - 1])) max_item = max(max(row) for row in A) return max_item def comparePrefixNDictUsingLED(pref, reverse=0): if reverse: trieList = reverseTrie else: trieList = trie prefixDict = trieList.keys(prefix=pref) threshold = len(pref) * (0.8) prefixFlag = 0 for word in prefixDict: LEDValue = localEditDistance(pref, word) if LEDValue >= threshold: JWValue = jarowinklerSimilarity.get_jaro_distance( pref, word[:int(len(pref))], winkler=True, scaling=0.1) if JWValue > 0.95: return True return False def calAccurancy(): truePositiveAmount = 0 print(" *********** blendlist : ") print(blendList) print("\n\n ") print(" *********** count blendlist : ") print(len(blendList)) print("\n\n ") for word in blendList: if word in blendAnswerList: truePositiveAmount += 1 print(" *********** truePositive : ") print(truePositiveAmount) print("\n\n ") # precision precision = float(truePositiveAmount) / len(blendList) recall = float(truePositiveAmount) / (len(blendAnswerList) - 32) print("***** Precision is : ") print(precision) print("\n\n") print("***** Recall is : ") print(recall) print("\n\n") # input inputTrie() inputCandidate() inputBlendAnswerList() # append filtered candidates to blendList for word in candidatesList: prefix, reverseSuffix = splitWord(word) if comparePrefixNDictUsingLED( prefix, 0) and comparePrefixNDictUsingLED(reverseSuffix, 1): blendList.append(word) calAccurancy()