def createTrie(self, filePath): trie = Trie() # with open("/Users/nali/Downloads/final_dict.txt", 'r') as file: with open(filePath, 'r') as file: lines = file.readlines() flag = 0 for line in lines: flag += 1 parts = line.strip().split('\t') if (len(parts) > 1): words = parts[1].split(';') for word in words: if (word != ''): if (trie.get(word) == None): trie[word] = set() trie[word].add(parts[0]) else: trie[word].add(parts[0]) return trie
def _make_trie(file, fields="bgg_user_name", sep=","): return Trie(_process_file(file, fields, sep, count=True))
from flask import jsonify from pytrie import SortedStringTrie as Trie import pandas as pd import time import json import cPickle as pickle trieds = Trie() #trieds is short for trie datastructure. listofallnames = [] def loadstrings(): """ Function to load a csv file, preprocess the data and load it into the Trie Datastructure """ start = time.time() df = pd.read_csv("data.csv", error_bad_lines=False ) #loading csv data file as a dataframe using pandas listofallnames = {} print 'key generation in process, please wait.' count = 0 for index, row in df.iterrows(): rowdata = {} #preprocessing data in the dataframe rowdata['firstname'] = str(row['givenName']).lower() rowdata['middlename'] = str(row['middleName']).lower() rowdata['lastname'] = str(row['surname']).lower() if str(row['givenName']) != 'nan': try: listofallnames[str(rowdata['firstname'])].append(rowdata) except KeyError:
def __init__(self): self.full_match = Trie() self.repairs_match = Trie() self.full_sentences = {}
numeros['treinti{}'.format(text)] = 30 + number numeros['treinta y {}'.format(text)] = 30 + number numeros['cuarenti{}'.format(text)] = 40 + number numeros['cuarenta y {}'.format(text)] = 40 + number numeros['cincuenti{}'.format(text)] = 50 + number numeros['cincuenta y {}'.format(text)] = 50 + number numeros['sesenti{}'.format(text)] = 60 + number numeros['sesenta y {}'.format(text)] = 60 + number numeros['setenti{}'.format(text)] = 70 + number numeros['setenta y {}'.format(text)] = 70 + number numeros['ochenti{}'.format(text)] = 80 + number numeros['ochenta y {}'.format(text)] = 80 + number numeros['noventi{}'.format(text)] = 90 + number numeros['noventa y {}'.format(text)] = 90 + number numeros = Trie(numeros) multiplicadores = Trie({ 'mil': 1000, 'ientos': 100, 'cientos': 100, 'ciento': 100, 'cientas': 100, 'cienta': 100, 'cien': 100 }) momento_del_dia = Trie({ 'de la noche': 'pm', 'de la mañana': 'am', 'del mediodia': 'pm', 'de la tarde': 'pm' }) time_of_day_to_sum = { 'noche': 12,
t0 = time.time() #Start time for index, row in train.iterrows(): #Normalise the names of dishes. with_sc = row[1] processed_string = preprocessing(with_sc) for token in processed_string.split(): #Create Inverted Index if inverted_index.get(token) == None: inverted_index.update({token: set()}) inverted_index[token].add(row[0]) prefix_tree = Trie(**inverted_index) t1 = time.time() - t0 #Time required for index construction print("Time taken for Trie generation: {} ".format(t1)) def search_results(query): #Return primary keys for query matching strings t0 = time.time() #Start time found = False processed_query = preprocessing( query) #Apply same pre-processing as before
def create_training_instances(input_file, max_seq_length, tokenizer, rng, alias2entities): def is_whitespace(c): if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F: return True return False """Create `TrainingInstance`s from raw text.""" all_documents = [] all_alias_token_spans = [] from pytrie import SortedStringTrie as Trie trie = Trie() # add entities to this trie for alias, ents in alias2entities.items(): trie.setdefault(alias, 0) with open(input_file, "r") as reader: for line in tqdm(reader, desc='converting tokens'): line = tokenization.convert_to_unicode(line.strip()) line = json.loads(line)['text'] tokens = [] if do_lower_case: line = line.lower() char_to_word_offset = [] prev_is_whitespace = True for c in line: if is_whitespace(c): prev_is_whitespace = True else: if prev_is_whitespace: tokens.append(c) else: tokens[-1] += c prev_is_whitespace = False char_to_word_offset.append(len(tokens) - 1) # 匹配文档中的alias alias_spans = match_alias(line, trie, alias2entities) # 此时的span对应粗粒度的token,span_end对应alias的最后一个token alias_token_spans = [(char_to_word_offset[span[0]], char_to_word_offset[span[1] - 1]) for span in alias_spans] for span, token_span in zip(alias_spans, alias_token_spans): alias_tokens = ' '.join(tokens[token_span[0]:token_span[1] + 1]) alias_texts = line[span[0]:span[1]] assert alias_tokens in alias2entities, print( alias_tokens, token_span, alias_texts, span) # assert all(' '.join(tokens[span[0]: span[1] + 1]) in alias2entities for span in alias_token_spans), \ # print([' '.join(tokens[span[0]: span[1] + 1]) for span in alias_token_spans]) tok_to_orig_index = [] # 细粒度-粗粒度 orig_to_tok_index = [] # 粗粒度-细粒度 real_tokens = [] for (i, token) in enumerate(tokens): orig_to_tok_index.append(len(real_tokens)) sub_tokens = tokenizer.tokenize(token) for sub_token in sub_tokens: tok_to_orig_index.append(i) real_tokens.append(sub_token) # 判断当前span对应的粗粒度token是否为最后一个token, # 如果是的话,则取最后一个细粒度token为结尾,如果不是的话,取下一个粗粒度token对应的第一个细粒度token的前一个token为结尾。 real_alias_token_spans = [] for span in alias_token_spans: if span[1] == len(tokens) - 1: real_end = orig_to_tok_index[-1] else: real_end = orig_to_tok_index[span[1] + 1] - 1 real_start = orig_to_tok_index[span[0]] real_alias_token_spans.append((real_start, real_end)) # alias_token_spans = [(orig_to_tok_index[span[0]], orig_to_tok_index[span[1]]) # for span in alias_token_spans] all_documents.append(real_tokens) all_alias_token_spans.append(real_alias_token_spans) vocab_words = list(tokenizer.vocab.keys()) instances = [] for document_index in tqdm(range(len(all_documents)), total=len(all_documents), desc='creating instances'): instances.extend( create_instances_from_document(all_documents, document_index, all_alias_token_spans, max_seq_length, vocab_words, rng)) rng.shuffle(instances) return instances
def loadtree(arr): global trie trie = Trie() for key in arr: trie[key[::-1]] = key
reduce(lambda i, j: i & j, (set(x) for x in res))) except: message += 'NOTHING FOUND BY YOUR QUERY\n' message += '-----------------------------------------------------------\n' message += f'InnoGruk found {len(relevant_documents)} documents by your query\n' message += '-----------------------------------------------------------' return message, relevant_documents, track[2:] if __name__ == '__main__': prfx = Trie(apple=None, borrow=None, friend=None, ginger=None, lermontov=None, money=None, november=None, object=None) psfx = Trie(elppa=None, worrob=None, dneirf=None, regnig=None, votnomrel=None, yenom=None, rebmevon=None, tcejbo=None) sndx = { 'A140': ['apple'],
word2soundex[sndx] = [] if word not in word2soundex[sndx]: word2soundex[sndx].append(word) # Prefix tree loader if not word in prefix_tree: prefix_tree[word] = None rword = word[::-1] # Postfix_tree loader if not rword in postfix_tree: postfix_tree[rword] = None return aux, word2soundex, prefix_tree, postfix_tree if __name__ == '__main__': aux = {'one': [0], 'five': [0], 'reuters': [0]} w2s = {'O500': ['one'], 'F100': ['five'], 'R362': ['reuters']} prfx = Trie(one=None, five=None, reuters=None, chelny=None) psfx = Trie(eon=None, evif=None, sretuer=None, ylehc=None) doc = next(get_docs("./docs/reuters21578.tar.gz")) name = 'collection 1' aux, w2s, prfx, psfx = load_data(doc, name, aux, w2s, prfx, psfx) print(aux) print(w2s) print(prfx) print(psfx)
with open('candidates.txt','r') as candidates: data = candidates.readlines() for key in data: output.append(key.split("\n")[0]) return output onepercent = [] # print(getelse("hello",2)) # judgeoutput = dict() # # alpha = "abcdefghijklmnopqrstuvwxyz" print("Loading Dictionary") dictionary = loaddictionary() dictionaryprime = loadprimedictionary() print("Building Tree Begin",time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))) t = Trie(dictionary) tprime = Trie(dictionaryprime) print("Building Tree Stop",time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))) print("Loading Candidates") candidates = loadcandidates() judgeoutput = dict() i = 0 j = 0 print("Begin Time:",time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))) for key in candidates: if i % 170 == 0: print(i/170,"%") print(time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))) judgeoutput[key] = judge(key,j) i = i + 1 j = j + 1
def solution(): trie = Trie() reverseTrie = Trie() candidatesList = [] blendList = [] blendAnswerList = [] repeatSubString = ['aaaa', 'bbbb', 'cccc', 'dddd', 'eeee', 'ffff', 'gggg', 'hhhh', 'iiii', 'jjjj', 'kkkk', 'llll', 'mmmm', 'nnnn', 'oooo', 'pppp', 'qqqq', 'rrrr', 'ssss', 'tttt', 'uuuu', 'vvvv', 'wwww', 'xxxx', 'yyy', 'zzzz'] def inputTrie(): file = open("data/dict.txt", "r", encoding='utf-8') wordList = file.read().splitlines() # input prefixly for word in wordList: word = word.strip() word = word.lower() trie[word] = word reverseWord = word[::-1] reverseTrie[reverseWord] = reverseWord def inputCandidate(): file = open("data/candidates.txt", "r", encoding='utf-8') wordList = file.read().splitlines() for word in wordList: word = word.strip() word = word.lower() flag = 0 for subString in repeatSubString: if subString in word: flag = 1 break if flag == 0: candidatesList.append(word) def inputBlendAnswerList(): file = open("data/blends.txt", "r", encoding='utf-8') wordList = file.read().splitlines() for word in wordList: word = word.split()[0] word = word.lower() blendAnswerList.append(word) def splitWord(word): length = len(word) prefix = word[:int(length/2) + 1] reverseSuffix = word[int(length/2 - 1) :][::-1] splitedCandidate = [prefix, reverseSuffix] return splitedCandidate def equal(replace, match, char1, char2): if char1 == char2: return match else: return replace def localEditDistance(word1,word2): A = [[]] deletion = -1 insertion = -1 replace = -1 match = 1 word1Length = len(word1) word2Length = len(word2) A = np.zeros([word1Length, word1Length]) # construct matrix for j in range(word1Length): for k in range(word1Length): A[j][k] = max(0, A[j][k-1] + deletion, A[j-1][k] + insertion, A[j-1][k-1] + equal(replace, match, word1[k-1],word2[j-1])) max_item = max(max(row) for row in A) return max_item def comparePrefixNDictUsingLED(pref, reverse = 0): if reverse: trieList = reverseTrie else: trieList = trie prefixDict = trieList.keys(prefix = pref) threshold = len(pref) * (0.85) prefixFlag = 0 for word in prefixDict: LEDValue = localEditDistance(pref, word) if LEDValue >= threshold: JWValue = jarowinklerSimilarity.get_jaro_distance(pref, word[:int(len(pref))], winkler = True, scaling = 0.1) if JWValue > 0.8: return True return False def calAccurancy(): truePositiveAmount = 0 print(" *********** blendlist : ") print(blendList) print("\n\n ") print(" *********** count blendlist : ") print(len(blendList)) print("\n\n ") blendFindedList = [] blendMissedList = [] for word in blendList: if word in blendAnswerList: truePositiveAmount += 1 blendFindedList.append(word) print(" *********** truePositive : ") print(truePositiveAmount) print("\n\n ") # precision precision = float(truePositiveAmount) / len(blendList) recall = float(truePositiveAmount) / (len(blendAnswerList) - 32) print("***** Precision is : ") print(precision) print("\n\n") print("***** Recall is : ") print(recall) print("\n\n") for word in blendAnswerList: if word not in blendList: blendMissedList.append(word) print(" *********** count of blendFindedList : ") print(len(blendFindedList)) print(" *********** blendFindedList : ") print(blendFindedList) print("\n\n ") print(" *********** count of blendMissedList : ") print(len(blendMissedList)) print(" *********** blendMissedList : ") print(blendMissedList) print("\n\n ") # input inputTrie() inputCandidate() inputBlendAnswerList() # append filtered candidates to blendList for word in candidatesList: prefix, reverseSuffix = splitWord(word) if comparePrefixNDictUsingLED(prefix, 0) and comparePrefixNDictUsingLED(reverseSuffix, 1): blendList.append(word) calAccurancy()
def generate_weight_tree(list, func): map = {} for w in list: w = w map[w] = func(w) return Trie(map)
def _trie_from_file(file): try: return Trie(_prefixes_from_file(file)) except Exception: pass return None
import os from pytrie import StringTrie as Trie import re myTrie = Trie() # create empty trie FOR QUERY2 myTrie1 = Trie() #create empty trie FOR QUERY1 path = input('Hello, \nFirstly,Enter a path that includes txt files : ') all_files = os.listdir(path) #BUİLD TRİE FOR QUERY 2 for i in range(len(all_files)): with open(all_files[i], 'r') as f: for line in f: res = re.findall(r'\w+', line) #take list of words line by line ,convert lower and put with filename to trie for j in range(len(res)): #check the key existence in the trie if not myTrie.has_key( res[j].lower()): # word is NOT in the trie myTrie[res[j].lower()] = {all_files[i] } # word-> set of file else: myTrie.get(res[j].lower()).add( all_files[i] ) # if word is IN the trie add only file info to fileSET OF WORD #BUİLD TRİE FOR QUERY 1 for i in range(len(all_files)): with open(all_files[i], 'r') as f: counter = 0 for line in f: res = re.findall(r'\w+', line)
def solution(): candidatesTokens = [] blendResults = [] blendWords = [] prefixTrie = Trie() reversalTrie = Trie() def equalLetter(replace, match, char1, char2): if char1 == char2: return match else: return replace def split(word): leng = len(word) prefix = word[:int(leng / 2) + 1] suffix = word[int(leng / 2 - 1):][::-1] splited = [prefix, suffix] return splited def calPreRe(): truePositiveAmount = 0 blendCorrectWords = [] blendFalseWords = [] for word in blendWords: if word in blendResults: truePositiveAmount += 1 blendCorrectWords.append(word) print(" truePositive : ") print(truePositiveAmount) print("\n\n ") # precision recall = float(truePositiveAmount) / (len(blendResults) - 32) precision = float(truePositiveAmount) / len(blendWords) print(" Recall is : ") print(recall) print("\n\n") print(" Precision is : ") print(precision) print("\n\n") for word in blendResults: if word not in blendWords: blendFalseWords.append(word) print("count of blendCorrectWords : " + len(blendCorrectWords)) print(" blendCorrectWords : " + blendCorrectWords) print(" count of blendFalseWords : " + len(blendFalseWords)) print(" blendFalseWords : " + blendFalseWords) def input(): file = open("data/dict.txt", "r", encoding='utf-8') wordList = file.read().splitlines() repeatLetters = [ 'aaaa', 'bbbb', 'cccc', 'dddd', 'eeee', 'ffff', 'gggg', 'hhhh', 'iiii', 'jjjj', 'kkkk', 'llll', 'mmmm', 'nnnn', 'oooo', 'pppp', 'qqqq', 'rrrr', 'ssss', 'tttt', 'uuuu', 'vvvv', 'wwww', 'xxxx', 'yyy', 'zzzz' ] # input prefixly for word in wordList: word = word.strip() word = word.lower() prefixTrie[word] = word reverseWord = word[::-1] reversalTrie[reverseWord] = reverseWord file1 = open("data/candidates.txt", "r", encoding='utf-8') wordList1 = file1.read().splitlines() for word in wordList1: word = word.strip() word = word.lower() flag = 0 for subString in repeatLetters: if subString in word: flag = 1 break if flag == 0: candidatesTokens.append(word) file2 = open("data/blends.txt", "r", encoding='utf-8') wordList2 = file2.read().splitlines() for word in wordList2: word = word.split()[0] word = word.lower() blendResults.append(word) def editDistance(word1, word2): A = [[]] replace = -1 match = 1 deletion = -1 insertion = -1 word1Length = len(word1) word2Length = len(word2) A = np.zeros([word1Length, word1Length]) # construct matrix for j in range(word1Length): for k in range(word1Length): A[j][k] = max( 0, A[j][k - 1] + deletion, A[j - 1][k] + insertion, A[j - 1][k - 1] + equalLetter(replace, match, word1[k - 1], word2[j - 1])) max_item = max(max(row) for row in A) return max_item def compareUsingLED(pref, reverse=0): if reverse: trieList = reversalTrie else: trieList = prefixTrie prefixDict = trieList.keys(prefix=pref) threshold = len(pref) * (0.85) prefixFlag = 0 for word in prefixDict: LEDValue = editDistance(pref, word) if LEDValue >= threshold: JWValue = jarowinklerSimilarity.get_jaro_distance( pref, word[:int(len(pref))], winkler=True, scaling=0.1) if JWValue > 0.8: return True return False # input input() for word in candidatesTokens: prefix, reverseSuffix = split(word) if compareUsingLED(prefix, 0) and compareUsingLED(reverseSuffix, 1): blendWords.append(word) calPreRe()
def solution(): trie = Trie() reverseTrie = Trie() candidatesList = [] blendList = [] blendAnswerList = [] def inputTrie(): file = open("data/dict.txt", "r", encoding='utf-8') wordList = file.read().splitlines() # input prefixly for word in wordList: word = word.strip() word = word.lower() trie[word] = word reverseWord = word[::-1] reverseTrie[reverseWord] = reverseWord def inputCandidate(): file = open("data/candidates.txt", "r", encoding='utf-8') wordList = file.read().splitlines() for word in wordList: word = word.strip() word = word.lower() candidatesList.append(word) def inputBlendAnswerList(): file = open("data/blends.txt", "r", encoding='utf-8') wordList = file.read().splitlines() for word in wordList: word = word.split()[0] word = word.lower() blendAnswerList.append(word) def splitWord(word): length = len(word) prefix = word[:int(length / 2)] reverseSuffix = word[int(length / 2):][::-1] splitedCandidate = [prefix, reverseSuffix] return splitedCandidate def equal(replace, match, char1, char2): if char1 == char2: return match else: return replace def localEditDistance(word1, word2): A = [[]] deletion = -1 insertion = -1 replace = -1 match = 1 word1Length = len(word1) word2Length = len(word2) A = np.zeros([word1Length, word1Length]) # construct matrix for j in range(word1Length): for k in range(word1Length): A[j][k] = max( 0, A[j][k - 1] + deletion, A[j - 1][k] + insertion, A[j - 1][k - 1] + equal(replace, match, word1[k - 1], word2[j - 1])) max_item = max(max(row) for row in A) return max_item def comparePrefixNDictUsingLED(pref, reverse=0): if reverse: trieList = reverseTrie else: trieList = trie prefixDict = trieList.keys(prefix=pref) threshold = len(pref) * (0.8) prefixFlag = 0 for word in prefixDict: LEDValue = localEditDistance(pref, word) if LEDValue >= threshold: JWValue = jarowinklerSimilarity.get_jaro_distance( pref, word[:int(len(pref))], winkler=True, scaling=0.1) if JWValue > 0.95: return True return False def calAccurancy(): truePositiveAmount = 0 print(" *********** blendlist : ") print(blendList) print("\n\n ") print(" *********** count blendlist : ") print(len(blendList)) print("\n\n ") for word in blendList: if word in blendAnswerList: truePositiveAmount += 1 print(" *********** truePositive : ") print(truePositiveAmount) print("\n\n ") # precision precision = float(truePositiveAmount) / len(blendList) recall = float(truePositiveAmount) / (len(blendAnswerList) - 32) print("***** Precision is : ") print(precision) print("\n\n") print("***** Recall is : ") print(recall) print("\n\n") # input inputTrie() inputCandidate() inputBlendAnswerList() # append filtered candidates to blendList for word in candidatesList: prefix, reverseSuffix = splitWord(word) if comparePrefixNDictUsingLED( prefix, 0) and comparePrefixNDictUsingLED(reverseSuffix, 1): blendList.append(word) calAccurancy()