Python Trie.Trieの例、pytrie.Trie.Trie Pythonの例

コード例 #1

0

ファイルを表示

 def createTrie(self, filePath):
     trie = Trie()
     # with open("/Users/nali/Downloads/final_dict.txt", 'r') as file:
     with open(filePath, 'r') as file:
         lines = file.readlines()
         flag = 0
         for line in lines:
             flag += 1
             parts = line.strip().split('\t')
             if (len(parts) > 1):
                 words = parts[1].split(';')
                 for word in words:
                     if (word != ''):
                         if (trie.get(word) == None):
                             trie[word] = set()
                             trie[word].add(parts[0])
                         else:
                             trie[word].add(parts[0])
     return trie

コード例 #2

0

ファイルを表示

def _make_trie(file, fields="bgg_user_name", sep=","):
    return Trie(_process_file(file, fields, sep, count=True))

コード例 #3

0

ファイルを表示

from flask import jsonify
from pytrie import SortedStringTrie as Trie
import pandas as pd
import time
import json
import cPickle as pickle

trieds = Trie()  #trieds is short for trie datastructure.
listofallnames = []


def loadstrings():
    """
    Function to load a csv file, preprocess the data and load it into the Trie Datastructure
    """
    start = time.time()
    df = pd.read_csv("data.csv", error_bad_lines=False
                     )  #loading csv data file as a dataframe using pandas
    listofallnames = {}
    print 'key generation in process, please wait.'
    count = 0
    for index, row in df.iterrows():
        rowdata = {}
        #preprocessing data in the dataframe
        rowdata['firstname'] = str(row['givenName']).lower()
        rowdata['middlename'] = str(row['middleName']).lower()
        rowdata['lastname'] = str(row['surname']).lower()
        if str(row['givenName']) != 'nan':
            try:
                listofallnames[str(rowdata['firstname'])].append(rowdata)
            except KeyError:

コード例 #4

0

ファイルを表示

 def __init__(self):
     self.full_match = Trie()
     self.repairs_match = Trie()
     self.full_sentences = {}

コード例 #5

0

ファイルを表示

  numeros['treinti{}'.format(text)] = 30 + number
  numeros['treinta y {}'.format(text)] = 30 + number
  numeros['cuarenti{}'.format(text)] = 40 + number
  numeros['cuarenta y {}'.format(text)] = 40 + number
  numeros['cincuenti{}'.format(text)] = 50 + number
  numeros['cincuenta y {}'.format(text)] = 50 + number
  numeros['sesenti{}'.format(text)] = 60 + number
  numeros['sesenta y {}'.format(text)] = 60 + number
  numeros['setenti{}'.format(text)] = 70 + number
  numeros['setenta y {}'.format(text)] = 70 + number
  numeros['ochenti{}'.format(text)] = 80 + number
  numeros['ochenta y {}'.format(text)] = 80 + number
  numeros['noventi{}'.format(text)] = 90 + number
  numeros['noventa y {}'.format(text)] = 90 + number

numeros = Trie(numeros)

multiplicadores = Trie({
  'mil': 1000,
  'ientos': 100,
  'cientos': 100,
  'ciento': 100,
  'cientas': 100,
  'cienta': 100,
  'cien': 100
})

momento_del_dia = Trie({ 'de la noche': 'pm', 'de la mañana': 'am', 'del mediodia': 'pm', 'de la tarde': 'pm' })

time_of_day_to_sum = {
  'noche': 12,

コード例 #6

0

ファイルを表示

t0 = time.time()  #Start time
for index, row in train.iterrows():
    #Normalise the names of dishes.

    with_sc = row[1]

    processed_string = preprocessing(with_sc)

    for token in processed_string.split():
        #Create Inverted Index
        if inverted_index.get(token) == None:
            inverted_index.update({token: set()})
        inverted_index[token].add(row[0])

prefix_tree = Trie(**inverted_index)

t1 = time.time() - t0  #Time required for index construction

print("Time taken for Trie generation: {} ".format(t1))


def search_results(query):
    #Return primary keys for query matching strings

    t0 = time.time()  #Start time

    found = False

    processed_query = preprocessing(
        query)  #Apply same pre-processing as before

コード例 #7

0

ファイルを表示

def create_training_instances(input_file, max_seq_length, tokenizer, rng,
                              alias2entities):
    def is_whitespace(c):
        if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F:
            return True
        return False

    """Create `TrainingInstance`s from raw text."""
    all_documents = []
    all_alias_token_spans = []
    from pytrie import SortedStringTrie as Trie
    trie = Trie()
    # add entities to this trie
    for alias, ents in alias2entities.items():
        trie.setdefault(alias, 0)

    with open(input_file, "r") as reader:
        for line in tqdm(reader, desc='converting tokens'):
            line = tokenization.convert_to_unicode(line.strip())
            line = json.loads(line)['text']

            tokens = []
            if do_lower_case:
                line = line.lower()
            char_to_word_offset = []
            prev_is_whitespace = True
            for c in line:
                if is_whitespace(c):
                    prev_is_whitespace = True
                else:
                    if prev_is_whitespace:
                        tokens.append(c)
                    else:
                        tokens[-1] += c
                    prev_is_whitespace = False
                char_to_word_offset.append(len(tokens) - 1)

            # 匹配文档中的alias
            alias_spans = match_alias(line, trie, alias2entities)
            # 此时的span对应粗粒度的token,span_end对应alias的最后一个token
            alias_token_spans = [(char_to_word_offset[span[0]],
                                  char_to_word_offset[span[1] - 1])
                                 for span in alias_spans]

            for span, token_span in zip(alias_spans, alias_token_spans):
                alias_tokens = ' '.join(tokens[token_span[0]:token_span[1] +
                                               1])
                alias_texts = line[span[0]:span[1]]
                assert alias_tokens in alias2entities, print(
                    alias_tokens, token_span, alias_texts, span)
            # assert all(' '.join(tokens[span[0]: span[1] + 1]) in alias2entities for span in alias_token_spans), \
            #     print([' '.join(tokens[span[0]: span[1] + 1]) for span in alias_token_spans])

            tok_to_orig_index = []  # 细粒度-粗粒度
            orig_to_tok_index = []  # 粗粒度-细粒度
            real_tokens = []
            for (i, token) in enumerate(tokens):
                orig_to_tok_index.append(len(real_tokens))
                sub_tokens = tokenizer.tokenize(token)
                for sub_token in sub_tokens:
                    tok_to_orig_index.append(i)
                    real_tokens.append(sub_token)
            # 判断当前span对应的粗粒度token是否为最后一个token,
            # 如果是的话，则取最后一个细粒度token为结尾，如果不是的话，取下一个粗粒度token对应的第一个细粒度token的前一个token为结尾。
            real_alias_token_spans = []
            for span in alias_token_spans:
                if span[1] == len(tokens) - 1:
                    real_end = orig_to_tok_index[-1]
                else:
                    real_end = orig_to_tok_index[span[1] + 1] - 1
                real_start = orig_to_tok_index[span[0]]
                real_alias_token_spans.append((real_start, real_end))

            # alias_token_spans = [(orig_to_tok_index[span[0]], orig_to_tok_index[span[1]])
            #                      for span in alias_token_spans]

            all_documents.append(real_tokens)
            all_alias_token_spans.append(real_alias_token_spans)

    vocab_words = list(tokenizer.vocab.keys())
    instances = []
    for document_index in tqdm(range(len(all_documents)),
                               total=len(all_documents),
                               desc='creating instances'):
        instances.extend(
            create_instances_from_document(all_documents, document_index,
                                           all_alias_token_spans,
                                           max_seq_length, vocab_words, rng))

    rng.shuffle(instances)
    return instances

コード例 #8

0

ファイルを表示

def loadtree(arr):
    global trie
    trie = Trie()
    for key in arr:
        trie[key[::-1]] = key

コード例 #9

0

ファイルを表示

            reduce(lambda i, j: i & j, (set(x) for x in res)))
    except:
        message += 'NOTHING FOUND BY YOUR QUERY\n'

    message += '-----------------------------------------------------------\n'
    message += f'InnoGruk found {len(relevant_documents)} documents by your query\n'
    message += '-----------------------------------------------------------'

    return message, relevant_documents, track[2:]


if __name__ == '__main__':
    prfx = Trie(apple=None,
                borrow=None,
                friend=None,
                ginger=None,
                lermontov=None,
                money=None,
                november=None,
                object=None)

    psfx = Trie(elppa=None,
                worrob=None,
                dneirf=None,
                regnig=None,
                votnomrel=None,
                yenom=None,
                rebmevon=None,
                tcejbo=None)

    sndx = {
        'A140': ['apple'],

コード例 #10

0

ファイルを表示

ファイル: data_loader.py プロジェクト: Web-Sheriff/InnoGruk-Search

            word2soundex[sndx] = []
        if word not in word2soundex[sndx]:
            word2soundex[sndx].append(word)

        # Prefix tree loader
        if not word in prefix_tree:
            prefix_tree[word] = None

        rword = word[::-1]
        # Postfix_tree loader
        if not rword in postfix_tree:
            postfix_tree[rword] = None

    return aux, word2soundex, prefix_tree, postfix_tree


if __name__ == '__main__':
    aux = {'one': [0], 'five': [0], 'reuters': [0]}
    w2s = {'O500': ['one'], 'F100': ['five'], 'R362': ['reuters']}
    prfx = Trie(one=None, five=None, reuters=None, chelny=None)
    psfx = Trie(eon=None, evif=None, sretuer=None, ylehc=None)
    doc = next(get_docs("./docs/reuters21578.tar.gz"))
    name = 'collection 1'

    aux, w2s, prfx, psfx = load_data(doc, name, aux, w2s, prfx, psfx)

    print(aux)
    print(w2s)
    print(prfx)
    print(psfx)

コード例 #11

0

ファイルを表示

    with open('candidates.txt','r') as candidates:
        data = candidates.readlines()
        for key in data:
            output.append(key.split("\n")[0])
    return output

onepercent = []
# print(getelse("hello",2))
# judgeoutput = dict()
#
# alpha = "abcdefghijklmnopqrstuvwxyz"
print("Loading Dictionary")
dictionary = loaddictionary()
dictionaryprime = loadprimedictionary()
print("Building Tree Begin",time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())))
t = Trie(dictionary)
tprime = Trie(dictionaryprime)
print("Building Tree Stop",time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())))
print("Loading Candidates")
candidates = loadcandidates()
judgeoutput = dict()
i = 0
j = 0
print("Begin Time:",time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())))
for key in candidates:
    if i % 170 == 0:
        print(i/170,"%")
        print(time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())))
    judgeoutput[key] = judge(key,j)
    i = i + 1
    j = j + 1

コード例 #12

0

ファイルを表示

ファイル: main.py プロジェクト: yao790536267/KTProject

def solution():

    trie = Trie()
    reverseTrie = Trie()
    candidatesList = []
    blendList = []
    blendAnswerList = []
    repeatSubString = ['aaaa', 'bbbb', 'cccc', 'dddd', 'eeee', 'ffff', 'gggg', 'hhhh', 'iiii', 'jjjj', 'kkkk', 'llll',
                       'mmmm', 'nnnn', 'oooo', 'pppp', 'qqqq', 'rrrr', 'ssss', 'tttt', 'uuuu', 'vvvv', 'wwww', 'xxxx',
                       'yyy', 'zzzz']

    def inputTrie():
        file = open("data/dict.txt", "r", encoding='utf-8')
        wordList = file.read().splitlines()

        # input prefixly
        for word in wordList:
            word = word.strip()
            word = word.lower()
            trie[word] = word
            reverseWord = word[::-1]
            reverseTrie[reverseWord] = reverseWord

    def inputCandidate():
        file = open("data/candidates.txt", "r", encoding='utf-8')
        wordList = file.read().splitlines()
        for word in wordList:
            word = word.strip()
            word = word.lower()
            flag = 0
            for subString in repeatSubString:
                if subString in word:
                    flag = 1
                    break
            if flag == 0:
                candidatesList.append(word)


    def inputBlendAnswerList():
        file = open("data/blends.txt", "r", encoding='utf-8')
        wordList = file.read().splitlines()

        for word in wordList:
            word = word.split()[0]
            word = word.lower()
            blendAnswerList.append(word)

    def splitWord(word):
        length = len(word)
        prefix = word[:int(length/2) + 1]
        reverseSuffix = word[int(length/2 - 1) :][::-1]
        splitedCandidate = [prefix, reverseSuffix]
        return splitedCandidate

    def equal(replace, match, char1, char2):
        if char1 == char2:
            return match
        else:
            return replace

    def localEditDistance(word1,word2):
        A = [[]]
        deletion = -1
        insertion = -1
        replace = -1
        match = 1


        word1Length = len(word1)
        word2Length = len(word2)
        A = np.zeros([word1Length, word1Length])
        # construct matrix
        for j in range(word1Length):
            for k in range(word1Length):
                A[j][k] = max(0, A[j][k-1] + deletion, A[j-1][k] + insertion, A[j-1][k-1] + equal(replace, match, word1[k-1],word2[j-1]))

        max_item = max(max(row) for row in A)
        return max_item

    def comparePrefixNDictUsingLED(pref, reverse = 0):
        if reverse:
            trieList = reverseTrie
        else:
            trieList = trie
        prefixDict = trieList.keys(prefix = pref)
        threshold = len(pref) * (0.85)

        prefixFlag = 0
        for word in prefixDict:
            LEDValue = localEditDistance(pref, word)
            if LEDValue >= threshold:
                JWValue = jarowinklerSimilarity.get_jaro_distance(pref, word[:int(len(pref))], winkler = True, scaling = 0.1)
                if JWValue > 0.8:
                    return True
        return False

    def calAccurancy():
        truePositiveAmount = 0

        print(" ***********  blendlist : ")
        print(blendList)
        print("\n\n ")
        print(" *********** count blendlist : ")
        print(len(blendList))
        print("\n\n ")
        blendFindedList = []
        blendMissedList = []

        for word in blendList:
            if word in blendAnswerList:
                truePositiveAmount += 1
                blendFindedList.append(word)

        print(" ***********  truePositive : ")
        print(truePositiveAmount)
        print("\n\n ")

        # precision
        precision = float(truePositiveAmount) / len(blendList)
        recall = float(truePositiveAmount) / (len(blendAnswerList) - 32)

        print("***** Precision is : ")
        print(precision)
        print("\n\n")
        print("***** Recall is : ")
        print(recall)
        print("\n\n")

        for word in blendAnswerList:
            if word not in blendList:
                blendMissedList.append(word)

        print(" *********** count of  blendFindedList : ")
        print(len(blendFindedList))
        print(" ***********  blendFindedList : ")
        print(blendFindedList)
        print("\n\n ")
        print(" *********** count of  blendMissedList : ")
        print(len(blendMissedList))
        print(" ***********  blendMissedList : ")
        print(blendMissedList)
        print("\n\n ")


    # input
    inputTrie()
    inputCandidate()
    inputBlendAnswerList()

    # append filtered candidates to blendList
    for word in candidatesList:
        prefix, reverseSuffix = splitWord(word)
        if comparePrefixNDictUsingLED(prefix, 0) and comparePrefixNDictUsingLED(reverseSuffix, 1):
            blendList.append(word)

    calAccurancy()

コード例 #13

0

ファイルを表示

ファイル: util.py プロジェクト: FranciShy/blend_search

def generate_weight_tree(list, func):
    map = {}
    for w in list:
        w = w
        map[w] = func(w)
    return Trie(map)

コード例 #14

0

ファイルを表示

def _trie_from_file(file):
    try:
        return Trie(_prefixes_from_file(file))
    except Exception:
        pass
    return None

コード例 #15

0

ファイルを表示

import os
from pytrie import StringTrie as Trie
import re
myTrie = Trie()  # create empty trie FOR QUERY2
myTrie1 = Trie()  #create empty trie FOR QUERY1
path = input('Hello, \nFirstly,Enter a path that includes txt files : ')
all_files = os.listdir(path)

#BUİLD TRİE FOR QUERY 2
for i in range(len(all_files)):
    with open(all_files[i], 'r') as f:
        for line in f:
            res = re.findall(r'\w+', line)
            #take list of words line by line ,convert lower and put with filename to trie
            for j in range(len(res)):
                #check the key existence in the trie
                if not myTrie.has_key(
                        res[j].lower()):  #  word is NOT in the trie
                    myTrie[res[j].lower()] = {all_files[i]
                                              }  # word-> set of file
                else:
                    myTrie.get(res[j].lower()).add(
                        all_files[i]
                    )  # if word is IN the trie add only file info to fileSET OF WORD

#BUİLD TRİE FOR QUERY 1
for i in range(len(all_files)):
    with open(all_files[i], 'r') as f:
        counter = 0
        for line in f:
            res = re.findall(r'\w+', line)

コード例 #16

0

ファイルを表示

def solution():

    candidatesTokens = []
    blendResults = []
    blendWords = []
    prefixTrie = Trie()
    reversalTrie = Trie()

    def equalLetter(replace, match, char1, char2):
        if char1 == char2:
            return match
        else:
            return replace

    def split(word):
        leng = len(word)
        prefix = word[:int(leng / 2) + 1]
        suffix = word[int(leng / 2 - 1):][::-1]
        splited = [prefix, suffix]
        return splited

    def calPreRe():
        truePositiveAmount = 0

        blendCorrectWords = []
        blendFalseWords = []

        for word in blendWords:
            if word in blendResults:
                truePositiveAmount += 1
                blendCorrectWords.append(word)

        print("  truePositive : ")
        print(truePositiveAmount)
        print("\n\n ")

        # precision
        recall = float(truePositiveAmount) / (len(blendResults) - 32)
        precision = float(truePositiveAmount) / len(blendWords)

        print(" Recall is : ")
        print(recall)
        print("\n\n")
        print(" Precision is : ")
        print(precision)
        print("\n\n")

        for word in blendResults:
            if word not in blendWords:
                blendFalseWords.append(word)

        print("count of  blendCorrectWords : " + len(blendCorrectWords))

        print("   blendCorrectWords : " + blendCorrectWords)

        print("  count of  blendFalseWords : " + len(blendFalseWords))

        print(" blendFalseWords : " + blendFalseWords)

    def input():
        file = open("data/dict.txt", "r", encoding='utf-8')
        wordList = file.read().splitlines()

        repeatLetters = [
            'aaaa', 'bbbb', 'cccc', 'dddd', 'eeee', 'ffff', 'gggg', 'hhhh',
            'iiii', 'jjjj', 'kkkk', 'llll', 'mmmm', 'nnnn', 'oooo', 'pppp',
            'qqqq', 'rrrr', 'ssss', 'tttt', 'uuuu', 'vvvv', 'wwww', 'xxxx',
            'yyy', 'zzzz'
        ]

        # input prefixly
        for word in wordList:
            word = word.strip()
            word = word.lower()
            prefixTrie[word] = word
            reverseWord = word[::-1]
            reversalTrie[reverseWord] = reverseWord

        file1 = open("data/candidates.txt", "r", encoding='utf-8')
        wordList1 = file1.read().splitlines()
        for word in wordList1:
            word = word.strip()
            word = word.lower()
            flag = 0
            for subString in repeatLetters:
                if subString in word:
                    flag = 1
                    break
            if flag == 0:
                candidatesTokens.append(word)

        file2 = open("data/blends.txt", "r", encoding='utf-8')
        wordList2 = file2.read().splitlines()

        for word in wordList2:
            word = word.split()[0]
            word = word.lower()
            blendResults.append(word)

    def editDistance(word1, word2):
        A = [[]]
        replace = -1
        match = 1
        deletion = -1
        insertion = -1

        word1Length = len(word1)
        word2Length = len(word2)
        A = np.zeros([word1Length, word1Length])
        # construct matrix
        for j in range(word1Length):
            for k in range(word1Length):
                A[j][k] = max(
                    0, A[j][k - 1] + deletion, A[j - 1][k] + insertion,
                    A[j - 1][k - 1] +
                    equalLetter(replace, match, word1[k - 1], word2[j - 1]))

        max_item = max(max(row) for row in A)
        return max_item

    def compareUsingLED(pref, reverse=0):
        if reverse:
            trieList = reversalTrie
        else:
            trieList = prefixTrie
        prefixDict = trieList.keys(prefix=pref)
        threshold = len(pref) * (0.85)

        prefixFlag = 0
        for word in prefixDict:
            LEDValue = editDistance(pref, word)
            if LEDValue >= threshold:
                JWValue = jarowinklerSimilarity.get_jaro_distance(
                    pref, word[:int(len(pref))], winkler=True, scaling=0.1)
                if JWValue > 0.8:
                    return True
        return False

    # input
    input()

    for word in candidatesTokens:
        prefix, reverseSuffix = split(word)
        if compareUsingLED(prefix, 0) and compareUsingLED(reverseSuffix, 1):
            blendWords.append(word)

    calPreRe()

コード例 #17

0

ファイルを表示

def solution():

    trie = Trie()
    reverseTrie = Trie()
    candidatesList = []
    blendList = []
    blendAnswerList = []

    def inputTrie():
        file = open("data/dict.txt", "r", encoding='utf-8')
        wordList = file.read().splitlines()

        # input prefixly
        for word in wordList:
            word = word.strip()
            word = word.lower()
            trie[word] = word
            reverseWord = word[::-1]
            reverseTrie[reverseWord] = reverseWord

    def inputCandidate():
        file = open("data/candidates.txt", "r", encoding='utf-8')
        wordList = file.read().splitlines()
        for word in wordList:
            word = word.strip()
            word = word.lower()
            candidatesList.append(word)

    def inputBlendAnswerList():
        file = open("data/blends.txt", "r", encoding='utf-8')
        wordList = file.read().splitlines()

        for word in wordList:
            word = word.split()[0]
            word = word.lower()
            blendAnswerList.append(word)

    def splitWord(word):
        length = len(word)
        prefix = word[:int(length / 2)]
        reverseSuffix = word[int(length / 2):][::-1]
        splitedCandidate = [prefix, reverseSuffix]
        return splitedCandidate

    def equal(replace, match, char1, char2):
        if char1 == char2:
            return match
        else:
            return replace

    def localEditDistance(word1, word2):
        A = [[]]
        deletion = -1
        insertion = -1
        replace = -1
        match = 1

        word1Length = len(word1)
        word2Length = len(word2)
        A = np.zeros([word1Length, word1Length])
        # construct matrix
        for j in range(word1Length):
            for k in range(word1Length):
                A[j][k] = max(
                    0, A[j][k - 1] + deletion, A[j - 1][k] + insertion,
                    A[j - 1][k - 1] +
                    equal(replace, match, word1[k - 1], word2[j - 1]))

        max_item = max(max(row) for row in A)
        return max_item

    def comparePrefixNDictUsingLED(pref, reverse=0):
        if reverse:
            trieList = reverseTrie
        else:
            trieList = trie
        prefixDict = trieList.keys(prefix=pref)
        threshold = len(pref) * (0.8)

        prefixFlag = 0
        for word in prefixDict:
            LEDValue = localEditDistance(pref, word)
            if LEDValue >= threshold:
                JWValue = jarowinklerSimilarity.get_jaro_distance(
                    pref, word[:int(len(pref))], winkler=True, scaling=0.1)
                if JWValue > 0.95:
                    return True
        return False

    def calAccurancy():
        truePositiveAmount = 0

        print(" ***********  blendlist : ")
        print(blendList)
        print("\n\n ")
        print(" *********** count blendlist : ")
        print(len(blendList))
        print("\n\n ")

        for word in blendList:
            if word in blendAnswerList:
                truePositiveAmount += 1

        print(" ***********  truePositive : ")
        print(truePositiveAmount)
        print("\n\n ")

        # precision
        precision = float(truePositiveAmount) / len(blendList)
        recall = float(truePositiveAmount) / (len(blendAnswerList) - 32)

        print("***** Precision is : ")
        print(precision)
        print("\n\n")
        print("***** Recall is : ")
        print(recall)
        print("\n\n")

    # input
    inputTrie()
    inputCandidate()
    inputBlendAnswerList()

    # append filtered candidates to blendList
    for word in candidatesList:
        prefix, reverseSuffix = splitWord(word)
        if comparePrefixNDictUsingLED(
                prefix, 0) and comparePrefixNDictUsingLED(reverseSuffix, 1):
            blendList.append(word)

    calAccurancy()