コード例 #1
0
ファイル: Q6.py プロジェクト: Samriddhakc/LeetCodeCollection
    def replaceWords(self, dictionary, sentence):
        '''
        :param dictionary:
        :return:
        '''

        lis = sentence.split(" ")
        trie = Trie()
        for word in dictionary:
            trie.insert(word)

        # O(len(sentence)) time | O(26 * len(dictionary))  + O(len(sentence)) space.

        for idx, word in enumerate(sentence.split(" ")):
            root = trie.root
            result = []
            for w in word:
                if root and root.children[trie.getIndex(w)]:
                    if root.children[trie.getIndex(w)].isEndNode:
                        result.append(w)
                        break
                    root = root.children[trie.getIndex(w)]
                else:
                    root = None
                result.append(w)
            lis[idx] = "".join(result)
        return " ".join(lis)
コード例 #2
0
ファイル: Q3.py プロジェクト: Samriddhakc/LeetCodeCollection
    def longestWord(self, words):
        '''
        :param words:
        :return:
        '''
        ''' populate the trie. len(words) * max(len(word))  
            Use dfs to find the deepest branch in the trie O(len(words) ) time.
            as max(len(word)) is bounded, O(len(words)) time | O(len(words)) space'''

        trie = Trie()
        for word in words:
            trie.insert(word)

        root = trie.root
        max_prefix = ""
        stack = [[root, ""]]

        while stack:
            curr_node, prefix = stack.pop()
            if (len(max_prefix) < len(prefix)):

                max_prefix = prefix

            for i in range(25, -1, -1):
                if curr_node.children[i]:
                    if curr_node.children[i].isEndNode:
                        stack.append(
                            [curr_node.children[i], prefix + chr(97 + i)])
        return max_prefix
コード例 #3
0
class TestTrie(unittest.TestCase):
    def setUp(self):
        self.trie = Trie()
        self.case = [
            "A", "a", "aa", "aal", "aalii", "aam", "Aani", "aardvark",
            "aardwolf", "Aaron", "Aaronic", "Aaronite", "Aaronitic", "Aaru",
            "Ab", "Ababdeh", "Ababua", "abac", "abacay", "abacinate"
        ]

    def testInsertSearch(self):
        words = set(self.case)
        length = len(words) / 2
        setA = set()
        for i in range(length / 2):
            setA.add(words.pop())
        setB = words
        for word in setA:
            self.trie.insert(word)
        for word in setA:
            self.assertTrue(self.trie.search(word))
        for word in setB:
            self.assertFalse(self.trie.search(word))

    def testStartWith(self):
        prefixes = set(["A", "a", "aa", "aal", "Aaron", "Ab", "aba", "abac"])
        others = [
            "abaciscus", "abacist", "aback", "abactinal", "Abe", "abaction"
        ]
        for word in set(self.case) - prefixes:
            self.trie.insert(word)
        for prefix in prefixes:
            self.assertTrue(self.trie.startsWith(prefix))
        for word in others:
            self.assertFalse(self.trie.startsWith(word))
コード例 #4
0
ファイル: Q2.py プロジェクト: Samriddhakc/LeetCodeCollection
    def suggestedProducts(self, products, searchWord):
        '''
        :param products:
        :param searchWord:
        :return:
        '''

        trie = Trie()

        for word in products:
            trie.insert(word)

        prefix = ""
        result = []
        p_crawl = trie.root

        prefix = ""

        # O ( len(searchWord)^2 * len(products) * 26 * len(max(products)) ) time
        # O( len(max(products)) * 26 * len(products) ) space

        for s in searchWord:
            prefix += s
            curr_result = []
            p_crawl = p_crawl.children[trie.getIndex(s)]
            if p_crawl:
                curr_result = self.dfs(p_crawl, prefix)
            result.append(curr_result)
        return result
コード例 #5
0
    def test_insert_trie_multi(self):
        trie = Trie()

        trie.insert([1, 1, 1])
        trie.insert([1, 1, 2])
        trie.insert([1, 1, 3])
        trie.insert([1, 2, 1])
        trie.insert([1, 2, 2])
        trie.insert([1, 2, 3])
        trie.insert([1, 3, 4])
        trie.insert([5, 1, 1])
        trie.insert([5, 3, 1])

        collection = trie.collect([])
        print(collection)
        self.assertEqual(
            [[1, 1, 1], [1, 1, 2], [1, 1, 3], [1, 2, 1], [1, 2, 2], [1, 2, 3],
             [1, 3, 4], [5, 1, 1], [5, 3, 1]], collection)

        collection11 = trie.collect([1, 1])
        print(collection11)
        collection5 = trie.collect([5])
        self.assertEqual([[1, 1, 1], [1, 1, 2], [1, 1, 3]], collection11)
        self.assertEqual([[5, 1, 1], [5, 3, 1]], collection5)
        print(collection5)
コード例 #6
0
def autocomplete(prefix: str, possible_queries: List[str]) -> List[str]:
    # Add all query strings to the Trie
    trie = Trie()
    for word in possible_queries:
        trie.insert(word)
    # Get the nested dictionary for input prefix
    prefix_dict = trie.find(prefix)
    # Get all words from this dictionary
    return complete_words(prefix, prefix_dict)
コード例 #7
0
    def test_insert_trie_one(self):
        trie = Trie()

        arr1 = [1, 2, 3]

        trie.insert(arr1)
        collection = trie.collect([])
        print(collection)
        self.assertEqual([[1, 2, 3]], collection)
コード例 #8
0
def load_trie():
    trie = Trie()
    count = 0
    with open("成语俗语.txt", encoding='utf-8') as f:
        for line in f:
            count += 1
            line = line.strip()
            trie.insert(line)
    print("word num:", count)
    return trie
コード例 #9
0
ファイル: Table.py プロジェクト: macleginn/trie-paradigms
def corpus2table(data_path, table_path=None, lang=None):
    trie = Trie()

    with open(data_path, 'r', encoding='utf-8') as inp:
        for line in inp:
            words = word_tokenize(line)
            for w in words:
                w = non_word_pattern.sub('', w)
                if not w:
                    continue
                trie.insert(f'{w.lower()}#')
    prefix_suffix_tree = trie.get_prefix_suffix_tree()

    print('Tree constructed')

    prefixes = sorted(prefix_suffix_tree.keys())
    suffix_counts = Counter()
    for v in prefix_suffix_tree.values():
        for k, count in v.items():
            suffix_counts[k] += count

    # Take N most common suffixes
    sorted_counts = suffix_counts.most_common(300)
    suffixes = [el[0] for el in sorted_counts]
    freqs = [el[1] for el in sorted_counts]

    d = pd.DataFrame(index = prefixes, columns = suffixes, dtype = int).fillna(0)
    for prefix, suffix_counts_for_prefix in prefix_suffix_tree.items():
        print(prefix)
        for suffix, count in suffix_counts_for_prefix.items():
            if suffix in d.columns:
                d.loc[prefix,suffix] = count
    
    print('Dataframe constructed')

    entropies = d.apply(entropy)

    if lang is not None:
        # Regress entropies on log frequencies
        plt.figure(figsize=(16,10))
        plt.scatter(np.log(freqs), entropies, marker = 'o')
        plt.savefig(f'/home/macleginn/Analyses/bible-tables/img/entropies_log_freqs_{lang}.png')
    
    cutoff = np.quantile(entropies, 0.9)    
    d = d.loc[:,entropies > cutoff]

    print('Columns selected')
    
    if table_path is not None:
        d.to_csv(table_path)
        
    return d
コード例 #10
0
    def replaceWords(self, dict, sentence):
        t = Trie()
        words = sentence.split()
        for key in dict:
            t.insert(key)
        new_sentence = []
        for word in words:
            d = t.get_first_word_in_item(word)
            if d:
                new_sentence.append(d)
            else:
                new_sentence.append(word)

        return " ".join(new_sentence)
コード例 #11
0
def make_dot_file(ways, distances, file_name=''):
    unique_ways = [list(x) for x in set(tuple(x) for x in ways)]
    t = Trie(distances)
    for way in unique_ways:
        t.insert(way)
    trie = t.query(["Казань"])

    file_string = ''
    file_string += (f'digraph {file_name}' + '{\n')
    for edge in trie:
        file_string += (f'\t{edge}\n')
    file_string += ('}')
    file_string = file_string.replace('.', '')

    with open(f'data/{file_name}', 'w') as output:
        output.write(file_string)
コード例 #12
0
def is_formation_possible(dictionary, word):
    trie = Trie()
    for w in dictionary:
        trie.insert(w)

    current_node = trie.root
    for i in range(len(word)):
        index = trie.get_index(word[i])
        if not current_node.children[index]:
            return False
        current_node = current_node.children[index]
        if current_node.is_end_word:
            if trie.search(word[i + 1:]):
                return True

    return False
コード例 #13
0
def word_to_subword(infile, outfile):
    preTrie = Trie()
    with open('../data/pre.txt', 'r') as f:
        for line in f:
            line = line.strip()
            preTrie.insert(line)

    suffTrie = Trie()
    with open('../data/suff.txt', 'r') as f:
        for line in f:
            line = line.strip()
            suffTrie.insert(line[::-1])
    print(preTrie.startsWith('including'))
    with open(outfile, 'w') as fw:
        with open(infile, 'r') as fr:
            for line in fr:
                for word in line.strip().split():
                    new_line = []
                    prefix = preTrie.startsWith(word)
                    sufffix = suffTrie.startsWith(word[::-1])
                    if sufffix: sufffix = sufffix[::-1]
                    if prefix:
                        if sufffix:
                            if len(prefix) + len(sufffix) >= len(word):
                                new_line.extend(
                                    [prefix, word[len(prefix):], ''])
                            else:
                                new_line.extend([
                                    prefix, word[len(prefix):-len(sufffix)],
                                    sufffix
                                ])
                        else:
                            new_line.extend([prefix, word[len(prefix):], ''])

                    else:
                        if sufffix:
                            new_line.extend(
                                ['', word[:-len(sufffix)], sufffix])
                        else:
                            new_line.extend(['', '', ''])
                    new_line = [
                        item if item != '' else '#' for item in new_line
                    ]
                    fw.write(word + '\t' + ' '.join(new_line) + '\n')
    print('word to subword Successfully')
コード例 #14
0
def generate_kb2id():
    id_dict = dict()
    id_dict['PAD'] = 0
    kb_word_id = 1
    trie = Trie()
    with open(args.kb_path, 'r', encoding='utf-8') as f:
        for word in f:
            word = word.strip()
            if word not in id_dict.keys():
                trie.insert(word)
                id_dict[word] = kb_word_id
                kb_word_id = kb_word_id + 1

    with open(args.trie_path, 'wb') as f:
        pkl.dump(trie, f)

    with open(args.kb2id_path, 'wb') as f:
        pkl.dump(id_dict, f)
コード例 #15
0
ファイル: Table.py プロジェクト: macleginn/trie-paradigms
def corpora2dict(path):
    data_dict = {
        'log_frequency': [],
        'entropy': [],
        'doculect': []
    }
    trie = Trie()
    doculect = path.split('/')[-1].split('.')[0]
    print(doculect)
    with open(path, 'r', encoding='utf-8') as inp:
        for line in inp:
            words = word_tokenize(line)
            for w in words:
                w = non_word_pattern.sub('', w)
                if not w:
                    continue
                trie.insert(f'{w.lower()}#')
    prefix_suffix_tree = trie.get_prefix_suffix_tree()
    prefixes = sorted(prefix_suffix_tree.keys())
    suffix_counts = Counter()
    for v in prefix_suffix_tree.values():
        for k, count in v.items():
            suffix_counts[k] += count
                
    # Take N most common suffixes
    sorted_counts = suffix_counts.most_common(300)
    suffixes = [el[0] for el in sorted_counts]
    freqs = [el[1] for el in sorted_counts]

    # Construct an intermediate data frame to compute entropies
    d = pd.DataFrame(index = prefixes, columns = suffixes, dtype = int).fillna(0)
    for prefix, suffix_counts_for_prefix in prefix_suffix_tree.items():
        for suffix, count in suffix_counts_for_prefix.items():
            if suffix in d.columns:
                d.loc[prefix,suffix] = count

    entropies = d.apply(entropy)
    for f, e in zip(np.log(freqs), entropies):
        data_dict['doculect'].append(doculect)
        data_dict['log_frequency'].append(f)
        data_dict['entropy'].append(e)

    return data_dict
コード例 #16
0
    def test_insert_string(self):
        trie = Trie()

        trie.insert('ant')
        trie.insert('bar')
        trie.insert('bat')
        trie.insert('car')
        trie.insert('cat')
        trie.insert('cry')

        all_words = trie.collect_string('')
        print(f'all words: {all_words}')

        self.assertEqual(['ant', 'bar', 'bat', 'car', 'cat', 'cry'], all_words)

        a_words = trie.collect_string('a')
        print(f'a words: {a_words}')

        self.assertEqual(['ant'], a_words)

        b_words = trie.collect_string('b')
        print(f'b words: {b_words}')

        self.assertEqual(['bar', 'bat'], b_words)

        c_words = trie.collect_string('c')
        print(f'c words: {c_words}')

        self.assertEqual(['car', 'cat', 'cry'], c_words)

        ca_words = trie.collect_string('ca')
        print(f'ca words: {ca_words}')

        self.assertEqual(['car', 'cat'], ca_words)
コード例 #17
0
	def findLongestWord(self):

		word = self.word
		trie = Trie()
		queue = deque()

		#insert key to tree and also mark all the prefix with tuple format
		for key in word: # from longest to shortest
			prefixes = trie.getAllPrefix(key)
			for pf in prefixes:
				queue.append((key, key[len(pf):]))
			trie.insert(key)

		# get the longest word form the provided dictionary
		longest_word = ['','']
		flag = 2 # mark get the first two longest
		dic = {} # mark visited word

		while queue: 
			key,suffix = queue.popleft()
			if key not in dic and suffix in trie:
				dic[key] = True
				if len(key) > len(longest_word[0]):
					longest_word[1] = longest_word[0]
					longest_word[0] = key
				elif len(key) > len(longest_word[1]):
					longest_word[1] = key
			else:
				prefixes = trie.getAllPrefix(suffix)
				for pf in prefixes:
					queue.append((key, suffix[len(pf):]))

		#print result
		print "longest_word 1 are ", longest_word[0], ', length is ',len(longest_word[0])
		print "longest_word 2 are ", longest_word[1], ', length is ',len(longest_word[1])
		print "total words can be combined by other words are", len(dic)
	 	

	 	return
コード例 #18
0
class PrefixMapSum:
    def __init__(self):
        self._trie = Trie()
        self.values = defaultdict(int)

    def insert(self, key: str, value: int):
        self._trie.insert(key)
        self.values[key] = value

    def sum(self, prefix: sum):
        # Get all possible words with prefix
        words = self.complete_words(prefix, self._trie.find(prefix))
        # Sum values from values dictionary for all possible words
        return sum(self.values[word] for word in words)

    def complete_words(self, prefix, prefix_dict: dict):
        words = []
        for key, next_level in prefix_dict.items():
            if key == ENDS_HERE:
                words.append(prefix)
            else:
                words.extend(self.complete_words(prefix + key, next_level))
        return words
コード例 #19
0
                    elif len(tag) == 2:
                        temp_tag = tag[0] + " " + tag[1]
                    elif len(tag) == 3:
                        temp_tag = tag[0] + " " + tag[1] + " " + tag[2]
                    elif len(tag) == 4:
                        temp_tag = tag[0] + " " + tag[1] + " " + tag[
                            2] + " " + tag[3]
                    elif len(tag) == 5:
                        temp_tag = tag[0] + " " + tag[1] + " " + tag[
                            2] + " " + tag[3] + " " + tag[4]
                    elif len(tag) == 6:
                        temp_tag = tag[0] + " " + tag[1] + " " + tag[
                            2] + " " + tag[3] + " " + tag[4] + " " + tag[5]

                    if (len(word) == 1):
                        Lexi.insert(word[0], temp_tag)
                    elif (len(word) == 2):
                        Lexi.insert(word[0] + " " + word[1], temp_tag)
                    elif (len(word) == 3):
                        Lexi.insert(word[0] + " " + word[1] + " " + word[2],
                                    temp_tag)

###### Main Menu ####################################################
#####################################################################

Exit = False
while (not Exit):
    Key = raw_input(
        "=====:: Main Menu ::===============\n\n 1.Parse\n 2.Edit Gammer Or Lexicon\n 3.Represent Data Or Grammer\n 4.Exit\n\n :: "
    )
    if (int(Key) == 1):
コード例 #20
0
class Extractor(object):
    def __init__(self, rfpath, max_len=4):
        self.prefixTree = Trie()
        self.suffixTree = Trie(direction='suffix')

        self.vocabulary = []
        self.len_dict = dict()
        # 想要计n个字的词必须用n+1-gram
        self.max_len = max_len + 1

        text = Cleaner.preprocess_text(rfpath)
        self.buildTreesAndDics(text)
        self.prefixTree.set_entropy()
        self.suffixTree.set_entropy()

        self.words = dict()

    def buildTreesAndDics(self, text):
        tic = time()

        pbar = tqdm(range(self.max_len))
        for i in pbar:
            pbar.set_description("buildTreesAndDics, %d-gram \n" % (i + 1))
            n_gram_list = sum(map(lambda x: Cleaner.n_gram(x, i + 1), text),
                              [])
            self.len_dict[i + 1] = len(n_gram_list)
            if i >= 1:
                self.vocabulary.extend(list(set(n_gram_list)))
            for word in n_gram_list:
                self.prefixTree.insert(word, i + 1)
                self.suffixTree.insert(word, i + 1)
        print("build tree done! %.2fs" % (time() - tic))

    def score(self, candidate):
        '''
        淘宝
        h_r_l:宝的左信息熵
        h_l_r:淘的右信息熵
        '''
        children = set()
        h_l, count = calculate_entropy(candidate,
                                       self.prefixTree,
                                       return_count=True)
        h_r = calculate_entropy(candidate, self.suffixTree, return_count=False)
        max_score = 0
        for seg_index in range(1, len(candidate)):
            pmi = cal_pmi(candidate, self.len_dict, seg_index, self.suffixTree)

            left_candidate = candidate[:seg_index]
            right_candidate = candidate[seg_index:]
            # 去除重叠的词,似乎太粗暴,过于倾向于长词,比如‘牛逼’与‘牛逼牛逼牛逼’会选择后者
            if left_candidate in self.words:
                children.add(left_candidate)
            if right_candidate in self.words:
                children.add(right_candidate)

            h_r_l = calculate_entropy(right_candidate,
                                      self.prefixTree,
                                      return_count=False)
            h_l_r = calculate_entropy(left_candidate,
                                      self.suffixTree,
                                      return_count=False)
            score = min(h_l_r, h_r_l)
            if score > max_score:
                max_score = score

        if h_l == 0 or h_r == 0:
            return count, 0, 0

        for child in children:
            del self.words[child]
        max_score = pmi + min(h_l, h_r) - max_score
        return count, max_score, max_score * count

    def extract_words(self, thresh=None):
        # calculate PMI and freq remove dict words
        if thresh:
            for word in tqdm(self.vocabulary):
                count, score, final = self.score(word)
                if score > thresh:
                    self.words[word] = {
                        "candidate": word,
                        "count": count,
                        "score": score,
                        "final": final
                    }
            words = pd.DataFrame.from_dict(list(self.words.values()))
        else:
            words = pd.DataFrame(self.vocabulary, columns=['candidate'])
            words[['count', 'score', 'final']] = words.apply(
                lambda x: pd.Series(self.score(x['candidate'])), axis=1)
        if words.shape[0]:
            words = words.sort_values("final",
                                      ascending=False).reset_index(drop=True)
        return words
コード例 #21
0
'''
You're given a dictionary of strings, and a key. Check if the key is composed of an arbitrary number of concatenations of strings from the dictionary. For example: 

dictionary: "world", "hello", "super", "hell" 
key: "helloworld" --> return true 
key: "superman" --> return false 
key: "hellohello" --> return true
'''
from Trie import Trie

words = ["world", "hello", "super", "hell" ]
trie = Trie()

for word in words:
    trie.insert(word, 1)

def search(root, key, new_start = False):
    
    if root == None:
        return False
    
    if new_start:
        if not root.children.get(key[0], None):
            return False
    
    if(len(key) == 0):
        if root.data == 1:
            return True
        return False
        
    #Since we still have characters left, we search for the child node using the next
コード例 #22
0
'''Made by Jackson Bremen ||| Written Summer 2018, Refactored Winter 2020
Trie Datastructure used from github below, additional functionality added
https://www.wordplays.com/boggle has line crossing be legal
'''

import readline
from Trie import Trie

dictionary = Trie()
with open('allScrabbleWords.txt', 'r') as file:
    for i in file.read().split():
        dictionary.insert(i)

board = []
with open('boggleBoard.txt', 'r') as file:
    for line in file.readlines():
        board.append([])
        for char in line.split():
            board[-1].append(char.upper())


def print_board(board):
    for y in board:
        for x in y:
            if len(x) == 2:
                print(x, end=' ')
            else:
                print(x, end='  ')
        print()

コード例 #23
0
'''
You're given a dictionary of strings, and a key. Check if the key is composed of an arbitrary number of concatenations of strings from the dictionary. For example: 

dictionary: "world", "hello", "super", "hell" 
key: "helloworld" --> return true 
key: "superman" --> return false 
key: "hellohello" --> return true
'''
from Trie import Trie

words = ["world", "hello", "super", "hell"]
trie = Trie()

for word in words:
    trie.insert(word, 1)


def search(root, key, new_start=False):

    if root == None:
        return False

    if new_start:
        if not root.children.get(key[0], None):
            return False

    if (len(key) == 0):
        if root.data == 1:
            return True
        return False
コード例 #24
0
class preprocess:
    def __init__(self):
        self.trie = Trie()
        self.spam_count = 0
        self.normal_count = 0
        self.dump_csv = []

    def is_ascii(self, s):
        return all(ord(c) < 128 for c in s)

    def remove_non_ascii(self, s):
        return re.sub(r'[^\x00-\x7f]', r' ', s)

    def get_trie(self):
        data = pandas.read_csv("spam_or_not_spam.csv")
        for index, row in data.dropna().iterrows():
            # if index > 2600:
            #     continue
            is_spam = int(row["label"])

            seen = set()
            email = self.remove_non_ascii(row["email"])
            for word in email.split():
                if word in seen:
                    continue

                seen.add(word)
                word = ''.join(filter(str.isalnum, word))
                if word.isdigit():
                    continue
                self.trie.insert(word, is_spam)

            if is_spam == 1:
                self.spam_count += 1
            else:
                self.normal_count += 1

    def dfs(self, word="", node=None):
        if not node:
            node = self.trie.root

        if node.is_word:
            self.dump_csv.append({
                "word": word,
                "spam": node.spam_count,
                "normal": node.normal_count
            })

        for c in node.children:
            self.dfs(word + c, node.children[c])

    def process(self):
        self.get_trie()
        self.dfs()

        new_df = pandas.DataFrame(self.dump_csv,
                                  columns=['word', 'spam', 'normal'])

        print("test: ", self.trie.search("interred").normal_count)

        with open("cleaned.csv", "w") as f:
            new_df.to_csv(f, header=True, mode='w', line_terminator="\n")
コード例 #25
0
class Extractor(object):

    def __init__(self, rfpath=None, text=None, max_len=4):
        self.prefixTree = Trie()
        self.suffixTree = Trie(direction='suffix')

        self.vocabulary = []
        self.len_dict = dict()
        # 想要计n个字的词必须用n+1-gram
        self.max_len = max_len + 1

        if rfpath is not None:
            text = Cleaner.preprocess_text(rfpath)
        elif text is None:
            raise ValueError()

        self.buildTreesAndDics(text)
        self.prefixTree.set_entropy()
        self.suffixTree.set_entropy()

        self.words = dict()

    def buildTreesAndDics(self, text):
        tic = time()

        for i in range(self.max_len):
            n_gram_list = sum(
                map(lambda x: Cleaner.n_gram(x, i + 1), text), [])
            self.len_dict[i + 1] = len(n_gram_list)
            if i >= 1:
                self.vocabulary.extend(list(set(n_gram_list)))
            for word in n_gram_list:
                self.prefixTree.insert(word, i + 1)
                self.suffixTree.insert(word, i + 1)
            sys.stdout.write('build tree done %d/%d\r' % (i, self.max_len))

    def score(self, candidate, cnt_thresh):
        '''
        淘宝
        h_r_l:宝的左信息熵
        h_l_r:淘的右信息熵
        '''
        children = set()
        h_l, count = calculate_entropy(
            candidate, self.prefixTree, return_count=True)
        if count < cnt_thresh:
            return count, None, None

        h_r = calculate_entropy(candidate, self.suffixTree, return_count=False)
        min_score = np.inf
        for seg_index in range(1, len(candidate)):
            left_candidate = candidate[:seg_index]
            right_candidate = candidate[seg_index:]

            if left_candidate in self.words:
                children.add(left_candidate)
            if right_candidate in self.words:
                children.add(right_candidate)

            h_r_l = calculate_entropy(
                right_candidate, self.prefixTree, return_count=False)
            h_l_r = calculate_entropy(
                left_candidate, self.suffixTree, return_count=False)
            pmi = cal_pmi(candidate, self.len_dict, seg_index, self.suffixTree)
            score = pmi - min(h_l_r, h_r_l)
            if score < min_score:
                min_score = score
                

        if h_l == 0 or h_r == 0:
            return count, 0, 0

        min_score += min(h_l, h_r)

        for child in children:
            # 出现次数大于等于子段,选长的
            if min_score > self.words[child]['score']:
                del self.words[child]
        return count, min_score, min_score * count

    def extract_words(self, score_thresh=4.0, cnt_thresh=20):
        # calculate PMI and freq remove dict words
        for i, word in enumerate(self.vocabulary):
            res = self.score(word, cnt_thresh)
            count, score, final = res
            if score is None or score < score_thresh:
                continue
            self.words[word] = {"candidate": word,
                                "count": count, "score": score, "final": final}
            sys.stdout.write('extract words done %d/%d\r' %(i, len(self.vocabulary)))
        words = pd.DataFrame.from_dict(list(self.words.values())).sort_values("final", ascending=False).reset_index(drop=True)
        return words
コード例 #26
0
ファイル: main.py プロジェクト: WheatGroup/leetcode_coding
from Trie import Trie
trie = Trie()

trie.insert("apple")
result1 = trie.search("apple")
result2 = trie.search("app")
result3 = trie.startsWith("app")
trie.insert("app")
result4 = trie.search("ap")
print(result1, result2, result3, result4)

コード例 #27
0
class SpellSuggestion(object):
    def __init__(self, regex=r"[\w]+"):
        """ initialize the WORDS dictionary which the key is a word and the value is the occurrences of the key  """
        self.trie = Trie()
        self.regex = regex

        with open(english_words, "r") as f:  # Create a dictionary for storing all the words and its occurrences
            self.WORDS = Counter(self.words_token(f.read()))

        for word in self.WORDS.keys():  # put all the words in Trie
            self.trie.insert(word)

    def words_token(self, text):
        """ extract all the words from text with lowercase """
        if text is not None and text != "":
            return re.findall(self.regex, text.lower(), re.MULTILINE)
        else:
            return ["UWindsor"]

    def probability_of_word(self, word):
        """ calculate the probability of a given word """
        # return self.WORDS[word] / len(self.WORDS.values())
        return self.WORDS[word] / sum(self.WORDS.values())

    def edit_distance_1(self, word):
        """ get all combinations from the given word which edit distance is 1 """
        letters = string.ascii_lowercase
        splits = [(word[:i], word[i:]) for i in range(len(word) + 1)]
        deletes = [L + R[1:] for L, R in splits if R]
        transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R) > 1]
        replaces = [L + c + R[1:] for L, R in splits if R for c in letters]
        inserts = [L + c + R for L, R in splits for c in letters]
        return set(replaces + deletes + transposes + inserts)

    def edit_distance_2(self, word):
        """ get all combinations from the given word which edit distance is 2 """
        return set(e2 for e1 in self.edit_distance_1(word) for e2 in self.edit_distance_1(e1))

    def shown(self, words):
        """ return words that are shown in WORDS based on edit distance which is ether 1 or 2 """
        return set(w for w in words if w in self.WORDS)

    def candidates(self, word):
        """ get all the candidate words from the given word """
        return self.shown([word]) or self.shown(self.edit_distance_1(word)) or self.shown(self.edit_distance_2(word)) or [word]

    def correct_word(self, word) -> str:
        """ get the most probable word suggestion for the given word """
        return max(self.candidates(word), key=self.probability_of_word)

    def spell_checker(self, words) -> str:
        """ check all the words and return the words with modifications """
        tokens = self.words_token(words)
        return " ".join([self.correct_word(token) for token in tokens])

    # def spell_checker(self, word):
    #     """ get the most probable spelling suggestion for the given word. """
    #     # return sorted(self.candidates(word), key=self.probability_of_word)[0]
    #     return max(self.candidates(word), key=self.probability_of_word)

    def auto_completer(self, prefix, top=5):
        """ return number of top auto complete suggestion according the prefix """
        return self.trie.autocomplete(prefix, top)  # get the top word accourding to the given prefix
コード例 #28
0
def sort_list(arr):
    trie = Trie()
    for s in arr:
        trie.insert(s)
    return list(find(trie.root, ''))
コード例 #29
0
ファイル: endpoints.py プロジェクト: reddy100/CustomerAPI
class SimpleAPI():
    def __init__(self):
        self.titleTrie = Trie()
        self.categoryNameTrie = Trie()
        self.brandNameTrie = Trie()

        self.productDict={}
        self.productIdDict={}
        self.titleDict={}
        self.brandIdDict={}
        self.brandNameDict={}
        self.categoryIdDict={}
        self.categoryNameDict={}

        self.keywordFrequencyDict={}

    def _combinations(self, L, final,tmp=None):
        if tmp is None:
            tmp = []
        if L==[]:
            final.append(tmp)
        else:
            for i in L[0]:
                self._combinations(L[1:], final,tmp+[i])
        return final

    def _getListOfTypeCombinations(self, conditions):
        listOfTypes = []
        for condition in conditions:
            listOfTypes.append(self._combinations([[condition['type']], [value.lower() for value in condition['values']]], []))
        listOfTypeCombinations = self._combinations(listOfTypes, [])
        return listOfTypeCombinations

    def _addToDict(self, d, key, value):
        if key in d:
            d.get(key).append(value)
        else:
            d[key]=[value]

    def _addToFreqDict(self, keyList):
        for keys in keyList.split():
            keys = regex.sub(' ', keys).split()
            for key in keys:
                if key in self.keywordFrequencyDict:
                    self.keywordFrequencyDict[key] = self.keywordFrequencyDict[key] + 1
                else:
                    self.keywordFrequencyDict[key] = 1

    def initializeApi(self, fileName):
        with open(fileName,encoding='utf8') as in_file:
            for line in in_file:
                output = []
                columns = line.split("\t")
                for index, c in enumerate(columns):
                    c=c.lower()
                    output.append(c)
                    if index==1:
                        self._addToFreqDict(c)
                        self.titleTrie.insert(c)
                    elif index==3:
                        self.brandNameTrie.insert(c)
                    elif index==5:
                        self.categoryNameTrie.insert(c)
                        self.productIdDict[output[0]]=output[0]
                self._addToDict(self.titleDict, output[1], output[0])
                self._addToDict(self.brandIdDict, output[2], output[0])
                self._addToDict(self.brandNameDict, output[3], output[0])
                self._addToDict(self.categoryIdDict, output[4], output[0])
                self._addToDict(self.categoryNameDict, output[5][:-1], output[0])
                self.productDict[output[0]]=output

    def endpoint1(self, type, prefix):
        prefix=prefix.lower()
        if type=='title':
            return list(self.titleTrie.allWordsStartingWithPrefix(prefix))
        elif type=='category':
            return list(self.categoryNameTrie.allWordsStartingWithPrefix(prefix))
        elif type=='brand':
            return list(self.brandNameTrie.allWordsStartingWithPrefix(prefix))

    def endpoint2(self, conditions, pagination):
        responseHeadings = ['productId', 'title', 'brandId', 'brandName', 'categoryId', 'categoryName']
        fromPagination = pagination['from']
        sizePagination = pagination['size']
        resultingProductIds=[]
        listOfTypeCombinations = self._getListOfTypeCombinations(conditions)
        for typeCombination in listOfTypeCombinations:
            setList=[]
            for type in typeCombination:
                setList.append(eval('self.'+type[0]+'Dict.get(\''+type[1]+'\')'))
            resultingProductIdsByTypeCombination = set.intersection(*list(map(set, setList)))
            resultingProductIds+=resultingProductIdsByTypeCombination
        paginatedResultingProductIds = resultingProductIds[fromPagination:fromPagination+sizePagination]
        searchResults=[]
        for productId in paginatedResultingProductIds:
            searchResults.append(dict(zip(responseHeadings, self.productDict[productId])))
        return searchResults

    def endpoint3(self, keywords):
        searchResult = {}
        searchResult['keywordFrequencies']=[]
        for keyword in keywords:
            keywordFrequency = self.keywordFrequencyDict.get(keyword.lower(), 0)
            searchResult['keywordFrequencies']=searchResult['keywordFrequencies']+[[keyword, str(keywordFrequency)]]
        return searchResult

    def closeEndpoint(self):
        self.titleTrie = None
        self.categoryNameTrie = None
        self.brandNameTrie = None

        self.productDict={}
        self.productIdDict={}
        self.titleDict={}
        self.brandIdDict={}
        self.brandNameDict={}
        self.categoryIdDict={}
        self.categoryNameDict={}

        self.keywordFrequencyDict={}
コード例 #30
0
class pinyin(object):
    def __init__(self, pinyins):
        self.pinyins = pinyins
        # 读入所有有效拼音
        self.tree = Trie()
        f = open('pinyin/pinyin_list.txt')
        # f = open('pinyin_list.txt')
        for line in f:
            self.tree.insert(line.split()[0])
        f.close()

    def split(self):
        '''
        分割函数
        @param pinyin:  拼音串 str
        @return:        分割后的拼音列表 list
        '''
        # 可作为拼音开头的字母
        pinyin_initials = [
            'a', 'b', 'e', 'p', 'm', 'f', 'd', 't', 'n', 'l', 'g', 'k', 'h',
            'j', 'q', 'x', 'r', 'z', 'c', 's', 'y', 'w'
        ]
        # pinyin_initials = self.tree.root.children
        iuv = ['i', 'u', 'v']
        grn = ['g', 'r', 'n']

        input = ''
        result = []

        for i in range(len(self.pinyins)):
            c = self.pinyins[i]
            # 读入字符 c
            input += c
            # c是 i|u|v,并且是拼音串的首字母
            if c in iuv and len(input) == 1:
                return False, None
            # 当前拼音有效或者是有效拼音的一部分
            if self.tree.find_initial_with(input):
                continue
            # c是声母
            if c in pinyin_initials:
                # 前面的拼音为有效拼音
                if self.tree.find_initial_with(input[:-1]):
                    # 在c前断开
                    result.append(input[:-1])
                    input = input[-1:]
                    continue
                else:
                    return False, None
            # 倒数第二个字母为 g|r|n
            elif input[-2:-1] in grn:
                # 在 g|r|n 前断开有效
                if self.tree.find_initial_with(input[:-2]):
                    # 在 g|r|n 前断开
                    result.append(input[:-2])
                    input = input[-2:]
                    continue
                # 在 g|r|n 后断开有效
                elif self.tree.find_initial_with(input[:-1]):
                    # 在 g|r|n 后断开
                    result.append(input[:-1])
                    input = input[-1:]
                    continue
            else:
                # 单独断开
                result.append(input)
                input = ''

        result.append(input)

        return True, result
コード例 #31
0
ファイル: Pinyin.py プロジェクト: Velkan/NLP-test
class pinyin(object):
    def __init__(self, pinyins):
        self.pinyins = pinyins
        # 读入所有有效拼音
        self.tree = Trie()
        f = open('pinyin/pinyin_list.txt')
        # f = open('pinyin_list.txt')
        for line in f:
            self.tree.insert(line.split()[0])
        f.close()

    def split(self):
        '''
        分割函数
        @param pinyin:  拼音串 str
        @return:        分割后的拼音列表 list
        '''
        # 可作为拼音开头的字母
        pinyin_initials = ['a', 'b', 'e', 'p', 'm', 'f', 'd',
                           't', 'n', 'l', 'g', 'k', 'h', 'j',
                           'q', 'x', 'r', 'z', 'c', 's', 'y', 'w']
        # pinyin_initials = self.tree.root.children
        iuv = ['i','u','v']
        grn = ['g','r','n']

        input = ''
        result = []

        for i in range(len(self.pinyins)):
            c = self.pinyins[i]
            # 读入字符 c
            input += c
            # c是 i|u|v,并且是拼音串的首字母
            if c in iuv and len(input)==1:
                return False,None
            # 当前拼音有效或者是有效拼音的一部分
            if self.tree.find_initial_with(input):
                continue
            # c是声母
            if c in pinyin_initials:
                # 前面的拼音为有效拼音
                if self.tree.find_initial_with(input[:-1]):
                    # 在c前断开
                    result.append(input[:-1])
                    input = input[-1:]
                    continue
                else:
                    return False,None
            # 倒数第二个字母为 g|r|n
            elif input[-2:-1] in grn:
                # 在 g|r|n 前断开有效
                if self.tree.find_initial_with(input[:-2]):
                    # 在 g|r|n 前断开
                    result.append(input[:-2])
                    input = input[-2:]
                    continue
                # 在 g|r|n 后断开有效
                elif self.tree.find_initial_with(input[:-1]):
                    # 在 g|r|n 后断开
                    result.append(input[:-1])
                    input = input[-1:]
                    continue
            else:
                # 单独断开
                result.append(input)
                input = ''

        result.append(input)

        return True,result
コード例 #32
0
ファイル: addent2wiki.py プロジェクト: CogComp/JEANS
    for line in open(os.path.join(dir,f"ent_ids_{id}"), "r", encoding="utf-8"):
        line = line.strip().split("\t")
        eid, url = line[0], line[1]
        ent =  line[1].split("/")[-1]
        ent_name = ent.split("(")[0].replace("_"," ").strip()
        ent = "dbpedia/" + ent
        fout.write(f"{eid}\t{url}\t{ent}\t{ent_name}\n")
    fout.close()

# dump_path
trie = Trie()
all_ents = set([])
for line in open(dump_path,"r",encoding="utf-8"):
    # lower
    line = line.rstrip('\n').split('\t')
    trie.insert(line[3], line[2])
    all_ents.add(line[2])
print("Built trie.")

out_file = f"../data/wiki_db/{lang1}_{lang2}/{id2lang[id]}.txt"
corpora_file = f"../data/wiki/{id2lang[id]}.txt"
num = 0
found = 0
found_entity = set([])
with open(out_file, "w") as f:
    t0 = time.time()
    for line in open(corpora_file, "r", encoding="utf-8"):
        hit4line = 0
        num += 1
        # lower
        # line = line.lower()