def __init__(self): self._data = trie({}) self.lang = None self.name = None self.ver = None self.sha = hashlib.sha1() self._hashed = None
def search_users(accounts, search_term): global dir_path # Like search using PyTrie t = trie(users_dict) # Get list of aliases # Search through aliases and get corresponding userIds and put them ALL in one list aliases = t.keys(prefix=search_term) users = [] names = [] for alias in aliases: names = names + users_dict[alias] for account in accounts: dir_path = log_path + '/' + account os.chdir(dir_path) for name in names: if name in os.listdir(dir_path): users.append(account + '/' + name) break return users
def __init__(self, *args, **kwargs): # pylint: disable=super-init-not-called self._dict = {} self._trie = trie(*args, **kwargs) d = dict(*args, **kwargs) for key, value in d.iteritems(): self._dict[case_insensitive(key)] = value
def draw_simple(tech, sequence): 'Init variables, arrays, trie' global gTrie gTrie = trie() snps = [ ] gene = tech.gene seg_loc = gt.seq_pos(gene, sequence) 'Prepare plot' fig, ax = plt.subplots() fig.set_size_inches(18.5, 10.5, forward=True) ax.invert_yaxis() 'Prepare axis' ax.xaxis.tick_top() ax.xaxis.set_ticks_position('both') # THIS IS THE ONLY CHANGE ax.autoscale_view() yPos = 0 'Draw the sequence first' gt.draw_sequence(seg_loc,sequence, ax, yPos) length = 0 "Read a .csv file with motifs, build a Trie from it and store in gTrie" with open(tech.motif_file.path) as f: motifs = f.readlines() for m in motifs[1:]: m = m.split(',')[0] length = len(m) gTrie[m] = 1 'Get Motifs and their positions' motifs = gt.trieGetMotifs(gTrie, sequence, length) motif_pos = gt.getMotifPosition(gTrie, sequence, length) yPos += 1 gt.draw_all_motifs(tech, seg_loc,sequence, motifs, motif_pos, yPos, ax) 'Read SNP' if not tech.snp_file == None: snps = gt.readSNP(tech.snp_file.path) "Check for each SNP if it belongs to a sequence and eventually plot" gt.process_SNP(tech, sequence, seg_loc, motifs, motif_pos, snps, ax) 'Last figure modifucations' plt.ylabel("Motifs") plt.legend(loc='upper right') plt.xticks(()) plt.yticks(()) save_folder = 'media/graphics/Bild' + str(tech.pk) + '.png' plt.savefig(save_folder) return save_folder
def clear(): # delete the old trie and declare a new one. global inv_idx del inv_idx # clear uploaded_files directory os.system('rm uploaded_files/*') inv_idx = trie() return 'Index cleared.'
def _create_trie_from_dictionary_lines(self, dict_lines): dict_trie = trie() for line in dict_lines: line = line.strip() word, meta = self.split_line(line) if not dict_trie.has_key(word): dict_trie[word] = set() dict_trie[word].add(meta) return dict_trie
def _reset_trie(self): paths = defaultdict(lambda: defaultdict(list)) for a in self.actions: cu = a.uri if cu is not None: path = "" paths["/"][a.request_method].append(a) for sub in cu.split('/'): if sub: path += "/" + sub paths[path][a.request_method].append(a) self._trie = trie(paths.items())
def __init__(self): super(APIBlueprint, self).__init__() self._metadata = {} self._name = None self._overview = None self._groups = OrderedDict() self._trie = trie() self._data_structures = OrderedDict() def strip(): del self.strip return self # trick Markdown in the end of the conversion self.strip = strip
def _create_prefix_trie(self): """ Fetches all the keywords in the `JAVA_RESERVED_WORDS` file and stores them into a SortedStringTrie (from `pytrie`). """ english_dictionary_word_set = set() with open(ENGLISH_DICTIONARY) as dictionary: for word in dictionary: word = word.strip().lower() english_dictionary_word_set.add(word) english_dictionary_word_set = sorted(english_dictionary_word_set) english_dictionary_word_set = [ (word, i) for i, word in enumerate(english_dictionary_word_set) ] english_dictionary = trie(english_dictionary_word_set) return english_dictionary
def build_mirrors(): from portage.util import stack_dictlist, grabdict global _mirrors tmp = {} thirdpartymirrors = {} if not _thirdpartymirrors: thirdpartymirrors = portage.settings.thirdpartymirrors() else: thirdparty_lists = [grabdict(x) for x in _thirdpartymirrors] thirdpartymirrors = portage.util.stack_dictlist(thirdparty_lists, incremental=True) for prefix, mirrors in thirdpartymirrors.iteritems(): for mirror in mirrors: tmp[mirror] = prefix _mirrors = trie(tmp)
## Author: Ryan Kingston ## Last Updated: 5/2011 ## Description: Contains functions to convert a text input-stream ## into bigram and probability tables for various prediction ## algorithms such as Viterbi. from __future__ import division #floating-point division import os, re, time, string, operator from cPickle import load, dump import mirror_functions from pytrie import StringTrie as trie path_separator = "\\" ## Initialize corpus of words corpus = trie() ## Initialize bigram tries wordbigrams = trie() charbigrams = trie() tranbigrams = trie() obsbigrams = trie() ## Initialize probability tries startProbs = trie() transProbs = trie() obsProbs = trie() ## All structures that are to be swapped to disk structs = ("wordbigrams","charbigrams","tranbigrams", "obsbigrams", \ "transProbs", "startProbs", "obsProbs", "corpus")
def get_ngram_vocab_prob(m, vocab, sent, ngram, ltrie): # ngram > 1 lsent = sent if type(sent) is list else sent.split() ldic = [] # 0, 1, 2, ..., ngram - 1 for i in xrange(ngram): ldic.append({}) state_in = kenlm.State() m.NullContextWrite(state_in) # Use <s> as context. If you don't want <s>, use m.NullContextWrite(state). # m.BeginSentenceWrite(ngram_state) probs = [] dist = {} for v in vocab: state_out = kenlm.State() full_score = m.BaseFullScore(state_in, v, state_out) # print full_score.log_prob, full_score.ngram_length, full_score.oov #probs.append((full_score.log_prob, full_score.ngram_length, full_score.oov, v)) dist[v] = (full_score.log_prob, full_score.ngram_length, full_score.oov) # given 0 word, probs # probs.sort(reverse=True) # lg->sm ldic[0]['null'] = trie(dist) for wid in range(len(lsent)): prev_words = lsent[wid - (ngram - 2) if wid - (ngram - 2) >= 0 else 0:wid + 1] gram_m1 = len(prev_words) for i in range(1, gram_m1 + 1): l_gram_prev = prev_words[-i:] s_gram_prev = ' '.join(l_gram_prev) # print s_gram_prev if not s_gram_prev in ldic[i]: probs = [] state_in = kenlm.State() m.NullContextWrite(state_in) for w in l_gram_prev: # print w, l_gram_prev ngram_state = kenlm.State() full_score = m.BaseFullScore(state_in, w, ngram_state) # print w # print full_score state_in = ngram_state s = time.time() for v in vocab: state_out = kenlm.State() full_score = m.BaseFullScore(ngram_state, v, state_out) # print v # print full_score # full_score.ngram_length is the matched ngram length ending with v in # (l_gram_prev + v) #probs.append((full_score.log_prob, full_score.ngram_length, full_score.oov, v)) dist[v] = (full_score.log_prob, full_score.ngram_length, full_score.oov) print time.time() - s print 'add....', len(dist) # probs.sort(reverse=True) j = 0 sq = time.time() print dist['wonderful'] print time.time() - sq for k, v in dist.iteritems(): if j < 10: print k, v j += 1 ldic[i][s_gram_prev] = trie(dist) sq = time.time() tdist = trie(dist) print 'create trie: ', time.time() - sq print tdist.longest_prefix('wandskafjkasdjfas') j = 0 sq = time.time() print tdist['wonderful'] print time.time() - sq for k, v in tdist.iteritems(): if j < 10: print k, v j += 1 for i in xrange(ngram): ltrie.append(trie(ldic[i]))
text = file_buf.read().decode() tokens = get_tokens(text, True) if tokens is not None: if file_name.endswith('.pdf'): id = file_name for text, para in tokens: index_words(inv_idx, para, id, text) elif file_name.endswith('.txt'): for i, (text, para) in enumerate(tokens, 0): id = f'{file_name}_para_{i}' index_words(inv_idx, para, id, text) def index_input_files(inv_idx): for root, _, files in os.walk('input_files'): for file in files: file_path = os.path.join(root, file) print(file_path) with open(file_path, 'rb') as f: index_file(inv_idx, f, file_path) if __name__ == '__main__': inv_idx = trie() index_input_files(inv_idx) print(inv_idx.get_by_prefix('lorem')) print(inv_idx.get_by_prefix('process'))
from __future__ import print_function import re import sys import portage from gentoolkit.query import Query from gentoolkit.package import Package from gentoolkit.helpers import get_cpvs import gentoolkit.pprinter as pp import helpers from pytrie import SortedStringTrie as trie _mirrors = trie() _generate_diff = False _thirdpartymirrors = [] _src_uri_re = re.compile("SRC_URI=(['\"])(.*?)\\1", re.S) def pmsg(package, msg): print(pp.cpv(str(package)) + ": " + msg) def generate_diff(package, bad_uris): from difflib import unified_diff ebuild = package.ebuild_path() before = open(ebuild).read()
pq.qsize() # get size pq.get() # dequeue an element (using priority) # set from sets import Set s = set() s.add(1) # add an element s.remove(1) # remove an element # trie from pytrie import SortedStringTrie as trie t = trie({ 'all': 2, 'allot': 3, 'alloy': 4, 'aloe': 5, 'an': 0, 'ant': 1, 'are': 6, 'be': 7 }) # arg is a dict t.keys('al') # get keys prefixed by 'al' t.items('al') # get values associated with keys prefixed by 'al' t.longest_prefix('antonym', None) # get the longest key that is a prefix of 'antonym' t.longest_prefix_item( 'allstar', None ) # get the value associated with the longest key that is a prefix of 'allstar' t.longest_prefix_item( 'area', None ) # get item ((key,value) tuple) associated with the longest key that is a prefix of 'area' t.iter_prefixes('al') # get iterator over the keys that are prefixes of 'al'
from telegram.ext import Updater, CommandHandler, MessageHandler, Filters from telegram import Bot import urllib3 import json import requests import configparser import redis import re import os import sys import jieba import jieba.analyse import jieba.posseg import pytrie forbid_words_filter = pytrie.trie() with open('forbid.txt') as f: for line in f: line = line.strip('\n') if line != '': forbid_words_filter.insert(line) forbid_words_filter.build_fail() logging.basicConfig( format='%(asctime)s - %(name)s - %(levelname)s - %(message)s') urllib3.disable_warnings() logging.getLogger('urllib3').setLevel(logging.CRITICAL) logger = logging.getLogger(__name__) logger.setLevel(logging.WARN)
from pytrie import SortedStringTrie as trie t = trie(an=0, ant=1, all=2, allot=3, alloy=4, aloe=5, are=6, be=7) print t #https://www.youtube.com/watch?v=RIUY7ieyH40 ''' A trie is an tree data structure that is used to store a mapping where the keys are sequences, usually strings over an alphabet. In addition to implementing the mapping interface, tries facilitate finding the items for a given prefix, and vice versa, finding the items whose keys are prefixes of a given key K. As a common special case, finding the longest-prefix item is also supported. ''' ''' Algorithmically, tries are more efficient than binary search trees (BSTs) both in lookup time and memory when they contain many keys sharing relatively few prefixes. Unlike hash tables, trie keys don’t need to be hashable. ''' #Trie data strcuture https://reterwebber.wordpress.com/2014/01/22/data-structure-in-python-trie/ # >>> from pytrie import SortedStringTrie as trie # >>> t = trie(an=0, ant=1, all=2, allot=3, alloy=4, aloe=5, are=6, be=7) # >>> t # {'all': 2, 'allot': 3, 'alloy': 4, 'aloe': 5, 'an': 0, 'ant': 1, 'are': 6, 'be': 7} # >>> t.keys(prefix='al') # ['all', 'allot', 'alloy', 'aloe'] # >>> t.items(prefix='an') # [('an', 0), ('ant', 1)] # >>> t.longest_prefix('antonym') # 'ant' # >>> t.longest_prefix_item('allstar') # ('all', 2) # >>> t.longest_prefix_value('area', default='N/A') # 6 # >>> t.longest_prefix('alsa') # Traceback (most recent call last):
def draw_differ(tech1, tech2, sequence): gene = tech1.gene seg_loc = gt.seq_pos(gene, sequence) tech1.sequence = sequence tech2.sequence = sequence 'Init variables, arrays, trie' global gTrie1 global gTrie2 gTrie1 = trie() gTrie2 = trie() "Read a .csv file with motifs, build a Trie from it and store in gTrie" length = 0 with open(tech1.motif_file.path) as f: motifs = f.readlines() for m in motifs[1:]: m = m.split(',')[0] length = len(m) gTrie1[m] = 1 with open(tech2.motif_file.path) as f: motifs = f.readlines() for m in motifs[1:]: m = m.split(',')[0] length = len(m) gTrie2[m] = 1 cMotifs = gt.getMatchingMotifs(gTrie1, tech2.motif_file.path) motifs = getDifferencMotifs(gTrie1, gTrie2, tech1, tech2, cMotifs) dMotifs = diff_motif_seq(sequence, motifs, length) dPos = diff_pos(sequence, motifs, length) 'Prepare plot' fig, ax = plt.subplots() fig.set_size_inches(18.5, 10.5, forward=True) ax.invert_yaxis() 'Prepare axis' ax.xaxis.tick_top() ax.xaxis.set_ticks_position('both') # THIS IS THE ONLY CHANGE ax.autoscale_view() yPos = 0 'Draw the sequence first' gt.draw_sequence(seg_loc,sequence, ax, yPos) yPos += 1 'Draw Motifs' for i, m in enumerate(dMotifs): techNr = checkFindIn(gTrie1, gTrie2, m) if gt.checkShift(dMotifs[i], dPos[i], dPos[i-1]): yPos += 2 else: yPos = 1 if techNr == 1: draw_motif(seg_loc,sequence, dMotifs[i], dPos[i], yPos, 'black') erd = gt.get_MotifERD(tech1.motif_file.path, dMotifs[i]) print erd gt.draw_Information(ax, erd, dPos[i], yPos) if techNr == 2: draw_motif(seg_loc,sequence, dMotifs[i], dPos[i], yPos, 'grey') erd = gt.get_MotifERD(tech2.motif_file.path, dMotifs[i]) print erd gt.draw_Information(ax, erd, dPos[i], yPos) 'Read SNP' snps = gt.readSNP(tech1.snp_file.path) # "Check for each SNP if it belongs to a sequence and eventually plot" gt.process_SNP(tech1, sequence, seg_loc, dMotifs, dPos, snps, ax) 'Last figure modifications' plt.ylabel("Difference") plt.legend(loc='upper right') plt.xticks(()) plt.yticks(()) save_folder = 'media/graphics/Bild_differ' + str(tech1.pk) + str(tech2.pk) + '.png' plt.savefig(save_folder) return save_folder
import Display #execfile('D:\Documents\Repos\dotphrase\dotPhrases.py') #get all of the phrases and keys and create the trie #Read the .ini file config = ConfigParser.ConfigParser() config.read('dict.ini') keys = [] vals = [] for section in config.sections(): keys.append(config.get(section, 'Key')) vals.append(config.get(section, 'Phrase')) t = trie().fromkeys(keys, vals) #read a simple CSV file #first line is all keys #second line is all phrases #ordering must match up! # lines = [line.rstrip() for line in open('phrases.txt')] # keys = [l.lower() for l in lines[0].split(',')] # vals = lines[1].split(',') #gloabl variables phraseChoice = 1 keyBuffer = [] phrases = [] keyCharacters = {'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z'}
#!/usr/bin/env python import sys import pytrie t = pytrie.trie() with open('forbid.txt') as f: for line in f: line = line.strip('\n') if line != '': t.insert(line) t.build_fail() for line in sys.stdin: print(t.process(line))
def draw_common(tech1, tech2, sequence): gene = tech1.gene seg_loc = gt.seq_pos(gene, sequence) tech1.sequence = sequence tech2.sequence = sequence 'Init variables, arrays, trie' global gTrie gTrie = trie() 'Prepare plot' fig, ax = plt.subplots() fig.set_size_inches(18.5, 10.5, forward=True) ax.invert_yaxis() 'Prepare axis' ax.xaxis.tick_top() ax.xaxis.set_ticks_position('both') # THIS IS THE ONLY CHANGE ax.autoscale_view() yPos = 0 'Draw the sequence first' gt.draw_sequence(seg_loc,sequence, ax, yPos) length = 0 "Read a .csv file with motifs, build a Trie from it and store in gTrie" with open(tech1.motif_file.path) as f: motifs = f.readlines() for m in motifs[1:]: m = m.split(',')[0] length = len(m) gTrie[m] = 1 motifs = gt.getMatchingMotifs(gTrie, tech2.motif_file.path) common_m = common_motif_seq(sequence, motifs, length) common_p = common_pos(sequence, common_m, length) yPos += 1 'Draw Motifs' gt.draw_all_motifs(tech1, seg_loc,sequence, common_m, common_p, yPos, ax) 'Read SNP' if not tech1.snp_file.path == None: snps1 = gt.readSNP(tech1.snp_file.path) gt.process_SNP(tech1, sequence, seg_loc, common_m, common_p, snps1, ax) if not tech2.snp_file.path == None: snps2 = gt.readSNP(tech2.snp_file.path) gt.process_SNP(tech2, sequence, seg_loc, common_m, common_p, snps2, ax) 'Last figure modifications' plt.ylabel("Common Motifs") plt.legend(loc='upper right') plt.xticks(()) plt.yticks(()) save_folder = 'media/graphics/Bild_common' + str(tech1.pk) + str(tech2.pk) + '.png' plt.savefig(save_folder) return save_folder
name_index[raw].append({ 'source': 'raws', 'item': raw, 'raw_type': raw_type }) raws = parse_raws() orderables = parse_orderables() craftables = parse_craftables() index_by_name = collections.defaultdict(list) index_by_tool = collections.defaultdict(list) database = { 'raws': raws, 'indices': { 'by_name': index_by_name, 'by_tool': index_by_tool, } } insert(database, 'orderables', orderables) insert(database, 'craftables', craftables) insert_raws(database, raws) by_name_trie = trie() for name in index_by_name.keys(): by_name_trie[name] = name with open('db.json', 'w', encoding = 'utf-8') as fp: json.dump(database, fp, indent = 4, sort_keys = True, default = json_default) with open('db.trie', 'wb') as fp: pickle.dump(by_name_trie, fp, protocol = 4)
if len(line) == 0: break # split by '\t' values = (line.split('\n')[0]).split('\t') # extract values brandDict_List.append(values[0].lower()) length = len(values[0].split(' ')) if length > maxBrandLength: maxBrandLength = length print ("# BRAND NAMES IN DICTIONARY:", len(brandDict_List)) print ('MAXIMUM BRAND NAME LENGTH:', maxBrandLength,'\n\n') ## build a trie for brand names in dictionary t = trie() for i in range(len(brandDict_List)): t[brandDict_List[i]] = i print ('DONE BUILDING BRAND NAMES TRIE...\n\n') print ("################# LOADING DEVELOPMENT DATA SET #####################\n") ## read in development set dataFile = curdir + os.sep + devFile product_dev = [] # development set # open the file with open(dataFile, 'r', encoding = "ISO-8859-1") as infile: line = infile.readline() if len(line) == 0: break try:
Takes in a string, which is tab separated and newline separated, and outputs a graph, in a to-be-determined format. """ # import marisa_trie as trie from pytrie import SortedStringTrie as trie from time import time words = open('englishWords.txt', 'r') wordlines = words.readlines() wordlines = [l.strip() for l in wordlines] one = [1 for i in xrange(len(wordlines))] zipped = zip(wordlines, one) T = trie(zipped) def boardToGraph(board): rows = board.split('\n') matrix = [row.split('\t') for row in rows] if len(matrix) != 4 or len(matrix[0]) != 4: print matrix print "something is wrong" raise Exception("Something is wrong") node_set = set() node_dict = {} for i in range(16): d = {}
max_word_len = len(word) long_word = word if email_trie.has_key(word): email_trie.__setitem__(word,\ email_trie.__getitem__(word) + 1) else: email_trie.__setitem__(word, 1) print reader.line_num print "max word len:", max_word_len, "\n",long_word, "\n num words:", email_trie.__len__() def save_trie_mysql(email_trie): sql_handler = mysql_handler('scam', 'scam', 'test') sql_handler.connect() for word in email_trie: parameter = "('" + word + "'," + str(email_trie.__getitem__(word)) + ")" #print "parameter: ", parameter; try: result = sql_handler.do_query("insert into email_trie values" + parameter)#TODO compose a insert trie query except: continue sql_handler.disconnect() if __name__ == '__main__': email_trie = trie()#initialize an empty trie build_word_freq(email_trie) for word in email_trie: print email_trie.__getitem__(word) #print email_trie save_trie_mysql(email_trie)
from pytrie import SortedStringTrie as trie t = trie(an=0, ant=1, all=2, allot=3, alloy=4, aloe=5, are=6, be=7) print t print t.keys(prefix='al') print t.items(prefix='an') print t.longest_prefix('antonym') print t.longest_prefix_value('alsa', default=-1) print list(t.iter_prefixes('allotment')) print list(t.iter_prefix_items('antonym'))
# PoS-tagger's. # - 2011.02.08: -t has new effect (capitalization indicator): # append 0 if token starts with a lower-case character, # 1 if token starts with upper-case character and is not the first in sentence, # 2 if token starts with upper-case character and is first in sentence # 3 if token starts with a non-alphabetic character import codecs, re, sys from optparse import OptionParser from pytrie import StringTrie as trie # Some globals # Name of file that contains stemming exceptions in the form of <surface form>TAB<stem> each line stemexcfile = sys.path[0] + '/stem.exc' # in the same directory as this script is in stemexc = {} # surf form => stem stemexcpref = trie() # surface form prefix => stem opts = None # command-line options # surff: string, surface form # poss: string, Part-of-Speech tag returned by PoS-tagger # morphstr: string, the list of "stem1/pos1/...(+stem2/pos21/...)*" morphological analyses returned by morph. analyzer # 1. find surff in stemexc; if found, return stem associated there # 2. if there are no anas (morphstr==None), return surff # 3. find surff in stemexcpref; if found (the longest prefix), # 3.1 then check if associated stem exists among stems in morphs; if yes, return it # 4. if there's only 1 ana, return its stem (if no stem in ana, return surf) # 5. if more than 1 anas: # 5.1 select stems from morphstr whose 1st PoS-tag is equal to, or (if none such), contains poss # 5.2 if there are more than one, return the longest one # 5.3 if there are none, return (surff, True) # Return: (identified stem, is_surfaceform_used_as_stem) pair
new_test_phrase = smart_str((phrase + ' ')) + letter.encode('iso-8859-2') grabbed_list_depper = get_suggestion(new_test_phrase, LANG, TLD) grabbed_list_depper_size = len(grabbed_list_depper) if grabbed_list_depper_size == 0: #TODO!!, realy? - how about 1 continue elif grabbed_list_depper_size > 0: #TODO (do dziesieciu, a inczej print) l = level + 2 if not trie_.has_key( new_test_phrase ): grabWords(new_test_phrase, file_path, False, l, for_) # Reduce number of calls (print directly, not call scrobbler again) def getAllSuggestion(phrase, file_path, intendations = True): cleared_phrase = leaveAccetableSigns(phrase) lowered_cleared = cleared_phrase.lower() grabWords(phrase, file_path, intendations) def main(): if len(sys.argv) != 3: print "Error usage, two parameters should been given" else: getAllSuggestion(sys.argv[1], sys.argv[2]) if __name__ == "__main__": trie_ = trie() counter = 0 main()