Example #1
0
 def __init__(self):
     self._data = trie({})
     self.lang = None
     self.name = None
     self.ver = None
     self.sha = hashlib.sha1()
     self._hashed = None
def search_users(accounts, search_term):
    global dir_path
    
    # Like search using PyTrie
    t = trie(users_dict)

    # Get list of aliases
    # Search through aliases and get corresponding userIds and put them ALL in one list
    aliases = t.keys(prefix=search_term)

    users = []
    names = []
    for alias in aliases:
        names = names + users_dict[alias]
        
    for account in accounts:
        dir_path = log_path + '/' + account
        os.chdir(dir_path)

        for name in names:
            if name in os.listdir(dir_path):
                users.append(account + '/' + name)
                break

    return users
Example #3
0
 def __init__(self):
     self._data = trie({})
     self.lang = None
     self.name = None
     self.ver = None
     self.sha = hashlib.sha1()
     self._hashed = None
    def __init__(self, *args, **kwargs):
        # pylint: disable=super-init-not-called
        self._dict = {}
        self._trie = trie(*args, **kwargs)

        d = dict(*args, **kwargs)
        for key, value in d.iteritems():
            self._dict[case_insensitive(key)] = value
def draw_simple(tech, sequence):
        
    'Init variables, arrays, trie'
    global gTrie
    gTrie = trie()
    
    snps = [ ]

    gene = tech.gene  
    seg_loc = gt.seq_pos(gene, sequence)  
        
    'Prepare plot'
    fig, ax = plt.subplots()
    fig.set_size_inches(18.5, 10.5, forward=True)
    ax.invert_yaxis()

    'Prepare axis'
    ax.xaxis.tick_top()
    ax.xaxis.set_ticks_position('both') # THIS IS THE ONLY CHANGE
    ax.autoscale_view()
    yPos = 0
    'Draw the sequence first'
    gt.draw_sequence(seg_loc,sequence, ax, yPos)

    length = 0
    "Read a .csv file with motifs, build a Trie from it and store in gTrie"
    with open(tech.motif_file.path) as f:
        motifs = f.readlines()
        for m in motifs[1:]:
            m = m.split(',')[0]
            length = len(m)
            gTrie[m] = 1

    'Get Motifs and their positions'
    motifs = gt.trieGetMotifs(gTrie, sequence, length)
    motif_pos = gt.getMotifPosition(gTrie, sequence, length)
    yPos += 1
    
    gt.draw_all_motifs(tech, seg_loc,sequence, motifs, motif_pos, yPos, ax)    
            
    'Read SNP'
    if not tech.snp_file == None:
        snps = gt.readSNP(tech.snp_file.path)
    
    "Check for each SNP if it belongs to a sequence and eventually plot"
    gt.process_SNP(tech, sequence, seg_loc, motifs, motif_pos, snps, ax)
    
    'Last figure modifucations'
    plt.ylabel("Motifs")
    plt.legend(loc='upper right')
    plt.xticks(())
    plt.yticks(())
    

    save_folder = 'media/graphics/Bild' + str(tech.pk) + '.png'
    plt.savefig(save_folder)
    
    return save_folder
def clear():
    # delete the old trie and declare a new one.
    global inv_idx
    del inv_idx

    # clear uploaded_files directory
    os.system('rm uploaded_files/*')

    inv_idx = trie()
    return 'Index cleared.'
Example #7
0
    def _create_trie_from_dictionary_lines(self, dict_lines):
        dict_trie = trie()

        for line in dict_lines:
            line = line.strip()
            word, meta = self.split_line(line)
            if not dict_trie.has_key(word):
                dict_trie[word] = set()
            dict_trie[word].add(meta)

        return dict_trie
Example #8
0
 def _reset_trie(self):
     paths = defaultdict(lambda: defaultdict(list))
     for a in self.actions:
         cu = a.uri
         if cu is not None:
             path = ""
             paths["/"][a.request_method].append(a)
             for sub in cu.split('/'):
                 if sub:
                     path += "/" + sub
                     paths[path][a.request_method].append(a)
     self._trie = trie(paths.items())
Example #9
0
 def _reset_trie(self):
     paths = defaultdict(lambda: defaultdict(list))
     for a in self.actions:
         cu = a.uri
         if cu is not None:
             path = ""
             paths["/"][a.request_method].append(a)
             for sub in cu.split('/'):
                 if sub:
                     path += "/" + sub
                     paths[path][a.request_method].append(a)
     self._trie = trie(paths.items())
Example #10
0
    def __init__(self):
        super(APIBlueprint, self).__init__()
        self._metadata = {}
        self._name = None
        self._overview = None
        self._groups = OrderedDict()
        self._trie = trie()
        self._data_structures = OrderedDict()

        def strip():
            del self.strip
            return self

        # trick Markdown in the end of the conversion
        self.strip = strip
Example #11
0
    def __init__(self):
        super(APIBlueprint, self).__init__()
        self._metadata = {}
        self._name = None
        self._overview = None
        self._groups = OrderedDict()
        self._trie = trie()
        self._data_structures = OrderedDict()

        def strip():
            del self.strip
            return self

        # trick Markdown in the end of the conversion
        self.strip = strip
 def _create_prefix_trie(self):
     """
     Fetches all the keywords in the `JAVA_RESERVED_WORDS`
     file and stores them into a SortedStringTrie (from `pytrie`).
     """
     english_dictionary_word_set = set()
     with open(ENGLISH_DICTIONARY) as dictionary:
         for word in dictionary:
             word = word.strip().lower()
             english_dictionary_word_set.add(word)
     english_dictionary_word_set = sorted(english_dictionary_word_set)
     english_dictionary_word_set = [
         (word, i) for i, word in enumerate(english_dictionary_word_set)
     ]
     english_dictionary = trie(english_dictionary_word_set)
     return english_dictionary
Example #13
0
def build_mirrors():
    from portage.util import stack_dictlist, grabdict

    global _mirrors

    tmp = {}
    thirdpartymirrors = {}

    if not _thirdpartymirrors:
        thirdpartymirrors = portage.settings.thirdpartymirrors()
    else:
        thirdparty_lists = [grabdict(x) for x in _thirdpartymirrors]
        thirdpartymirrors = portage.util.stack_dictlist(thirdparty_lists, incremental=True)

    for prefix, mirrors in thirdpartymirrors.iteritems():
        for mirror in mirrors:
            tmp[mirror] = prefix

    _mirrors = trie(tmp)
Example #14
0
## Author: Ryan Kingston
## Last Updated: 5/2011
## Description: Contains functions to convert a text input-stream
##      into bigram and probability tables for various prediction
##      algorithms such as Viterbi.

from __future__ import division #floating-point division
import os, re, time, string, operator
from cPickle import load, dump
import mirror_functions
from pytrie import StringTrie as trie

path_separator = "\\"

## Initialize corpus of words
corpus = trie()

## Initialize bigram tries
wordbigrams = trie()
charbigrams = trie()
tranbigrams = trie()
obsbigrams = trie()

## Initialize probability tries
startProbs = trie()
transProbs = trie()
obsProbs = trie()

## All structures that are to be swapped to disk
structs = ("wordbigrams","charbigrams","tranbigrams", "obsbigrams", \
                   "transProbs", "startProbs", "obsProbs", "corpus")
Example #15
0
def get_ngram_vocab_prob(m, vocab, sent, ngram, ltrie):
    # ngram > 1
    lsent = sent if type(sent) is list else sent.split()

    ldic = []
    # 0, 1, 2, ..., ngram - 1
    for i in xrange(ngram):
        ldic.append({})

    state_in = kenlm.State()
    m.NullContextWrite(state_in)
    # Use <s> as context.  If you don't want <s>, use m.NullContextWrite(state).
    # m.BeginSentenceWrite(ngram_state)
    probs = []
    dist = {}
    for v in vocab:
        state_out = kenlm.State()
        full_score = m.BaseFullScore(state_in, v, state_out)
        # print full_score.log_prob, full_score.ngram_length, full_score.oov
        #probs.append((full_score.log_prob, full_score.ngram_length, full_score.oov, v))
        dist[v] = (full_score.log_prob, full_score.ngram_length, full_score.oov)
    # given 0 word, probs
    # probs.sort(reverse=True)    # lg->sm
    ldic[0]['null'] = trie(dist)

    for wid in range(len(lsent)):
        prev_words = lsent[wid - (ngram - 2) if wid - (ngram - 2) >= 0 else 0:wid + 1]
        gram_m1 = len(prev_words)
        for i in range(1, gram_m1 + 1):
            l_gram_prev = prev_words[-i:]
            s_gram_prev = ' '.join(l_gram_prev)
            # print s_gram_prev
            if not s_gram_prev in ldic[i]:
                probs = []
                state_in = kenlm.State()
                m.NullContextWrite(state_in)
                for w in l_gram_prev:
                    # print w, l_gram_prev
                    ngram_state = kenlm.State()
                    full_score = m.BaseFullScore(state_in, w, ngram_state)
                    # print w
                    # print full_score
                    state_in = ngram_state

                s = time.time()
                for v in vocab:
                    state_out = kenlm.State()
                    full_score = m.BaseFullScore(ngram_state, v, state_out)
                    # print v
                    # print full_score
                    # full_score.ngram_length is the matched ngram length ending with v in
                    # (l_gram_prev + v)
                    #probs.append((full_score.log_prob, full_score.ngram_length, full_score.oov, v))
                    dist[v] = (full_score.log_prob, full_score.ngram_length, full_score.oov)

                print time.time() - s
                print 'add....', len(dist)
                # probs.sort(reverse=True)
                j = 0
                sq = time.time()
                print dist['wonderful']
                print time.time() - sq
                for k, v in dist.iteritems():
                    if j < 10:
                        print k, v
                    j += 1
                ldic[i][s_gram_prev] = trie(dist)

                sq = time.time()
                tdist = trie(dist)
                print 'create trie: ', time.time() - sq

                print tdist.longest_prefix('wandskafjkasdjfas')

                j = 0
                sq = time.time()
                print tdist['wonderful']
                print time.time() - sq
                for k, v in tdist.iteritems():
                    if j < 10:
                        print k, v
                    j += 1

    for i in xrange(ngram):
        ltrie.append(trie(ldic[i]))
        text = file_buf.read().decode()
        tokens = get_tokens(text, True)

    if tokens is not None:
        if file_name.endswith('.pdf'):
            id = file_name
            for text, para in tokens:
                index_words(inv_idx, para, id, text)
        elif file_name.endswith('.txt'):
            for i, (text, para) in enumerate(tokens, 0):
                id = f'{file_name}_para_{i}'
                index_words(inv_idx, para, id, text)


def index_input_files(inv_idx):
    for root, _, files in os.walk('input_files'):
        for file in files:
            file_path = os.path.join(root, file)
            print(file_path)

            with open(file_path, 'rb') as f:
                index_file(inv_idx, f, file_path)


if __name__ == '__main__':
    inv_idx = trie()
    index_input_files(inv_idx)

    print(inv_idx.get_by_prefix('lorem'))
    print(inv_idx.get_by_prefix('process'))
Example #17
0
from __future__ import print_function

import re
import sys
import portage

from gentoolkit.query import Query
from gentoolkit.package import Package
from gentoolkit.helpers import get_cpvs
import gentoolkit.pprinter as pp

import helpers
from pytrie import SortedStringTrie as trie

_mirrors = trie()
_generate_diff = False
_thirdpartymirrors = []

_src_uri_re = re.compile("SRC_URI=(['\"])(.*?)\\1", re.S)


def pmsg(package, msg):
    print(pp.cpv(str(package)) + ": " + msg)


def generate_diff(package, bad_uris):
    from difflib import unified_diff

    ebuild = package.ebuild_path()
    before = open(ebuild).read()
Example #18
0
pq.qsize()  # get size
pq.get()  # dequeue an element (using priority)

# set
from sets import Set
s = set()
s.add(1)  # add an element
s.remove(1)  # remove an element

# trie
from pytrie import SortedStringTrie as trie
t = trie({
    'all': 2,
    'allot': 3,
    'alloy': 4,
    'aloe': 5,
    'an': 0,
    'ant': 1,
    'are': 6,
    'be': 7
})  # arg is a dict
t.keys('al')  # get keys prefixed by 'al'
t.items('al')  # get values associated with keys prefixed by 'al'
t.longest_prefix('antonym',
                 None)  # get the longest key that is a prefix of 'antonym'
t.longest_prefix_item(
    'allstar', None
)  # get the value associated with the longest key that is a prefix of 'allstar'
t.longest_prefix_item(
    'area', None
)  # get item ((key,value) tuple) associated with the longest key that is a prefix of 'area'
t.iter_prefixes('al')  # get iterator over the keys that are prefixes of 'al'
Example #19
0
from telegram.ext import Updater, CommandHandler, MessageHandler, Filters
from telegram import Bot
import urllib3
import json
import requests
import configparser
import redis
import re
import os
import sys
import jieba
import jieba.analyse
import jieba.posseg
import pytrie

forbid_words_filter = pytrie.trie()
with open('forbid.txt') as f:
    for line in f:
        line = line.strip('\n')
        if line != '':
            forbid_words_filter.insert(line)

forbid_words_filter.build_fail()

logging.basicConfig(
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
urllib3.disable_warnings()
logging.getLogger('urllib3').setLevel(logging.CRITICAL)
logger = logging.getLogger(__name__)
logger.setLevel(logging.WARN)
Example #20
0
from pytrie import SortedStringTrie as trie

t = trie(an=0, ant=1, all=2, allot=3, alloy=4, aloe=5, are=6, be=7)

print t

#https://www.youtube.com/watch?v=RIUY7ieyH40
'''
A trie is an tree data structure that is used to store a mapping where the keys are sequences, usually strings over an alphabet. In addition to implementing the mapping interface, tries facilitate finding the items for a given prefix, and vice versa, finding the items whose keys are prefixes of a given key K. As a common special case, finding the longest-prefix item is also supported.
'''
'''
Algorithmically, tries are more efficient than binary search trees (BSTs) both in lookup time and memory when they contain many keys sharing relatively few prefixes. Unlike hash tables, trie keys don’t need to be hashable.
'''

#Trie data strcuture https://reterwebber.wordpress.com/2014/01/22/data-structure-in-python-trie/
# >>> from pytrie import SortedStringTrie as trie
# >>> t = trie(an=0, ant=1, all=2, allot=3, alloy=4, aloe=5, are=6, be=7)
# >>> t
# {'all': 2, 'allot': 3, 'alloy': 4, 'aloe': 5, 'an': 0, 'ant': 1, 'are': 6, 'be': 7}
# >>> t.keys(prefix='al')
# ['all', 'allot', 'alloy', 'aloe']
# >>> t.items(prefix='an')
# [('an', 0), ('ant', 1)]
# >>> t.longest_prefix('antonym')
# 'ant'
# >>> t.longest_prefix_item('allstar')
# ('all', 2)
# >>> t.longest_prefix_value('area', default='N/A')
# 6
# >>> t.longest_prefix('alsa')
# Traceback (most recent call last):
Example #21
0
def draw_differ(tech1, tech2, sequence):
    
    gene = tech1.gene  
    seg_loc = gt.seq_pos(gene, sequence)  
    
    tech1.sequence = sequence
    tech2.sequence = sequence
    
    'Init variables, arrays, trie'
    global gTrie1
    global gTrie2
    
    gTrie1 = trie()
    gTrie2 = trie()
    
    "Read a .csv file with motifs, build a Trie from it and store in gTrie"
    length = 0
    with open(tech1.motif_file.path) as f:
        motifs = f.readlines()
        for m in motifs[1:]:
            m = m.split(',')[0]
            length = len(m)
            gTrie1[m] = 1
    with open(tech2.motif_file.path) as f:
        motifs = f.readlines()
        for m in motifs[1:]:
            m = m.split(',')[0]
            length = len(m)
            gTrie2[m] = 1
            
    cMotifs = gt.getMatchingMotifs(gTrie1, tech2.motif_file.path)
    
    motifs = getDifferencMotifs(gTrie1, gTrie2, tech1, tech2, cMotifs)
    
    dMotifs = diff_motif_seq(sequence, motifs, length)
    dPos = diff_pos(sequence, motifs, length)
    
    'Prepare plot'
    fig, ax = plt.subplots()
    fig.set_size_inches(18.5, 10.5, forward=True)
    ax.invert_yaxis()

    'Prepare axis'
    ax.xaxis.tick_top()
    ax.xaxis.set_ticks_position('both') # THIS IS THE ONLY CHANGE
    ax.autoscale_view()

    yPos = 0
    'Draw the sequence first'
    gt.draw_sequence(seg_loc,sequence, ax, yPos)
    yPos += 1
    'Draw Motifs'
    for i, m in enumerate(dMotifs):
        techNr = checkFindIn(gTrie1, gTrie2, m)
        if gt.checkShift(dMotifs[i], dPos[i], dPos[i-1]):
            yPos += 2 
        else:
            yPos = 1
        if techNr == 1:
            draw_motif(seg_loc,sequence, dMotifs[i], dPos[i], yPos, 'black')
            erd = gt.get_MotifERD(tech1.motif_file.path, dMotifs[i])
            print erd
            gt.draw_Information(ax, erd, dPos[i], yPos)
        if techNr == 2:
            draw_motif(seg_loc,sequence, dMotifs[i], dPos[i], yPos, 'grey')
            erd = gt.get_MotifERD(tech2.motif_file.path, dMotifs[i])
            print erd
            gt.draw_Information(ax, erd, dPos[i], yPos)

    'Read SNP'
    snps = gt.readSNP(tech1.snp_file.path)
    
    # "Check for each SNP if it belongs to a sequence and eventually plot"
    gt.process_SNP(tech1, sequence, seg_loc, dMotifs, dPos, snps, ax)
    
    'Last figure modifications'
    plt.ylabel("Difference")
    plt.legend(loc='upper right')
    plt.xticks(())
    plt.yticks(())
    
    save_folder = 'media/graphics/Bild_differ' + str(tech1.pk) + str(tech2.pk) + '.png'
    plt.savefig(save_folder)
    
    return save_folder
Example #22
0
import Display

#execfile('D:\Documents\Repos\dotphrase\dotPhrases.py')


#get all of the phrases and keys and create the trie
#Read the .ini file
config = ConfigParser.ConfigParser()
config.read('dict.ini')
keys = []
vals = []
for section in config.sections():
    keys.append(config.get(section, 'Key'))
    vals.append(config.get(section, 'Phrase'))

t = trie().fromkeys(keys, vals)

#read a simple CSV file
#first line is all keys
#second line is all phrases
#ordering must match up!
# lines = [line.rstrip() for line in open('phrases.txt')]
# keys = [l.lower() for l in lines[0].split(',')]
# vals = lines[1].split(',')

#gloabl variables
phraseChoice = 1
keyBuffer = []
phrases = []
keyCharacters = {'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't',
                 'u', 'v', 'w', 'x', 'y', 'z'}
Example #23
0
#!/usr/bin/env python

import sys
import pytrie

t = pytrie.trie()

with open('forbid.txt') as f:
    for line in f:
        line = line.strip('\n')
        if line != '':
            t.insert(line)

t.build_fail()

for line in sys.stdin:
    print(t.process(line))

Example #24
0
def draw_common(tech1, tech2, sequence):
    
    gene = tech1.gene  
    seg_loc = gt.seq_pos(gene, sequence)  
    
    tech1.sequence = sequence
    tech2.sequence = sequence
    
    'Init variables, arrays, trie'
    global gTrie
    gTrie = trie()
    
    'Prepare plot'
    fig, ax = plt.subplots()
    fig.set_size_inches(18.5, 10.5, forward=True)
    ax.invert_yaxis()

    'Prepare axis'
    ax.xaxis.tick_top()
    ax.xaxis.set_ticks_position('both') # THIS IS THE ONLY CHANGE
    ax.autoscale_view()

    yPos = 0
    'Draw the sequence first'
    gt.draw_sequence(seg_loc,sequence, ax, yPos)
    
    length = 0
    "Read a .csv file with motifs, build a Trie from it and store in gTrie"
    with open(tech1.motif_file.path) as f:
        motifs = f.readlines()
        for m in motifs[1:]:
            m = m.split(',')[0]
            length = len(m)
            gTrie[m] = 1
        
    motifs = gt.getMatchingMotifs(gTrie, tech2.motif_file.path)
    
    common_m = common_motif_seq(sequence, motifs, length)
    common_p = common_pos(sequence, common_m, length)
    
    yPos += 1
    'Draw Motifs'
    gt.draw_all_motifs(tech1, seg_loc,sequence, common_m, common_p, yPos, ax)
        
    'Read SNP'
    if not tech1.snp_file.path == None:
        snps1 = gt.readSNP(tech1.snp_file.path)
        gt.process_SNP(tech1, sequence, seg_loc, common_m, common_p, snps1, ax)
    if not tech2.snp_file.path == None:   
        snps2 = gt.readSNP(tech2.snp_file.path)
        gt.process_SNP(tech2, sequence, seg_loc, common_m, common_p, snps2, ax)
    
    
    
    'Last figure modifications'
    plt.ylabel("Common Motifs")
    plt.legend(loc='upper right')
    plt.xticks(())
    plt.yticks(())
    
    save_folder = 'media/graphics/Bild_common' + str(tech1.pk) + str(tech2.pk) + '.png'
    plt.savefig(save_folder)
    
    return save_folder
Example #25
0
            name_index[raw].append({ 'source': 'raws', 'item': raw, 'raw_type': raw_type })

raws       = parse_raws()
orderables = parse_orderables()
craftables = parse_craftables()

index_by_name = collections.defaultdict(list)
index_by_tool = collections.defaultdict(list)

database = {
    'raws': raws,
    'indices': {
        'by_name': index_by_name,
        'by_tool': index_by_tool,
    }
}

insert(database, 'orderables', orderables)
insert(database, 'craftables', craftables)
insert_raws(database, raws)

by_name_trie = trie()
for name in index_by_name.keys():
    by_name_trie[name] = name

with open('db.json', 'w', encoding = 'utf-8') as fp:
    json.dump(database, fp, indent = 4, sort_keys = True, default = json_default)

with open('db.trie', 'wb') as fp:
    pickle.dump(by_name_trie, fp, protocol = 4)
Example #26
0
    if len(line) == 0:
        break
    # split by '\t'
    values = (line.split('\n')[0]).split('\t')
    # extract values       
    brandDict_List.append(values[0].lower())
    
    length = len(values[0].split(' '))    
    if length > maxBrandLength:
        maxBrandLength = length

print ("# BRAND NAMES IN DICTIONARY:", len(brandDict_List))
print ('MAXIMUM BRAND NAME LENGTH:', maxBrandLength,'\n\n')

## build a trie for brand names in dictionary
t = trie()
for i in range(len(brandDict_List)):
    t[brandDict_List[i]] = i
print ('DONE BUILDING BRAND NAMES TRIE...\n\n')

print ("################# LOADING DEVELOPMENT DATA SET #####################\n")
## read in development set
dataFile = curdir + os.sep + devFile
product_dev = []    # development set

# open the file
with open(dataFile, 'r', encoding = "ISO-8859-1") as infile:
    line = infile.readline()
    if len(line) == 0:
        break
    try:
Example #27
0
Takes in a string, which is tab separated and newline separated, and outputs
a graph, in a to-be-determined format. 
"""


# import marisa_trie as trie
from pytrie import SortedStringTrie as trie

from time import time

words = open('englishWords.txt', 'r')
wordlines = words.readlines()
wordlines = [l.strip() for l in wordlines]
one = [1 for i in xrange(len(wordlines))]
zipped = zip(wordlines, one)
T = trie(zipped)



def boardToGraph(board):
	rows = board.split('\n')
	matrix = [row.split('\t') for row in rows]
	if len(matrix) != 4 or len(matrix[0]) != 4:
		print matrix
		print "something is wrong"
		raise Exception("Something is wrong")

	node_set = set()
	node_dict = {}
	for i in range(16):
		d = {}
Example #28
0
					max_word_len = len(word)
					long_word = word
				if email_trie.has_key(word):
					email_trie.__setitem__(word,\
							email_trie.__getitem__(word) + 1)
				else:
					email_trie.__setitem__(word, 1)
			print reader.line_num
		print "max word len:", max_word_len, "\n",long_word, "\n num words:", email_trie.__len__()

def save_trie_mysql(email_trie):
	sql_handler = mysql_handler('scam', 'scam', 'test')
	sql_handler.connect()
	
	for word in email_trie:
		parameter = "('"	+ word + "'," +	str(email_trie.__getitem__(word)) + ")"
		#print "parameter: ", parameter;
		try:
			result = sql_handler.do_query("insert into email_trie values" + parameter)#TODO compose a insert trie query
		except:
			continue
	sql_handler.disconnect()

if __name__ == '__main__':
	email_trie = trie()#initialize an empty trie
	build_word_freq(email_trie)
	for word in email_trie:
		print email_trie.__getitem__(word)
	#print email_trie
	save_trie_mysql(email_trie)
from pytrie import SortedStringTrie as trie

t = trie(an=0, ant=1, all=2, allot=3, alloy=4, aloe=5, are=6, be=7)

print t

print t.keys(prefix='al')

print t.items(prefix='an')

print t.longest_prefix('antonym')

print t.longest_prefix_value('alsa', default=-1)

print list(t.iter_prefixes('allotment'))

print list(t.iter_prefix_items('antonym'))






Example #30
0
#   PoS-tagger's.
# - 2011.02.08: -t has new effect (capitalization indicator):
#   append 0 if token starts with a lower-case character,
#   1 if token starts with upper-case character and is not the first in sentence,
#   2 if token starts with upper-case character and is first in sentence
#   3 if token starts with a non-alphabetic character

import codecs, re, sys
from optparse import OptionParser
from pytrie import StringTrie as trie

# Some globals
# Name of file that contains stemming exceptions in the form of <surface form>TAB<stem> each line
stemexcfile = sys.path[0] + '/stem.exc' # in the same directory as this script is in
stemexc = {} # surf form => stem
stemexcpref = trie() # surface form prefix => stem
opts = None # command-line options

# surff: string, surface form
# poss: string, Part-of-Speech tag returned by PoS-tagger
# morphstr: string, the list of "stem1/pos1/...(+stem2/pos21/...)*" morphological analyses returned by morph. analyzer
# 1. find surff in stemexc; if found, return stem associated there
# 2. if there are no anas (morphstr==None), return surff
# 3. find surff in stemexcpref; if found (the longest prefix),
#  3.1 then check if associated stem exists among stems in morphs; if yes, return it 
# 4. if there's only 1 ana, return its stem (if no stem in ana, return surf)
# 5. if more than 1 anas:
#  5.1 select stems from morphstr whose 1st PoS-tag is equal to, or (if none such), contains poss
#  5.2 if there are more than one, return the longest one
#  5.3 if there are none, return (surff, True)
# Return: (identified stem, is_surfaceform_used_as_stem) pair 
Example #31
0
               new_test_phrase = smart_str((phrase + ' ')) + letter.encode('iso-8859-2')

            grabbed_list_depper = get_suggestion(new_test_phrase, LANG, TLD)
            grabbed_list_depper_size = len(grabbed_list_depper)

            if grabbed_list_depper_size == 0: #TODO!!, realy? - how about 1
               continue
            elif grabbed_list_depper_size > 0: #TODO (do dziesieciu, a inczej print)
               l = level + 2
               if not trie_.has_key( new_test_phrase ):
                  grabWords(new_test_phrase, file_path, False, l, for_) # Reduce number of calls (print directly, not call scrobbler again)
           
def getAllSuggestion(phrase, file_path, intendations = True):
   cleared_phrase = leaveAccetableSigns(phrase)
   lowered_cleared = cleared_phrase.lower()
   grabWords(phrase, file_path, intendations)

def main():
   if len(sys.argv) != 3:
      print "Error usage, two parameters should been given"
   else: 
      getAllSuggestion(sys.argv[1], sys.argv[2])

if __name__ == "__main__":
    trie_ = trie()
    counter = 0
    main()