コード例 #1
0
def run_ctrie(ngram_list, write_file):
    t = trie.trie()

    print('create trie...')
    beg = time.time()
    for ngram in ngram_list:
        sub = t.setdefault(ngram, 0)
        sub.data += 1
    print('time=', time.time() - beg, 's')

    print('Search...')
    beg = time.time()
    for i in range(find_epoch):
        for ngram in ngram_list:
            sub = t.find_trie(ngram)
            sub.data += 1
    print('time=', time.time() - beg, 's')

    print('write1...')
    beg = time.time()
    with open(write_file + '.1.txt', 'wt') as f:
        for keys, data in trie.TrieIter(t, True):
            f.write(' '.join(str(i) for i in keys) + '\t{}\n'.format(data))
    with open(write_file + '.2.txt', 'wt') as f:
        for n in range(1, max_order + 1):
            for keys, data in trie.LevelIter(t, n, True):
                f.write(' '.join(str(i) for i in keys) + '\t{}\n'.format(data))
    print('time=', time.time() - beg, 's')
コード例 #2
0
  def test_basic(self):
    t = trie()
    t.add('next')
    t.add('nexas')
    find = t.search('next')
    self.assertTrue(find)
    self.assertTrue(t.search('nexas'))
    #search substring
    self.assertFalse(t.search('nexa'))

    #search string that is not in the tree
    self.assertFalse(t.search('null'))
コード例 #3
0
def main():
    language_trie = trie.trie(
        "input.txt")  # To load trie from scheme file. clean code. tested
    level_2_list = get_finalized_suggestion(language_trie, arginp, argcount,
                                            "bigram_mal_corpus.txt")
    #training_phase()
    #test(language_trie,"malayalam.txt",module1=True)
    symspell_python.create_dictionary("malayalam.txt")
    #print(symspell_python.dictionary)
    for ii in level_2_list:
        ##print(mlphone_calculator(ii[0]))
        print(symspell_python.get_suggestions(ii[0]))
コード例 #4
0
 def test_delete(self):
   t = trie()
   t.add('next')
   t.add('nexas')
   self.assertTrue(t.delete('nexas'))
   self.assertFalse(t.delete('nexas'))
   self.assertFalse(t.delete('ne'))
   self.assertFalse(t.search('nexas'))
   #search substring of the one just deleted
   self.assertFalse(t.search('nexa'))
   #make sure not effecting other words
   self.assertTrue(t.search('next'))
   self.assertFalse(t.search('ne')) 
コード例 #5
0
def get_list_of_candidates(target_string, PAM, gRNA_length,
                           exclude_stop_codons, consider_negative, alt_pams):
    target = target_string
    if consider_negative:
        target_rev = reverse_complement(target)
    if alt_pams is None:
        PAMs = NFiller(PAM).get_list()
    else:
        for pam in alt_pams:
            if len(pam) != 3:
                raise ValueError('Length of one or more PAMs not set to 3')
            for character in pam:
                if character != 'A' and character != 'C' and character != 'G' and character != 'T':
                    raise ValueError('Invalid PAM has been entered')
        PAMs = alt_pams
    len_pams = 3
    candidates_rev = []
    candidates = []
    for PAM in PAMs:
        candidates.extend(find_candidates(target, PAM, gRNA_length))
        if consider_negative:
            candidates_rev.extend(find_candidates(target_rev, PAM,
                                                  gRNA_length))
    trie_dic = trie.trie()
    for candidate in candidates:
        key = candidate[1]
        if exclude_stop_codons and ('TAG' in key or 'TAA' in key
                                    or 'TGA' in key):
            continue
        candidate_position = candidate[0]
        if key not in trie_dic.keys():
            trie_dic[key] = ['+', candidate_position]
        else:
            trie_dic[key].append(candidate_position)
    for candidate in candidates_rev:
        if exclude_stop_codons and ('TAG' in candidate[1] or 'TAA'
                                    in candidate[1] or 'TGA' in candidate[1]):
            continue
        key = reverse_complement(candidate[1])
        candidate_position = len(
            target_string) - candidate[0] - gRNA_length - len_pams
        if key not in trie_dic.keys():
            trie_dic[key] = ['-', candidate_position]
        else:
            trie_dic[key].append(candidate_position)
    return trie_dic
コード例 #6
0
ファイル: part1-trie.py プロジェクト: ryrency/Misc
def test():
    wordsFilename = "top100k.txt"
    tagsFilename = "skip_hashtags-dev.txt"
    t = trie.trie()
    t.insert_file(wordsFilename)

    matches = []
    hash_line = ""
    for line in open(tagsFilename):
        if (line[0] == "#"):
            hashtag = line[1:-1]  # strip leading pound and newline
            hash_line = hashtag
            matches = max_match_hashtag(hashtag, t)

            print hash_line
            if matches:
                for match in matches:
                    print "   ", match
コード例 #7
0
ファイル: part1-trie.py プロジェクト: Arjun-Chaudhary/Misc
def test():
	wordsFilename = "top100k.txt"
	tagsFilename="skip_hashtags-dev.txt"
	t = trie.trie()
	t.insert_file(wordsFilename)

	matches=[]
	hash_line=""
	for line in open(tagsFilename):
		if (line[0] == "#"):
			hashtag = line[1:-1] # strip leading pound and newline
			hash_line =  hashtag
			matches=max_match_hashtag(hashtag,t)

			print hash_line
			if matches:
				for match in matches:
					print "   ", match
コード例 #8
0
    def find_family(self):
        family_list = {}
        search_tree = trie()
        skus = self.qb_df.index.values

        for sku in skus:
            search_tree.insert(sku)
            family_sku = self.qb_df.ix[sku, 'Family sku']
            if pd.notna(family_sku):
                family_list[sku] = family_sku.split(", ")

        for i in skus:
            prefix_i = self.find_family_prefix(i)
            for j in skus:
                prefix_j = self.find_family_prefix(j)
                if i != j and search_tree.find_prefix(prefix_j) and prefix_i == prefix_j:
                    if i not in family_list:
                        family_list[i] = []
                    family_list[i].append(j)
        self.family = family_list
コード例 #9
0
def build_kmers_tries(kmers_filename, goodkeys_filename, badkeys_filename, 
                      kmers_trie_filename, genome, altpam, pampos, maxcount, n):
    util.check_file_exists(kmers_filename)
    if goodkeys_filename:
        goodkeys = gzip.open(goodkeys_filename, 'w')
    if badkeys_filename:
        badkeys = gzip.open(badkeys_filename,'w')

    kmers_trie = trie.trie()

    f = gzip.open(kmers_filename)
    for line in f:
        kmer, coord = line.strip().split()
        kmer2 = kmer[n:]

        if kmers_trie.has_key(kmer2):
            arr = kmers_trie[kmer2]
            if len(arr) < maxcount + 1:
                coord_int = util.map_coord_to_int(coord, genome)
                arr = np.append(arr, coord_int)
                arr[0] = len(arr) - 1
                kmers_trie[kmer2] = arr
        else:
            coord_int = util.map_coord_to_int(coord, genome)
            label = 0
            if pampos == 'start' and any(kmer.startswith(p) for p in altpam):
                label = 1
            if pampos == 'end' and any(kmer.endswith(p) for p in altpam):
                label = 1
            kmers_trie[kmer2] = np.array([label, coord_int])
            
            if label == 0:
                goodkeys.write('%s\n' % kmer)
            if label != 0:
                badkeys.write('%s\n' % kmer)
    
    save_single_trie(kmers_trie, kmers_trie_filename)

    goodkeys.close()
    badkeys.close()
    f.close()
コード例 #10
0
ファイル: part1-trie.py プロジェクト: Arjun-Chaudhary/Misc
def test_with_gs():
	wordsFilename = "test_vocabulary.txt"
	tagsFilename="hashtags-test.txt"
	t = trie.trie()
	t.insert_file(wordsFilename)

	gs=[]
	gs_line=""
	matches=[]
	hash_line=""
	for line in open(tagsFilename):
		if (line[0] == "$"):
			gs_line=line[1:-1]
			gs = gs_line.split(',')
		 
		if (line[0] == "#"):
			hashtag = line[1:-1] # strip leading pound and newline
			hash_line =  hashtag
			matches=max_match_hashtag(hashtag,t)

		if gs and matches:
			print "-->", hash_line
			print "GS>", gs_line
			n = min(len(matches), len(gs))
			for i in xrange(0,n):
				print "T:", i, " "  + matches[i] + " " +  gs[i]
				if (matches[i] != gs[i]):
					print "ERROR:"
			gs=[]
			gs_line=""
			matches=[]
			hash_line=""
				
		else:
			print hash_line
			if matches:
				for match in matches:
					print match
コード例 #11
0
ファイル: part1-trie.py プロジェクト: ryrency/Misc
def test_with_gs():
    wordsFilename = "test_vocabulary.txt"
    tagsFilename = "hashtags-test.txt"
    t = trie.trie()
    t.insert_file(wordsFilename)

    gs = []
    gs_line = ""
    matches = []
    hash_line = ""
    for line in open(tagsFilename):
        if (line[0] == "$"):
            gs_line = line[1:-1]
            gs = gs_line.split(',')

        if (line[0] == "#"):
            hashtag = line[1:-1]  # strip leading pound and newline
            hash_line = hashtag
            matches = max_match_hashtag(hashtag, t)

        if gs and matches:
            print "-->", hash_line
            print "GS>", gs_line
            n = min(len(matches), len(gs))
            for i in xrange(0, n):
                print "T:", i, " " + matches[i] + " " + gs[i]
                if (matches[i] != gs[i]):
                    print "ERROR:"
            gs = []
            gs_line = ""
            matches = []
            hash_line = ""

        else:
            print hash_line
            if matches:
                for match in matches:
                    print match
コード例 #12
0
def child(M, X):
    """
    Input: - M: context matrix
           - X: list of attributes
    Output: - res: list of potential children (obj, att)
    """

    res = []

    objX = common_objects(M, X)
    L = without(range(len(M[0])), X)
    t = tr.trie(-1, [], [], [])

    for i in L:
        obji = objr(M, i, objX)
        t.insert_trie(i, obji)

    S = t.equivalence()

    for s in S:
        res.append((s[0], sorted(X + s[1])))  # sorted ?

    return res
コード例 #13
0
ファイル: concepts.py プロジェクト: gitter-badger/Project2A
def child(M, X):
    
    """
    Input: - M: context matrix
           - X: list of attributes
    Output: - res: list of potential children (obj, att)
    """

    res = []

    objX = common_objects(M, X)
    L = without(range(len(M[0])), X)
    t = tr.trie(-1,[],[],[])

    for i in L:
        obji = objr(M, i, objX)
        t.insert_trie(i, obji)

    S = t.equivalence()

    for s in S:
        res.append((s[0], sorted(X+s[1]))) # sorted ?

    return res
コード例 #14
0
def create_trie(filename):
	text = open(filename, buffering=1)
	
	# Create an empty trie called file_trie
	file_trie = trie()

	# Regex for parsing each line
	pattern = re.compile(r"^(\w+?)\s*?(\d+?)$")
	
	# Begin regular expressions on file line by line
	while(text):
		line = text.readline()
		if not line:
			break
		
		# Parse with Regex
		match = pattern.match(line)
		if match is not None:
			# Add each line's word and hits to trie
			file_trie.add_child(match.groups()[0], int(match.groups()[1]))

	text.close()

	return file_trie
コード例 #15
0
def create_trie(filename):
    text = open(filename, buffering=1)

    # Create an empty trie called file_trie
    file_trie = trie()

    # Regex for parsing each line
    pattern = re.compile(r"^(\w+?)\s*?(\d+?)$")

    # Begin regular expressions on file line by line
    while (text):
        line = text.readline()
        if not line:
            break

        # Parse with Regex
        match = pattern.match(line)
        if match is not None:
            # Add each line's word and hits to trie
            file_trie.add_child(match.groups()[0], int(match.groups()[1]))

    text.close()

    return file_trie
コード例 #16
0
ファイル: decode.py プロジェクト: asaluja/cca-mt
    log_score = False
if log_score and no_cca: #if we are not writing scores, then log scores will be ignored
    sys.stderr.write("Warning! Ignoring log_score ('-l') option, since no_cca flag ('-c') is on\n")    

param_filename = args[0]
output_dir = args[1]
num_process = int(args[2])
param_fh = open(param_filename, 'rb')
model = cPickle.load(param_fh)
extractor = cPickle.load(param_fh)
param_fh.close()
phrase_pairs = ["[X] ||| " + pair for pair in model.get_tokens()]
phrase_pairs.append("[X] ||| [X,1] [X,2] ||| [1] [2]")
phrase_pairs.append("[X] ||| [X,1] [X,2] ||| [2] [1]")
#dev_grammars=args[3]
grammar_trie = trie(phrase_pairs)
print "Data structures from training stage loaded"
if discretize != "": #compute relevant statistics for discretization
    compute_feature_thresholds(model, discretize)

'''
declaration of list that maintains which sentences have failed across all processes
'''
def init(fs):
    global failed_sentences
    failed_sentences = fs

def main():
    failed_sentences = mp.Manager().list()
    pool = mp.Pool(processes=num_process, initializer=init, initargs=(failed_sentences,))
    for sent_num, line in enumerate(sys.stdin):
コード例 #17
0
ファイル: main.py プロジェクト: kritika92/search_trie
import os
import stat
import re

from directory_to_list import directory_to_list
from trie import trie

dirfilepath=input('type the filepath of the directory: ')
dir_to_list=directory_to_list(dirfilepath)
dir_to_list.setprefixes()
myprefix=dir_to_list.get_myprefix()

file_list=dir_to_list.get_file_list()
our_trie=trie(myprefix,file_list)
our_trie.make_trie()

#The prefix/words to search
a=[]
p=''
while(1):
    p=input('enter the prefixes , else to exit enter q: ')
    if(p=='q'):
        break
    else:
        a.append(p)
    

our_trie.check_prefixes(a) 
コード例 #18
0
            return path
        else:
            return ''


def trie_matching(text, trie):
    occurences = defaultdict(list)
    for i in range(len(text)):
        postfix = text[i:]
        path = prefix_trie_matching(postfix, trie)
        if path:
            occurences[path].append(i)

    return occurences


if __name__ == '__main__':
    inp, out = small_example()
    #inp, out = big_example()
    inp = read_dataset()

    root, edges = trie(map(Seq, inp[1:]))
    occurences = trie_matching(Seq(inp[0]), root)

    positions = ''
    for p in inp[1:]:
        if p in occurences:
            positions += ' '.join(map(str, occurences[p])) + '\n'

    #positions = prefix_trie_matching(Seq(inp[0]), root)
    write_result(positions)
コード例 #19
0
for f in all_files:  #read all the files with .sgm extension
    if f[-4:] == ".sgm":
        fi = open("reuters21578/" + f, mode='r', encoding='latin-1')
        text = fi.read()
        # print(f)

        while True:
            start = text.find("<REUTERS")
            end = text.find("</REUTERS>")
            #print(text[start:end])

            if start == -1:
                break
            getWords(text[start:end])
            text = text[end + 11:]

tr = trie(
)  #creates a trie object and adds the every word recorded in the files
for word in inversedict:
    tr.add(word)

triefile = open("trie.pickle", 'wb')  #pickle the trie object for later use
pickle.dump(tr, triefile)
triefile.close()

with open('invertedindex.json',
          'w') as outfile:  #json the invertedindex dictionary for later use
    json.dump(inversedict, outfile)

print(inversedict.keys())
コード例 #20
0
#!/usr/bin/python

import trie

search_structure = trie.trie()


def readAndInsertWords(filename):

    fh = open(filename, "r")
    name_list = fh.readlines()

    for name in name_list:
        name = name[0:len(name) - 1]
        search_structure.insert(name)
    fh.close()


def searchWord(word):
    return search_structure.search(word)


readAndInsertWords("names.txt")

if __name__ == "__main__":

    while True:
        word = raw_input("Enter word to be searched\n")

        ret_val = searchWord(word)
コード例 #21
0
ファイル: main.py プロジェクト: kritika92/search_trie
import os
import stat
import re

from directory_to_list import directory_to_list
from trie import trie

dirfilepath = input('type the filepath of the directory: ')
dir_to_list = directory_to_list(dirfilepath)
dir_to_list.setprefixes()
myprefix = dir_to_list.get_myprefix()

file_list = dir_to_list.get_file_list()
our_trie = trie(myprefix, file_list)
our_trie.make_trie()

#The prefix/words to search
a = []
p = ''
while (1):
    p = input('enter the prefixes , else to exit enter q: ')
    if (p == 'q'):
        break
    else:
        a.append(p)

our_trie.check_prefixes(a)
コード例 #22
0
 def setUp(self):
     self.dict = trie.trie()
     self.dict.readDict("../dict.txt")
     self.b = board.board(5, dict)
コード例 #23
0
ファイル: compute_hg.py プロジェクト: jonsafari/spectral-scfg
        optsDict["nodeMarginal"] = 1 #if true, we print out heat maps
    elif opt[0] == '-f': #if marginal is < 0, we flip the sign
        optsDict["flipSign"] = 1
    elif opt[0] == '-m': #MLE 
        optsDict["MLE"] = 1
    elif opt[0] == '-s': #source norm
        optsDict["sourceNorm"] = 1
    elif opt[0] == '-t': #target norm
        optsDict["targetNorm"] = 1
    elif opt[0] == '-x': #only write out marginals for non-lexical rules in source
        optsDict["onlyXX"] = 1

params_fh = open(args[0], 'rb')
paramDict = cPickle.load(params_fh) #key is 'LHS ||| src RHS'
grammar_rules = [rule for rule in paramDict.keys() if rule != "Pi"] #Pi contains the start of sentence params
grammarTrie = trie(grammar_rules) 
rank = int(args[1])
inputFile = open(args[2], 'r').readlines()
numProcesses = int(args[3])
outDir = args[4]
if not os.path.exists(outDir):
    os.makedirs(outDir)

'''
declaration of list that maintains which sentences have failed across all processes
'''
def init(fs, fls):
    global failed_sentences, flipped_sentences
    failed_sentences = fs
    flipped_sentences = fls
コード例 #24
0
def amma_api_initiate_trie():
	return trie.trie("input.txt") # To load trie from scheme file. clean code. tested
コード例 #25
0
ファイル: wyrd.py プロジェクト: Stellarator-X/Wyrd
def load_model(file_name):
    obj_file = open(file_name, 'rb')
    tree = trie()
    tree = pickle.load(obj_file)
    return tree
コード例 #26
0
ファイル: decode.py プロジェクト: asaluja/cca-mt
    sys.stderr.write(
        "Warning! Ignoring log_score ('-l') option, since no_cca flag ('-c') is on\n"
    )

param_filename = args[0]
output_dir = args[1]
num_process = int(args[2])
param_fh = open(param_filename, 'rb')
model = cPickle.load(param_fh)
extractor = cPickle.load(param_fh)
param_fh.close()
phrase_pairs = ["[X] ||| " + pair for pair in model.get_tokens()]
phrase_pairs.append("[X] ||| [X,1] [X,2] ||| [1] [2]")
phrase_pairs.append("[X] ||| [X,1] [X,2] ||| [2] [1]")
#dev_grammars=args[3]
grammar_trie = trie(phrase_pairs)
print "Data structures from training stage loaded"
if discretize != "":  #compute relevant statistics for discretization
    compute_feature_thresholds(model, discretize)
'''
declaration of list that maintains which sentences have failed across all processes
'''


def init(fs):
    global failed_sentences
    failed_sentences = fs


def main():
    failed_sentences = mp.Manager().list()
コード例 #27
0
ファイル: part1-min-first.py プロジェクト: ryrency/Misc
#
# Python 2.6

import trie

#wordsFilename = "top100k.txt"
#wordsFilename = "top10k.txt"
#wordsFilename = "top1k.txt"
wordsFilename = "top100.txt"
#wordsFilename = "top20.txt"
#wordsFilename = "test.txt"

tagsFilename = "hashtags_dev.txt"
tagsFilename = "skip_hashtags-dev.txt"

t = trie.trie()
t.insert_file(wordsFilename)

for line in open(tagsFilename):
    hashtag = line[1:]
    print "----------------------" + hashtag
    part = ""
    index = 0
    while index < len(hashtag):
        part = part + hashtag[index]
        if t.is_word(part):
            print part
            part = ""
        index += 1
    print "----------------------"
コード例 #28
0
# Also, I won't be using many libraries and want to keep it that way unless absolutely necessary

import pickle
import re
import numpy as np
import sys

from trie import trie

# Building the player model

player = trie()

with open('Resources/booklist.txt') as booklist:
    books = booklist.readlines()
    for book in books:
        fil = open("Resources/" + book[:-1])
        text = fil.read()
        words = re.split(r'\W+', text)
        words = np.unique(np.array(words))
        n_block = 30
        done = "["
        rem = (n_block) * "~"
        block = int(len(words) / n_block)
        for i, word in enumerate(words):
            print(
                f"\rReading from {book[:-1]} : {done}{rem}] : {i*100/(len(words)):0.2f}% ",
                end="")
            if (i % block == 0):
                # Progress Bar
                done += "="
コード例 #29
0
#
# Python 2.6

import trie

#wordsFilename = "top100k.txt"
#wordsFilename = "top10k.txt"
#wordsFilename = "top1k.txt"
wordsFilename = "top100.txt"
#wordsFilename = "top20.txt"
#wordsFilename = "test.txt"

tagsFilename="hashtags_dev.txt"
tagsFilename="skip_hashtags-dev.txt"

t = trie.trie()
t.insert_file(wordsFilename)

for line in open(tagsFilename):
	hashtag = line[1:]
	print "----------------------" + hashtag
	part = ""
	index=0
	while index < len(hashtag) :
		part = part + hashtag[index]
		if t.is_word(part) :
			print part
			part=""	
		index+=1
	print "----------------------"
コード例 #30
0
def generate_adjacent_mers(sequence, max_hamming_distance):
    alphabet = 'AGCT'
    t = trie.trie()
    hamming_ball(sequence, max_hamming_distance, alphabet, t)
    return t