def combineWithChild(self): #first, let's see if it's even legal for us to combine with one of our children if (self.verb.verb.isModal() or self.verb.verb.isHelper()): return #the first verb to compare first = self.child.verb firstFull = first.verb.get(True)[0]["full"] #and the second second = self.verb secondFull = second.verb.get(True)[0]["full"] forms = ( (word.word(firstFull + " " + secondFull, first.sentLoc, first.clauseLoc, first.numWords), second), (word.word(secondFull + " " + firstFull, first.sentLoc, first.clauseLoc, first.numWords), first), (word.word(firstFull + secondFull, first.sentLoc, first.clauseLoc, first.numWords), second), (word.word(secondFull + firstFull, first.sentLoc, first.clauseLoc, first.numWords), first) ) #go through all possible combinations for f in forms: if (len(f[0].translations.searchFromDB()) > 0): #store the conjugation and defineable forms self.verb = f[0] self.conjugation = f[1] #we're absorbing our child, remove him self.child = self.child.child #and store the verbs we used to return in appendVerbs() self.verbs = (first, second) #and set our flag that we're not combined with our child self.isCombined = True break
def test_ordered_dict_search(self): odict = self.new_ordered_dict(4) odict.insert(10, 110) odict.insert(12, 112) odict.insert(15, 115) odict.insert(13, 113) odict.insert(14, 114) # no common edges in a trie with self.assertRaises(KeyError): odict.search(0) # some comon edges in a trie with self.assertRaises(KeyError): odict.search(11) # contained elements result = odict.search(10) self.assertEqual(result, 110) result = odict.search_node(10) self.assertEqual(result.key, word(10, 4)) self.assertEqual(result.value, 110) result = odict.search(12) self.assertEqual(result, 112) result = odict.search_node(12) self.assertEqual(result.key, word(12, 4)) self.assertEqual(result.value, 112)
def test_ordered_dict_insert(self): odict = self.new_ordered_dict(8) self.assertEqual(odict.size(), 0) q = word(12, 8) node = odict.insert(q, 112) self.assertEqual(node.key, q) self.assertEqual(node.value, 112) self.assertEqual(odict.size(), 1) q = word(14, 8) node = odict.insert(q, 114) self.assertEqual(node.key, q) self.assertEqual(node.value, 114) self.assertEqual(odict.size(), 2) q = word(13, 8) node = odict.insert(q, 113) self.assertEqual(node.key, q) self.assertEqual(node.value, 113) self.assertEqual(odict.size(), 3) q = word(77, 8) node = odict.insert(q, 177) self.assertEqual(node.key, q) self.assertEqual(node.value, 177) self.assertEqual(odict.size(), 4)
def test_insert(self): veb = self.new_trie(4) ref = self.new_reference_trie(4) # \ # 0111 a = word(0b0111, 4) ref.insert(a) veb.insert(a) self.assertEqualTrie(veb, ref) # 0111 # / # \ # 1000 b = word(0b1000, 4) ref.insert(b) veb.insert(b) self.assertEqualTrie(veb, ref) # 0111 # / # \ # 1 - 000 # \ # - 001 c = word(0b1001, 4) ref.insert(c) veb.insert(c) self.assertEqualTrie(veb, ref)
def test_depths(self): trie = self.new_trie(16) q = word(0b0011111011101110, 16) depths = list(trie._depths(q)) expect = [q.split_fst(14), q.split_fst(12), word.epsilon] self.assertEqual(depths, expect) q = word(0b0011010111010001, 16) depths = list(trie._depths(q)) expect = [q.split_fst(14), q.split_fst(12), word.epsilon] self.assertEqual(depths, expect) # trie = self.new_trie(8) q = word(0b00111110, 8) depths = list(trie._depths(q)) expect = [q.split_fst(6), q.split_fst(4), word.epsilon] self.assertEqual(depths, expect) q = word(0b00110101, 8) depths = list(trie._depths(q)) expect = [q.split_fst(6), q.split_fst(4), word.epsilon] self.assertEqual(depths, expect)
def test_insert_order(self): # seed: 6897201961525902772 veb = self.new_trie(8) ref = self.new_reference_trie(8) a = word(0b00110111, 8) ref.insert(a) veb.insert(a) self.assertEqualTrie(veb, ref) # \ # 00 - 011001 # \ # - 110111 b = word(0b00011001, 8) ref.insert(b) veb.insert(b) self.assertEqualTrie(veb, ref) # \ # 00 - 011 - 001 # | \ # | - 100 # \ # - 110 111 c = word(0b00011100, 8) ref.insert(c) veb.insert(c) self.assertEqualTrie(veb, ref)
def test_has_prefix(self): # word itself is prefix a = word(0b1100, 4) p = word(0b1100, 4) result = a.has_prefix(p) self.assertTrue(result) # epsilon is a prefix a = word(0b1100, 4) p = word.epsilon result = a.has_prefix(p) self.assertTrue(result) # prefix is a prefix a = word(0b1100, 4) p = word(0b110, 3) result = a.has_prefix(p) self.assertTrue(result) # word which is shorter, but is no prefix a = word(0b1100, 4) p = word(0b010, 3) result = a.has_prefix(p) self.assertFalse(result) # word which is longer can't be a prefix a = word(0b1100, 4) p = word(0b11001, 5) result = a.has_prefix(p) self.assertFalse(result)
def test_search_phase_two(self): trie = Mihai.Tree(16) trie.construct([ 0b0111111000000010, 0b1000100100010011, 0b1010101101011110, 0b1110110010010001, 0b1111100110110010, 0b0110000011111000, 0b0000011110101100, 0b0101101000111011, 0b0111101010010111, 0b0001010010110101, 0b0110100011010001, 0b0101010100000001, 0b1100101010101110, 0b1110001101101010, 0b0010001001100001, 0b0001101011100100, 0b0111100011011101, 0b0100000010000111, 0b1100110011100000, 0b0101010100110111, 0b1000111001111010, 0b0000101100001000, 0b1000001010000011, 0b0010011101100011, 0b1010110101110111, 0b0110100100101001, 0b0011101101101101, 0b0100010000000101, 0b0000101001001101, 0b1011000111100100 ]) # q is below (u,v) q = word(0b1101110101100101, 16) index = trie._lca_search2(q, 2, 2, 4) self.assertEqual(index, 3) # q is in between (u,v) q = word(0b1000011010001001, 16) index = trie._lca_search2(q, 4, 4, 8) self.assertEqual(index, 5) # q is below (u,v) q = word(0b1010100101111010, 16) index = trie._lca_search2(q, 5, 5, 8) self.assertEqual(index, 6) # q is below (u,v) q = word(0b1100101111111111, 16) index = trie._lca_search2(q, 5, 5, 8) self.assertEqual(index, 7) # q is in between (u,v) q = word(0b1010000010100001, 16) index = trie._lca_search2(q, 4, 4, 8) self.assertEqual(index, 4) # q is in between (u,v) q = word(0b0110100011110001, 16) index = trie._lca_search2(q, 8, 8, 12) self.assertEqual(index, 10) # q is in between (u,v) q = word(0b0101010100000000, 16) index = trie._lca_search2(q, 12, 12, 16) self.assertEqual(index, 14) # q is in between (u,v) q = word(0b0000011110100101, 16) index = trie._lca_search2(q, 12, 12, 16) self.assertEqual(index, 12) # q is contained q = word(0b0111111000000010, 16) index = trie._lca_search2(q, 16, 16, 16) self.assertEqual(index, 16)
def test_hash(self): a = word(12, 8) b = word(12, 8) hashmap = {} hashmap[a] = False hashmap[b] = True self.assertEqual(hash(a), hash(b)) self.assertTrue(hashmap[a], 'a should hash to the same as b location') self.assertTrue(hashmap[b], 'b should hash to the same as a location')
def test_pred(self): self.assertEqual(word(3, 2).pred(), word(2, 2)) self.assertEqual(word(2, 2).pred(), word(1, 2)) self.assertEqual(word(1, 2).pred(), word(0, 2)) with self.assertRaises(TypeError): word(0, 2).pred()
def insert(self, q, value=None): """ Insert q into the trie """ q = word(q, self.w) start_node = self.root.child(q) if start_node is None: node = self._insert_leaf(q, value, self.root) self._size += 1 return node assert \ start_node.is_leaf() or ( start_node.left is not None and start_node.right is not None ), "start_node is either a leaf or a strict branch node" lca, child = self.lowest_common_ancestor(q) if lca.is_leaf_of(q): lca.value = value return lca assert \ child is not None, \ "the subtree in which q belongs is not empty" _, new_node = self._insert_node(q, value, lca, child) self._size += 1 return new_node
def __participleMeanings(self, participles, meanings): #add our participles to our meanings for p in participles: presentParticiple = p.verb.isPresentParticiple() forms = p.verb.get(unknownHelper = True) #save the full form of our word for the translation origWord = p.verb.word #if we found no conjugations for the verb, then we had something like "gesehenen", #so we need to get a new word from the stem of the participle, then we let the #translator run through all its stuff and get the meaning of the verb, and then to #the output it goes if (len(forms) == 0): p = word.word(p.verb.getParticipleStem()[0], p.sentLoc, p.clauseLoc, p.numWords) forms = p.verb.get(True) fullForm = forms[0]["full"] loc = p.sentLoc #fix for python 2.4 tense = "past participle" if (fullForm == origWord): tense = "infinitive" elif (presentParticiple): tense = "present participle" for t in p.get("verb"): meanings.append({ "en": "(" + tense + ") " + t["en"], "de": fullForm, "deOrig": origWord, "deWordLocation": loc })
def parse(self,links): strwords = [] for link in links: #get html with urllib.request.urlopen(link) as url: page = url.read() #beautiful soup object soup = BeautifulSoup(page) #extract and combine paragraphs paragraphs = soup.find_all('p') for x in paragraphs: #separate all words (returns list) strwords = strwords + x.getText().split() #test regex regex = r'\.$|\W$' self.filtered = parse.test_regex(strwords,regex) #parse words. #regex: \.$ is periods at the end of string OR \W$ is special chars at end #of string (which also includes special chars alone. #still need work on apostrophies, missed spaces after periods and word?word #leaves empty strings instead of deleting, I think #this should be made into a method. strwords[:] = [re.sub(regex,'',x) for x in strwords] #returns a dictionary object strwords = Counter(strwords) for x,y in strwords.items(): #could also use dict defaultset() if x in self.words: self.words[x].incr_count(y) else: self.words[x] = word(x,None,None,None,None,y,None)
def get_set(doc_num, num_set): word_set = {} doc_dir = os.listdir(train_path) w_dict = {} for dd in doc_dir: f_list = os.listdir(train_path + dd) print "get in the ---> " + dd + " <---" for fpath in f_list: d_path = train_path + dd + '/' + fpath with open(d_path, "rb") as d_file: list_tmp = [] lines = d_file.readlines() for line in lines: tokens = nltk.regexp_tokenize(line, pattern) for t in tokens: if t.lower() not in stopword: list_tmp.append(t.lower()) set_tmp = set(list_tmp) for w in set_tmp: if w in word_set: word_set[w].update_dict(cat_dic[dd]) else: #superise! if I did not initial the dict, all will use the same dict! word_set[w] = word(w, 0, 0, 0, {cat_dic[dd]: 1}) d_file.close() #get the word_in_doc nums for idx in word_set: word_set[idx].get_docs() word_set[idx].get_widf(doc_num) word_set[idx].get_s(doc_num, num_set) return word_set
def lowest_common_ancestor(self, q): """ lca - lowest common ancestor child - child of lowest common ancestor in the direction of q return [lca, child] """ q = word(q, self.w) start_node = self.root.child(q) # q navigates into an empty subtree of the root if start_node is None: return [self.root, None] curr = start_node c, i = q.split(curr.edge.w) # c == curr.edge iff q.has_prefix(curr.key) while not curr.is_leaf() and c == curr.edge: curr = curr.child(i) c, i = i.split(curr.edge.w) if c == curr.edge: return [curr, None] return [curr.parent, curr]
def new_node(self, q, value=None): q = word(q, self.w) node = self.root.new_node(q, value) # create a new loose node node.parent = None return node
def List2Trie(self, filename): self.CharCounts = {} totalCharCount = 0 self.CharCounts = dict([(i,1) for i in range(len(variations.Chars))]) self.allWords = {} self.allWords[-1] = word(self, -1) self.BaseWord = -1 splited_words = open(os.path.join(os.path.dirname(__file__),"splited.csv")).readlines()#[:10000] c = 0 self.totalWordCount = 0 for s in splited_words: c+=1 m = re.match("(.*?),([0-9]+)", s) if len(m.groups()) == 2: st1 = m.groups()[0] chrs = variations.SplitIntoChars(st1) cnt = int(m.groups()[1]) self.totalWordCount += cnt for i in chrs: self.CharCounts[i] += cnt totalCharCount += len(chrs) * cnt self.GetIndexedWord(self.BaseWord).Add(chrs,cnt, st1) bw=open("charprobabilities.dat",'wb') charprobabilities = "" for i in range(len(variations.Chars)): self.CharCounts[i] = math.log(self.CharCounts[i]) - math.log(totalCharCount) charprobabilities += variations.Chars[i] + "," + str(self.CharCounts[i]) + "\n" bw.write(pack('d', self.CharCounts[i])) bw.close() open("charprobabilities.csv",'w').write(charprobabilities) self.xCountCalcAll(self.BaseWord, 0) self.Save(filename,"modified.csv")
def processa_linea_vocabolo(self,line): splittedline = line.split(";") if len(splittedline) != 4: # every line supposed to have four char print(bcolors.FAIL, "Linea: " + line + " ignorata, non contiene 4 campi distinti separati dal \";\"", bcolors.RESET) return None solution = splittedline[0] rule = splittedline[1] term = splittedline[2] group = splittedline[3] splitted_line = line.split(";") solution_set = {} solution_set["e"]=["è","é","e"] solution_set["o"] = ["ò", "ó","o"] solution_set["s"] = ["ss", "s","sss"] solution_set["z"] = ["zz", "z","zzz"] if group not in solution_set.keys(): print(bcolors.FAIL,"Linea: "+line+" ignorata; Il gruppo: ",group," non è valido",bcolors.RESET) return None elif solution not in solution_set[group]: print(bcolors.FAIL,"Linea: "+line+" ignorata; per il gruppo: ",group," le soluzioni accettate sono: ",solution_set[group],bcolors.RESET) return None #if here alles ist gut lexic = word.word(solution, rule, term, group) return lexic possible_solution_set =["è","é","ò","ó"]
def get_set(doc_num,num_set): word_set = {} doc_dir = os.listdir(train_path) w_dict = {} for dd in doc_dir: f_list = os.listdir(train_path+dd) print "get in the ---> "+dd+" <---" for fpath in f_list: d_path = train_path+dd+'/'+fpath with open(d_path,"rb") as d_file: list_tmp = [] lines = d_file.readlines() for line in lines: tokens = nltk.regexp_tokenize(line,pattern) for t in tokens: if t.lower() not in stopword: list_tmp.append(t.lower()) set_tmp = set(list_tmp) for w in set_tmp: if w in word_set: word_set[w].update_dict(cat_dic[dd]) else: #superise! if I did not initial the dict, all will use the same dict! word_set[w] = word(w,0,0,0,{cat_dic[dd]:1}) d_file.close() #get the word_in_doc nums for idx in word_set: word_set[idx].get_docs() word_set[idx].get_widf(doc_num) word_set[idx].get_s(doc_num,num_set) return word_set
def test_ordered_dict_random_remove(self, seed=None): with self.random(seed) as rand: size = rand.randint(0, 150) samples = rand.sample(xrange(255), size) while len(samples) > 0: odict1 = self.new_ordered_dict(8) odict2 = self.new_ordered_dict(8) odict1.extend(samples) rand.shuffle(samples) val = samples.pop() result = odict1.remove(val) odict2.extend(samples) self.assertEqual(result.key, word(val, 8)) self.assertEqual(odict1.elements(), odict2.elements()) self.assertEqual(odict1.size(), len(samples)) min1 = odict1.min_node() min2 = odict2.min_node() self.assertEqual(min1 and min1.key, min2 and min2.key) max1 = odict1.max_node() max2 = odict2.max_node() self.assertEqual(max1 and max1.key, max2 and max2.key)
def test_successor_with_lca(self): # special case trie is empty trie = self.new_trie(8) q = word(54, 8) result = trie.successor_with_lca(q, trie.root, None) self.assertIsNone(result)
def successor_node(self, q): q = word(q, self.w) # q is in the set child = self.search_node(q) if child is not None: return child.next_leaf() # print # print "%s" % self.T_d for c in self._depths(q): # print "depth: %s (%s)" % (c, c.w) if c in self.T_d: # print "T_d[c] != None" lca, child = self.lowest_common_ancestor_start(q, c) return self.successor_with_lca(q, lca, child) successor = self._successor(q, c) if successor: return successor predecessor = self._predecessor(q, c) if predecessor: return predecessor.next_leaf() return None
def buildWords(theList): global theWords with open('wordsEn.txt','r') as filetxt: for line in filetxt: for aword in line.split(): newWord=word.word(aword.strip()) theWords.append(newWord)
def insert(self, q, value=None): q = word(q, self.w) try: new_node = self._insert(q, q, value) self._size += 1 return new_node except ValueError as err: return err.args[0]
def remove(self, q): q = word(q, self.w) try: removed = self._remove(q) self._size -= 1 return removed except KeyError: raise KeyError(q.x)
def test_insert_3bit(self): veb = self.new_ordered_dict(3) xs = [] for x in xrange(8): veb.insert(x) xs.append(word(x, 3)) self.assertEqual(veb.elements(), xs)
def test_predecessor_query_is_in_successor_tree(self): items = [41, 72, 110, 150, 210] trie = self.new_trie(8, items) result = trie.predecessor(90) expect = word(72, 8) self.assertEqual(result, expect)
def test_search_phase_one(self): trie = Mihai.Tree(16) trie.construct([ 0b0111111000000010, 0b1000100100010011, 0b1010101101011110, 0b1110110010010001, 0b1111100110110010, 0b0110000011111000, 0b0000011110101100, 0b0101101000111011, 0b0111101010010111, 0b0001010010110101, 0b0110100011010001, 0b0101010100000001, 0b1100101010101110, 0b1110001101101010, 0b0010001001100001, 0b0001101011100100, 0b0111100011011101, 0b0100000010000111, 0b1100110011100000, 0b0101010100110111, 0b1000111001111010, 0b0000101100001000, 0b1000001010000011, 0b0010011101100011, 0b1010110101110111, 0b0110100100101001, 0b0011101101101101, 0b0100010000000101, 0b0000101001001101, 0b1011000111100100 ]) q = word(0b1101110101100101, 16) index = trie._lca_search1(q, 0, 0, trie.sqrt_log_u - 1) self.assertEqual(index, 0) q = word(0b1001011010001001, 16) index = trie._lca_search1(q, 0, 0, trie.sqrt_log_u - 1) self.assertEqual(index, 0) q = word(0b1010100101111010, 16) index = trie._lca_search1(q, 0, 0, trie.sqrt_log_u - 1) self.assertEqual(index, 4) q = word(0b1010000010100001, 16) index = trie._lca_search1(q, 0, 0, trie.sqrt_log_u - 1) self.assertEqual(index, 4) q = word(0b1100101111111111, 16) index = trie._lca_search1(q, 0, 0, trie.sqrt_log_u - 1) self.assertEqual(index, 4) q = word(0b0110100011110001, 16) index = trie._lca_search1(q, 0, 0, trie.sqrt_log_u - 1) self.assertEqual(index, 8) q = word(0b0101010100000000, 16) index = trie._lca_search1(q, 0, 0, trie.sqrt_log_u - 1) self.assertEqual(index, 12) q = word(0b0000011110100101, 16) index = trie._lca_search1(q, 0, 0, trie.sqrt_log_u - 1) self.assertEqual(index, 12) q = word(0b0111111000000010, 16) index = trie._lca_search1(q, 0, 0, trie.sqrt_log_u) self.assertEqual(index, 16)
def test_create(self): # test normal creation a = word(15, 4) self.assertIsInstance(a, word) # test normal creation with a word as argument a = word(a, 4) self.assertIsInstance(a, word) # 4095 needs 12 bits, but it can only hold 8 bits with self.assertRaises(TypeError): a = word(4095, 8) # you can't create a word with a word, which does not matches the # wordsize with self.assertRaises(TypeError): a = word(15, 4) a = word(a, 8)
def search_node(self, q): q = word(q, self.w) lca, child = self.lowest_common_ancestor(q) if lca.is_leaf_of(q): return lca return None
def test_split_concat(self): xs = [(w, x) for w in range(4) for x in range(8)] for w, x in xs: a = word(x, 3) c, i = a.split(w) b = c.concat(i) self.assertEqual(a, b)
def successor_node(self, q): q = word(q, self.w) # tree is empty if self.root.is_leaf(): return None lca, child = self.lowest_common_ancestor(q) return self.successor_with_lca(q, lca, child)
def test_common_prefix_split(self): # a = 1100 = 12 # b = 1100 = 12 # p = 1100 = 12 # suffix1 = 0 with word size 0 # suffix2 = 0 with word size 0 a = b = p = word(12, 8) s1 = s2 = word(0, 0) pre, suf1, suf2 = a.common_prefix_split(b) self.assertEqual(pre, p) self.assertEqual(suf1, s1) self.assertEqual(suf2, s2) # a = 1111 0001 0001 1100 = 61724 # b = 1111 0010 1001 0100 = 62100 # p = 1111 00 = 60 # suffix1 = 0 with word size 0 # suffix2 = 0 with word size 0 a = word(61724, 16) b = word(62100, 16) p = word(60, 6) s1 = word(284, 10) s2 = word(660, 10) pre, suf1, suf2 = a.common_prefix_split(b) self.assertEqual(pre, p) self.assertEqual(suf1, s1) self.assertEqual(suf2, s2) # a = 1100 1101 0001 1100 = 52508 # b = 0100 1100 1001 0100 = 19604 # p = empty = 0 with word size 0 # suffix1 = 1100 1101 0001 1100 = 52508 # suffix2 = 0100 1100 1001 0100 = 19604 a = word(52508, 16) b = word(19604, 16) p = word(0, 0) s1, s2 = a, b pre, suf1, suf2 = a.common_prefix_split(b) self.assertEqual(pre, p) self.assertEqual(suf1, s1) self.assertEqual(suf2, s2)
def add_word(self): pair = self.to_add.pop(0) while pair[1] in self.word_card_map: if pair[1] != self.word_card_map[pair[1]]: self.to_add.append((pair[0],pair[1] + ' 2')) if len(self.to_add) == 0: return pair = self.to_add.pop(0) w = word(pair[0], pair[1], False) card(self, w)
def test_guess_word(self): word.raw_input = lambda _: give_answer(.5, 'çekirge') word.word.say = lambda _: True word.display = lambda _: True w = word.word('çekirge', 'grasshopper') for i in range(2000): w.guess_word() self.assertEqual(2001, w.num_times_seen) self.assertEqual(True, abs(float(w.num_times_correct)/w.num_times_seen - .5) < .03) word.raw_input = lambda x: raw_input(x)
def test_ordered_dict_remove(self): odict = self.new_ordered_dict(8) odict.insert(249, 1249) with self.assertRaises(KeyError): odict.remove(12) result = odict.remove(249) self.assertEqual(result.key, word(249, 8)) self.assertEqual(result.value, 1249) self.assertEqual(odict.size(), 0)
def search_node(self, q): q = word(q, self.w) try: root = self.T[q] except KeyError: return None # root is the parent of the searched leaf q = q.remove_prefix(root.key)[0] return root.child(q)
def test_ordered_dict_update_value(self): odict = self.new_ordered_dict(8) self.assertEqual(odict.size(), 0) q = word(12, 8) node1 = odict.insert(12, 15) node2 = odict.insert(q, 18) self.assertIs(node1, node2) self.assertEqual(node1.value, 18) self.assertEqual(odict.size(), 1)
def chooseWord(): global wordLength global answer global theWords global wordIdx global trys wordIdx=randint(0,len(theWords)-1) #DEBUGprint (wordIdx) wordLength=theWords[wordIdx].wordLength #DEBUGprint(theWords[wordIdx].getString()) answer=word.word(" ",True,wordLength) trys=0
def __doTranslations(self, fullForm): if (not verbNode.doTranslations): return #check if we're a `kennen lernen` type guy toTranslate = fullForm if (self.conjugation.word != self.verb.word): words = self.verb.word.split(" ") #get the original form of the word words[len(words) - 1] = self.conjugation.word toTranslate = " ".join(words) trans = word.word(toTranslate).get("verb") self.__meaning(trans, fullForm)
def test_heap(self): word.word.say = lambda _: True word.display = lambda _: True for pair in [('çekirge', 'grasshopper'), ('okul yılı', 'school year'), ('akort etmek', 'to tune'), ('oynamak', 'to preform'), ('düğmelemek', 'to button'), ('korku', 'fear'), ('memeli', 'mammal'), ('gelir', 'revenue')]: text, meaning = pair word.raw_input = lambda _: give_answer(.5, text) w = word.word(text, meaning) heap.heap_node(self.pl, w) self.check_values() for i in range(20): h = self.pl.heap_root word.raw_input = lambda _: give_answer(.5, h.word.text) h.word.guess_word() h.update() self.check_values()
def wordCount(self, text): #THis function returns a dictionary with each word in the text and the number of times it occurs print "[-] Preparing datastructures for analysis" text = self.sanitise(text) text = text.split(' ') for word in text: word = word.replace(' ', "").strip() try: self.wordCountDict[word] = self.wordCountDict[word] + 1 except: self.wordCountDict[word] = 1 for w in self.wordCountDict: wordOb = wd.word(w,int(self.wordCountDict[w])) self.wordObjectList.append(wordOb) self.stats()
def get_word_dict(path): with open(path['lex_path'],'r') as infile: word_dict = {} py_dict = {} for line in infile.readlines(): str_tmp = line.rstrip().split('\t') word_dict[str_tmp[0]] = word(str_tmp[0],{},0) py_tmp = get_pinyin(str_tmp[1:]) if py_tmp in py_dict: py_dict[py_tmp].append(str_tmp[0]) else: py_dict[py_tmp] = [str_tmp[0]] infile.close() print "Have got the word dict" #max_len = max(len(x) for x in word_dict) return word_dict,py_dict
def get_voc_set(): word_dict = {} word_no = 0 doc_dir = os.listdir(tr_data_path) for doc_cat in doc_dir: file_list = os.listdir(tr_data_path+doc_cat) print '开始处理: '+doc_cat+' 文件夹文件' for file_path in file_list: doc_f = open(tr_data_path+doc_cat+'/'+file_path,'rb') document = doc_f.read() tokens = set(nlp.word_tokenize(document)) for w in tokens: if w not in word_dict: word_dict[w] = word(w,word_no) word_no += 1 word_dict[w].update_dict(cat_dict[doc_cat]) return word_dict
def translate(query, beAggressive): """Does the hefty work of translating the input""" try: query = utf8.encode(query) if (sentenceFigurer.canTranslate(query)): s = sentenceFigurer(query) return s.translate(beAggressive) else: w = word.word(query) return w.get() except: if (app.config.get('debug', False)): raise else: return []
def __translateInheritedTense_modal(self, uberParent): """ Translates an inherited tense. When we get here, it means we're doing something like: "Ich würde bleiben müssen" -> "I would have to stay" """ form = uberParent.verb.verb.get(True)[0] stem = uberParent.verb.verb.getStem() #this is the only case I can think of right now, more to come, I'm sure if (form["subj2"] == stem): self.setTense(tenses.INFINITIVE) #and add our translations to the output for v in self.conjugation.verb.get(True): trans = word.word(v["full"]).get("verb") self.__meaning(trans, v["full"])
def translate(self, beAggressive): """Assumes we can translate it, then runs a sentence guesser on it""" #remove any character that can't be used as a word tmpClauses = [re.sub(u"[^a-zA-Z0-9ÄÖÜäöüß\s]*", "", r.strip()) for r in re.split("[,\.\?\!\;\:]*", self.query) if len(r) > 0] #do a pass over the sentence to count words and stuff and stuff words = [] numWords = 0 for c in tmpClauses: w = c.split(" ") numWords += len(w) words.append(w) #and now do a final pass to build up our word objects loc = 0 ret = [] for w in words: wLen = len(w) w = [word.word(w, loc + i, i, wLen) for w, i in zip(w, range(0, wLen))] ret += clauseFigurer().translate(w, beAggressive) loc += wLen return ret
# Pygame Hangman # hangman.py # [email protected] # A simple implementaiton of Hangman using python3 and pygame import pygame import word from random import randint #game variables wordList=[] trys=0 wordLength=0 wordIdx=-1 answer=word.word(" ") complete=False lost=False theWords=[] flashC = 0 isFlash=True # Define some colors black = ( 0, 0, 0) white = ( 255, 255, 255) green = ( 0, 255, 0) red = ( 255, 0, 0) def chooseKey(event): key="a" if event.key == pygame.K_b: key="b"
def translate(self, words, beAggressive): """Given a complete clause, finds relations amongst verbs and determines their tenses.""" #run for all the possible verbs (participles could be included in this list) tmpVerbs = [v for v in words if v.isVerb()] if (len(tmpVerbs) == 0): tmpVerbs = [v for v in words if v.isVerb(ignoreLocation = True)] if (len(tmpVerbs) == 0): participles = [] meanings = [] [participles.append(w) for w in words if w.verb.isPastParticiple()] [participles.append(w) for w in words if w not in participles and w.verb.isPresentParticiple()] self.__participleMeanings(participles, meanings) return meanings #lowercase the verbs -- we need this for our compares later for v in tmpVerbs: v.word = v.word.lower() v.verb.word = v.verb.word.lower() #all the possible verbs in the sentence possibleVerbs = [v for v in tmpVerbs if not v.verb.isPresentParticiple()] #the present participles that were originally mistaken for verbs -- they were excluded in #the above statement, so we need to grab them here participles = [v for v in tmpVerbs if v not in possibleVerbs] #only add in past participles if they're not in our list of possible verbs -- if it is really #a participle and included in the list of possible verbs, it will be pruned out later [participles.append(w) for w in words if w not in possibleVerbs and w.verb.isPastParticiple()] #present particples are easy -> only add them if they were not gotten from the mistaken list of #verbs above [participles.append(w) for w in words if w not in participles and w.verb.isPresentParticiple()] #step 2: since we are in a clause, we have isolation from all other verbs, so let's #start building out our verb tree # #do we have a separable prefix that needs re-attaching? lastWord = words[len(words) - 1] if (lastWord.isSeparablePrefix() and len(possibleVerbs) > 0): #attempt to see if when we add the prefix to the verb, it is still a verb prefixed = word.word(lastWord.word + possibleVerbs[0].word, possibleVerbs[0].sentLoc, possibleVerbs[0].clauseLoc, possibleVerbs[0].numWords) if (prefixed.isVerb()): tmpVerbs.remove(possibleVerbs[0]) possibleVerbs[0] = prefixed #it's a separable verb, so replace it #pass it onto the tree constructor to build out our verb tree tree = verbTree() tree.build(possibleVerbs) #do our first pass on the tree to clean out the remaining participles tree.translate(translate = False) #clear our ambiguous words ambi = tree.pruneAmbiguousWords(beAggressive) if (len(ambi) > 0): [tmpVerbs.remove(v) for v in ambi if v in tmpVerbs] [possibleVerbs.remove(v) for v in ambi if v in possibleVerbs] #and rebuild our tree...again tree.build(possibleVerbs) #do our second pass on the tree, if we removed some "sein"s tree.translate(translate = False) for i in (1,2): #add the mistaken participles to our participle list participles += tree.pruneParticiples() [possibleVerbs.remove(v) for v in participles if v in possibleVerbs] tree.build(possibleVerbs) #do a final pass (now that it's clean) for the actual tenses and translations tree.translate(translate = True) #debugging dump of the tenses and nodes tree.dump() #grab all the used verbs verbs = tmpVerbs[:] [verbs.remove(v) for v in tree.getVerbs() if v in verbs] #only add participles to our list if they're not already in the list (no duplicates allowed) #anything left over in verbs as this point was not used in the tree, so chances are it is #a participle [participles.append(v) for v in verbs if v not in participles] #the meanings of the used, conjugated verbs meanings = tree.getMeanings() self.__participleMeanings(participles, meanings) return meanings
import word a1 = word.word("I") a2 = word.word_question("want") a3 = word.word("eat") a4 = word.word("soup") <<<<<<< HEAD # test ======= >>>>>>> language print a1,a2,a3,a4
def __translateWithHelper(self, parent): #grab our helper's conjugations and stuff helperConj = parent.verb.verb.getStem() helper = parent.verb.verb.get(unknownHelper = True)[0] #if we're going for simple tenses if (helper["stem"] == "hab" or helper["stem"] == "sein"): #it's possible that we have numerous verbs that take the same past-tense form verbs = [] stem = self.conjugation.verb.getStem() #is the verb in the right form for having a helper? #check here to make sure that the entered verb is in the right past-tense form for v in self.conjugation.verb.get(helper = helper["full"]): #make sure we have the right helper, too if (v["perfect"] == stem and v["hilfsverb"] == helper["full"]): verbs.append(word.word(v["full"])) #two loops...otherwise things get far too indented and painful for v in verbs: used = False #process the translation into its proper output form if (helperConj in (helper["third"], helper["firstPlural"], helper["first"], helper["thirdPlural"], helper["stem"])): self.setTense(tenses.PAST_PERFECT) used = True elif (helperConj == helper["subj2"]): self.setTense(tenses.CONDITIONAL_PAST) used = True elif (helperConj == helper["preterite"]): self.setTense(tenses.PLUSQUAM) used = True #and set the translations with the full form of our word #it can grab from our node the conjugated values, &etc. if (used): self.__doTranslations(v.word) #this is a special-case tense -> the combination of a helper and a modal...owwies elif (helper["stem"] == "werd" and self.verb.word in (word.canoo.helperHaben, word.canoo.helperSein) and self.child != None and self.child.conjugation.verb.getStem() == self.child.conjugation.verb.get(True)[0]["perfect"] ): self.setTense(tenses.FUTURE2_HELPER) self.child.setTense(tenses.FUTURE2) self.child.__doTranslations(self.child.conjugation.verb.get(True)[0]["full"]) #something going on with werden -> conditional present, passive voice elif (helper["stem"] == "werd"): conjugatedStem = self.conjugation.verb.getStem() #all the possible verbs (ex: gedenken + denken for gedacht) for v in self.conjugation.verb.get(helper["full"]): used = False #if we're looking at an unconjugated form of the verb: sehen if (conjugatedStem == v["perfect"]): if (helperConj == helper["preterite"]): self.setTense(tenses.PASSIVE_PAST) used = True elif (helperConj in (helper["third"], helper["firstPlural"], helper["first"], helper["thirdPlural"], helper["stem"])): self.setTense(tenses.PASSIVE_PRESENT) used = True elif (conjugatedStem == v["stem"]): if (helperConj == helper["subj2"]): self.setTense(tenses.CONDITIONAL) used = True elif (helperConj in (helper["third"], helper["firstPlural"], helper["first"], helper["thirdPlural"], helper["stem"])): self.setTense(tenses.FUTURE) used = True if (used): self.__doTranslations(v["full"])
def test_to_str(self): w = word.word('çöişüğıÇÖİŞIÜĞasd', 'Testing Turkish Characters', False) self.assertEqual(w.text, 'çöişüğıÇÖİŞIÜĞasd') self.assertEqual(str(w), 'çöişüğıÇÖİŞIÜĞasd')
#!/usr/bin/env python # Filename: readgre.sh # Author: LIU Yang # Create Time: Sun Aug 25 03:04:20 HKT 2013 # License: LGPL v2.0+ # Contact Me: [email protected] import fileinput, shelve import word from config import GRE_DB wordbook = dict() windex = 10000 # Magic number 4 GRE words, no portable issue HA HA HA! for line in fileinput.input(): items = line.split() name, mean = items[0], ' '.join(items[1:]) wordbook[windex] = word.word(name, mean) windex += 1 for idx in wordbook: print idx, wordbook[idx] gre_db = shelve.open(GRE_DB) for windex in wordbook: gre_db[str(windex)] = wordbook[windex] gre_db.close()
) ) ... ) ''' parts = [ 'n','aj','av','pr', 'ab'] desc = [ [False,False,False,True, False], [True, False,False,False,False], [True, True, True, True, False], [True, True, True, True, False], [True, True, True, True, False] ] dummy = word.word('') dummy.part = ':dnn' def describes(head,tail): if (head == 'cj') or\ (tail == 'cj') or\ (head == 'dn')or\ (tail == 'dn'): return False if head[0] == ':': if head[1:3] == 'dn': return False if tail[0] == ':': if tail[1:3] == 'dn': return describes(head,tail[3:])
# single key res = match_pattern(items, partkeys) if res == []: res = match_idiom(items, combine) if res == []: res = match_idiom(items, partkeys) if res == []: sys.stderr.write('ERR: partkey not found in line: %s' % line) # do cache index, wname, mean = res if wordbook.has_key(index): wordbook[index] = wordbook[index] + word.word(wname, mean) else: wordbook[index] = word.word(wname, mean) # End reading from file # # store into database word_db = shelve.open(IBT_DB) for idx in wordbook: word_db[idx] = wordbook[idx] word_db.close() if __name__ == '__main__': db_in = shelve.open(IBT_DB) for idx in db_in.keys(): print db_in[idx] db_in.close()