def split_word(sentence): sentence = format_sentence(sentence) words = sentence.split(' ') word_array = [] hash_table = get_hashtable() i = 0 len_ = len(words) while i < len_: if i + 2 < len_ and is_exist((words[i] + ' ' + words[i + 1] + ' ' + words[i + 2]).encode('utf-8'), hash_table)\ == u'\u2713': word_array.append( Word(words[i] + '_' + words[i + 1] + '_' + words[i + 2], u'\u2713')) i += 2 elif i + 1 < len_ and is_exist( (words[i] + ' ' + words[i + 1]).encode('utf-8'), hash_table) == u'\u2713': word_array.append(Word(words[i] + '_' + words[i + 1], u'\u2713')) i += 1 else: if is_exist(words[i].encode('utf-8'), hash_table) == u'\u2713': word_array.append(Word(words[i], u'\u2713')) else: if words[i] != '\n': word_array.append(Word('~' + words[i], u'\u274C')) else: word_array.append(Word(words[i], u'\u2713')) i += 1 return word_array
def __init__(self, path, source='text'): self.words = [] if source == 'umarker': #xml from uncertaintymaker code import untangle template = untangle.parse(path) for word in template.transcription.words: if not word.word.cdata in ["[sil]", "[NOISE]", "[SPEECH]"]: self.words.append( Word(word.word.cdata, word.timeStart.cdata, word.timeEnd.cdata)) elif source == 'ldc-wrd': wrd_file = open(path, 'r') for line in wrd_file: elements = line.split(' ') word = elements[2].rstrip() time_from = self.convert_time(elements[0], from_time='ldc', to_time='ms') time_to = self.convert_time(elements[1], from_time='ldc', to_time='ms') self.words.append(Word(word, time_from, time_to)) wrd_file.close() elif source == 'text': path = path.encode('ascii', 'ignore') for word in word_tokenize(path): if not word in ['"', "'", ".", "!", '``', '`', "''", '""']: self.words.append(Word(word, '', ''))
def words(self, word, line): str = '' for w in word: if w in self.symbols and self.reading_string == False: if str != '': if str.lower() in self.table: self.channel_values.append(Word( str, str.lower(), line)) #print(Word(str, self.table.get(str.lower()), line).toString()) else: self.channel_values.append( Word(str, 'identifier', line)) #print(Word(str, Tag.IDENTIFIER, line).toString()) str = '' self.channel_values.append(Token(w, w, line)) #print(Token(w, self.symbols.get(w), line).toString()) else: str += w if w == "'" and self.reading_string == False: self.reading_string = True elif w == "'" and self.reading_string: self.big_string += str self.channel_values.append(String(self.big_string, line)) #print(String(self.big_string, line).toString()) self.reading_string = False str = '' self.big_string = '' if self.reading_string: self.big_string += str + ' ' else: if str != '': self.channel_values.append(Word(str, 'identifier', line))
def test_IsWordLegal_IllegalWordBadCollisionsOneLetterAnchorDown_ReturnsFalse( self): # Arrange board = Board() t = Tile('t', -1, -1.0, -1) e = Tile('e', -1, -1.0, -1) s = Tile('s', -1, -1.0, -1) i = Tile('i', -1, -1.0, -1) n = Tile('n', -1, -1.0, -1) g = Tile('g', -1, -1.0, -1) hand = Hand("test", [t, e, s, t, i, n, g, t, e, s, t, i, n, g, s, i, t]) testing = Word([t, e, s, t, i, n, g]) sit = Word([s, i, t]) # Act board.PlaceWord(testing, board.GetAnchors()[0], hand, 0, 'down') board.PlaceWord(sit, board.GetAnchors()[3], hand, 1, 'across') results = board.IsWordLegal(testing, board.GetAnchors()[6], 3, 'down') # Assert #board.PrintBoard() self.assertFalse(results[0]) self.assertEqual(results[1], 'word creates an invalid word when placed')
def get(self,word,stress_ambiguity=True): # python3 unnecessary: #if type(word)==str: # word=word.decode('utf-8',errors='ignore') (word,punct)=gleanPunc(word) if self.has(word): words=self.dict['Word'][word.lower()] elif self.getprep: words=self.getprep(word,config=self.config) else: return [Word(word,[],None)] if not words: return [Word(word,[],None)] if type(words)==list: if type(words[0])==tuple: # New word needs to be built wordobjs=[] for wordtuple in words: wrd=wordtuple[:2] attrs=wordtuple[2] if len(wordtuple)>2 else {} wordobj=self.make(wrd,word) for _k,_v in list(attrs.items()): setattr(wordobj,_k,_v) wordobjs+=[wordobj] self.dict['Word'][word.lower()]=wordobjs return self.maybeUnstress(wordobjs) if stress_ambiguity else wordobjs else: wordobjs=words else: wordobjs=[words] return self.maybeUnstress(wordobjs) if stress_ambiguity else wordobjs
def parse_matrix(self, matrix): self.__check_format(matrix) for i in xrange(len(matrix)): words_coordinates = self.__extract_words_coordinates(matrix[i]) for (x1, x2) in words_coordinates: self.horizontal_words.append(Word(i, x1, x2)) for j in xrange(len(matrix[0])): vertical_list = [list[j] for list in matrix] words_coordinates = self.__extract_words_coordinates(vertical_list) for (y1, y2) in words_coordinates: self.vertical_words.append(Word(j, y1, y2)) self.__build_tree(self.horizontal_words[0], False)
def read_input(): input_file = open("input.txt", "r") line1 = input_file.readline().rstrip("\n").split("OR") for i in line1: a.add_word(Word(i.strip())) num = int(input_file.readline()) for i in range(num): line = input_file.readline().rstrip("\n").split("OR") new_sen = Sentence() for j in line: new_sen.add_word(Word(j.strip())) KB.append(new_sen)
def toCSVEntry(str): elements = str.split('\t') rel = elements[1] source = elements[2] target = elements[3] sourceForm = '' targetForm = '' sourceCategory = '' targetCategory = '' if source.endswith('/n') or source.endswith('/v') or source.endswith( '/a') or source.endswith('/s') or source.endswith('/r'): sourceForm = source[:-2].replace('/c/fr/', '').replace('_', ' ').strip() sourceCategory = source[len(source) - 1:] else: sourceForm = source.replace('/c/fr/', '').replace('_', ' ').strip() if target.endswith('/n') or target.endswith('/v') or target.endswith( '/a') or target.endswith('/s') or target.endswith('/r'): targetForm = target[:-2].replace('/c/fr/', '').replace('_', ' ').strip() targetCategory = target[len(target) - 1:] else: targetForm = target.replace('/c/fr/', '').replace('_', ' ').strip() line = sourceCategory + "\t" + source + "\t" + sourceForm + "\t" + rel + "\t" + targetCategory + "\t" + target + "\t" + targetForm + "\n" idSource = toId(sourceForm, sourceCategory) idTarget = toId(targetForm, targetCategory) sourceWord = None targetWord = None try: sourceWord = words[idSource] except KeyError: sourceWord = Word(sourceForm, sourceCategory) words[idSource] = sourceWord try: targetWord = words[idTarget] except KeyError: targetWord = Word(targetForm, targetCategory) words[idTarget] = targetWord sourceWord.add_relation(rel, targetWord) #sourceWord.print_relations_count() return line
def add(self, word, documentID, occurences): if word in self.cache.keys(): self.cache[word].add(documentID, occurences) else: wordInstance = Word(word, self.directory) wordInstance.add(documentID, occurences) self.cache[word] = wordInstance
def main(): content = fileReader() content = [Word(str, targetWord) for str in content] print(f"Created {len(content)} words") content = [word for word in content if word.isValid] print(f"Reduced to {len(content)} valid candidates.") findMatches(content)
def WriteSonnet(self): self.mySonnet = [] for i in range(self.desiredLines): line = [] followingWord = Word("@") syllables = 0 if (self.rhymeLines[i] >= 0): followingWord = self.wordChain.GetRhymingWord( self.mySonnet[self.rhymeLines[i]][-1], self.rhymeLevel) line.insert(0, followingWord.GetWord()) syllables += followingWord.CountSyllables() while syllables < self.desiredLength: nextWord = self.wordChain.GetRandomLeader( followingWord.GetWord()) for k in range(1, 5): if nextWord.GetWord() != "@": break nextWord = self.wordChain.GetRandomLeader( followingWord.GetWord()) if nextWord.GetWord() == "@": break line.insert(0, nextWord.GetWord()) followingWord = nextWord followingWord.GetWordStress() syllables += nextWord.CountSyllables() self.PrintProgress(i + 1, syllables) self.mySonnet.append(line)
def FindBestMove(self, hand, board): #Return word, anchor, anchorindex, direction #Get words for each anchor, find best word, compare best words sequentially anchors = board.GetAnchors() random.shuffle(anchors) bestWord = Word() bestWord.SetScore(min) for anchor in anchors: # get list of possible words for each anchor # match words to hand words = self.MatchWords(hand, anchor, board) # check for case no legal move is found if words is not None : # set scores for words, find best word for word in words.keys(): word.SetScore(self.heuristic.ScoreWord(word, hand)) if word.GetScore() > bestWord.GetScore() : bestWord = word bestAnchor = anchor bestIndex = words[word][0] bestDirection = words[word][1] # check for case no legal move is found if bestWord.GetScore() is min: raise Exception("BRI: No valid word options found!") return bestWord, bestAnchor, bestIndex, bestDirection
def MatchWords(self, hand, anchor, board): # match available tiles in hand to possible words for a certain anchor anchorWords = anchor.GetPossibleWords() handTiles = hand.PeekHand() anchorTile = anchor.GetTile() if anchorTile.GetLetter() is " ": handTiles.append(anchorTile) tiles = handTiles totalHand = Word(tiles) options = anchorWords.WordSearch(totalHand) optionsCleaned = dict() direction = anchor.GetDirection() timeStart = time.time() shuffledOptions = list(options.GetDict().values()) random.shuffle(shuffledOptions) #print(shuffledOptions) for strWordList in shuffledOptions: for strWord in strWordList: if (len(strWord) <= len(handTiles)) : word = self.MakeItWord(strWord) if anchor.GetLetter() is " ": indices = [int(len(strWord)/2)] else: indices = [i for i, a in enumerate(word.GetString()) if a == anchor.GetLetter() ] for i in indices: if board.IsWordLegal(word, anchor, i, direction): optionsCleaned[word] = (i, direction) timeDiff = time.time() - timeStart if (timeDiff > 5): break return optionsCleaned
def handle_response(self, response): soup = BeautifulSoup(response.decode('utf-8', 'ignore')) table = soup.find('table', {'border': '1'}) is_going_on = False for tr in table.findAll('tr'): is_going_on = True for td in tr.findAll('td'): td_value = str(td) word = re.search(r'(.*)kelime=(.+)&c(.*)', td_value).group(2) is_letter = word.__len__() == 1 is_proper_name = word[0].isupper() is_phrase = word.__contains__(' ') w = Word(word, is_letter, is_proper_name, is_phrase) if DictionaryConfig.detailed_log: print 'word consumed #', DictionaryService.regular_words.__len__( ) + 1, ':', w.get_value() if w.is_regular(): DictionaryService.regular_words.append(w) elif w.is_proper_name: DictionaryService.proper_words.append(w) elif w.is_phrase: DictionaryService.phrases.append(w) DictionaryService.fs.write(w.formatted_value() + "\n") return is_going_on
def addWordToList(self, word, fWord): if self.wordInList(word): return False else: newWord = Word(word, fWord) self.wordList.append(newWord) return True
def insert_words(fv, hash_set): """ ------------------------------------------------------- Retrieves every Word in fv and inserts into a Hash_Set. Each Word object in hash_set contains the number of comparisons required to insert that Word object from file_variable into hash_set. ------------------------------------------------------- Parameters: fv - the already open file containing data to evaluate (file) hash_set - the Hash_Set to insert the words into (Hash_Set) Returns: None ------------------------------------------------------- """ lines = fv.read() words = lines.split() for word in words: if word.isalpha(): k = Word(word.lower()) hash_set.insert(k) return
def insert_words(fv, hash_set): """ ------------------------------------------------------- Retrieves every Word in fv and inserts into a Hash_Set. Each Word object in hash_set contains the number of comparisons required to insert that Word object from file_variable into hash_set. ------------------------------------------------------- Parameters: fv - the already open file containing data to evaluate (file) hash_set - the Hash_Set to insert the words into (Hash_Set) Returns: None ------------------------------------------------------- """ fv.seek(0) for line in fv: for word in line.strip().split(): if word.isalpha(): l = Word(word.lower()) hash_set.insert(l) fv.close() return
def test_freqency_query_of_empty_word(self): """ Make sure code still runs if an empty string is passed into frequency """ word = Word("", "") querier = DataMuseQuerier() self.assertTrue(len(str(querier.get_frequency(word))) > 0)
def test_frequency_query_of_fake_word(self): """ Make sure a non-english word typed in still returns something and doesn't throw an error """ fake_word = Word("asdfjk", "") querier = DataMuseQuerier() self.assertTrue(len(str(querier.get_frequency(fake_word))) > 0)
def filter_corpus_for_relevance(filein, fileout): print "Converting ", filein, " to ", fileout print "\tReading ", filein inpt = open(filein, 'r') content = filter(lambda x: not x == "", inpt.readlines()[0].replace("\n", "").split(" ")) inpt.close() print "\tNow filtering and writing relevant words" print "\t", len(content), "words to go..." outpt = open(fileout, 'w') cache = dict() for i in xrange(len(content)): if i % 1000000 == 0: print "\t\tIteration", i word = content[i] if word not in cache: word_object = Word(word) if word_object.relevant(): cache[word] = word_object.lemma() else: cache[word] = False token = cache[word] if not token == False: outpt.write(token + " ") outpt.close() print "Done!"
def tokenize(self, sent, pos_tags): head = Word() head.word = [sent[0].lower()] head.actual = [sent[0]] head.pos = pos_tags[0] curr = head length = len(sent) for i in xrange(1, len(sent)): new_word = Word() new_word.left = curr curr.right = new_word new_word.word = [sent[i].lower()] new_word.actual = [sent[i]] new_word.pos = pos_tags[i] curr = new_word return head, length
def test_best_synonym_of_empty_word(self): """ Make the best_synonym function returns an empty string for an empty string """ word_substitutor = WordSubstitutor() self.assertEqual(word_substitutor.get_best_synonym(Word("", "")).get_word()\ , "")
def make(self,stressedipasylls_text,token): stressedipa=stressedipasylls_text[0] sylls_text=stressedipasylls_text[1] stress=stressedipa2stress(stressedipa) (prom_stress,prom_strength)=getStrengthStress(stress) syllphons=self.ipa2phons(stressedipa) sylls=[] for i in range(len(syllphons)): syllbody=self.use('SyllableBody',syllphons[i]) syll=self.use('Syllable',(syllbody,prom_strength[i],prom_stress[i])) #print token,i,syllbody,syll,syllphons,stressedipa,stress,prom_stress,prom_strength sylls.append(syll) word=Word(token,sylls,sylls_text) word.ipa=stressedipa word.stress=stress word.lang=self.lang # when is word broken? if not word.ipa: word.broken=True return word
def pseudo2real(self, pseudo_words, increment=False): """ Convert a pseudo sentence to a real sentence. If increment is True, we update the occurence """ #Word._connection.debug=True self.words = [] for pword in pseudo_words: try: real_word = Word.byAppeared_name( pword['appeared_name'].encode('utf-8')) #Do we increment? if increment: real_word.increment() except SQLObjectNotFound: #We don't have the word yet try: main_type = MainType.byName( pword['main_type'].encode('utf-8')) except SQLObjectNotFound: main_type = MainType(name=pword['main_type']) try: sub_type = SubType.byName( pword['sub_type'].encode('utf-8')) except SQLObjectNotFound: sub_type = SubType(name=pword['sub_type']) # We create a new word object real_word = Word(appeared_name=pword['appeared_name'], appeared_reading=pword['appeared_reading'], base_name=pword['base_name'], base_reading=pword['base_reading'], main_type=main_type.id, sub_type=sub_type.id) self.words.append(real_word)
def new_word(self): word1 = self.text1.get().lower().lstrip().rstrip() word2 = self.text2.get().lower().lstrip().rstrip() if len(word1) == 0 or len(word2) == 0: self.text2.insert(0.0, "Зполните все поля") return 0 elif (word1[0] in ABV_english and word2[0] in ABV_english) or (word1[0] in ABV_russian and word2[0] in ABV_russian): self.text3.delete(0.0, END) self.text3.insert(0.0, "Так ниизяяя, подумайте дважды") else: languages = ['russian', 'english'] if word1[0] in ABV_russian else ['english', 'russian'] word = Word(word=languages[0], translate=languages[1]) result = word.set(word1, word2) if result == -1: self.text3.delete(0.0, END) self.text3.insert(0.0, "Такой вид перевода уже имеется") else: file = open('new_words.txt', 'a') file.write(word1+'-'+word2+'\n') file.close() self.text3.delete(0.0, END) self.text3.insert(0.0, "Перевод записан!")
def buildWords(word_vectors, features, f_types): words = [] for word_vector in word_vectors: word = Word(word_vector, features, f_types) words.append(word) return words
def comparison_total(hash_set): """ ------------------------------------------------------- Sums the comparison values of all Word objects in hash_set. ------------------------------------------------------- Parameters: hash_set - a hash set of Word objects (Hash_Set) Returns: total - the total of all comparison fields in the Hash_Set Word objects (int) max_word - the word having the most comparisons (Word) ------------------------------------------------------- """ total = 0 max_word = Word('a') for word in hash_set: total += word.comparisons if word.comparisons > max_word.comparisons: max_word = word return total, max_word
def __init__(self): print("Word_data_Based_on : . https://github.com/KKuTu-Korea/KKuTu") print("-----------------------------------------------------------") self.word = Word() self.words = {} # 단어 dict self.thirdword = ['', '', ''] self.usedword = []
def insert_words(fv, hash_set): """ ------------------------------------------------------- Retrieves every Word in fv and inserts into a Hash_Set. ------------------------------------------------------- Parameters: fv - the already open file containing data to evaluate (file) hash_set - the Hash_Set to insert the words into (Hash_Set) Returns: Each Word object in hash_set contains the number of comparisons required to insert that Word object from file_variable into hash_set. ------------------------------------------------------- """ fv.seek(0) lines = fv.readlines() for line in lines: #print("[{}]".format(line.rstrip())) words = line.split(' ') for word in words: if word.isalpha(): #print("Word: {}".format(word)) # Ignoring any punctuation and words with punctuation _word = Word(word.lower()) hash_set.insert(_word) return
def test_AnchorSearch_FindingWordsWithScript_ReturnsCorrectWords(self): # Arrange words = Words() selectedKeys = [] s = Tile('S', 1, 0.09480520300163461, 3) c = Tile('C', 3, 0.04049934678472928, 29) r = Tile('R', 1, 0.07098146383333229, 11) i = Tile('I', 1, 0.0885545324304026, 5) p = Tile('P', 3, 0.029410465329100584, 41) t = Tile('T', 1, 0.06566549066880407, 17) word = Word([s, c, r, i, p, t]) allWordsContainWord = True noWordsContainWord = True # Act selectedWords = words.AnchorSearch(word) for key in selectedWords.GetDict(): selectedKeys.append(key) for element in selectedWords.GetDict().get(key): if word.GetString() not in element: allWordsContainWord = False for key in words.GetDict(): if key not in selectedKeys: for element in words.GetDict().get(key): if word.GetString() in element: noWordsContainWord = False print(element, "contains the word", word.GetString()) # Assert self.assertTrue(allWordsContainWord) self.assertTrue(noWordsContainWord)