def handle_response(self, response): soup = BeautifulSoup(response.decode('utf-8', 'ignore')) table = soup.find('table', {'border': '1'}) is_going_on = False for tr in table.findAll('tr'): is_going_on = True for td in tr.findAll('td'): td_value = str(td) word = re.search(r'(.*)kelime=(.+)&c(.*)', td_value).group(2) is_letter = word.__len__() == 1 is_proper_name = word[0].isupper() is_phrase = word.__contains__(' ') w = Word(word, is_letter, is_proper_name, is_phrase) if DictionaryConfig.detailed_log: print 'word consumed #', DictionaryService.regular_words.__len__( ) + 1, ':', w.get_value() if w.is_regular(): DictionaryService.regular_words.append(w) elif w.is_proper_name: DictionaryService.proper_words.append(w) elif w.is_phrase: DictionaryService.phrases.append(w) DictionaryService.fs.write(w.formatted_value() + "\n") return is_going_on
def test_IsWordLegal_IllegalWordBadCollisionsOneLetterAnchorDown_ReturnsFalse( self): # Arrange board = Board() t = Tile('t', -1, -1.0, -1) e = Tile('e', -1, -1.0, -1) s = Tile('s', -1, -1.0, -1) i = Tile('i', -1, -1.0, -1) n = Tile('n', -1, -1.0, -1) g = Tile('g', -1, -1.0, -1) hand = Hand("test", [t, e, s, t, i, n, g, t, e, s, t, i, n, g, s, i, t]) testing = Word([t, e, s, t, i, n, g]) sit = Word([s, i, t]) # Act board.PlaceWord(testing, board.GetAnchors()[0], hand, 0, 'down') board.PlaceWord(sit, board.GetAnchors()[3], hand, 1, 'across') results = board.IsWordLegal(testing, board.GetAnchors()[6], 3, 'down') # Assert #board.PrintBoard() self.assertFalse(results[0]) self.assertEqual(results[1], 'word creates an invalid word when placed')
def add(self, word, documentID, occurences): if word in self.cache.keys(): self.cache[word].add(documentID, occurences) else: wordInstance = Word(word, self.directory) wordInstance.add(documentID, occurences) self.cache[word] = wordInstance
def handle_response(self, response): soup = BeautifulSoup(response.decode('utf-8', 'ignore')) table = soup.find('table', {'border': '1'}) is_going_on = False for tr in table.findAll('tr'): is_going_on = True for td in tr.findAll('td'): td_value = str(td) word = re.search(r'(.*)kelime=(.+)&c(.*)', td_value).group(2) is_letter = word.__len__() == 1 is_proper_name = word[0].isupper() is_phrase = word.__contains__(' ') w = Word(word, is_letter, is_proper_name, is_phrase) if DictionaryConfig.detailed_log: print 'word consumed #', DictionaryService.regular_words.__len__() + 1, ':', w.get_value() if w.is_regular(): DictionaryService.regular_words.append(w) elif w.is_proper_name: DictionaryService.proper_words.append(w) elif w.is_phrase: DictionaryService.phrases.append(w) DictionaryService.fs.write(w.formatted_value() + "\n") return is_going_on
def __init__(self): print("Word_data_Based_on : . https://github.com/KKuTu-Korea/KKuTu") print("-----------------------------------------------------------") self.word = Word() self.words = {} # 단어 dict self.thirdword = ['', '', ''] self.usedword = []
def test_AnchorSearch_FindingWordsWithScript_ReturnsCorrectWords(self): # Arrange words = Words() selectedKeys = [] s = Tile('S', 1, 0.09480520300163461, 3) c = Tile('C', 3, 0.04049934678472928, 29) r = Tile('R', 1, 0.07098146383333229, 11) i = Tile('I', 1, 0.0885545324304026, 5) p = Tile('P', 3, 0.029410465329100584, 41) t = Tile('T', 1, 0.06566549066880407, 17) word = Word([s, c, r, i, p, t]) allWordsContainWord = True noWordsContainWord = True # Act selectedWords = words.AnchorSearch(word) for key in selectedWords.GetDict(): selectedKeys.append(key) for element in selectedWords.GetDict().get(key): if word.GetString() not in element: allWordsContainWord = False for key in words.GetDict(): if key not in selectedKeys: for element in words.GetDict().get(key): if word.GetString() in element: noWordsContainWord = False print(element, "contains the word", word.GetString()) # Assert self.assertTrue(allWordsContainWord) self.assertTrue(noWordsContainWord)
def filter_corpus_for_relevance(filein, fileout): print "Converting ", filein, " to ", fileout print "\tReading ", filein inpt = open(filein, 'r') content = filter(lambda x: not x == "", inpt.readlines()[0].replace("\n", "").split(" ")) inpt.close() print "\tNow filtering and writing relevant words" print "\t", len(content), "words to go..." outpt = open(fileout, 'w') cache = dict() for i in xrange(len(content)): if i % 1000000 == 0: print "\t\tIteration", i word = content[i] if word not in cache: word_object = Word(word) if word_object.relevant(): cache[word] = word_object.lemma() else: cache[word] = False token = cache[word] if not token == False: outpt.write(token + " ") outpt.close() print "Done!"
def words(self, word, line): str = '' for w in word: if w in self.symbols and self.reading_string == False: if str != '': if str.lower() in self.table: self.channel_values.append(Word( str, str.lower(), line)) #print(Word(str, self.table.get(str.lower()), line).toString()) else: self.channel_values.append( Word(str, 'identifier', line)) #print(Word(str, Tag.IDENTIFIER, line).toString()) str = '' self.channel_values.append(Token(w, w, line)) #print(Token(w, self.symbols.get(w), line).toString()) else: str += w if w == "'" and self.reading_string == False: self.reading_string = True elif w == "'" and self.reading_string: self.big_string += str self.channel_values.append(String(self.big_string, line)) #print(String(self.big_string, line).toString()) self.reading_string = False str = '' self.big_string = '' if self.reading_string: self.big_string += str + ' ' else: if str != '': self.channel_values.append(Word(str, 'identifier', line))
def split_word(sentence): sentence = format_sentence(sentence) words = sentence.split(' ') word_array = [] hash_table = get_hashtable() i = 0 len_ = len(words) while i < len_: if i + 2 < len_ and is_exist((words[i] + ' ' + words[i + 1] + ' ' + words[i + 2]).encode('utf-8'), hash_table)\ == u'\u2713': word_array.append( Word(words[i] + '_' + words[i + 1] + '_' + words[i + 2], u'\u2713')) i += 2 elif i + 1 < len_ and is_exist( (words[i] + ' ' + words[i + 1]).encode('utf-8'), hash_table) == u'\u2713': word_array.append(Word(words[i] + '_' + words[i + 1], u'\u2713')) i += 1 else: if is_exist(words[i].encode('utf-8'), hash_table) == u'\u2713': word_array.append(Word(words[i], u'\u2713')) else: if words[i] != '\n': word_array.append(Word('~' + words[i], u'\u274C')) else: word_array.append(Word(words[i], u'\u2713')) i += 1 return word_array
def get(self,word,stress_ambiguity=True): # python3 unnecessary: #if type(word)==str: # word=word.decode('utf-8',errors='ignore') (word,punct)=gleanPunc(word) if self.has(word): words=self.dict['Word'][word.lower()] elif self.getprep: words=self.getprep(word,config=self.config) else: return [Word(word,[],None)] if not words: return [Word(word,[],None)] if type(words)==list: if type(words[0])==tuple: # New word needs to be built wordobjs=[] for wordtuple in words: wrd=wordtuple[:2] attrs=wordtuple[2] if len(wordtuple)>2 else {} wordobj=self.make(wrd,word) for _k,_v in list(attrs.items()): setattr(wordobj,_k,_v) wordobjs+=[wordobj] self.dict['Word'][word.lower()]=wordobjs return self.maybeUnstress(wordobjs) if stress_ambiguity else wordobjs else: wordobjs=words else: wordobjs=[words] return self.maybeUnstress(wordobjs) if stress_ambiguity else wordobjs
def filter_corpus_for_relevance(filein, fileout): print "Converting ", filein, " to ", fileout print "\tReading ", filein inpt = open(filein, 'r') content = filter(lambda x : not x == "", inpt.readlines()[0].replace("\n", "").split(" ")) inpt.close() print "\tNow filtering and writing relevant words" print "\t", len(content), "words to go..." outpt = open(fileout, 'w') cache = dict() for i in xrange(len(content)): if i % 1000000 == 0: print "\t\tIteration", i word = content[i] if word not in cache: word_object = Word(word) if word_object.relevant(): cache[word] = word_object.lemma() else: cache[word] = False token = cache[word] if not token == False: outpt.write(token + " ") outpt.close() print "Done!"
def pseudo2real(self, pseudo_words, increment=False): """ Convert a pseudo sentence to a real sentence. If increment is True, we update the occurence """ #Word._connection.debug=True self.words = [] for pword in pseudo_words: try: real_word = Word.byAppeared_name( pword['appeared_name'].encode('utf-8')) #Do we increment? if increment: real_word.increment() except SQLObjectNotFound: #We don't have the word yet try: main_type = MainType.byName( pword['main_type'].encode('utf-8')) except SQLObjectNotFound: main_type = MainType(name=pword['main_type']) try: sub_type = SubType.byName( pword['sub_type'].encode('utf-8')) except SQLObjectNotFound: sub_type = SubType(name=pword['sub_type']) # We create a new word object real_word = Word(appeared_name=pword['appeared_name'], appeared_reading=pword['appeared_reading'], base_name=pword['base_name'], base_reading=pword['base_reading'], main_type=main_type.id, sub_type=sub_type.id) self.words.append(real_word)
def setOb(self, ob): Word.setOb(self, ob) self.autogenJSON() for typ in typs: if self.rats[typ]: self.rats[typ] *= 10 self.onlyTyp = typ
def make(self,stressedipasylls_text,token): stressedipa=stressedipasylls_text[0] sylls_text=stressedipasylls_text[1] stress=stressedipa2stress(stressedipa) (prom_stress,prom_strength)=getStrengthStress(stress) syllphons=self.ipa2phons(stressedipa) sylls=[] for i in range(len(syllphons)): syllbody=self.use('SyllableBody',syllphons[i]) syll=self.use('Syllable',(syllbody,prom_strength[i],prom_stress[i])) #print token,i,syllbody,syll,syllphons,stressedipa,stress,prom_stress,prom_strength sylls.append(syll) word=Word(token,sylls,sylls_text) word.ipa=stressedipa word.stress=stress word.lang=self.lang # when is word broken? if not word.ipa: word.broken=True return word
def test_case_value(self): letters = Letter.from_raw(test.letters) grid = Grid(test.grid) word = Word(grid, letters, Letter.from_raw(test.wordons)) print(word.__repr__()) assert word.value is test.value
def new_word(self): word1 = self.text1.get().lower().lstrip().rstrip() word2 = self.text2.get().lower().lstrip().rstrip() if len(word1) == 0 or len(word2) == 0: self.text2.insert(0.0, "Зполните все поля") return 0 elif (word1[0] in ABV_english and word2[0] in ABV_english) or (word1[0] in ABV_russian and word2[0] in ABV_russian): self.text3.delete(0.0, END) self.text3.insert(0.0, "Так ниизяяя, подумайте дважды") else: languages = ['russian', 'english'] if word1[0] in ABV_russian else ['english', 'russian'] word = Word(word=languages[0], translate=languages[1]) result = word.set(word1, word2) if result == -1: self.text3.delete(0.0, END) self.text3.insert(0.0, "Такой вид перевода уже имеется") else: file = open('new_words.txt', 'a') file.write(word1+'-'+word2+'\n') file.close() self.text3.delete(0.0, END) self.text3.insert(0.0, "Перевод записан!")
def __init__(self, path, source='text'): self.words = [] if source == 'umarker': #xml from uncertaintymaker code import untangle template = untangle.parse(path) for word in template.transcription.words: if not word.word.cdata in ["[sil]", "[NOISE]", "[SPEECH]"]: self.words.append( Word(word.word.cdata, word.timeStart.cdata, word.timeEnd.cdata)) elif source == 'ldc-wrd': wrd_file = open(path, 'r') for line in wrd_file: elements = line.split(' ') word = elements[2].rstrip() time_from = self.convert_time(elements[0], from_time='ldc', to_time='ms') time_to = self.convert_time(elements[1], from_time='ldc', to_time='ms') self.words.append(Word(word, time_from, time_to)) wrd_file.close() elif source == 'text': path = path.encode('ascii', 'ignore') for word in word_tokenize(path): if not word in ['"', "'", ".", "!", '``', '`', "''", '""']: self.words.append(Word(word, '', ''))
def FindBestMove(self, hand, board): #Return word, anchor, anchorindex, direction #Get words for each anchor, find best word, compare best words sequentially anchors = board.GetAnchors() random.shuffle(anchors) bestWord = Word() bestWord.SetScore(min) for anchor in anchors: # get list of possible words for each anchor # match words to hand words = self.MatchWords(hand, anchor, board) # check for case no legal move is found if words is not None : # set scores for words, find best word for word in words.keys(): word.SetScore(self.heuristic.ScoreWord(word, hand)) if word.GetScore() > bestWord.GetScore() : bestWord = word bestAnchor = anchor bestIndex = words[word][0] bestDirection = words[word][1] # check for case no legal move is found if bestWord.GetScore() is min: raise Exception("BRI: No valid word options found!") return bestWord, bestAnchor, bestIndex, bestDirection
def prepareWordBufferForDecode(buffer): """Add to every word of the buffer features GOVREF and LABELREF. GOVEREF is a copy of feature GOV and LABELREF a copy of LABEL GOV and LABEL are set to initialization values """ for word in buffer.array: word.setFeat('GOV', str(Word.invalidGov())) word.setFeat('LABEL', Word.invalidLabel())
def readAllMcfFile(self): tokens = [] for ligne in self.mcfFile: ligne = ligne.rstrip() tokens = ligne.split("\t") w = Word() for i in range(0, len(tokens)): w.setFeat(self.mcd.getColName(i), tokens[i]) self.addWord(w) self.mcfFile.close()
def find_word(self): word = self.text1.get().lower().lstrip().rstrip() _word = Word() result = _word._translate(word) if result: self.text3.insert(0.0, result) else: message = "Подходящих слов нет\n\nПОРА БЫ ИХ ДОБАВИТЬ!!!" self.text3.insert(0.0, message)
def parse_matrix(self, matrix): self.__check_format(matrix) for i in xrange(len(matrix)): words_coordinates = self.__extract_words_coordinates(matrix[i]) for (x1, x2) in words_coordinates: self.horizontal_words.append(Word(i, x1, x2)) for j in xrange(len(matrix[0])): vertical_list = [list[j] for list in matrix] words_coordinates = self.__extract_words_coordinates(vertical_list) for (y1, y2) in words_coordinates: self.vertical_words.append(Word(j, y1, y2)) self.__build_tree(self.horizontal_words[0], False)
def readNextWord(self): line = self.mcfFile.readline() if line == "": return None line = line.rstrip() tokens = line.split("\t") w = Word() for i in range(0, len(tokens)): w.setFeat(self.mcd.getColName(i), tokens[i]) self.addWord(w) return w
def read_input(): input_file = open("input.txt", "r") line1 = input_file.readline().rstrip("\n").split("OR") for i in line1: a.add_word(Word(i.strip())) num = int(input_file.readline()) for i in range(num): line = input_file.readline().rstrip("\n").split("OR") new_sen = Sentence() for j in line: new_sen.add_word(Word(j.strip())) KB.append(new_sen)
def calculate_by_severity( self): #Pontuação baseada na "severidade" de cada palavra dic = self.curtail_dictionary() counter = 0 weight = 0 wd = Word() for key, occurrences in dic.items(): validated = wd.word_validation(key) weight = wd.categorize_word(validated) counter += occurrences * weight return counter
def toCSVEntry(str): elements = str.split('\t') rel = elements[1] source = elements[2] target = elements[3] sourceForm = '' targetForm = '' sourceCategory = '' targetCategory = '' if source.endswith('/n') or source.endswith('/v') or source.endswith( '/a') or source.endswith('/s') or source.endswith('/r'): sourceForm = source[:-2].replace('/c/fr/', '').replace('_', ' ').strip() sourceCategory = source[len(source) - 1:] else: sourceForm = source.replace('/c/fr/', '').replace('_', ' ').strip() if target.endswith('/n') or target.endswith('/v') or target.endswith( '/a') or target.endswith('/s') or target.endswith('/r'): targetForm = target[:-2].replace('/c/fr/', '').replace('_', ' ').strip() targetCategory = target[len(target) - 1:] else: targetForm = target.replace('/c/fr/', '').replace('_', ' ').strip() line = sourceCategory + "\t" + source + "\t" + sourceForm + "\t" + rel + "\t" + targetCategory + "\t" + target + "\t" + targetForm + "\n" idSource = toId(sourceForm, sourceCategory) idTarget = toId(targetForm, targetCategory) sourceWord = None targetWord = None try: sourceWord = words[idSource] except KeyError: sourceWord = Word(sourceForm, sourceCategory) words[idSource] = sourceWord try: targetWord = words[idTarget] except KeyError: targetWord = Word(targetForm, targetCategory) words[idTarget] = targetWord sourceWord.add_relation(rel, targetWord) #sourceWord.print_relations_count() return line
def ranked(*words, directory='lookup', func=lambda *x: 1): if directory == 'stemming': words = list(map(lambda x: stem(x), words)) query = [] processed = [] for w in words: if w in processed: continue query.append(words.count(w)) processed.append(w) query = np.array(list(map(lambda x: x / sum(query), query))) documents = set() # query = np.array([1/len(words) for i in range(len(words))]) heap = [] Words = [] DFt = [] for word in processed: WordInstance = Word(word, directory) Words.append(WordInstance) documents = documents.union(WordInstance.documents()) DFt.append(len(WordInstance.documents())) N = len(documents) iDFt = list(map(lambda X: np.log10(N / X), DFt)) for document in documents: vector = [] for WordInstance in Words: ## calculate the TF(t,d) count = WordInstance.count(document) if count > 0: vector.append(1 + np.log10(count)) else: vector.append(0) ## tf-idf weighting for i in range(len(iDFt)): vector[i] = vector[i] * iDFt[i] vector = np.array(vector) score = func(query, vector) heapq.heappush(heap, (score, document)) return heap
def curtail_dictionary(self): aux_dic = {} dic = self.get_dictionary() wd = Word() for key in dic: word = wd.word_validation(key) occu = dic[key] if word in aux_dic: aux_dic[word] += occu else: aux_dic[word] = occu return aux_dic
def insert_words(fv, hash_set): """ ------------------------------------------------------- Retrieves every Word in fv and inserts into a Hash_Set. ------------------------------------------------------- Parameters: fv - the already open file containing data to evaluate (file) hash_set - the Hash_Set to insert the words into (Hash_Set) Returns: Each Word object in hash_set contains the number of comparisons required to insert that Word object from file_variable into hash_set. ------------------------------------------------------- """ fv.seek(0) lines = fv.readlines() for line in lines: #print("[{}]".format(line.rstrip())) words = line.split(' ') for word in words: if word.isalpha(): #print("Word: {}".format(word)) # Ignoring any punctuation and words with punctuation _word = Word(word.lower()) hash_set.insert(_word) return
def MatchWords(self, hand, anchor, board): # match available tiles in hand to possible words for a certain anchor anchorWords = anchor.GetPossibleWords() handTiles = hand.PeekHand() anchorTile = anchor.GetTile() if anchorTile.GetLetter() is " ": handTiles.append(anchorTile) tiles = handTiles totalHand = Word(tiles) options = anchorWords.WordSearch(totalHand) optionsCleaned = dict() direction = anchor.GetDirection() timeStart = time.time() shuffledOptions = list(options.GetDict().values()) random.shuffle(shuffledOptions) #print(shuffledOptions) for strWordList in shuffledOptions: for strWord in strWordList: if (len(strWord) <= len(handTiles)) : word = self.MakeItWord(strWord) if anchor.GetLetter() is " ": indices = [int(len(strWord)/2)] else: indices = [i for i, a in enumerate(word.GetString()) if a == anchor.GetLetter() ] for i in indices: if board.IsWordLegal(word, anchor, i, direction): optionsCleaned[word] = (i, direction) timeDiff = time.time() - timeStart if (timeDiff > 5): break return optionsCleaned
def comparison_total(hash_set): """ ------------------------------------------------------- Sums the comparison values of all Word objects in hash_set. ------------------------------------------------------- Parameters: hash_set - a hash set of Word objects (Hash_Set) Returns: total - the total of all comparison fields in the Hash_Set Word objects (int) max_word - the word having the most comparisons (Word) ------------------------------------------------------- """ total = 0 max_word = Word('a') for word in hash_set: total += word.comparisons if word.comparisons > max_word.comparisons: max_word = word return total, max_word
def addWordToList(self, word, fWord): if self.wordInList(word): return False else: newWord = Word(word, fWord) self.wordList.append(newWord) return True
def extractTopics(self, text_list, threshold=0.5): """ Return a list of Words which seems to be the current topic """ if not text_list: return [] #Make pseudo sentences from the text_list pCandidates = [] for message in text_list: pSentence = self.parser.parseSentence(message) if not pSentence: return [] #pick out only the Candidates from the pseudo words candidates = [x for x in pSentence if x['main_type'] in [u'名詞']] #print 'candidates:%s' % candidates #pCandidates is a list of pseudo words which are candidates of the topic if candidates: pCandidates.extend(candidates) #print 'pCandidates:%s' % pCandidates #Next we make a dictionary of occurences of this particular candidates #Convert the pseudo words to real words true_candidates = Sentence() true_candidates.pseudo2real(pCandidates) #print 'true_candidates:%s' % true_candidates.words #make a dictionary {word_id:occurence} sample_words = {} for w in true_candidates: if sample_words.has_key(int(w.id)): sample_words[int(w.id)] += 1 else: sample_words[int(w.id)] = 1 if len(sample_words) < 1: return [] #print 'sample_words:%s' % sample_words #We make the actual occurences from the database base_words = {} for word_id in sample_words.keys(): base_words[word_id] = Word.get(word_id).occurence sample_count = 0 for k, v in sample_words.iteritems(): sample_count += v base_count = 0 for k, v in base_words.iteritems(): base_count += v scores = {} for w in sample_words.keys(): scores[w] = self.score(float(sample_words[w]), float(sample_count), float(base_words[w]), float(base_count)) items = [(v, k) for k, v in scores.items()] items.sort() items.reverse() # so largest is first candidate_keywords = [x[1] for x in items if x[0] > threshold] #Fallback - if no topics are found, we choose a random noun # Yucky, but makes Juna more talkative. if (not candidate_keywords) and (candidates): choice = [candidates[int(random.random() * len(candidates))]] s = Sentence() s.pseudo2real(choice) candidate_keywords = [s[0].id] return candidate_keywords
def insert_words(fv, hash_set): """ ------------------------------------------------------- Retrieves every Word in fv and inserts into a Hash_Set. Each Word object in hash_set contains the number of comparisons required to insert that Word object from file_variable into hash_set. ------------------------------------------------------- Parameters: fv - the already open file containing data to evaluate (file) hash_set - the Hash_Set to insert the words into (Hash_Set) Returns: None ------------------------------------------------------- """ lines = fv.read() words = lines.split() for word in words: if word.isalpha(): k = Word(word.lower()) hash_set.insert(k) return
def oracle(c): if (c.getStack().isEmpty()): return ('SHIFT', '') s0_index = c.getStack().top() b0_index = c.getBuffer().getCurrentIndex() # print("s0_index = ", s0_index) s0_gov_index = int( c.getBuffer().getWord(s0_index).getFeat('GOVREF')) + s0_index s0_label = c.getBuffer().getWord(s0_index).getFeat('LABELREF') # print('s0_index = ', s0_index, 'b0_index = ', b0_index, 's0_gov_index = ', s0_gov_index, 'b0_gov_index = ', b0_gov_index, 's0 label =', s0_label) if (s0_gov_index == b0_index): return ('LEFT', c.getBuffer().getWord(s0_index).getFeat('LABELREF')) if (b0_index < c.getBuffer().getLength()): b0_gov_index = int( c.getBuffer().getWord(b0_index).getFeat('GOVREF')) + b0_index if (b0_gov_index == s0_index): return ('RIGHT', c.getBuffer().getWord(b0_index).getFeat('LABELREF')) if ((c.getStack().getLength() > 1) and check_all_dependents_of_word_in_ref_are_in_hyp(c, s0_index) and # word on top must have all its dependents (int(c.getBuffer().getWord(c.getStack().top()).getFeat('GOV')) != Word.invalidGov())): # word on top of the stack has a governor return ('REDUCE', '') #print("no movement possible return SHIFT") if not c.getBuffer().endReached(): return ('SHIFT', '') print("The machine is stucked") exit(1)
def test_case_find(self): letters = Letter.from_raw(test.letters) grid = Grid(test.grid) words = Word.find_all(letters, grid, test.language) assert len(words) is test.wordcount assert words[0].__str__() == test.topword assert words[0].value == test.topvalue
def loadFromParser(self, parsedSentence): #Read the parseTree parseTree = parsedSentence['parsetree'] #Read the Dependencies dependencies = parsedSentence['dependencies'] #Read the Words wordsInJson = parsedSentence['words'] words = map(lambda w : Word.loadFromParser(w), wordsInJson) return self(dependencies, words, parseTree)
def deserialize(rep): navi = Navigator(rep["systemId"]) wordsGenerated = [] for word in rep["wordsGenerated"]: wordsGenerated.append(Word.deserialize(word)) navi.wordsGenerated = wordsGenerated navi.length = rep["length"] navi.cutoff_length = rep["cutoff_length"] return navi
def createWord(word_str, pof_line): """ Parses part of speech line :rtype: Word """ knownPofs = [Word.NOUN, Word.VERB, Word.ADJ] ignored = ['OBJECT', 'ADJ_DIST', 'PREFIX'] match = re.search(ur'^\[([\w]+):(\w+).*\]$', pof_line) if not match: raise Exception('Unknown line format: %s' % format(pof_line)) pof = match.group(1) if pof not in knownPofs and pof not in ignored: print pof_line raise Exception('Unknown pof: %s' % format(pof)) if pof in ignored: return None word = Word(word_str, '', pof, '') word.word_form = match.group(2).capitalize() return word
def make(self,stressedipasylls_text,token): stressedipa=stressedipasylls_text[0] sylls_text=stressedipasylls_text[1] stress=self.stressedipa2stress(stressedipa) (prom_stress,prom_strength)=self.getStrengthStress(stress) syllphons=self.ipa2phons(stressedipa) sylls=[] for i in range(len(syllphons)): syllbody=self.use('SyllableBody',syllphons[i]) syll=self.use('Syllable',(syllbody,prom_strength[i],prom_stress[i])) sylls.append(syll) word=Word(token,sylls,sylls_text) word.stress=stress word.lang=self.lang return word
def setOb(self, ob): Word.setOb(self, ob) if "translations" in ob: for text in ob["translations"]: if not (text in self.rl.ru): ruWord = RuWord({"text": text}, self.rl, 1) self.rl.add(ruWord) self.trs.append(self.rl.ru[text]) oneTyp = False for typ in typs: if typ in ob: if oneTyp: oneTyp = False break else: oneTyp = typ if oneTyp: self.oneTyp = oneTyp self.rats[oneTyp] *= 10
def tokenize(self, sent, pos_tags): head = Word() head.word = [sent[0].lower()] head.actual = [sent[0]] head.pos = pos_tags[0] curr = head length = len(sent) for i in xrange(1, len(sent)): new_word = Word() new_word.left = curr curr.right = new_word new_word.word = [sent[i].lower()] new_word.actual = [sent[i]] new_word.pos = pos_tags[i] curr = new_word return head, length
def testValidator(): val=WordValidator() word=Word("","","") try: val.wordempty(word.get_id(),word.get_lang(),word.get_word()) except ValueError: assert True word1=Word("1","","") try: val.wordempty(word1.get_id(),word1.get_lang(),word1.get_word()) except ValueError: assert True
def loadFromParser(self, parsedSentence, tokenCountBeforeThisSentence = 0, docId = 0): #Read the parseTree parseTree = parsedSentence['parsetree'] #Read the Dependencies dependencies = parsedSentence['dependencies'] #Read the Words wordsInJson = parsedSentence['words'] words = map(lambda w : Word.loadFromParser(w), wordsInJson) startIndex = 0; if tokenCountBeforeThisSentence != 0: startIndex = tokenCountBeforeThisSentence + 1; return self(dependencies, words, parseTree, startIndex, startIndex + len(words) - 1, docId)
def play(self): if not self.my_turn: return words = Word.find_all(self.letters, self.grid, self.locale) if len(words) == 0: self.swap() else: for word in words: if self.submit_word(word): return self.swap()
def getWordsAndTheirParses(self, words): for word in words: lines = word.split("\n") firstLine = lines[0] splittedFirstLine = firstLine.split(":") if len(splittedFirstLine) < 2: continue wordText = splittedFirstLine[0] unique_serialized_word = self.upsertWord(wordText) correctMorphParseNumber = splittedFirstLine[1] # get correct correctMorphParseFull = lines[int(correctMorphParseNumber)] unique_serialized_word.addParse(correctMorphParseFull) # if "Punc" not in correctMorphParseFull: w = Word(wordText) # add all words to all_words_in_corpus w.correct_parse = Parse(correctMorphParseFull) self.all_words_in_corpus.append(w)
def load_vectors(filename): def normalizeString(vec): vec = [ float(x) for x in vec] total = sqrt( sum([v**2 for v in vec]) ) new_vec = [] for v in vec: new_vec.append(v/total) return tuple(new_vec) print "\tLoading projections..." f = open(filename,'r') f.readline() content = [ filter( lambda x : not x in ["\n",""], l.replace("\n", "").split(" ")) for l in f.readlines() ] content = [ (l[0], normalizeString(l[1:])) for l in content ] content = filter(lambda x : not x[1] == None, content) words = dict() for (word, vector) in content: wordClass = Word(word) # print word, wordClass if wordClass.relevant(): # print "Keeping: ", word, wordClass.pos() words[word.lower()] = vector return words
def apply(self, word, other_word, words): rule_changed_any_tag = False condition = (other_word.assigned_parse.tag == self.tag_b) tag_a_is_at_least_one_tag_of_morph_parses_of_word = False parse_to_assign_to_word = None unique_word = Word.find_word_by_text(word.text, words) for parse in unique_word.parses: if parse.tag == self.tag_a: tag_a_is_at_least_one_tag_of_morph_parses_of_word = True parse_to_assign_to_word = parse break if condition is True and tag_a_is_at_least_one_tag_of_morph_parses_of_word is True: if word.assigned_parse != parse_to_assign_to_word: word.assigned_parse = parse_to_assign_to_word rule_changed_any_tag = True #print(word.text + " changed." + self.text) return rule_changed_any_tag
def tag_word(word, tags): '''A helper function to tag words.''' # Make the word a Word class. Cleanse it. word = Word(word) word.cleanse() # Open the tagging file. Match each tag with word in question. Once found, replace word # with tagged word. with codecs.open(tags, "r", "utf-8") as tags: for tagline in tags: tagline = tagline.strip().split(', ') if word.clean().lower() == tagline[0]: replacement = tagline[1] + word.__str__() + tagline[2] return replacement # If the word's not in the tag list, close tagging file and return the word as is. tags.close() return word.__str__()
def _checkSpellingInteractive(self, page): """Interactively goes through all wrong words in a page. All we do here is save doReplace = True if we want to replace it, while doReplace will do the actual replacement. """ title = page.title() text = page.get() words = page.words for w in words: w.doReplace = False # Go through all wrong words in this page for w in words: smallword = w.word # Check if on ignore list -> continue if self.pm.checkIsIgnored(title, smallword): continue bigword = Word(w.bigword) loc = w.location # Try to find replacement site w.site = text.find(bigword.word, loc) if w.site == -1: w.site = text.find(bigword.word) if w.site == -1: pywikibot.output(u"Not found any more in %s: %s" % ( title, bigword.word)) continue # We now have a potential site for replacement sugg = w.correctword w.LocAdd = len(bigword) # Check if the word has been replaced in the meantime with the # correct suggestion if len(text) > loc + len(sugg) and \ text[w.site:w.site+len(sugg)].lower() == sugg.lower(): continue if smallword == sugg: continue # Adjust case if smallword[0].isupper(): sugg = sugg[0].upper() + sugg[1:] # Print the two words pywikibot.output(u"Replace \03{lightred}\"%s\"" % smallword + "\03{default} \nby \03{lightgreen}\"%s\"\03{default}" % sugg) # Print context pywikibot.output(u" %s" % text[max(0,w.site-55):w.site+len(w)+55]) choice = pywikibot.inputChoice('', ['Yes', 'yes', 'No', 'no', 'No but dont save', 'never replace' 'Replace by something else', '<--!sic!-->', 'Exit and go to next site'], ['y', '\\', 'n', ']', 'b', 'v', 'r', 's','x']) # Evaluate user choice if choice == 'b': continue if choice in ('v'): self.pm.markCorrectWord(smallword) continue if choice in ('n', ']'): self.pm.markCorrectWordPerPage(title, smallword) continue if choice == 'x': return if choice == 'r': w.replacement = pywikibot.input(u"What should I replace \"%s\" by?" % bigword.word) w.doReplace = True if choice == 's': w.replacement = bigword.word + '<--!sic!-->' w.doReplace = True if choice in ( 'y','\\'): w.replacement = bigword.replace(sugg) w.doReplace = True self.pm.markReplaced(smallword, sugg)
def get_most_likely_morph_parse(self, word_text): serialized_word = Word.find_word_by_text(word_text, self.words) if serialized_word is None or len(serialized_word.parses) == 0: return None return serialized_word.parses[0] # to get the most likely
def analyze_sentence(sentence): sentence = sentence.strip() if sentence == "": return nouns = [] verbs = [] adjectives = [] current_word = start_word current_grammer = start_grammer previous_word = start_word previous_grammer = start_grammer for i in sentence.split(): # See if word exists in our dictionary # If not create it if word_dictionary.has_key(i) == True: current_word = word_dictionary[i] current_grammer = current_word.getGrammer() else: # Get the grammer grammer = summartUtil.getWordFunction(i) # See if that grammer type exists, if not create it if grammer_dictionary.has_key(grammer) == True: current_grammer = grammer_dictionary[grammer] else: current_grammer = Word(grammer) grammer_dictionary[grammer] = current_grammer current_word = Word(i, current_grammer) word_dictionary[i] = current_word # Increase how much the word and grammer have been used current_word.increaseUsage() current_grammer.increaseUsage() # Link words together as valid words that can be next to # eachother previous_word.addPostWord(current_word) current_word.addPreWord(previous_word) previous_grammer.addPostWord(current_grammer) current_grammer.addPreWord(previous_grammer) previous_word = current_word previous_grammer = current_grammer # Check if the word is black listed meaning we don't want # any contextual linking if i in BLACK_LIST: continue # Link the context based on what type of word it is if current_word.getGrammer().getWord() == "noun": nouns.append(current_word) elif current_word.getGrammer().getWord() == "verb": verbs.append(current_word) elif current_word.getGrammer().getWord() == "adjective": adjectives.append(current_word) current_word.addPostWord(end_word) end_word.addPreWord(current_word) current_grammer.addPostWord(end_grammer) end_grammer.addPreWord(current_grammer) for i in nouns: current_word = i for j in nouns: if i == j: continue current_word.addRelated(j) for j in verbs: current_word.addRelated(j) j.addRelated(current_word) for j in adjectives: current_word.addAdjective(j) j.addRelated(current_word)
#!/usr/bin/env python # -*- coding: utf-8 -*- import summartUtil from Word import Word from Markov import Markov import sys DEBUG = False # Words to not type BLACK_LIST = ["the", "has", "hasn't", "have", "havn't", "a", "an", "is", "it", "to", "its"] # The grammer a sentence can start with start_grammer = Word(" ") # The grammer a sentence can end with end_grammer = Word(".") # All the grammer we know about grammer_dictionary = {start_grammer.getWord(): start_grammer, end_grammer.getWord(): end_grammer} # The words that can start a sentence start_word = Word(" ", start_grammer) # The words that can end a sentence end_word = Word(".", end_grammer) # All the words we know about word_dictionary = {start_word.getWord(): start_word, end_word.getWord(): end_word} def analyze_sentence(sentence): sentence = sentence.strip() if sentence == "": return
def __init__(self, ob, rl): self.trs = [] Word.__init__(self, ob, rl)
def buildcorpus(corpus, rootpath, filelimit = 0): #rootpath = corpus.rootpath fileids = os.listdir(rootpath) hugewordlist = [] hugewordlist.extend(corpus.words) # will contain distinct Word instances numoffiles = 0 corpus.set_corpusname(str(max(filelimit, len(fileids)))+"texts") for fileid in fileids: allwords = nltk.FreqDist() # will contain all words in this text doc_id = fileid.split(".")[0] # corpus.inserttext(doc_id) ##### ! text in kendisini gondermeli newtext = Text(doc_id) path = rootpath + os.sep + fileid #lines = readtextlines(path) #rawtext = texter.readtxtfile(path) rawtext = texter.readnewstext(path) lines = texter.splitToSentences(rawtext) sntindex = 0 # each line is a sentence for line in lines: words = [] # words in this sentence words = line.split() words = texter.eliminatepunctuation(words) words = [word for word in words if not word.isspace()] for word in words: allwords.inc(word) newword = Word(word) newword.insertsentenceid(doc_id+"_"+str(sntindex)) if allwords[word] <= 1: # if this was not added to the hugelist before, add it hugewordlist.append(newword) sentence = Sentence(sntindex) sntindex = sntindex + 1 # sentence'a Word mu wordindex mi atalim? for word in words: index = hugewordlist.index(Word(word)) hugewordlist[index].insertsentenceid(doc_id+"_"+str(sntindex-1)) sentence.insertword(index) newtext.insertsentence(sentence) if (not rawtext.isspace()) or (len(allwords) != 0): corpus.inserttext(newtext) print str(numoffiles)," : finished handling the words-snts-txts ",doc_id numofwords = reduce(lambda x,y : x+y, allwords.values()) for word in hugewordlist: cnt = allwords[word.literal] #freq = cnt / float(numofwords) word.assigntermfreq(cnt, numofwords, doc_id) #hugewordlist[index].toscreen() numoffiles = numoffiles + 1 if filelimit == numoffiles: break # end for - docs numofdocs = len(fileids) print "computing tf*idf" for word in hugewordlist: word.computeinvdocfreq(numofdocs) word.computeTFIDF() #word.toscreen() corpus.assignwords(hugewordlist) print "corpus length ",str(len(corpus.words))," words" print "huges length ",str(len(hugewordlist))," words" print "exiting buildcorpus()" print "pickle-dumping words" corpus.pickledumpwords()
def spellcheck_blacklist(self, text, badDict, return_for_db=False, return_words=False, title=None, verbose=False, range_level="full"): """ Checks a single text against the words in the blacklist and returns a list of wrong words. """ loc = 0 # the current location in the text we parse old_loc = 0 curr_r = 0 ranges = self.forbiddenRanges(text, level=range_level) ranges = sorted(ranges) wrongWords = [] prepare = [] j = 0 # Regex to find next word: look for any whitespace or control # characters followed by a "word" stopping at the next whitespace or # control character. wordsearch = re.compile(r'([\s\=\<\>\_/-]*)([^\s\=\<\>\_/\-|]+)') if self._testcase_compat: wordsearch = re.compile(r'([\s\=\<\>\_/-]*)([^\s\=\<\>\_/\-]+)') while True: if verbose: print "== Start wordsearch at location", loc match = wordsearch.search(text,loc) LocAdd = 0 j = j + 1 if not match: # No more words on this page break if verbose: print j, "Check '%s'" % text[ match.start():match.end()].encode("utf8"), "at loc", loc # Check if we are in forbidden range curr_r, loc, in_nontext = self.check_in_ranges(ranges, match.start(), match.end(), curr_r, loc) if verbose: print " -> moved loc pointer to ", loc, "skip is", in_nontext if in_nontext: continue # Split the words up at special places like or a dash spl = re.split(' ', match.group(2)) if len(spl) > 1: LocAdd = 5 elif len(spl) == 1: spl = re.split(u'–', spl[0]) loc_start = loc + len(match.group(1)) # start of the word ww = spl[0] LocAdd += len(ww) + 1 bigword = Word(ww) smallword = bigword.derive() if verbose: print " ==> smallword", smallword.encode("utf8") done = False for r in ranges: # If the end of the range coincides with the start of the word # we might not have a full word -> rather discard it. if r[1] == loc_start: loc += LocAdd done = True if verbose: print " we are done with ", smallword.encode("utf8"), "due to range", r if done: continue # We advance the location by the characters skipped (group 1) loc += len(match.group(1)) if range_level != "none" or self._testcase_compat: done = self._text_skip(text, loc, smallword, title, return_for_db) if verbose: print " new loc (after accounting for skipped chars)", loc, "which is '%s'" % match.group(1) ################################### #use this code to insert into the database if return_for_db: if not done: if verbose: print " ===> append word for db: ", smallword.encode("utf8") wrongWords.append(smallword) else: ################################### #here we check whether it is wrong if not done and smallword.lower() in badDict \ and not smallword == '' and not smallword.isupper(): if not smallword == badDict[smallword.lower()]: if return_words: wrongWords.append( WrongWord(wrong_word = smallword, location = loc, bigword = bigword.word, correctword = badDict[smallword.lower()] ) ) else: wrongWords.append([smallword, bigword, loc, badDict[smallword.lower()], text[max(0, loc-100):min(loc+100, len(text))] ]) # We advance the location by the characters of the word (group 2) loc += LocAdd if verbose and len(text) > loc: print " new loc (after accounting for word)", loc, "we are at", text[loc].encode("utf8") return wrongWords
def __init__(self, ob, rl, after=0): Word.__init__(self, ob, rl) self.after = after