def viterbi_segment(longstring): """ INPUT: text: str, text that need to be segmented corpusPath: path of corpus file used to generate a voucabulary OUTPUT: segmented token: str, original longstring segmented EXAMPLE: >>>viterbi_segment('myhomeawayfromhom') 'my home away from ho m' """ #longstring = re.sub(r'-', '',longstring) longstring = re.sub(r'\.|[0-9]|_|-', '', longstring) probs, lasts = [1.0], [0] for i in range(1, len(longstring) + 1): prob_k, k = max((probs[j] * word_prob(longstring[j:i]), j) for j in range(max(0, i - max_word_length), i)) probs.append(prob_k) lasts.append(k) words = [] i = len(longstring) while 0 < i: words.append(longstring[lasts[i]:i]) i = lasts[i] words.reverse() segmented_token = ' '.join(words) return segmented_token
def splitTrim(line): words = [] temp = re.findall(r'\w+', line) for i in temp: if len(i)>2: words.append(i) #print(words) return words
def get_topic_words(ldamodel, num_topics=num_topics): words = [] for i in range(0, num_topics): topics = ldamodel.show_topic(i) for topic in topics: words.append(topic[0]) return words
def get_words_from_keys(keys): '''Helper function for filtering stopwords and non-english words from a list of keys. Returns a list of strings.''' words = [] for key in keys: if key not in nltk_stopwords and key in nltk_words: words.append(key) return words
def list_maker(data): """ Make a list from read-in file lines. Return a list """ words = [] for i in range(35, len(data)): words.append(data[i].rstrip('\n')) return words
def segment(text, segs): words = [] last = 0 for i in range(len(segs)): if segs[i] == '1': words.append(text[last:i + 1]) last = i + 1 words.append(text[last:]) return words
def _get_all_words_from_node(self, node, curr_word, words): if not node: words.append(''.join(curr_word)) return curr_word.append(node.value) for child in node.children.values(): self._get_all_words_from_node(child, curr_word, words) curr_word.pop()
def WordTokenize(namafile): # fungsi ini mengembalikan array of word dari dokumen dengan nama namafile words = [] with open(namafile, 'r') as f: for line in f: for word in line.split(): if (word != '' and word != ' '): words.append(clean(word)) return words
def brute(string, length, charset): global words if len(string) == length: return for char in charset: temp = string + char words.append(temp) brute(temp, length, charset) return (words)
def STokenWord(namafile): #namafile: string.txt # return kata-kata yang unik dan udah distem dan tanpa stopwords words = [] with open(namafile, 'r') as f: for i in f: kata = word_tokenize(i) for j in kata: if (not (ps.stem(j) in words) and not (j in stop_words)): words.append(ps.stem(j)) return words
def split_word(content): REG_EXPR = "'?([_-a-zA-z0-9']+)'?" pattern = re.compile(r'{}'.format(REG_EXPR)) matches = pattern.finditer(content) words = [] for match in matches: words.append(match.group(0)) return words
def STokenWord(namafile): #namafile: string.txt # return kata-kata yang unik dan udah distem dan tanpa stopwords words = [] with open(namafile, 'r') as f: for line in f: for word in line.split(): if (clean(word) not in words) and (word not in stop_words): words.append(clean(word)) if ('' in words): words.remove('') if (' ' in words): words.remove(' ') return words
def words_and_not_words(xs, ws): flwD = {} for w in ws: flwD[w] = w words, not_words = [], [] for x in xs: try: words.append(flwD[x]) except KeyError: not_words.append(x) return words, not_words
def get_features_from_chunk(self, chunks_tree): if chunks_tree.height() > 3: raise Exception("Chunk tree is too deep to parse!") words = [] labels = [] for chunk in chunks_tree: if isinstance(chunk, nltk.Tree): for b in chunk: words.append(b[0]) labels.append('BIO') else: words.append(chunk[0]) labels.append('O') feats = self.fast_calculate_features(words) return zip(feats, labels)
def topTen(classDict, classCount): words = [] probs = [] #keys = word, vals = P(word|class) keys = list(classDict.keys()) vals = list(classDict.values()) pClass = classCount/allCount for i in range(len(keys)): pWord = allProb[(keys[i])] p = (vals[i]*pClass)/pWord words.append(keys[i]) probs.append(p) top = sorted(range(len(probs)), key=lambda i: probs[i])[-10:] topWords = [] for i in top: topWords.append(words[i]) return topWords
def text_cleaning(docs): documents = {} documents_without_verbs = {} for doc in docs: words = [] for word in docs[doc]: word = word.lower() word.replace("|", "").replace("\\", "").replace("!", "").replace( "\"", "").replace("£", "").replace("$", "").replace("%", "").replace( "&", "").replace("", "").replace("(", "").replace( ")", "").replace("=", "").replace("?", "").replace( "^", "").replace(",", "").replace(".", "").replace( "@", "").replace("#", "").replace("\'", "").replace("~", "") if (word not in stopwords.words('english')) and ( word not in punctuation_list) and (len(word) > 1) and ( not word.isdigit()) and ("//" not in word): #TODO LEMMATIZATION words.append(word) documents[doc] = words #bigram_fd = nltk.FreqDist(nltk.bigrams(words)) #words è la lista di parole da modificare #document_tagged = nltk.pos_tag(words) #PRENDO SOSTANTIVI E AGGETTIVI SENZA VERBI E FACCIO TFIDF CON QUELLI #document_tagged = [(x,y) for (x,y) in document_tagged if (y in ('VB', 'NN', 'NNS', 'NNP', 'NNPS','VBD', "VBG", "VBN", "VBP", "VBZ"))or ("_" in x) ] #print (document_tagged) f = open("./data_structures/for_test_noverbs.json", "w+") json_data = json.dumps(documents_without_verbs) f.write(json_data) f.close() f = open("./data_structures/for_test.json", "w+") json_data = json.dumps(documents) f.write(json_data) f.close() return documents
def callViterbi(text): scoreList, lasts = [1.0], [0] for i in range(1, len(text) + 1): prob_k = 0 k = 0 for j in range(max(0, i - maximumlength), i): maximumProbability = scoreList[j] * (dictionary[text[j:i]] / total) if (maximumProbability > prob_k): prob_k = maximumProbability k = j scoreList.append(prob_k) lasts.append(k) words = [] i = len(text) while 0 < i: words.append(text[lasts[i]:i]) i = lasts[i] words.reverse() return words
def load_event_ontology(file): input_file = open(file, "r").readlines() phase_category2keywords = {} expression2words = {} for line in input_file: if not line.strip(): continue if line[0] == "@": words = [] expression = line.split()[0] phrases = " ".join(line.split()[1:]).split(",") for phrase in phrases: words.append(" ".join(phrase.split()).lower()) expression2words[expression] = words for line in input_file: if not line.strip(): continue words = line.split() if words[0] == "#": phase = words[1] phase_category2keywords[phase] = {} continue if words[0] == "##": category = "_".join(words[1:]) phase_category2keywords[phase][category] = [] continue if words[0] == "###": phrases = line.replace("###", "").split(",") for phrase in phrases: expression_flag = False for word in phrase.split(): if "@" + word in expression2words: expression_flag = True for expression_word in expression2words["@" + word]: final_phrase = " ".join(phrase.split()).replace( word, expression_word) phase_category2keywords[phase][category].append( final_phrase.lower()) if expression_flag == False: phase_category2keywords[phase][category].append(" ".join( phrase.split()).lower()) continue return phase_category2keywords
def separator(chars, exclude=None): words = [] if not chars.isalpha(): return [chars] if not exclude: exclude = set() working_chars = chars while working_chars: for i in range(len(working_chars), 1, -1): segment = working_chars[:i] if check(segment) and segment not in exclude: words.append(segment) working_chars = working_chars[i:] break else: if words: exclude.add(words[-1]) return separator(chars, exclude=exclude) return [chars] return words
def updateFilterWords(): df = pd.read_excel("./Resource/filterCandidiate.xlsx", index_col=None, header=None) allWords = list(set(df[0])) words = [] #filter out null for w in allWords: try: if len(w) > 1: words.append(w) except: continue len(words) wordsNeed2Filter = [df[0][i] for i in range(len(df[0])) if df[1][i] == 1] wordsNeed2Filter += ["IEEE", "ON", "DOWN", "THIS", "THAT"] wordsNeed2Filter += [chr(i) for i in range(97, 123)] wordsNeed2Filter += [chr(i) for i in range(65, 91)] wordsNeed2Filter = set(wordsNeed2Filter) pickle.dump(wordsNeed2Filter, open("wordsNeedToBeFiltered.dat", "wb"))
def text_files_to_wordbags(): count = 0 files = os.listdir(directory) files.sort() words = [] titles = [] for filename in files: if count < max_books: if '.txt' in filename: count += 1 book_no = filename.split(' -- ')[0] book_title = filename.split(' -- ')[1].strip('.txt') titles.append(book_title) print('\n' + book_title) text = file_to_text(directory + '/' + filename) lemmas = text_to_lemmas(text) words.append(lemmas) return words, titles
def prediction_data_preparation(test_df, le, thresh): ''' ANALYSIS OF MODEL PREDICTION WITH CONSIDERATION OF PROBABILITY SELECTING PREDICTED WORDS BASED ON PROBABILITY THRESHOLD test_df = dataframe with test prediction and probability test_df is transformed into words and '_' is replaces with '' for better interpretability *function ensures that chosen words (probability sentences) contain the same number of words as model prediction ''' from tqdm import tqdm test_df_transformed = test_df.copy() transform_columns = [ '5_ae', '5_a', '4_iaebglebg', '4_iabglbg', '3_mdagmlg', '2_kl', '3_mdagrmlg', '2_ka', 'REF' ] for columns in transform_columns: test_df_transformed[columns] = le.inverse_transform( test_df_transformed[columns]) words = [] test_df_transformed['selected_words'] = None test_df_transformed['REF'] = test_df_transformed['REF'].replace('_', '') for row in tqdm(range(test_df_transformed.shape[0])): if test_df_transformed['probability'][row] > thresh: words.append(test_df_transformed['REF'][row]) else: words.append('_') test_df_transformed['selected_words'] = words test_df_transformed.loc[test_df_transformed['REF'] == '', 'selected_words'] = '' return test_df_transformed
def lemmatization(docs): documents = {} for doc in docs: words = [] document_tagged = nltk.pos_tag(docs[doc]) #PRENDO SOSTANTIVI E AGGETTIVI SENZA VERBI E FACCIO TFIDF CON QUELLI document_tagged = [ (x, y) for (x, y) in document_tagged if (y in ("JJ", 'JJR', 'JJS', 'VB', 'NN', 'NNS', 'NNP', 'NNPS', 'VBD', "VBG", "VBN", "VBP", "VBZ")) or ("_" in x) ] for word, tag in document_tagged: if tag.startswith("NN"): lemmatized_word = lemmatizer.lemmatize(word, pos="n") words.append(lemmatized_word) if tag.startswith("VB"): lemmatized_word = lemmatizer.lemmatize(word, pos="v") words.append(lemmatized_word) if tag.startswith("JJ"): lemmatized_word = lemmatizer.lemmatize(word, pos="a") words.append(lemmatized_word) else: print("left out: ", word) documents[doc] = words f = open("./data_structures/for_test_lemmas.json", "w+") json_data = json.dumps(documents) f.write(json_data) f.close() print(documents)
def generate_keywords(messages, n=8): sentences = read_messages(messages) stop_words = stopwords.words('english') words = [] common_word_list = create_common_word_list(messages) for sentence in sentences: sentence_words = word_tokenize(sentence) for i in range(len(sentence_words)): sentence_words[i] = sentence_words[i].lower() words.append(sentence_words) flat_words = [item for sublist in words for item in sublist] final_wordlist = [] for flat_word in flat_words: if flat_word not in stop_words and len(flat_word) > 3: final_wordlist.append(flat_word) freq = Counter(final_wordlist) common_keywords = (freq.most_common(n)) final_words = [] for c in common_keywords: if c[0] not in common_word_list: final_words.append(c[0]) return final_words
def attempt_words(word): '''Return a list of words that may or may not be real''' # If 0 stands for 'use letter before' and 1 'use letter # after', then the numbers from 0 to 2**n (where n is # the length of a string) represent all possible # translations, **if** each number is zero-padded to n places # (e.g. "000" instead of "0" for n=3. n = len(word) options = [bin(i)[2:].zfill(n) for i in range(2**n)] words = [] for i, option in enumerate(options): new_word = '' for j, c in enumerate(word): choice = option[j] k = ABC.find(c) if choice == '0': new_c = ABC[k - 1] else: # Wrap around if go past right end of list new_c = ABC[(k + 1) % len(ABC)] new_word += new_c words.append(new_word) return words
def findpattern(Apps, Revs): App = [] for i in Revs: App.append(i[3]) App = list(set(App)) #print "Len App",len(App) dic = {} for i, j in enumerate(App): dic[j] = i NumApp = len(App) for i in range(0, len(Revs)): for j in range(0, len(Revs[i])): if Revs[i][j] == "NULL": Revs[i][j] = "" # <codecell> result1 = codecs.open("dis1.txt", "w", "utf-8") pairs1 = np.zeros((NumApp, NumApp), dtype=np.int) resultApp = codecs.open("output.txt", "w", "utf-8") #################################################### nlp = spacy.en.English() count = 0 '''with codecs.open("sample.csv","r","latin-1") as f: arr = [i.rsplit("\n")[0] for i in f] arr = [i.rsplit("\t") for i in arr]''' c = 0 for rev in Revs: #print count count = count + 1 title = rev[0] body = rev[1] r = title + u"," + body #r = title + u" , " +body AppID = rev[3] words = [] #print r,type(r) doc = nlp(r) for chunk in doc.noun_chunks: #print "NNP" , chunk.orth_ words.append(chunk.orth_) for token in doc: if token.tag_ == "NN" or token.tag_ == "NNS" or token.tag_ == "NNP" or token.tag_ == "NNPS": #print "NN",token words.append(token.orth_) for i, j in enumerate(words): j = j.lower() if j.endswith(' app'): words.append(j[:-4]) if j.endswith(' game'): words.append(j[:-5]) if j.startswith('the '): words.append(j[4:]) found1 = False found2 = False Ans = "NULL" for i in words: for j in Apps: #if j[1] != AppID: #dist = distance(j[0],i.lower()) if AppID.find(i) != -1: continue try: if j[0] == i.lower() and not found1: result1.write(i + "\t" + title + "\t" + body + "\t" + j[1] + "\t" + AppID + "\n") # writing as [Source][Target] == from source to target pairs1[dic[j[1]], dic[AppID]] = pairs1[dic[j[1]], dic[AppID]] + 1 found1 = True Ans = j[1] #print "Found",j[1],j[0],i c = c + 1 except Exception as e: #print "\error"," ",str(e) #print AppID," ",j[1] p = 0 if found1: break if found1: break resultApp.write(rev[0] + "\t" + rev[1] + "\t" + rev[2] + "\t" + rev[3] + "\t" + rev[4] + "\t" + Ans + "\n") print 'total found', c resultApp.close()
def _replace(self, sentence, is_spell_check=True): words = [] for word in word_tokenize(sentence): word = word.strip() if "/" in word or "\\" in word: words.append("__isslashinword__") elif self.word_re.match(word): if is_spell_check and word not in self.all_words: words.append(self.spell_checker.correction(word)) else: words.append(word) elif self.number_re.match(word): words.append("__isnumber__") elif "__isurl__" in word: words.append("__isurl__") else: words.append("__isinvalidword__") return words
#PRINTING FINAL WORD LIST #print(new) #CREATING THE CLUSTER words_orgn = word_list final_word_list=[] for i in range(0,len(new)): if new[i] in word_list: final_word_list.append(new[i]) words = final_word_list for i in range(0,len(new)): words.append(new[i]) words = np.asarray(words) lev_similarity = -1*np.array([[distance.levenshtein(w1,w2) for w1 in words] for w2 in words]) affprop = sklearn.cluster.AffinityPropagation(affinity="precomputed", damping=0.80) affprop.fit(lev_similarity) data=[] for cluster_id in np.unique(affprop.labels_): exemplar = words[affprop.cluster_centers_indices_[cluster_id]] cluster = np.unique(words[np.nonzero(affprop.labels_ == cluster_id)]) for i in range(0, len(cluster)): if cluster[i] in words_orgn: for j in range(0, len(new)): if Levenshtein.ratio(cluster[i], new[j]) > 0.5: print(cluster[i])
def load_search_words(self): words = [] with open(self.search_word_file_path, 'r') as f: for line in f: words.append(line.strip()) return words
for word in vocab: #print word """ new_word='' if len(word)>2: for j in range(0,len(word)-2): if word[j]==word[j+1] and word[j]==word[j+2]: continue else: new_word=new_word+word[j] word=new_word """ word = word.lower() p = fl.english_g2p(word) if p not in phenomes: words.append(word) phenomes.append(p) conversion[word.lower()] = word.lower() else: i = phenomes.index(p) if words[i].lower() not in word_corpus_nltk: temp = words[i] words[i] = word conversion[word.lower()] = word.lower() conversion[temp.lower()] = word.lower() else: conversion[word.lower()] = words[i].lower() f = open('Final_CB_DS/finalcb_dataset_cleaned.txt', 'r') g = open('Final_CB_DS/sample_phonetic_ipa_vocab.txt', 'w')