def ngram_in_collection(ngram, coll): """ Check if ngram's components are in collection """ s1 = set([stem(word) for word in ngram.split(' ')]) s2 = set([stem(word) for word in coll]) return (len(s1.intersection(s2)) > 0)
def tokenize(self): punc = """\\.!?,(){}[]"'""" wordarray = [] for c in self.document.lower().split(): if stem(c.strip()) not in self.corpus.stopwords: wordarray.append(stem(c.strip(punc))) return wordarray
def tokenize_porter(title, body): """ Break text into words and stem user porter stemmer """ # break up words & remove stopwords title_break = stopWords(nltk.word_tokenize(title), lower_case=True) body_break = stopWords(nltk.word_tokenize(body), lower_case=True) # print title_break return ["title:" + stem(title) for title in title_break] + ["body:" + stem(body) for body in body_break]
def makeFreqDictionaryOfSentenceWords(s1): words1 = s1.split(); dt1 = {} for w in words1: if w.lower() not in stopwords: dt1[stem(w.lower())] = dt1.get(stem(w.lower()),0) + 1 return dt1
def find_collocations_tri(filename): text_file = open(filename, 'r') most_common_words = find_most_common_words(text_file, 100) second_word = None third_word = None fourth_word = None collocations = dict() text_file.seek(0) for line in text_file: for word in line.split(): first_word = second_word second_word = third_word third_word = fourth_word fourth_word = trim_word(word) if (first_word not in most_common_words and second_word not in most_common_words and third_word not in most_common_words) and \ (first_word and first_word[0].islower() and second_word and second_word[0].islower() and third_word and third_word[0].islower()): count_collocations_tri(collocations, stem(first_word.lower()), stem(second_word.lower()), stem(third_word.lower())) #dodatkowa iteracja dla ostatniego slowa first_word = second_word second_word = third_word third_word = fourth_word count_collocations_tri(collocations, first_word, second_word, third_word) sort_collocations_tri(collocations)
def get_pmi(self, word0, word1): """Return the pointwise mutual information, a measure of word association within a window, for two words. This is normalized using Bouma (2009) to avoid infinite values for OOV terms. """ word0 = word0.lower() word1 = word1.lower() if self.stemming: word0 = porter2.stem(word0) word1 = porter2.stem(word1) if word0 not in self.word_counts or word1 not in self.word_counts: return -1 if word0 < word1: pair_counts = self.word_pair_counts[word0][word1] else: pair_counts = self.word_pair_counts[word0][word1] if pair_counts == 0: return -1 num_words = self.word_counts[anyword] # TODO: confirm normalization. Currently assuming words are # normalized by num_words and pairs by num_words^2. ratio = pair_counts / (self.word_counts[word0] * self.word_counts[word1]) pmi = np.log(ratio) normalized_pmi = - pmi / np.log(pair_counts / (num_words * num_words)) return normalized_pmi
def read(self, publication_keyword, publication_data): words = open(publication_keyword, 'r').readlines() for i in range(0, self.topic_number): s = stem(words[i].split('\t')[0]) self.topics[ s ] = dict() self.stemword_dict[s] = words[i].split('\t')[0] content = open(publication_data, 'r').readlines() counter = 0 year = '' for i in content: # three line represents a publication if counter % 3000 == 0: print (counter / 3) # record the year of this publication if counter % 4 == 1: year = int(i.strip()) # parse the keywords of this publication elif counter % 4 == 3: keywords = i.strip().split(' ') for j in keywords: j = stem(j) if j in self.topics: if year in self.topics[j]: self.topics[j][year] += 1 else: self.topics[j][year] = 1 counter = counter + 1
def freq(text,index): text = text.strip() textList = re.split('\W+',text) if len(textList) > 1: textList = [stem(word) for word in textList] setList = list() length = len(textList)-1 for word in textList: if index.has_key(word)==False: print 0 return wordSet = { (tuples[0], tuples[1]+length) for tuples in index[word]} setList.append(wordSet) length-=1 docNum= setList[0] for Docset in setList: docNum = docNum & Docset print len(docNum) else: text = stem(textList[0]) if index.has_key(text)==False: print 0 return print len(index[text])
def calculateScore(query,qID): sfile=open('../AP_DATA/stoplist.txt','r') sList=sfile.read().split('\n') query=query.lower() qList=re.findall("\w+[\.?\w+]*",query) temp=list() for term in qList: if term.endswith('.') & term.count('.')==1 & (len(term)>1): term=term.replace('.','') if term.startswith('_') & term.count('_') ==1 & (len(term)>1): term = term.replace('_','') temp.append(term) qList = temp #print index_num if index_num=='4': #print 123 qList=[i for i in temp if i not in sList] temp=list() for term in qList: term=stem(term) temp.append(term) qList=temp if index_num=='3': temp=list() for term in qList: term=stem(term) temp.append(term) qList=temp if index_num=='2': qList=[i for i in temp if i not in sList]
def find_collocations(file_name, data, popular_word): text_file = open(file_name, 'r') file_content = text_file.read() most_common_words = find_most_common_words(file_content, popular_word) second_word = None third_word = None collocations = data text_file.seek(0) for line in text_file: for word in line.split(): first_word = second_word second_word = third_word third_word = trim_word(word) if (first_word not in most_common_words and second_word not in most_common_words) and \ (first_word and first_word[0].islower() and second_word and second_word[0].islower()): count_collocations(collocations, stem(first_word.lower()), stem(second_word.lower())) # dodatkowa iteracja dla ostatniego slowa first_word = second_word second_word = third_word count_collocations(collocations, first_word, second_word) collocations = find_whole_collocations_from_stems(collocations, file_content) return collocations, most_common_words, file_content
def sentence_matches(self, sentence_text): """Returns true iff the sentence contains this mention's upstream and downstream participants, and if one of the stemmed verbs in the sentence is the same as the stemmed action type.""" has_upstream = False has_downstream = False has_verb = False # Get the first word of the action type and assume this is the verb # (Ex. get depends for depends on) actiontype_words = word_tokenize(self.mention.actiontype) actiontype_verb_stemmed = stem(actiontype_words[0]) words = word_tokenize(sentence_text) if self.string_matches_sans_whitespace(sentence_text.lower(), self.mention.upstream.lower()): has_upstream = True if self.string_matches_sans_whitespace(sentence_text.lower(), self.mention.downstream.lower()): has_downstream = True for word in words: if actiontype_verb_stemmed == stem(word): has_verb = True return has_upstream and has_downstream and has_verb
def find_collocations_penta(text, data, popular_word): most_common_words = find_most_common_words(text, popular_word) second_word = None third_word = None fourth_word = None fifth_word = None sixth_word = None collocations = data for word in text.split(): first_word = second_word second_word = third_word third_word = fourth_word fourth_word = fifth_word fifth_word = sixth_word sixth_word = trim_word(word) if (first_word not in most_common_words and second_word not in most_common_words and third_word not in most_common_words and fourth_word not in most_common_words and fifth_word not in most_common_words) and \ (first_word and first_word[0].islower() and second_word and second_word[0].islower() and third_word and third_word[0].islower() and fourth_word and fourth_word[0].islower() and fifth_word and fifth_word[0].islower() ): count_collocations_penta(collocations, stem(first_word.lower()), stem(second_word.lower()), stem(third_word.lower()), stem(fourth_word.lower()), stem(fifth_word.lower())) #dodatkowa iteracja dla ostatniego slowa first_word = second_word second_word = third_word third_word = fourth_word fourth_word = fifth_word fifth_word = sixth_word count_collocations_penta(collocations, first_word, second_word, third_word, fourth_word, fifth_word) return collocations, most_common_words
def cleanText(text, entities, category): cleanText = text hashtags = entities.get('hashtags', []) ranges = [] for hashtag in hashtags: if hashtag.get('text', '').lower() == category: indices = hashtag.get('indices') ranges.append(indices) urls = entities.get('urls', []) urls.reverse() ranges.extend([v for url in urls for k,v in url.iteritems() if k == 'indices']) media = entities.get('media', []) media.reverse() ranges.extend([v for medium in media for k,v in medium.iteritems() if k == 'indices']) ranges = sorted(ranges, key=lambda x: x[0], reverse=True) for r in ranges: cleanText = cleanText[:r[0]] + cleanText[r[1] + 1:] category_stem = stem(category).lower() cleanTextList = cleanText.split(' ') cleanText = [] for word in cleanTextList: if category_stem not in stem(word).lower() and stem(word).lower() not in category_stem: cleanText.append(word) cleanText = " ".join(cleanText) return cleanText
def main(): nlp_file = open(sys.argv[1], "r") for line in nlp_file: words = line.strip().split(" ") for word in words: print stem(word) nlp_file.close()
def getVocabularyStem(content): vocabulary = {} index = 0 for i in range(len(content)): if stem(content[i]) not in vocabulary: vocabulary[stem(content[i])] = index index = index + 1 return vocabulary
def getSentTf(sent, stopwords): doc = dict() for word in re.split("[^a-zA-Z0-9]", sent): word = word.lower() if word != "" and word!="'" and stem(word) not in stopwords: if doc.get(stem(word), 0) == 0: doc[stem(word)] = 1 else: doc[stem(word)] = doc[stem(word)]+1 return doc
def filter(self): # do not generate html file, just filter the correct relationships correct_list = list() for i in range(0, len(self.linklist)): key0 = stem(self.linklist[i][0]) key1 = stem(self.linklist[i][1]) if self.judge(key0, key1, i) is False: continue correct_list.append(i) return correct_list
def get_word_embedding(word, w2vmodel): if word in w2vmodel: return w2vmodel[word] elif stem(word) in w2vmodel: return w2vmodel[stem(word)] elif word.lower() in w2vmodel: return w2vmodel[word.lower()] elif stem(word.lower()) in w2vmodel: return w2vmodel[stem(word.lower())] else: return None
def form_regex_for_common_words(): expr = "" count = 0 common_words = fp.read().split() for word in common_words: count+= 1 if count == len(common_words): expr += "^"+stem(word)+"$" else: expr += "^"+stem(word)+"$|" return expr
def naive_wc_sim(str1, str2): list1 = nltk.word_tokenize(str1) list2 = nltk.word_tokenize(str2) count = 0 for w1 in list1: stw1 = stem(w1) for w2 in list2: stw2 = stem(w2) if stw1 == stw2: count += 1 return (1.0*count)/(1.0*min(len(list1), len(list2)))
def getDocTf(fileName, stopwords): doc = dict() with open(fileName, "r") as fi: for line in fi: for word in re.split("[^a-zA-Z0-9]", line.strip()): word = word.lower() if word != "" and word!="'" and stem(word) not in stopwords: if doc.get(stem(word), 0) == 0: doc[stem(word)] = 1 else: doc[stem(word)] = doc[stem(word)]+1 return doc
def overlapMeasure(strA, strB, stopwords): # Split and lowercase tokens tokA = [x.lower() for x in strA.split(' ') if x != 'what' and x != 'why' and x != 'how'] tokB = [x.lower() for x in strB.split(' ') if x != 'what' and x != 'why' and x != 'how'] try: from stemming.porter2 import stem tokA = [stem(x) for x in tokA] tokB = [stem(x) for x in tokB] except: pass overlap = naiveOverlap(tokA, tokB, stopwords) return overlap
def process_item(self, item, spider): url = item["url"] title = item["title"] main = item["content"] title = re.findall(r'[A-Za-z0-9]\w*', title.lower()) main = re.findall(r'[A-Za-z0-9]\w*', main.lower()) for i in range(len(main)): main[i] = stem(main[i]) for i in range(len(title)): title[i] = stem(title[i]) delWord = dict(nltk.pos_tag(main)) for i in delWord: if delWord[i] == 'DT' or delWord[i] == 'IN' or delWord[i] == 'CC' or delWord[i] == 'TO': for j in range(main.count(i)): main.remove(i) delWord = dict(nltk.pos_tag(title)) for i in delWord: if delWord[i] == 'DT' or delWord[i] == 'IN' or delWord[i] == 'CC' or delWord[i] == 'TO': for j in range(title.count(i)): title.remove(i) new_main = main + title main_pos = {} for i in range(len(new_main)): if main_pos.get(new_main[i], 0) == 0: main_pos[new_main[i]] = [i] else: main_pos[new_main[i]].append(i) main = Counter(main) title = Counter(title) for i in title: title[i] *= 2 for i in title: title[i] = max(title[i], main.get(i, 0)) * 2 + min(title[i], main.get(i, 0)) main[i] = 0 main.update(title) return { "url": url, "title": item["title"], "content": item["content"], "words": main, "wordspos": main_pos }
def visit(word, depth): if depth > max_depth: return if word in visited: return word_stem = stem(word) visited.add(word) if not word in freqs: freqs[word] = dict() stems[word] = dict() text = DictionaryServices.DCSCopyTextDefinition(None, word, (0, len(word))) if not text or len(text) == 0: return # We don't care about any of the origin/etymology data, so remove it text = text.split('ORIGIN')[0] # Remove any punctuation, weird characters, etc. filtered_text = re.sub(r'[\W\d]+', ' ', text).lower() words = filtered_text.split() for w in words: w_stem = stem(w) if w != word and len(w) >= min_word_len and w_stem != word_stem: if not w in freqs: freqs[w] = dict() stems[w] = dict() if w_stem not in stems[word]: freqs[word][w] = 1 if w not in freqs[word] else freqs[word][w] + 1 stems[word][w_stem] = w else: same_stem = stems[word][w_stem] freqs[word][same_stem] = freqs[word][same_stem] + 1 if word_stem not in stems[w]: freqs[w][word] = 1 if word not in freqs[w] else freqs[w][word] + 1 stems[w][word_stem] = word else: same_stem = stems[w][word_stem] freqs[w][same_stem] = freqs[w][same_stem] + 1 unigram_freqs[w] = 1 if w not in unigram_freqs else unigram_freqs[w] + 1 visit(w, depth + 1)
def processQuery(): stopwords = set() stopfile = open("stoplist.txt") for stopword in stopfile: stopwords.add(stopword.rstrip()) queries = loadQueries() for queryString in queries.keys(): query = queries[queryString]; # handle dots (".") U.S. becomes US query = ''.join(e for e in query if e != '.') # remove punctuation query = re.sub('[^a-zA-Z0-9\n\.]', ' ', query).rstrip() # remove stop words result = query.split(" ") mystr = ''; for term in result: if term == '' or term == ' ': continue if term not in stopwords: # convert to lower case term = str(term).lower() term = str(stem(term)) mystr += term + ' ' queriesdict[queryString] = mystr.rstrip()
def ToVS(text): VS=dict() text=text.lower() VS["#!"]=len(re.findall("!",text)) VS["#?"]=len(re.findall("\\?",text)) VS["#()"]=len(re.findall("\\(|\\)",text)) VS["#numbers"]=len(re.findall("\\d+",text)) VS["##"]=len(re.findall("#",text)) VS["#{}"]=len(re.findall("\\{|\\}",text)) VS["#[]"]=len(re.findall("\\[|\\]",text)) VS["#comparison"]=len(re.findall("<|>|=",text)) VS['#"']=len(re.findall('"',text)) VS['#math']=len(re.findall('\\+|\\-|\\*|\\/',text)) VS['#_']=len(re.findall('_',text)) VS['#oneCharacter']=0 for c in string.punctuation: if c!="-" and c!="_": text=text.replace(c," ") else: text=text.replace(c,"") text=re.sub("[\\s|\\d]+"," ",text) text=filter(lambda x: x in string.printable, text) for w in text.split(" "): word=stem(w) if word!="": if len(word)==1: VS["#oneCharacter"]+=1 elif VS.has_key(word): VS[word]+=1 else: VS[word]=1 return VS
def __clean__(self, text): """ Clean up a document through stemming and stop word removal. Stemming is the act of removing suffixes from a word to limit variation between verb tenses. Stop word remove is the act of removing common words from the document that likely play no meaning in the significance of the document. :param document: The document to be cleaned :return: The given document after stemming and stop word removal """ text = re.sub("((http:|https:|ftp:|ftps:)//[\w$-_.+!*'(),%=]+)", '', text) text = re.sub("(@[\w_]+)", '', text) text = re.sub("(#[\w!$-_.+!*'(),%=]+)", '', text) text = re.sub("\p{P}+", '', text) text = re.sub("[\'\":#,!&]+", '', text) stopwords = ["a","a's","able","about","above","according","accordingly","across","actually","after","afterwards","again","against","ain't","all","allow","allows","almost","alone","along","already","also","although","always","am","among","amongst","an","and","another","any","anybody","anyhow","anyone","anything","anyway","anyways","anywhere","apart","appear","appreciate","appropriate","are","aren't","around","as","aside","ask","asking","associated","at","available","away","awfully","b","be","became","because","become","becomes","becoming","been","before","beforehand","behind","being","believe","below","beside","besides","best","better","between","beyond","both","brief","but","by","c","c'mon","c's","came","can","can't","cannot","cant","cause","causes","certain","certainly","changes","clearly","co","com","come","comes","concerning","consequently","consider","considering","contain","containing","contains","corresponding","could","couldn't","course","currently","d","definitely","described","despite","did","didn't","different","do","does","doesn't","doing","don't","done","down","downwards","during","e","each","edu","eg","eight","either","else","elsewhere","enough","entirely","especially","et","etc","even","ever","every","everybody","everyone","everything","everywhere","ex","exactly","example","except","f","far","few","fifth","first","five","followed","following","follows","for","former","formerly","forth","four","from","further","furthermore","g","get","gets","getting","given","gives","go","goes","going","gone","got","gotten","greetings","h","had","hadn't","happens","hardly","has","hasn't","have","haven't","having","he","he's","hello","help","hence","her","here","here's","hereafter","hereby","herein","hereupon","hers","herself","hi","him","himself","his","hither","hopefully","how","howbeit","however","i","i'd","i'll","i'm","i've","ie","if","ignored","immediate","in","inasmuch","inc","indeed","indicate","indicated","indicates","inner","insofar","instead","into","inward","is","isn't","it","it'd","it'll","it's","its","itself","j","just","k","keep","keeps","kept","know","known","knows","l","last","lately","later","latter","latterly","least","less","lest","let","let's","like","liked","likely","little","look","looking","looks","ltd","m","mainly","many","may","maybe","me","mean","meanwhile","merely","might","more","moreover","most","mostly","much","must","my","myself","n","name","namely","nd","near","nearly","necessary","need","needs","neither","never","nevertheless","new","next","nine","no","nobody","non","none","noone","nor","normally","not","nothing","novel","now","nowhere","o","obviously","of","off","often","oh","ok","okay","old","on","once","one","ones","only","onto","or","other","others","otherwise","ought","our","ours","ourselves","out","outside","over","overall","own","p","particular","particularly","per","perhaps","placed","please","plus","possible","presumably","probably","provides","q","que","quite","qv","r","rather","rd","re","really","reasonably","regarding","regardless","regards","relatively","respectively","right","s","said","same","saw","say","saying","says","second","secondly","see","seeing","seem","seemed","seeming","seems","seen","self","selves","sensible","sent","serious","seriously","seven","several","shall","she","should","shouldn't","since","six","so","some","somebody","somehow","someone","something","sometime","sometimes","somewhat","somewhere","soon","sorry","specified","specify","specifying","still","sub","such","sup","sure","t","t's","take","taken","tell","tends","th","than","thank","thanks","thanx","that","that's","thats","the","their","theirs","them","themselves","then","thence","there","there's","thereafter","thereby","therefore","therein","theres","thereupon","these","they","they'd","they'll","they're","they've","think","third","this","thorough","thoroughly","those","though","three","through","throughout","thru","thus","to","together","too","took","toward","towards","tried","tries","truly","try","trying","twice","two","u","un","under","unfortunately","unless","unlikely","until","unto","up","upon","us","use","used","useful","uses","using","usually","uucp","v","value","various","very","via","viz","vs","w","want","wants","was","wasn't","way","we","we'd","we'll","we're","we've","welcome","well","went","were","weren't","what","what's","whatever","when","whence","whenever","where","where's","whereafter","whereas","whereby","wherein","whereupon","wherever","whether","which","while","whither","who","who's","whoever","whole","whom","whose","why","will","willing","wish","with","within","without","won't","wonder","would","wouldn't","x","y","yes","yet","you","you'd","you'll","you're","you've","your","yours","yourself","yourselves","z","zero"] sb = [] text = text.lower() re.sub(r'[\W_]+', '', text) for term in text.split(): term = stem(term) if term not in stopwords: sb.append(term) return ' '.join(sb)
def index(urls): """ Goal: Download a list of webpage Parameter: urls: list of strings, which represent the address of each webpage """ if not os.path.isdir('files'): os.makedirs('files') for webpage in urls: name = webpage.split('/')[-1] os.system("wget "+webpage+ " -q -O files/"+name) logging.info("Downloaded: "+ name ) b_o_w = {} for web_file in os.listdir('files'): try: text_html = open('files/'+web_file,'r').read(); text = [stem(word.lower()) for word in html2text(text_html).split()] b_o_w[web_file] = text logging.info("Tokenized: "+web_file) except : #Something strange happened with the webpage of New_York_City print ("There is a problem with "+web_file) index_file = open("index_file.pck", "w") pickle.dump(b_o_w, index_file) index_file.close()
def get_trcmparer_sim(origin_sentences): flat_sentences = [] stopwords = get_stopwords("english_stopwords.txt") for sentence in origin_sentences: sent_tmp = [] for word in sentence: if word.isalnum(): word = word.lower() if word not in stopwords: stemmed = stem(word) sent_tmp.append(stemmed) flat_sentences.append(sent_tmp) # print len(flat_sentences) trcmp_matrix = np.zeros((len(flat_sentences), len(flat_sentences)),dtype=np.float32) for i in range(0,len(flat_sentences)): for j in range(i+1,len(flat_sentences)): if len(flat_sentences[i]) == 0 or len(flat_sentences[j]) == 0: continue intersection_word = intersectionSet(flat_sentences[i],flat_sentences[j]) trcmp_matrix[i][j] = (len(intersection_word)*1.0)/(np.log(len(flat_sentences[i])+1)+np.log(len(flat_sentences[j])+1)) trcmp_matrix[j][i] = trcmp_matrix[i][j] trcmp_matrix = scale_0_1(trcmp_matrix) return trcmp_matrix
def get_sent_embedding_w2v(sent, w2vmodel, mode, tfidf_vectorizer = None): result_vec = [] set_words = set(sent) for w_idx,word in enumerate(set_words): if word.isalnum() == False: continue word_vector = get_word_embedding(word,w2vmodel) if word_vector is None: # print word continue tf_word = sent.count(word) if tfidf_vectorizer is not None: stemmed = stem(word) if word in tfidf_vectorizer.vocabulary_: idf_word = tfidf_vectorizer.idf_[tfidf_vectorizer.vocabulary_[word]] elif stemmed in tfidf_vectorizer.vocabulary_: idf_word = tfidf_vectorizer.idf_[tfidf_vectorizer.vocabulary_[stemmed]] else: idf_word = 0 tf_idf_word = tf_word * idf_word tf_idf_array = np.array([tf_idf_word]) word_vector = np.concatenate((word_vector, tf_idf_array)) result_vec.append(word_vector) result_vec = np.array(result_vec) if mode == "mean": return np.mean(result_vec,axis=0) else: return np.sum(result_vec,axis=0)
def stem_inventory(inventory): words = inventory.split(" ") stem_words = [] for word in words: if "_" in word or '-' in word or "\\" in word: # if they are compound word, then don't stem stem_words.append(word) else: # if not stem stem_words.append(stem(word)) return stem_words
def removeStems(data): """ Purpose: Computes the stem of each word in the passed word list Returns: A list containing a stem for each word in the words list """ stemList = [] for d in data: stemList.append(stem(d)) return stemList
def foodwordList(keyword): tweets = twitterImport.getTweets(keyword) tmpList = [] for tweet in tweets['statuses']: tmpList.append([stem(word) for word in tweet['text'].split(" ")]) finalList = [] for list in tmpList: finalList = finalList + list return finalList
def create_features(x): phi = defaultdict(lambda: 0) # words = x.split() for word in x: word = stem(word) phi["UNI:" + word] += 1 return phi
def search_fast(term: str, vocab: pd.DataFrame) -> list: stemmed_term = stem(term.lower()) options = [] for row in vocab[vocab["stemmed"].str.contains(stemmed_term)].itertuples( index=True): options.append((row.URI, row.Label)) if len(options) >= 151: break return options
def addTerm(line, curr_doc, stopWordsList): for term in line.split(): term = term.lower() term = stem(term) if (term not in stopWordsList and len(term) > 3): try: curr_doc[term] += 1 except KeyError: curr_doc[term] = 1
def clean(doc): remove_digits = str.maketrans('', '', digits) doc = doc.translate(remove_digits) cleantext = BeautifulSoup(doc, "html.parser").text doc = re.sub(cleanr, ' ', doc) doc = doc.replace("<div>", " ") doc = doc.replace("</div>", " ") doc = doc.replace(".</div>", " ") doc = doc.replace("<br />", " ") doc = doc.replace(".", " ") doc = doc.replace(":", " ") doc = doc.replace(",", " ") doc = doc.replace("_", " ") doc = doc.replace('-', ' ') doc = doc.replace('(', ' ') doc = doc.replace(')', ' ') doc = doc.replace('#', ' ') doc = doc.replace('/', ' ') doc = doc.replace(" div ", " ") doc = doc.replace(" br ", " ") doc = doc.replace("nbsp", " ") doc = doc.replace("ndash", " ") doc = doc.replace("’", ' ') doc = doc.replace("™", ' ') doc = re.sub(r"\&([^;.]*);", " ", doc) doc = re.sub(r"([0-9]+)-([0-9]+)", " ", doc) doc = re.sub(r"\d", " ", doc) stop_free = " ".join([i for i in doc.lower().split() if i not in stop]) punc_free = ''.join(ch for ch in stop_free if ch not in exclude) punc_free = re.sub(r"\b\d+\b", " ", punc_free) words = word_tokenize(punc_free) lemmatized_words = [lemma.lemmatize(word) for word in words] #stemmed_word = [stem(w) for w in lemmatized_words] #stemmed_word = [stem(w) for w in words] finallist = [] for ch in lemmatized_words: if len(ch) > 2 and len(ch) < 13 and ch.encode('utf-8').isalnum( ) == True and bool(re.search(r'\d', ch)) == False: try: finallist.append(stem(vocab_mapper[ch])) except: finallist.append(stem(ch)) final = " ".join(finallist) return final
def lyrics_to_bow(lyrics): """ Main function to stem and create bag of words. It is what we used for the musiXmatch dataset. It is heavily oriented towards English lyrics, we apologize for that. INPUT lyrics as a string RETURN dictionary word -> count or None if something was wrong (e.g. not enough words) """ # remove end of lines lyrics_flat = lyrics.replace('\r', '\n').replace('\n', ' ').lower() lyrics_flat = ' ' + lyrics_flat + ' ' # special cases (English...) lyrics_flat = lyrics_flat.replace("'m ", " am ") lyrics_flat = lyrics_flat.replace("'re ", " are ") lyrics_flat = lyrics_flat.replace("'ve ", " have ") lyrics_flat = lyrics_flat.replace("'d ", " would ") lyrics_flat = lyrics_flat.replace("'ll ", " will ") lyrics_flat = lyrics_flat.replace(" he's ", " he is ") lyrics_flat = lyrics_flat.replace(" she's ", " she is ") lyrics_flat = lyrics_flat.replace(" it's ", " it is ") lyrics_flat = lyrics_flat.replace(" ain't ", " is not ") lyrics_flat = lyrics_flat.replace("n't ", " not ") lyrics_flat = lyrics_flat.replace("'s ", " ") # remove boring punctuation and weird signs punctuation = (',', "'", '"', ",", ';', ':', '.', '?', '!', '(', ')', '{', '}', '/', '\\', '_', '|', '-', '@', '#', '*') for p in punctuation: lyrics_flat = lyrics_flat.replace(p, '') words = filter(lambda x: x.strip() != '', lyrics_flat.split(' ')) # stem words words = map(lambda x: stem(x), words) bow = {} for w in words: if not w in bow.keys(): bow[w] = 1 else: bow[w] += 1 # remove special words that are wrong fake_words = ('>', '<', 'outro~') bowwords = bow.keys() for bw in bowwords: if bw in fake_words: bow.pop(bw) elif bw.find(']') >= 0: bow.pop(bw) elif bw.find('[') >= 0: bow.pop(bw) # not big enough? remove instrumental ones among others if len(bow) <= 3: return None # done return bow
def meaning_bag(pos, url): ''' Return a "meaning bag" given a WordSmyth url. A meaning bag is a bag of stemmed words derived from a WordSmyth url of a definition page. pos - either "noun" or "verb" url - WordSmyth url (e.g. "https://www.wordsmyth.net/?level=3&ent=dog") ''' # download page to file filename = wget.download(url, out='{}.html'.format(uuid.uuid4().hex)) page = open(filename).read() soup = BeautifulSoup(page, 'html.parser') # delete file !!! os.remove(filename) bag = set() maintable = soup.tbody.find('table', class_="maintable") correct_pos = False for tr in maintable.tbody.findChildren(): if tr.get("class") and tr.get("class")[0] == "postitle": if tr.find("td", class_="data").a: pos_ = tr.find("td", class_="data").a.text else: pos_ = tr.find("td", class_="data").text if pos in pos_.split(): correct_pos = True else: correct_pos = False elif tr.get("class") and tr.get( "class")[0] == "definition" and correct_pos: def_ = tr.find("td", attrs={ 'class': 'data' }).find_all(text=True, recursive=False)[0] # update bag with words from definition bag.update([t.lower() for t in tokenizer.tokenize(def_)]) # check for "similar words" if tr.find("td", attrs={'class': 'data'}).find("dl"): sim_words = tr.find("td", attrs={'class': 'data'}).dl.dd.a.text # update bag with words from "related words" section bag.update([t.lower() for t in tokenizer.tokenize(sim_words)]) elif tr.get("class") and tr.get( "class")[0] == "related_word" and correct_pos: rel_words = tr.find("td", class_="data").a.text # update bag with words from "related words" section bag.update([t.lower() for t in tokenizer.tokenize(rel_words)]) # remove stopwords bag -= STOPWORDS # stem words bag = set([stem(w) for w in bag]) return (bag)
def document_to_query(self, doc): """ Given a document it transforms the source code related fields to a lucene query string """ query = "" for field in ["description"]: for val in doc.getFields(field): if val.stringValue().strip(): term = QueryParser.escape(val.stringValue()) # tokenize term = self.tokenize_string(StandardAnalyzer(), term) # CamelCase temp = [] for t in term: temp += self.camel_case_split(t) # stopwords temp_2 = [] for t in temp: if t not in english_stop_words: temp_2.append(t) # stemming temp_3 = [] for t in temp_2: temp_3.append(stem(t)) # stopwords temp_4 = [] for t in temp_3: if t not in english_stop_words: temp_4.append(t) # query generation for term in temp_4: query += "%s:%s " % (field, term) for field in ["typed_method_call", "methods", "used_classes", "class_instance_creation", "methods_called", "annotations", "literals"]: # "used_classes", , "literals" , "extends" for val in doc.getFields(field): if val.stringValue().strip(): term = QueryParser.escape(val.stringValue()) java_stoplist = ["java.lang.Object", 'void', 'Global', 'boolean', 'String', 'int', 'char', 'float', 'double', 'write', 'close', 'from', 'println', 'StringBuilder', 'write', 'toString', 'close', 'mkdir', 'exists'] if term not in java_stoplist: query += "%s:%s " % (field, term) if len(doc.getFields("code_hints")) > 0: hints = [hint.stringValue() for hint in doc.getFields("code_hints")] hints_str = " ".join(hints) for term in hints: if term: term = QueryParser.escape(term) if term not in english_stop_words: # print "Including 'code_hints' from Doc_To_Query TERMs... //", term query += "code_hints:%s " % term return query
def processscript(filename): print '\n' + filename f = open(filename, 'r') s = f.read() s = s.replace('\\n', ' ') s = s.replace('\\t', ' ') s = re.sub(r'[^a-zA-Z]', r'\t', s) #print 'Tokenizing...' x = wordpunct_tokenize(s) tokenized = len(x) #print 'Removing words of length 1-2...' list = [] for word in x: if len(word) > 2: list.append(word) remove12 = len(list) fin = {} #print 'Stemming...' for wd in list: tem = stemport.stem(wd) if tem in fin: fin[tem] = fin[tem] + 1 else: fin[tem] = 1 stemmed = len(fin) #print 'Removing stop words...' f = open('stop.txt', 'r') for line in f: for word in line.split(): if (word in fin): #print word del fin[word] #print fin stopped = len(fin) print 'tokenized:' print tokenized print 'remove12:' print remove12 print 'stemmed:' print stemmed print 'stopped:' print stopped str = ''.join('%s ' % (k) for k, v in fin.iteritems()) return str
def extract_stem(self, sentence): if self._language == 'ko': spaced = self._morphs( unicodedata.normalize('NFKC', sentence.strip()).translate(self._table)) elif self._language == 'en': spaced = [ stem(j) for j in self._morphs(self.normalize_string(sentence)) ] return spaced
def tokenise_stem(text): '''removes punctuation, lowers all the characters and returns a list of all the stemmed words split at space or newline''' punct = re.compile('[%s]' % re.escape(string.punctuation)) dig = re.compile('\d') text_clean1 = punct.sub('', text) text_clean = dig.sub( '0', text_clean1) #convert all digits to 0 to optimize indexing tokens = text_clean.split() stemmed_tokens = [stem(token) for token in tokens] return lower(stemmed_tokens)
def fp_steps(self, text): title = text.strip().lower() title_splchar_removed = self.remove_spl_char_regex.sub(" ", title) title_number_removed = self.remove_num.sub("", title_splchar_removed) words = title_number_removed.split() filter_stop_words = [ w for w in words if not w in nltk.corpus.stopwords.words('english') ] stemed = [stem(w) for w in filter_stop_words] return sorted(stemed)
def get_pages(search_query): index = indexer.construct_index('indices/index_1.txt') search_query = nltk.word_tokenize(search_query) search_query = [stem(word.lower()) for word in search_query] print search_query pages = set(index.get(search_query[0])) for i in xrange(1, len(search_query)): word = search_query[i] pages = pages.intersection(set(index.get(word))) return list(pages)
def clean_text(text: str) -> str: """Clean text for TFIDF.""" new_text = re.sub(r'\p{P}+', ' ', text) new_text = [stem(i) for i in new_text.lower().split() if not re.findall(r'[0-9]', i)] new_text = ' '.join(new_text) return new_text
def normalize(sentence, bad_words=_bad_words): res = set() if isinstance(sentence, (str, unicode)): tokens = token_reg.split(sentence.lower().strip()) for token in tokens: if len(token) > 2 and token not in bad_words: stemmed = stem(token) if stemmed not in stop_words_en: res.add(token) return res
def extract_feature(sent): with open("./data/stop_words.txt", "r") as f: stop_words = [x.strip() for x in f] features = [] for word in sent: if word in stop_words: continue else: features.append(stem(word.strip())) return features
def encodeName(name, wordBag): encoding = [0 for word in wordBag] # 0s for all characters # result.append(1/len(text)) # result.append(1/len(text.split())) name = name.lower() tokens = word_tokenize(name) for t in tokens: root = stem(t) encoding[wordBag.index(root)] = 1 return encoding
def baseline(line, stop_list): word_ls = line.split() # ストップワード除去 for word in word_ls: if check_stop_words(word): word_ls = filter(lambda a: a != word, word_ls) # ステミング for i in range(len(word_ls)): word_ls[i] = stem(word_ls[i].strip()) return word_ls
def clean_up(tweet): # Perform porter stemmer and remove any STOPWORDS tweet = ' '.join([word for word in tweet.split(' ') if not word.startswith('#') and not word.startswith('@') and not word.startswith('http') and not word.startswith('www')]) tweet = to_alphanum(tweet).lower() tweet = tweet.split(' ') sw = set(STOPWORDS) # Allows for O(1) lookup # return [stem(word) for word in tweet if (word not in sw and stem(word) not in sw)] return ['+'.join(i) for i in ngrams([stem(word) for word in tweet if word not in sw and stem(word) not in sw and len(word) > 2], 1, 2)]
def createMapper(valueList): mapperDict = {} for value in valueList: mappedTo = [] mappedTo.append(value.lower()) mappedTo.append(stem(value.lower())) mapperDict.update(dict.fromkeys(mappedTo,value)) return mapperDict
def mk_word_dic(f): word_dic = defaultdict(int) for sen in f: for w in sen[3:-1].split(): w = stem(w) if stop_func(w): continue else: word_dic[w] += 1 return word_dic
def count_words_web(content): word_dicc = dict() content = [stem(s.lower()) for s in content.translate(table, string.punctuation).split() if len(s)<13 and len(s)>2 and (s not in common_english_words)] e_cont = enumerate(content) for idx,word in e_cont: if word not in word_dicc: word_dicc[word] = 1 else: word_dicc[word] += 1 return word_dicc
def price_q_keywords(keywords, pre): from stemming.porter2 import stem from textblob import TextBlob words = ['price', 'cost'] if type(keywords) == list: keywords = copy.copy(keywords[0]) text = TextBlob(keywords) logi = any( [word in [stem(w) for w in text.stripped.split()] for word in words]) return logi
def stem_token_list(words): """ Function that uses the porter stemming algorithm to remove suffixes(and in some cases prefixes) in order to find the "root word" or stem of a given word. """ stemmed_tokens = [] for word in words: w = stem(word) stemmed_tokens.append(w) return stemmed_tokens
def get_logprob(self, word, alpha=0): """Return the log probability of a word in the corpus with optional additive smoothing. """ word = word.lower() if self.stemming: word = porter2.stem(word) return np.log(self.word_counts[word] + alpha) - \ np.log(self.word_counts[anyword] + len(self.word_counts) * alpha)
def clean_text(text): if text: #whitespaces # a = 'this product is really good' clean = ' '.join(text.split()) red_text = [stem(word) for word in clean.split()] return ' '.join(red_text) else: return text
def cleanse_text(text): if text: # remove whitespace clean = ' '.join(text.split()) # stemming red_text = [stem(i.lower()) for i in clean.split()] return ' '.join(red_text) else: return text
def token_lower_nostop_stem_list(all_text, stopword_list): token_list = tokenisation_text(all_text) token_lowerlist = lower_word(token_list) token_lowerlist_nostop = [ str(current_word) for current_word in token_lowerlist if str(current_word) not in stopword_list ] stem_list = [stem(current_word) for current_word in token_lowerlist_nostop] return stem_list