def informationgaincompare(doc, text1, text2): text1a = tokenize(text1) text2a = tokenize(text2) t1 = [] t2 = [] punctpattern = re.compile(r'[,;\'"\)\(}{\[\].!\?<>=+-/*\\:]+') for i in text1a: if i in stopwords.words('english') or punctpattern.match(i) != None: continue t1.append(i) for i in text2a: if i in stopwords.words('english') or punctpattern.match(i) != None: continue t2.append(i) doctokens = tokenize(doc) docwords = [] for i in doctokens: if i in stopwords.words('english') or punctpattern.match(i) != None: continue docwords.append(i) count1 = 0 for i in t1: count1 += docwords.count(i) count2 = 0 for i in t2: count2 +=docwords.count(i) l = len(docwords) p1 = float(count1)/l p2 = float(count2)/l return (-p1*math.log(p1), -p2*math.log(p2))
def palavrasChaves(self): # fun��o da NLTK que retorna as stopwords na lingua inglesa stopE = stopwords.words('english') # fun��o da NLTK que retorna as stopwords na lingua portuguesa stop = stopwords.words('portuguese') stopS = stopwords.words('spanish') palavrasChaves = [] textoArtigo = [] #retira pontua��es do texto e divide o texto em palavras for i in self.titulo.lower().replace(',','').replace('.','').replace('-','').replace('(','').replace(')','').split(): #retira as stopwords da lingua portuguesa do texto do artigo que est� sendo apresentado if i not in stop: #retira as stopwords da lingua inglesa do texto do artigo que est� sendo apresentado if i not in stopE: #ignora palavras com menos de 3 caracteres. Isso � para tratar palavras, como por exemplo o verbo "�" if i not in stopS: if len(i) > 2: textoArtigo.append(i) # apresenta a frequencia de repeticoes das palavras no corpo do artigo freq = FreqDist(textoArtigo) # separa as quatro palavras mais frequentes items = freq.items()[:4] # coloca as palavras mais frequentes do texto na variavel palavrasChaves for i in range(0,len(items)): palavrasChaves.append(items[i][0]) return palavrasChaves
def loadQueries(fileloc): setTags=set() global training_doc_count global set_of_tokens xml_data=open(fileloc,'r') buf=xml_data.readlines() xml_data.close() count = 10 for line in buf: #if count < 0: # break #count =count -1 #print line match = re.search('<row(.*)Body="(.*)" OwnerUserId(.*)Title="(.*)"(.*)Tags="(.*)" Answer(.*)/>', line) if match: body=match.group(2) tokens_in_body = re.findall(r"[\w-]+", body,re.UNICODE) valid_tokens=filter(lambda x: x not in stopwords.words('english') and len(x) >= 3,tokens_in_body) title=match.group(4) tokens_in_title = re.findall(r"[\w-]+",title,re.UNICODE) valid_tokens_in_title=filter(lambda x: x not in stopwords.words('english') and len(x) >= 3, tokens_in_title) valid_tokens.extend(valid_tokens_in_title) tags=match.group(6) tokens_in_tags = re.findall(r"[\w-]+", tags,re.UNICODE) valid_tags=filter(lambda x: x not in stopwords.words('english') and len(x) >= 3, tokens_in_tags) #print valid_tokens #print valid_tags training_set_cluster[training_doc_count]=set(valid_tags) for x in valid_tags: setTags.add(x) add_values_to_dict(valid_tokens,training_doc_count) training_doc_count +=1 print len(main_dict) print len(setTags) print len(document_freq_dict)
def extract_features(self, article, feats, threegram_sent_ppl, fourgram_sent_ppl, fivegram_sent_ppl, sixgram_sent_ppl, index = None): featureSet = {} articleWords = article.replace("<s>", "").replace("</s>", "").split() featureSet["articlelen"] = len(articleWords) fx_words = [word for word in articleWords if word.lower() in stopwords.words('english')] featureSet["fxwordcount"] = len(fx_words)/len(articleWords) non_words = [word for word in articleWords if word.isalpha() != True] featureSet["nonwordcount"] = len(non_words)/len(articleWords) content_words = [word for word in articleWords if word.lower() not in stopwords.words('english')] featureSet["contentwordcount"] = len(content_words)/len(articleWords) featureSet["uniquewords"] = len(set(articleWords))/len(articleWords) featureSet.update(feats) try: sents = [x for x in article.split("\n") if len(x) > 1] ppl_five = ppl_wrangling(sents, fivegram_sent_ppl) ppl_six = ppl_wrangling(sents, sixgram_sent_ppl) ppl_three = ppl_wrangling(sents, threegram_sent_ppl) ppl_four = ppl_wrangling(sents, fourgram_sent_ppl) featureSet["ppl-5"] = ppl_five featureSet["ppl-6"] = ppl_six featureSet["ppl-3"] = ppl_three featureSet["ppl-4"] = ppl_four except: pass featureSet.update(self.posTags(index, article)) return featureSet
def word_standardize(sentences): tokens = [] sentences_st = [] for sent in sentences: tokens.extend(word_tokenize(sent)) sentences_st.append(word_tokenize(sent)) words = tokens st = LancasterStemmer() words = [w.lower() for w in words] words = [w for w in words if not w in stopwords.words('english')] words = [w for w in words if not w in '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'] st_words = [st.stem(w) for w in words] sent_result = [] for sent in sentences_st: sent = [w.lower() for w in sent] sent = [w for w in sent if not w in stopwords.words('english')] sent = [w for w in sent if not w in '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'] sent_result.append(sent) return st_words, sent_result
def getBOW(): predatelist, postdatelist = getDates() stpwrds = stopwords.words('english') path = './unique/posts' stpwrds = stopwords.words("english") idList = [] doclist = [joinpath(path, fname) for fname in listdir(path) if fname.endswith('.txt')] count = 1 predoc = [] postdoc = [] for file in doclist: with open(file,'r') as posts: for line in posts: if parser.parse(line.split('\t')[1]).date() in predatelist: predoc.append(line.split('\t')[-1].decode('utf-8','ignore')) elif parser.parse(line.split('\t')[1]).date() in postdatelist: postdoc.append(line.split('\t')[-1].decode('utf-8','ignore')) texts1 = [[word for word in document.lower().split() if word not in stpwrds] for document in predoc] texts2 = [[word for word in document.lower().split() if word not in stpwrds] for document in postdoc] all_tokens_pre = sum(texts1, []) all_tokens_post = sum(texts1, []) tokens_once1 = set(word for word in set(all_tokens_pre) if all_tokens_pre.count(word) == 1) tokens_once2 = set(word for word in set(all_tokens_post) if all_tokens_post.count(word) == 1) texts1 = [[word for word in text if word not in tokens_once1 and word not in stpwrds and word.isalpha()] for text in texts1] texts2 = [[word for word in text if word not in tokens_once2 and word not in stpwrds and word.isalpha()] for text in texts2] return texts1, texts2
def feature_extractor(data): """Extract features from a relation for the classifier.""" features = dict() lmtzr = WordNetLemmatizer() h2, h3, paragraph = data features['h2_' + h2.lower()] = True for word in h2.split(' '): if word.lower() not in stopwords.words('english') and len(word) > 1: features['h2word_' + word.lower()] = True features['h_' + h2.lower()] = True for word in h2.split(' '): if word.lower() not in stopwords.words('english') and len(word) > 1: features['hword_' + word.lower()] = True if h3 != None: features['h3_' + h3.lower()] = True for word in h3.split(' '): if word.lower() not in stopwords.words('english') and len(word) > 1: features['h3word_' + word.lower()] = True features['h_' + h3.lower()] = True for word in h3.split(' '): if word.lower() not in stopwords.words('english') and len(word) > 1: features['hword_' + word.lower()] = True for word in nltk.wordpunct_tokenize(paragraph): if word.lower() not in stopwords.words('english') and len(word) > 1: features[word] = True features['lower_' + word.lower()] = True features['lmtzr_' + lmtzr.lemmatize(word).lower()] = True return features
def get_stopwords(include_trectext_syntax=True): ignore_words = ['<doc>', '</doc>', '<text>', '</text>'] ignore_words.extend(stopwords.words('english')) ignore_words.extend(stopwords.words('dutch')) return set(ignore_words)
def remove_stopwords(lines,method=2): if method==0: # using nltk stopwords stopwords_list = set(stopwords.words("english")) elif method==1: # using klearn stopwords stopwords_list = list(text.ENGLISH_STOP_WORDS) elif method==2: stopwords_list =list(set(stopwords.words("english") + list(text.ENGLISH_STOP_WORDS))) else: raise ValueError('Method value should be [0-2]') without_sw_lines = [] # run thru all lines for each_line in lines: a_line_without_sw = '' #tokenize each line tokens = each_line.split() # run thru all tokens for each_token in tokens: if each_token not in stopwords_list: a_line_without_sw = a_line_without_sw+' '+each_token #recreate the list all over without_sw_lines.append(a_line_without_sw) return without_sw_lines
def find_opinions(tokens, feature, feat, id): fg = 0 for opinion in tokens: if opinion[0] == 'advmod' or opinion[0] == 'neg': if opinion[3].lower() in stopwords.words('english'): continue # endif if feature[1:3] == opinion[1:3]: fg = 1 modifier_set.add(opinion[3]) if id != -1: mods[id].append(opinion[3]) feat.write( feature[3] + ' ' + feature[1] + ' ' + opinion[3] + '\n') # endif # endif elif opinion[0] == 'dep': if opinion[3].lower() in stopwords.words('english'): continue # endif if feature[1:3] == opinion[1:3]: opinions_set.add(opinion[3]) find_opinions( tokens, ['nsubj', opinion[3], opinion[4], feature[3], feature[4]], feat, -1) # endelif # endfor if fg == 0: feat.write(feature[3] + ' ' + feature[1] + '\n')
def find_features(tokens, feat): i = 0 for feature in tokens: if feature[0] == 'nsubj': if feature[3].lower() in stopwords.words('english'): continue if feature[1].lower() in stopwords.words('english'): continue if not valid_feature(tokens, feature): continue # endif mods.append([]) features_set.add(feature[3]) opinions_set.add(feature[1]) find_opinions(tokens, feature, feat, len(mods) - 1) if i != 0: if tokens[i - 1][0] == 'nsubj' and tokens[i - 1][3:5] == feature[3:5]: for mod in mods[len(mods) - 2]: if mod not in mods[len(mods) - 1]: mods[len(mods) - 1].append(mod) feat.write( feature[3] + ' ' + feature[1] + ' ' + mod + '\n') # endif i = i + 1
def extract_bigrams(self, text): text = self.remove_return_lines_and_quotes(text) bigrams = [] st = PorterStemmer() stop = stopwords.words('english') more_stop_words = [ '(', ')', "'s", ',', ':', '<', '>', '.', '-', '&', '*', '...'] stop = stopwords.words('english') stop = stop + more_stop_words tokens = st.stem(text) tokens = nltk.word_tokenize(tokens.lower()) tokens = [i for i in tokens if i not in stop] tokens = [word for word in tokens if len(word) > 2] bigram_measures = nltk.collocations.BigramAssocMeasures() finder = BigramCollocationFinder.from_words(tokens) finder.apply_freq_filter(2) top_bigrams = finder.nbest(bigram_measures.pmi, 1000) for bg in top_bigrams: bg = " ".join(bg) tag = nltk.pos_tag([bg])[0] if tag[1] not in ['VBG', 'RB', 'VB', 'VBD', 'VBN', 'VBP', 'VBZ', 'PRP', 'IN', 'DT', 'CC', 'PRP$']: bigrams.append(tag[0]) return bigrams
def CosSim(a,b): cossim=0 moda=0 aa= [word for word in a if word not in stopwords.words()] bb= [word for word in b if word not in stopwords.words()] for i in aa: # print "into aa" #sum of square values moda=moda + word_frequencies[i]*word_frequencies[i] moda=moda**(.5) modb=0 for i in bb: #print "into bb" modb=modb + word_frequencies[i]*word_frequencies[i] modb=modb**(.5) #a.b iff equal for i in aa: for j in bb: if(i==j): cossim=cossim+(word_frequencies[i]* word_frequencies[j]) if (moda*modb == 0.0): return 0 else: cossim=cossim/(moda*modb) return cossim
def adapted_lesk(context_sentence, ambiguous_word, \ pos=None, option=False,lemma=True,hyperhypo=True, \ stop=True): """ This function is the implementation of the Adapted Lesk algorithm, described in Banerjee and Pederson (2002). It makes use of the lexical items from semantically related senses within the wordnet hierarchies and to generate more lexical items for each sense. see www.d.umn.edu/~tpederse/Pubs/cicling2002-b.pdf """ # Ensure that ambiguous word is a lemma. #ambiguous_word = lemmatize(ambiguous_word) # Get the signatures for each synset. ss_sign = simple_signature(ambiguous_word,lemma=True,hyperhypo=True) #print ss_sign for ss in ss_sign: related_senses = list(set(ss.member_holonyms() + ss.member_meronyms() + ss.part_meronyms() + ss.part_holonyms() + ss.similar_tos() + ss.substance_holonyms() + ss.substance_meronyms())) try: signature = list([j for j in chain(*[i.lemma_names() for i in \ related_senses]) if j not in stopwords.words('english')]) except: signature = list([j for j in chain(*[i.lemma_names for i in \ related_senses]) if j not in stopwords.words('english')]) ss_sign[ss]+=signature context_sentence = lemmatize_sentence(context_sentence) best_sense = compare_overlaps(context_sentence, ss_sign) return best_sense
def clean(self, raw): letters_only = re.sub("[^a-zA-Z#@]", " ", raw) words = letters_only.split() for i in range(0, len(words)): if "#" in words[i]: s = words[i].split('#') words[i] = '# '.join(s) if "@" in words[i]: s = words[i].split('@') words[i] = '@ '.join(s) if "http" in words[i]: s = words[i].split('http') words[i]= "http".join(s) total_stop_words = set(stopwords.words("english")) removed_stop_words = set(stopwords.words("english")[0:20]) stop_words = total_stop_words - removed_stop_words content_words = [w for w in words if not w in stop_words] return " ".join(content_words)
def frequencounting4Up(Listings): """ Get the keywords count and the rank of the keywords :param Listings: the input list of tweets :return: a list of tuple ranked by words counts """ MyCounter = Counter() chars = ['.', '/', "'", '"', '?', '!', '#', '$', '%', '^', '&', '*', '(', ')', ' - ', '_', '+', '=', '@', ':', '\\', ',', ';', '~', '`', '<', '>', '|', '[', ']', '{', '}', '-', '"', '&', 'rt'] UpdatingChars = ['&', 'rt', '', '#dctraffic', '#mdtraffic', '#vatraffic', 'amp', '-'] # This section below will filter out the common english words and punctuations from the target tweets. for line in Listings: if type(line) is str: for word in line.strip().lower().split(): if PunkRemovement(word.strip().lower()) not in UpdatingChars + stopwords.words( 'english') and not word.isdigit(): if len(word) > 1: MyCounter[PunkRemovement(word.strip().lower())] += 1 else: for word in line.text.decode('UTF-8').strip().lower().split(): if PunkRemovement(word.strip().lower()) not in chars + stopwords.words('english'): MyCounter[PunkRemovement(word.strip().lower())] += 1 return MyCounter.most_common()
def annotations_to_words(terms, dag, ipr_map, lower): """ Converts a string of accesssions into a string of the corresponding english-text representations. """ try: sws = stopwords.words('english') except LookupError: nltk.download('stopwords') sws = stopwords.words('english') if lower: sws = set([x.lower() for x in sws]) case = string.lower else: sws = set([x.upper() for x in sws]) case = string.upper go_terms = [t.upper() for t in terms if 'GO' in t.upper()] ipr_terms = [t.upper() for t in terms if t.upper() in ipr_map] go_descriptions = ' '.join([case(dag[t].name) for t in go_terms]).split(' ') ipr_descriptions = ' '.join([case(ipr_map[t]) for t in ipr_terms]).split(' ') go_descriptions = [x.translate(None, string.punctuation) for x in go_descriptions] ipr_descriptions = [x.translate(None, string.punctuation) for x in ipr_descriptions] go_descriptions = [x for x in go_descriptions if case(x) not in sws] ipr_descriptions = [x for x in ipr_descriptions if case(x) not in sws] line = ' '.join(go_descriptions + ipr_descriptions) return line
def freqgen_word(word): connect(word) # get english stopwords stopen = stopwords.words('english') stopfr = stopwords.words('french') #stopsp = stopwords.words('spanish') query={} projection={"text":1} cursor = db.Tweetfind.find(query,projection) texts = pandas.Series(list(cursor)) tokens = [] for text in texts.values: tokens.extend([word.lower().strip(':;,#."-\'!') for word in text['text'].split()]) filtered_tokens=[] st = ['&',' ','it\'s','haven\'t','can\'t','don\'t','i\'m','i\'ve','i\'ll','i\'d','#','e','@','1','2','3','4','5','6','7','8','9','10','11','12','13','14','15','16','17','18','19','20','rt','(',')'] for word in tokens: try: if (not word.decode('utf-8') in stopen) and (not word.decode('utf-8') in stopfr): if not word in st: filtered_tokens.append(word.decode('utf-8')) except : pass freq_dist = nltk.FreqDist(filtered_tokens) print type(freq_dist) #print freq_dist.plot(25) return freq_dist
def fuzzer(localstring, dbpstring): lwl = localstring.replace('-','').replace(',.', '').split() lfwl = [w for w in lwl if not w in stopwords.words('english')] dwl = dbpstring.replace('-','').split() dfwl = [w for w in dwl if not w in stopwords.words('english')] ratio = fuzz.token_sort_ratio(str(lfwl), str(dfwl)) return ratio
def removeStopWords(tokens, lang): filteredToken=tokens if lang =='en': filteredToken = [w for w in tokens if not w in stopwords.words('english')] elif lang =='es': filteredToken = [w for w in tokens if not w in stopwords.words('spanish')] return filteredToken
def pre_process(self, text): for i in range(len(text)): text[i] = text[i].replace("-", " ") word_list = text[i].encode('ascii', 'ignore').lower().split(" ") processed_text = [] count = 0 for word in word_list: if word in stopwords.words('english'): continue if re.match('@\w+', word): continue if re.match('#\w+', word): continue word = re.sub('[0-9]+', 'gotNumber', word) word = re.sub('http(s)?.+', 'gotURL', word) word = re.sub('[^a-zA-Z0-9]', ' ', word) words = word.split(' ') for w in words: if w is not ' ' and len(w) > 1 and w not in stopwords.words('english'): w = self.sno.stem(w) processed_text.append(w) count += 1 print '. ', if count == 11: print '' count = 0 text[i] = processed_text print '' return text
def lazy_stopword_filter(filename): exclude_punctuation = set(['[', ']', '{', '}', '(', ')', ',','!','?',';',':','<', '>']) stop_set = set(stopwords.words('english')) with open("../resources/stopwords.txt", 'r') as f: stop_set = stop_set | set((l.strip() for l in f.readlines())) outfile = sys.argv[2] text = open(filename, 'rb') reader = csv.DictReader(text, delimiter=',', quotechar='"') target = open(outfile, 'wb') fieldnames=['Id', 'Title', 'Body', 'Tags'] writer = csv.DictWriter(target, fieldnames, delimiter=',', quotechar='"', quoting=csv.QUOTE_ALL) writer.writerow(dict((fn, fn) for fn in fieldnames)) for line in reader: # remove multiple spaces from all columns for k,v in line.items(): line[k] = ' '.join(v.split()) str_to_write_title = "" for word in line["Title"].split(): word = ''.join(ch for ch in word if ch not in exclude_punctuation) if word.lower() not in stopwords.words('english'): str_to_write_title = str_to_write_title + " " + word.lower() #print(str_to_write_title) str_to_write_body = "" body = html_tag_remover.cleanup_html(line["Body"]) for word in body.split(): # simple tokenization word = ''.join(ch for ch in word if ch not in exclude_punctuation) if word.lower() not in stop_set: str_to_write_body = str_to_write_body + " " + word.lower() #print(str_to_write_body) writer.writerow({'Id': line["Id"], 'Title':str_to_write_title, 'Body': str_to_write_body, 'Tags':line["Tags"]})
def clean_total_words(data): all_text=list() for i in range(len(data)): all_text.append(data[i]['text']) words=list() for i in range(len(all_text)): words.append(nltk.word_tokenize(all_text[i])) wordss= list(itertools.chain.from_iterable(words)) word_after_clean=list() for i in range(len(words)): wordss[i]=wordss[i].lower() stop_words = set(stopwords.words('english')) stop_words.update(['.', ',', '"', "'", '?', '!', ':', ';', '(', ')', '[', ']', '{', '}']) for i in range(len(wordss)): if wordss[i] not in stop_words: word_after_clean.append(wordss[i]) word_clean=list() for i in range(len(word_after_clean)): if word_after_clean[i].isalpha()==True: word_clean.append(word_after_clean[i]) word_clea=list() for i in range(len(word_clean)): word_clea.append(word_clean[i].lower()) stop_words = set(stopwords.words('english')) word_c=list() for i in range(len(word_clea)): if word_clea[i] not in stop_words: word_c.append(word_clea[i]) return(word_c)
def evaluate_html(content, html_conf): fdist = FreqDist() if html_conf['usehtml'] == False: logging.info('Discarding HTML tags') return fdist logging.info("\tEvaluating HTML") # try with TITLE tag titles = re.findall("<title>[A-Za-z0-9 ]+</title>", content) for title in titles: root = etree.fromstring(title) words_list = nltk.word_tokenize(re.sub('[^A-Za-z0-9 ]', ' ', root.text)) terms_list = [ x for x in words_list if x.lower() not in stopwords.words('english')] stems = steming(terms_list) for i in range(html_conf['title']): fdist.update(stems) # try with H1 tag headers = re.findall("<h1>[A-Za-z0-9 ]+</h1>", content) for header in headers: root = etree.fromstring(header) words_list = nltk.word_tokenize(re.sub('[^A-Za-z0-9 ]', ' ', root.text)) terms_list = [ x for x in words_list if x.lower() not in stopwords.words('english')] stems = steming(terms_list) for i in range(html_conf['h1']): fdist.update(stems) return fdist
def build_from_text(self,input_file): f=open(input_file, "r" ) all_text=f.read() f.close() tokens =nltk.word_tokenize(all_text) for i,w in enumerate(tokens): tokens[i]=self.lmtzr.lemmatize(tokens[i]).lower() for i in range(len(tokens)): # tokens[i]=self.lmtzr.lemmatize(tokens[i]) if (tokens[i] in stopwords.words("english")): continue if (tokens[i] =="."): continue if self.no_meaning(tokens[i]): continue if tokens[i] not in self.dict: for k in range(self.word_variants): self.vecCollection[k].append(0) self.dict[tokens[i]]=self.word_variants self.inverse_dict.append(tokens[i]) self.word_variants+=1 self.frequencies.append(0) self.vecCollection.append([0,]*self.word_variants) self.frequencies[self.dict[tokens[i]]]+=1 for j in range (i+1,i+window_length+1): if j>=len(tokens): break if (tokens[j] =="."): break if (tokens[j] in stopwords.words("english")): continue if self.no_meaning(tokens[j]): continue if tokens[j] not in self.dict: for k in range(self.word_variants): self.vecCollection[k].append(0) self.dict[tokens[j]]=self.word_variants self.inverse_dict.append(tokens[j]) self.word_variants+=1 self.vecCollection .append([0,]*self.word_variants) self.frequencies.append(0) self.vecCollection[self.dict[tokens[i]]][self.dict[tokens[j]]]+=1 self.vecCollection[self.dict[tokens[j]]][self.dict[tokens[i]]]+=1 #f.close() del(all_text) del(tokens)
def preprocessQuery(query): query = query.lower() query = re.sub(r'[^a-z0-9 ]',' ',query) wordListAll = wordpunct_tokenize(query); # Now combine wordList with operators also # So, wordList now contains (word, Operator) before we do stop word removal wordList = [] i = 0 while i < len(wordListAll): if wordListAll[i] == "AND" or wordListAll[i] == "and": wordList.append( (wordListAll[i+1], "AND") ) i += 2 elif wordListAll[i] == "OR" or wordListAll[i] == "or": wordList.append( (wordListAll[i+1], "OR") ) i += 2 else: wordList.append( (wordListAll[i], "OR") ) i += 1 # Filter the words and remove stop words. filteredWords = [w for w in wordList if not w[0] in stopwords.words('english')] queryTuples = [] queryLen = len(filteredWords) if queryLen > 15: queryTuples = filteredWords else: for word, operator in filteredWords: synonymList = getSynonymList(word) queryTuples.append((word, operator)) for synCount, syn in enumerate(synonymList): if synCount > 3: break # Adding operator OR in synonyms list if its not a stop word syn = re.sub(r'[^a-z0-9 ]', ' ', syn) synList = syn.split() for synOneTerm in synList: if not synOneTerm in stopwords.words('english'): queryTuples.append((synOneTerm, "OR")) # queryTuples list is ready (filtered). Now need to stem this list, # ensuring no duplicacy, same order and operator values finalQueryList = [] porterStemmer = PorterStemmer() for word, operator in queryTuples: finalQueryList.append( (porterStemmer.stem(word), operator) ) # Now removing duplicate items from list seenSet = set() uniqueList = [] for q in finalQueryList: stemWord = q[0] if stemWord in seenSet: continue uniqueList.append(q) seenSet.add(q[0]) return uniqueList
def tf_idf(review1, review2): def get_tokens(document): #remove the punctuation using the character deletion step of translate #no_punctuation = document.translate(None, string.punctuation) tokens = nltk.word_tokenize(document) return tokens def tokenize(text): tokens = nltk.word_tokenize(text) stems = stem_tokens(tokens, stemmer) return stems def stem_tokens(tokens, stemmer): stemmed = [] for item in tokens: stemmed.append(stemmer.stem(item)) return stemmed #document1 = ("I think this is one of the higher crazy selection end Chinese restaurants in Michigan, one of the best that I've ever been to in the US. The seating is great, with professional waiters and waitresses. My boyfriend and I went to this place to have a taste of their famous Peking Duck, and the dish turned out to be really amazing! I like how they came to your desk and serve directly to your plate for the first round, and the second round comes out very quickly too. The shrimp dumplings are great too, much better than the usual ones you may get at a Dim Sun place. Of course the price is a bit higher too. ") #document2 = ("Yummms.coms!! This is some good Chinese food!! They have a full bar, great selection super good location right by my house. My only issue is cost...beer is like 8 bucks and they did not have my favorite the night I was there and it's was a Saturday night. Look I know this is not Chicago...I should not hold them to that standard but dang....Chinese bar should have Tsingtao coming out of their ears on Saturdays. ") #print document1 #token1 = get_tokens(document1.lower()) #token2 = get_tokens(document2.lower()) #print token1 token1 = get_tokens(review1) token2 = get_tokens(review2) count1 = Counter(token1) #count1=Counter(review1) #count2=Counter(review2) #print count1.most_common(10) count2 = Counter(token2) #print count2.most_common(10) #print "\n" filtered1 = [w for w in token1 if not w in stopwords.words('english')] count_filter1 = Counter(filtered1) #print count_filter1.most_common(10) filtered2 = [w for w in token2 if not w in stopwords.words('english')] count_filter2 = Counter(filtered2) #print count_filter2.most_common(10) #print "\n" stemmer = PorterStemmer() stemmed1 = stem_tokens(filtered1, stemmer) stemmed2 = stem_tokens(filtered2, stemmer) count_stem1 = Counter(stemmed1) count_stem2 = Counter(stemmed2) stemmed1=' '.join(stemmed1) stemmed2=' '.join(stemmed2) #print stemmed1 #print stemmed2 documents=[stemmed1,stemmed2] tfidf_vectorizer = TfidfVectorizer() tfidf_matrix = tfidf_vectorizer.fit_transform(documents) #print tfidf_matrix.shape #print cosine_similarity(tfidf_matrix[0:1], tfidf_matrix) return cosine_similarity(tfidf_matrix[0:1], tfidf_matrix)
def tokenize(text): lookup_t = nltk.word_tokenize(text) nltk.data.load('tokenizers/punkt/spanish.pickle') stops_es=stopwords.words('spanish') stops_en=stopwords.words('english') stops_custom = ['http'] tokenized = [word.lower() for word in lookup_t if word not in stops_es and word not in stops_custom and word not in stops_en and word.isalpha() and len(word)>2] return tokenized
def _remove_bigram_stopwords(self,bigrams): filtered_words = [] for w in bigrams: if (w[0] in stopwords.words('english')) and (w[1] in stopwords.words('english')): pass else: filtered_words.append(w) return filtered_words
def PreparaFiltroTexto(self, recurso): tittletype = "" for s in recurso.split(): if s.lower() in stopwords.words('english') or s.lower() in stopwords.words('spanish'): tittletype = tittletype + s.lower() + ' ' else: tittletype = tittletype + s.title() + ' ' tittletype = tittletype[:-1] return tittletype
import numpy as np from numpy import dot from numpy.linalg import norm import math import nltk from nltk.corpus import stopwords, wordnet from nltk.stem import WordNetLemmatizer #### Set parameters N_dimension = 2000 Tau = 3 Forgetting_rate = 0 #### Get unique words in the corpus lemmatizer = WordNetLemmatizer() stopwords = stopwords.words("english") ###################################### #### DEFINNITIONS #### ###################################### ############################# #### COSINE SIMILARITY ############################# def Cosine(x, y): z = 0 if sum(x) != 0 and sum(y) != 0: z = dot(x, y) / (norm(x) * norm(y)) return z
df2.columns = ['rev', 'p'] df3 = pd.read_csv('twitter train.csv', delimiter=',', encoding='ISO-8859-1') df3.drop('ItemID', axis=1, inplace=True) df3 = df3.rename(columns={'SentimentText': 'rev', 'Sentiment': 'p'}) frames = [df2, df1, df3] result = pd.concat(frames) result df = shuffle(result) df.reset_index(inplace=True) df.drop('index', axis=1, inplace=True) sw = set(stopwords.words('english')) df = df for i in range(len(df)): review = df['rev'][i] review = ' '.join(review.split(' ')[1:]) review = review.lower() review = re.sub('[^a-zA-Z]', ' ', review) review = review.split(' ') review = ' '.join(list(filter(None, review))) words = word_tokenize(review) filt = [w for w in words if not w in sw] df['rev'][i] = ' '.join(filt) df.to_csv('finalmixrev.csv', sep='\t', encoding='utf-8')
def create_new_blog(): if request.method == 'GET': return render_template('new_blog.html') else: title = request.form['title'] user = User.get_by_email(session['email']) new_blog = Blog(user.email, title, user._id) new_blog.save_to_mongo() new_title = title.replace(" ", "_") # to be able to use it in the url my_url = "https://www.rottentomatoes.com/m/" + new_title + "/reviews/" # case sensitivity in url gets corrected automatically by browser # obtain the reviews of the required movie req = requests.get(my_url) content = req.content soup = BeautifulSoup(content, "html.parser") element = soup.find_all("div", {"class": "the_review"}) if len(element) == 0: new_url = req.url + "/reviews/" req = requests.get(new_url) content = req.content soup = BeautifulSoup(content, "html.parser") element = soup.find_all("div", {"class": "the_review"}) # preparing test set test_set_reviews = [] for i in range(len(element)): test_set_reviews.append(element[i].text) corpus2 = [] for i in range(len(test_set_reviews)): review = re.sub( '[^a-zA-Z]', ' ', test_set_reviews[i] ) # remove numbers and punctuations (don't remove letters a-z and A-Z) and second parameter ' ' is used to replace the removed character by a space. review = test_set_reviews[i].lower( ) # convert all letters to lowercase review = review.split( ) # convert the review into a list of different words of the review. ps = PorterStemmer( ) # Stemming process to keep only the root of the word i.e. keep 'love' and not 'loved' stop_words = set(stopwords.words('english')) stop_words.update( ['.', ',', '"', "'", ':', ';', '(', ')', '[', ']', '{', '}']) review = [ ps.stem(word) for word in review if not word in stop_words ] # retain only those english words in the list that are not present in stopwords. 'set' is used to make the algo faster because python goes through a set faster than a list review = ' '.join( review ) # convert the list of words back to a single string of words. if review == '' or review == ' ': # sometimes, after applying review = 'neutral' corpus2.append(review) if len(corpus2) == 0: # if no reviews found return "Sorry! No reviews yet for this movie. Please check spelling or try some other movie." # create the bag of words from sklearn.feature_extraction.text import TfidfVectorizer vectorizer = TfidfVectorizer(ngram_range=(1, 3)) x_train = vectorizer.fit_transform(corpus) x_train = x_train.astype('float16') x_test = vectorizer.transform(corpus2) x_test = x_test.astype('float16') # fitting SGD Classifier from sklearn.linear_model import SGDClassifier classifier_sgd = SGDClassifier(loss='hinge', shuffle=True, penalty='elasticnet', alpha=0.00001) classifier_sgd.fit(x_train, sentiment_train) # predict y_pred_sgd = classifier_sgd.predict(x_test) res = 0 for i in range(len(y_pred_sgd)): if y_pred_sgd[i] == 4: y_pred_sgd[i] = 3 elif y_pred_sgd[i] == 0: y_pred_sgd[i] = 1 for i in range(len(y_pred_sgd)): if y_pred_sgd[i] == 1: res += 0 elif y_pred_sgd[i] == 2: res += 50 else: res += 100 rate = res / (len(y_pred_sgd)) rate = str(rate) from collections import Counter data = Counter(y_pred_sgd) ans = data.most_common(1)[0][0] # Returns the highest occurring item if ans == 1: return "Negative Reviews!! Drop this Movie. " + "rating is : " + rate elif ans == 2: return "Neutral Reviews!! Go at your own risk. :) " + "rating is : " + rate elif ans == 3: return "Positive Reviews!! Go for it. " + "rating is : " + rate else: return "Sorry! Some Error in Processing"
)""" regex_str = [ emoticons_str, r'<[^>]+>', # HTML tags r'(?:@[\w_]+)', # @-mentions r"(?:\#+[\w_]+[\w\'_\-]*[\w_]+)", # hash-tags r'http[s]?://(?:[a-z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-f][0-9a-f]))+', # URLs r'(?:(?:\d+,?)+(?:\.?\d+)?)', # numbers r"(?:[a-z][a-z'\-_]+[a-z])", # words with - and ' r'(?:[\w_]+)', # other words r'(?:\S)', # anything else ] # Create stop word dictionary punctuation = list(string.punctuation) stop = stopwords.words('english') + punctuation + ['rt', 'via', 'amp', 'get', 'gt', '1', '10', 'click'] tokens_re = re.compile(r'(' + '|'.join(regex_str) + ')', re.VERBOSE | re.IGNORECASE) emoticon_re = re.compile(r'^' + emoticons_str + '$', re.VERBOSE | re.IGNORECASE) def tokenize(s): s = re.sub(r'[^\x00-\x7f]*', r'', s) return tokens_re.findall(s) def preprocess(s): tokens = tokenize(s) # To lower tokens = [token if emoticon_re.search(token) else token.lower() for token in tokens] return tokens
int(count) + 1) * math.log(total_docs / int(curr)) term_freq_dict[doc_id][field_idx + 1] += 1 term_freq_dict[doc_id][0] = max(term_freq_dict[doc_id][1:7]) total_results = len(term_freq_dict) results = sorted(term_freq_dict.items(), key=lambda x: (x[1], x[0]), reverse=True) final_result = [x[0] for x in results] return (final_result[:min(num_results, len(results))], total_results) if len(sys.argv) < 3: print('Insufficient Arguments provided') exit(0) STOP_WORDS_SET = set(stopwords.words('english')) STEMMER = Stemmer('porter') data_dir = os.path.join('.', "data") field_type_to_index = { 'title': 0, 'body': 1, 'ref': 2, 'infobox': 3, 'link': 4, 'category': 5 } secondary_index = { 'title': [], 'body': [], 'ref': [], 'infobox': [],
#Natural language processing import nltk from nltk.corpus import stopwords from nltk.stem.porter import PorterStemmer ps = PorterStemmer() nltk.download('stopwords') list1 = [] for i in range(0, 5536): mail = df.Message[i] #print(mail) mail = re.sub('[^a-zA-Z]', ' ', mail) mail = mail.lower() mailwords = mail.split() mailwords = [ ps.stem(word) for word in mailwords if word not in stopwords.words('english') ] mail = ' '.join(mailwords) list1.append(mail) from sklearn.feature_extraction.text import CountVectorizer cv = CountVectorizer() X = cv.fit_transform(list1).toarray() y = df.Status.values from sklearn.cross_validation import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
BOOK_LIST = ['hobbes-leviathan', 'hobbes-liberty', 'hobbes-elements', 'hobbes-law', 'mill-liberty', 'mill-util','locke-understanding', 'locke-treatise', 'hume-treatise', 'hume-morals', 'hume-enquiry', 'berkeley-TOK','berkeley-TD', 'bentham-POM', 'bentham-FOG', 'mill-representative', #'burke-reflections','conway-nature','mill-comte','more-utopia', 'reid-mind', 'hume-religion'] # this is the booklist we will analyse. Must be in the same folder TEST_FILES = ['sidgwick.txt','machiavelli.txt','more-utopia','burke-reflections','smith-sentiments','smith-wealth', 'fedPapers', 'mill-logic', 'kant-CPR', 'russell-AOM', 'russell-external', 'russell-ideals', 'russell-mysticism', 'russell-POP', 'spinoza-ethica', 'spinoza-understanding','Shi-PC', 'Shi-equality', 'Shi-AM', 'Shi-MP'] NUM_CLUSTERS = 6 # how many clusters we want to categorize when we process different individual books. SENTIMENT_LIST = [] # In[9]: #Adding more stopwords. Providing the option of an aggressive word list. # nltk.download('stopwords') #Not necessary if you have done it once stop_words = list(set(stopwords.words('english'))) stop_words.append('\'s')#manually add 's into the stop word list (because it's annoying!) We may add more similar ones. if MORE_SW: #if we want to add more stop words and render a more aggressive stopword list with open('stopwords', 'r') as myfile: sw = [i.strip().split(' ') for i in myfile] sw1 = [val.lower() for sublist in sw for val in sublist] stop_words.extend(sw1) stop_words = set(stop_words) # In[11]: def tokenize(text): ''' Tokenize the words in a texts. If we need tokenize and stemming, we can
@author: bhavyababuta """ import pandas as pd import numpy as np import re import matplotlib matplotlib.use('TkAgg') import matplotlib.pyplot as plt %matplotlib inline from wordcloud import WordCloud,STOPWORDS import nltk from nltk.tokenize import word_tokenize from nltk.corpus import stopwords stopWordList=stopwords.words('english') data=pd.read_csv('Tweets.csv') data.head(20) data.isnull().sum() data.describe(include='all') for columns in data.columns: print('%s'%(columns)) data[columns].value_counts() data['airline'].value_counts() data['retweet_count'].value_counts()
def get_summary(self, input, max_sentences): sentences_original = sent_tokenize(input) #Remove all tabs, and new lines if (max_sentences > len(sentences_original)): print( "Error, number of requested sentences exceeds number of sentences inputted" ) #Should implement error schema to alert user. s = input.strip('\t\n') #Remove punctuation, tabs, new lines, and lowercase all words, then tokenize using words and sentences words_chopped = word_tokenize(s.lower()) sentences_chopped = sent_tokenize(s.lower()) stop_words = set(stopwords.words("english")) punc = set(string.punctuation) #Remove all stop words and punctuation from word list. filtered_words = [] for w in words_chopped: if w not in stop_words and w not in punc: filtered_words.append(w) total_words = len(filtered_words) #Determine the frequency of each filtered word and add the word and its frequency to a dictionary (key - word,value - frequency of that word) word_frequency = {} output_sentence = [] for w in filtered_words: if w in word_frequency.keys(): word_frequency[w] += 1.0 #increment the value: frequency else: word_frequency[w] = 1.0 #add the word to dictionary #Weighted frequency values - Assign weight to each word according to frequency and total words filtered from input: for word in word_frequency: word_frequency[word] = (word_frequency[word] / total_words) #Keep a tracker for the most frequent words that appear in each sentence and add the sum of their weighted frequency values. #Note: Each tracker index corresponds to each original sentence. tracker = [0.0] * len(sentences_original) for i in range(0, len(sentences_original)): for j in word_frequency: if j in sentences_original[i]: tracker[i] += word_frequency[j] #Get the highest weighted sentence and its index from the tracker. We take those and output the associated sentences. for i in range(0, len(tracker)): #Extract the index with the highest weighted frequency from tracker index, value = max(enumerate(tracker), key=operator.itemgetter(1)) if (len(output_sentence) + 1 <= max_sentences) and ( sentences_original[index] not in output_sentence): output_sentence.append(sentences_original[index]) if len(output_sentence) > max_sentences: break #Remove that sentence from the tracker, as we will take the next highest weighted freq in next iteration tracker.remove(tracker[index]) sorted_output_sent = self.sort_sentences(sentences_original, output_sentence) return (sorted_output_sent)
target_word_2 = list[3].lower() target_word_3 = list[4].lower() f.close() # large capital -> small capital discourse_words_1 = [s.replace(s, s.lower()) for s in discourse_words_1] discourse_words_1and2 = [s.replace(s, s.lower()) for s in discourse_words_1and2] # remove '.' and ',' from word list discourse_words_1 = [s.replace('.', '') for s in discourse_words_1] discourse_words_1and2 = [s.replace('.', '') for s in discourse_words_1and2] discourse_words_1 = [s.replace(',', '') for s in discourse_words_1] discourse_words_1and2 = [s.replace(',', '') for s in discourse_words_1and2] # remove stop words from word list stop_words = stopwords.words('english') #print(stop_words) for stop_word in stop_words: while stop_word in discourse_words_1 : discourse_words_1.remove(stop_word) while stop_word in discourse_words_1and2 : discourse_words_1and2.remove(stop_word) # remove "'s" and "'" and "-" and "'d" and "'ll" and "'ve" and "re" from word list discourse_words_1 = [s.replace("'s", '') for s in discourse_words_1] discourse_words_1and2 = [s.replace("'s", '') for s in discourse_words_1and2] discourse_words_1 = [s.replace("'", '') for s in discourse_words_1] discourse_words_1and2 = [s.replace("'", '') for s in discourse_words_1and2] discourse_words_1 = [s.replace("-", '') for s in discourse_words_1] discourse_words_1and2 = [s.replace("-", '') for s in discourse_words_1and2]
import nltk from nltk.tokenize import WhitespaceTokenizer from nltk.corpus import stopwords import pandas as pd import scipy.stats as sp tokenizer = WhitespaceTokenizer() nltk.download() stopword_list = stopwords.words('english') reviews_df = pd.read_csv("C:/Users/Documents/Yelp/yelp_academic_dataset_review.csv", encoding="utf-8") positive_terms = [] f = open('C:/Users/Documents/Yelp/positive_terms.txt', "r") positive_terms = f.read().splitlines() f.close() negative_terms = [] f = open('C:/Users/Documents/Yelp/negative_terms.txt', "r") negative_terms = f.read().splitlines() f.close() porter = nltk.PorterStemmer() def normalize_review_text(text): text = text.lower() text = remove_punctuation(text) text = " ".join(text.split()) text_tokens = tokenizer.tokenize(text) text_tokens = [porter.stem(w) for w in text_tokens if w not in stopword_list]
for node in node_info) #making sure keys are integers node_info_tokenized = {int(k): v for k, v in node_info_tokenized.items()} #with open('./ISAE_Comp/out/node_info_token.json', 'w') as file: # json.dump(node_info_tokenized, file) print("Finished tokenizing {0} entries to dictionnary and saved it to file". format(len(node_info_tokenized.keys())), flush=True) ''' Removing stopwords ''' print("Downloading french stopwords", flush=True) nltk.download('stopwords') stop_words = stopwords.words('french') node_info_filtered = {} def remove_stopwords(node): ''' add an entry on node_info_filtered dict for the node as word list removing stopwords from node_info_tokenized ''' node_info_filtered[node] = [] for w in node_info_tokenized[node]: if w not in stop_words: node_info_filtered[node].append(w) print("Starting stopword removal", flush=True)
b.append(element) return b def category(a): return { '1': 'Negative', '2': 'S Negative', '3': 'Neutral', '4': 'S Positive', '5': 'Positive' }.get(a) #Build a training data set stop = stopwords.words('english') with open("train.tsv") as csvfile: records = csv.reader(csvfile, delimiter='\t') next(records) t = [({ word: True for word in nltk.word_tokenize(row[2]) if word not in stop }, (row[3])) for row in records] print('Train record count: ' + str(len(t))) ##trainlen = int((len(t) * 3 / 4)) ##train = t[:trainlen] ##test = t[trainlen:] ##test file data for later. Might want to incorporate a database read with open("test.tsv") as csvfile: records2 = csv.reader(csvfile, delimiter='\t')
from tkinter.filedialog import askopenfilename from nltk.stem import WordNetLemmatizer from nltk.corpus import stopwords from nltk.tokenize import sent_tokenize, word_tokenize from nltk.stem import PorterStemmer from nltk.stem.snowball import SnowballStemmer idenditify = [ '\033[92m' + 'Ad : Aykut' + '\033[0m', '\033[92m' + 'Soyad: Cengiz' + '\033[0m', '\033[92m' + 'No : 503020190030' + '\033[0m', '\033[92m' + '<Information Retrieval Final Project>' + '\033[0m' ] lemmatizer = WordNetLemmatizer() stop_words = set(stopwords.words('english')) porter = PorterStemmer() snowy = SnowballStemmer("english") translator = str.maketrans('', '', string.punctuation) Tk().withdraw() baslıklar = [ 'doga', 'bilim', 'hukuk', 'din', 'ekonomi', 'is', 'moda', 'siyaset', 'spor' ] diseaseAllergie = dict() diseaseAnxiety = dict() diseaseBipolar = dict() diseaseBrainTumour = dict() diseaseBreastCancer = dict()
from nltk.tokenize import word_tokenize, sent_tokenize from nltk.tag import pos_tag from nltk.probability import FreqDist from nltk.corpus import stopwords from nltk.stem import PorterStemmer, LancasterStemmer, SnowballStemmer, WordNetLemmatizer from nltk.chunk import ne_chunk sentences = [ 'Natural language processing (NLP) is a field of computer science, artificial intelligence, and computational linguistics concerned with the interactions between computers and human (natural) languages', 'Modern NLP algorithms are based on machine learning, especially statistical machine learning', 'NLP research is gradually shifting from lexical semantics to compositional semantics and, further on, narrative understanding', 'The learning procedures used during machine learning automatically focus on the most common cases, whereas when writing rules by hand it is often not obvious at all where the effort should be directed', 'Produce a readable summary of a chunk of text. Often used to provide summaries of text of a known type, such as articles in the financial section of a newspaper' ] stopword_set = set(stopwords.words('english')) def preprocessing(sentence): tokenized = set(word_tokenize(sentence)) tokenized = tokenized - stopword_set stemmed = [(PorterStemmer().stem(i)) for i in tokenized] lemmatized = [(WordNetLemmatizer().lemmatize(i)) for i in stemmed] return set(lemmatized) def menu_1(): global user_sentences while True: inp = input('Input Sentence : ') if len(inp) > 10: break
for i in battle_text_rep: for j in i: battle_text.append(j) # In[10]: battle_text = [x.lower() for x in battle_text if x not in ['—', '==', '===', '', '====']] # In[11]: battle_stopwords = [] stopword = stopwords.words("russian") for i in tqdm_notebook(battle_text): if i not in stopword: battle_stopwords.append(i) # In[12]: dict_freq = sorted(Counter(battle_stopwords).items(), key=lambda x: x[1], reverse=True) # In[13]: final_battle = []
def bag_of_words(text,url_path): tfidfconverter = TfidfVectorizer(max_features=100, min_df=5, max_df=0.7,stop_words=stopwords.words('english')) X = tfidfconverter.fit_transform(text) ###.toarray() rows=tfidfconverter.get_feature_names() BOW_dataset=pd.DataFrame(X.T.todense(),index=rows) url_path=os.path.dirname(url_path) scipy.sparse.save_npz(os.path.join(url_path , "keywords.npz"), X) BOW_dataset.to_csv(os.path.join(url_path , "keywords.csv")) # dictionary of keywords
print("Welcome to the Search Engine\n") while continueLoop: fromUser = "" user_query = "" print("\n\nSelect from the Following Options:\n\t1.) Search\n\t2.) Exit") from_user = input("Your Choice: ") if from_user == "1": # NOTE: this function is raw_input for Python 2.x print("\nSearching through the ''{0}'' File Cache:".format(doc_basename)) user_query = input("What Is Your Query?: ") formatted_query = (re_sub(r"[^a-zA-Z0-9_ ]+", "", user_query.lower().strip())).split() query = [] for i in range(0, len(formatted_query)): if formatted_query[i] not in stopwords.words("english"): query.append(stemmer.stem(formatted_query[i])) vsm = VSMClass(iic, doc_basename) qr = QueryClass(query, vsm) qr.computeSimilarities(10) # first index = location of unprocessed documents; second index = list of documents in order of similarity > 0 location_and_documents = getDocuments(qr.all_similarities, iic, doc_location, query) if len(location_and_documents[1]) > 0: print("\nResults:") for i in range(0, len(location_and_documents[1])): # NOTE: this might be yielding an encoding error try: print("\t\tURL:\t{0}".format(location_and_documents[1][i]))
import os.path as osp import torch from torch.utils.data import Dataset import transformers import string import nltk from tqdm import tqdm from nltk.corpus import stopwords import logging UNK = '[UNK]' nltk.download('stopwords') nltk.download('punkt') STOP_WORDS = stopwords.words('english') DROPPED = STOP_WORDS + list(string.punctuation) CATEGORY_IDS = {'1-to-1': 0, '1-to-many': 1, 'many-to-1': 2, 'many-to-many': 3} def file_to_ids(file_path): """Read one line per file and assign it an ID. Args: file_path: str, path of file to read Returns: dict, mapping str to ID (int) """ str2id = dict() with open(file_path) as file: for i, line in enumerate(file): str2id[line.strip()] = i
def group_data(roots_df: pd.DataFrame, notes_df: pd.DataFrame, w2v, tokenizer) -> pd.DataFrame: """Group the roots and notes data for modeling.""" # map each note_id to its tokens note_map = dict(notes_df.loc[:, ["note_id", "text"]].values) hadm_map = dict(notes_df.loc[:, ["note_id", "hadm_id"]].values) # join icd roots with notes print("Merging note and roots .....") df = roots_df.merge(notes_df, on="hadm_id", how="inner").dropna() # group by admission print("Grouping by hadm id .....") df = df.groupby("hadm_id").aggregate(list).reset_index() # get unique roots and notes per grouping print("Replicating notes .....") df["roots"] = df["roots"].apply(lambda x: list(set(x))) df["note_id"] = df["note_id"].apply(lambda x: list(set(x))) # replicate root lists for each note they are related to roots = list( it.chain.from_iterable( map(lambda r, nids: [r] * len(nids), df["roots"].tolist(), df["note_id"].tolist()))) # flatten note ids note_ids = list(it.chain.from_iterable(df["note_id"].tolist())) # flatten notes grouped by hadm_id notes = [note_map[nid] for nid in note_ids] # reassign hadm_id for each note id hadm_ids = [hadm_map[nid] for nid in note_ids] # store the resulting replications in a modeling df model_df = pd.DataFrame({ "roots": roots, "text": notes, "hadm_id": hadm_ids }) # tokenize and remove stop words print("Creating tokens .....") all_stops = set(stopwords.words("english")) model_df["tokens"] = model_df["text"]\ .apply(lambda t: [w for w in word_tokenize(t) if w not in all_stops]) # remove rows with no tokens from word2vec model_df["tokens"] = model_df["tokens"]\ .apply(lambda x: [t for t in x if t in w2v]) model_df["tokens"] = model_df["tokens"]\ .apply(lambda x: None if len(x) == 0 else x) model_df = model_df.dropna() # average word embeddings to generate d2v embeddings print("Creating d2v .....") model_df["d2v"] = model_df["tokens"]\ .apply(lambda doc: list(np.mean([w2v[t] for t in doc if t in w2v], axis=0))) # get column for embedding indices print("Creating w2v indices .....") model_df["w2v_idx"] = model_df["tokens"]\ .apply(lambda doc: [w2v.vocab[w].index for w in doc if w in w2v]) # get bert embeddings indices print("Creating bert indices .....") model_df["bert_idx"] = model_df["text"]\ .apply(lambda doc: torch.tensor(tokenizer\ .encode(doc, add_special_tokens=True))\ .unsqueeze(0)) # one hot encode labels mlb = MultiLabelBinarizer() model_df["roots"] = mlb.fit_transform(model_df["roots"]) return model_df, mlb.classes_
def indexer(): with codecs.open("valid_URL.txt","r",encoding='utf8') as fh_book: global word_freq_title_final global document_count global word_freq_final global link_analysis outLinks = [] for line in fh_book: info = line.split() path = info[0] url = info[1] print "Path :" + str(path) url = "http:" + url if(path == "39/373") or (path == "56/176") or (path == "10/451") or (path == "55/433"): print "Pass_bad_URL_hardCode" continue return_val = is_valid(url) if return_val == True: if magic.from_file(path).startswith('HTML') or magic.from_file(path).startswith('XML'): document_count += 1 fh = codecs.open(path,'r',encoding='utf8') soup = BeautifulSoup(fh,'lxml') fh.close() #TODO comment after first run [x.extract() for x in soup.find_all('script')] sample_list = soup.get_text().lower() #comment next two lines outLinks = extract_next_links(soup,url) link_analysis[url] = outLinks elif magic.from_file(path).startswith('ASCII') or magic.from_file(path).startswith('UTF'): document_count += 1 fh = codecs.open(path,'r',encoding='utf8') sample_list = fh.read() else: continue tokenizer = RegexpTokenizer(r'\w+') punct_remove = tokenizer.tokenize(sample_list) token_list_stopwords = [word for word in punct_remove if not word in stopwords.words('english')] stemmer = PorterStemmer() stemmed_list = stem_porter(token_list_stopwords, stemmer) word_freq = Counter(stemmed_list) word_freq_title_final = processTitle(soup,path,stemmed_list) tags = processTags(soup,path) tag_final = createTagIndex(tags,path,stemmed_list) for word in word_freq: # TODO : add check conditions from below if(checkCondition7(word)): indices = [i for i, x in enumerate(stemmed_list) if x == word] length = word_freq[word] totallength = len(word_freq) posting = {} posting["docID"] = path # posting["occurences"] = indices posting["TF"] = length if(word_freq_final.get(word) == None): sample_list = list() sample_list.append(posting) word_freq_final[word] = sample_list else: sample_list1 = word_freq_final.get(word) sample_list1.append(posting) word_freq_final[word] = sample_list1 writeTitleIndex(word_freq_title_final) writeWordIndex(word_freq_final) writeLinks(link_analysis) writeTagIndex(tag_final)
import pandas as pd import re import nltk nltk.download('stopwords') from nltk.corpus import stopwords from nltk.stem.porter import PorterStemmer dataset = pd.read_csv('Restaurant_Reviews.tsv', delimiter = '\t', quoting = 3) corpus = [] for i in range(0, 1000): review = re.sub('[^a-zA-Z]', ' ', dataset['Review'][i]) review = review.lower() review = review.split() ps = PorterStemmer() all_stopwords = stopwords.words('english') all_stopwords.remove('not') review = [ps.stem(word) for word in review if not word in set(all_stopwords)] review = ' '.join(review) corpus.append(review) from sklearn.feature_extraction.text import CountVectorizer cv = CountVectorizer(max_features = 1500) X = cv.fit_transform(corpus).toarray() y = dataset.iloc[:, -1].values from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0) from sklearn.naive_bayes import GaussianNB classifier = GaussianNB() classifier.fit(X_train, y_train)
del tokens sys.stdout.flush() i += 1 print(multiprocessing.current_process().name + ' has finished processing files') sys.stdout.flush() q.put([stemmed_corp, original_corp]) text_filepath = 'C:/Users/Matt/Documents/Data Science/CW/CLEANED_3/' root_cleaned_filepath = 'C:/Users/Matt/Documents/Data Science/CW/WORDCLOUD_3/' blacklist = ['document_process.csv'] blacklist_words = [ 'ptl', 'lukes', 'june', 'leads////', 'leads/////', 'leads//////' ] custom_stopwords = stopwords.words('english') + blacklist_words + [ punc for punc in string.punctuation ] # stemmer for reducing words stemmer = PorterStemmer() # storing stemmed tokens stemmed_corpus = [] # storing non-stemmed tokens original_corpus = [] # list of currently running threads process_list = [] # queue of information processed by threads q = multiprocessing.Queue() # testing # -1 for all files filesToIter = 2
df_count.plot(x='category', y='number', kind='bar') plt.show() # cleaning dataset stemmer = PorterStemmer() corpus = [] for w in range(len(df['Message'])): msg = df['Message'][w] msg = re.sub("[^a-zA-Z]", " ", msg) msg = msg.lower() msg = msg.split() msg = [ stemmer.stem(word) for word in msg if not word in set(stopwords.words('english')) ] msg = " ".join(msg) corpus.append(msg) # create word vector from sklearn.feature_extraction.text import TfidfVectorizer tf = TfidfVectorizer() tf.fit(corpus) # print(tf.vocabulary_) X = tf.transform(corpus).toarray() Y = df['Category'] # train test split from sklearn.model_selection import train_test_split
def func_tokenize(raw_input): try: #stop_words = set(stopwords.words('english')) new_words_list=stopwords.words('english') new_words_list.append('home')# adding home to stop words list because it is in every document stop_words = set(new_words_list) except: print ('Error creating stop words. Please verify the stopwords were imported prior to running this program') print ('Run the following commands in a python shell to download the stop words') print ('import nltk') print ('nltk.download("stopwords")') try: try: #tags = re.compile('(b\')((\<script.*?\>).*?(\<\/script\>))|((\<style.*?\>).*?(\<\/style\>))|(\<.*?\>)|(\<.*?\/\>)|(\<\/.*?\>)|(&\w+;)|(html)|(\\\\n)|(\\\\x\w\w)',re.DOTALL) #works at removing style tags #tags = re.compile('(b\')((<script.*?>).*?(</script>))|((<style.*?>).*?(</style>))|(<.*?>)|(<.*?/>)|(</.*?>)|(&\w+;)|(html)|(\\\\n)|(\\\\x\w\w)',re.DOTALL) #works at removing style tags #tags = re.compile('(<script>.*?</script>)|(<noscript>.*?</noscript>)|(<!--.*?-->)|(<.*?>)|(<.*?>\w)',re.DOTALL) #tags = re.compile('(<!.*?>)|(<script>.*?</script>)|(<noscript>.*?</noscript>)|(<.*?>)|((\\u[0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ]+)*)',re.DOTALL) #tags = re.compile('(<!.*?>)|(<script>.*?</script>)|(<noscript>.*?</noscript>)',re.DOTALL) #tags = re.compile('(<!.*?>)|(<script>.*?</script>)|(<noscript>.*?</noscript>)|([\\u2000-\\u2100])|(\\u00f8)|(\\u00b0)|([\\u0500-\\u0600])|([\\u5000-\\u6000])',re.DOTALL) tags = re.compile('(^<.*?>)|(^<!.*?>)|(^<script>.*?</script>)|(^<noscript>.*?</noscript>)|([\\u0080-\\uFFEF])',re.DOTALL) #tags = re.compile(r'(<!.*?>)|(<script>.*?</script>)|(<noscript>.*?</noscript>)|(\\u\d*[\s|\w*])',re.DOTALL) #tags = re.compile(r'(<!.*?>)|(<script>.*?</script>)|(<noscript>.*?</noscript>)|([^\\u0200-\\uFFFF])',re.DOTALL)##attempt to remove unicode reg_numbers = re.compile(r'(\s\d+\s)') except: print ('Error in regex', sys.exc_info()[0], sys.exc_info()[1]) ### the following section uses Python 3 conventions #try: ##tr = str.maketrans(" ", " ", string.punctuation)#used to strip punctuation ## need to change for python 2 THis is python 3 #except: #print ('Error removing punctuation', sys.exc_info()[0]) ### End Python 3 section #strip unicode from string try: raw_input = (raw_input.decode('unicode_escape').encode('ascii','ignore')) ## except: #print ('Error removing unicode characters from line var', sys.exc_info()[0], sys.exc_info()[1]) pass try: #line = tags.sub(' ',str(raw_input)) #remove html tags ##python 3 code line = re.sub(tags,' ',str(raw_input)) #remove html tags except: print ('Error removing html tags', sys.exc_info()[0], sys.exc_info()[1]) try: #line= (line.lower().translate(tr).split())#convert line to lower case, remove punctionation and tokenize this uses python 3 requires uncommenting #line= (line.lower().translate(None, string.punctuation).split())#convert line to lower case, remove punctionation and tokenize #This is Python2 version #right_num_spaces=" "*256 punctuation =re.compile('['+string.punctuation+']') line= re.sub(punctuation,' ',line)#remove punctuation with regex but replace with a space to preserve words #line = re.sub(reg_numbers,'',line)#remove numbers from string line=line.lower().split()#convert to lowercase and split into words except: print ('Error Changing case, removing punctuation and spliting', sys.exc_info()[0], sys.exc_info()[1]) try: line=[word for word in line if word not in stop_words] #remove stop words from raw line except: print ('Error with stop words', sys.exc_info()[0], sys.exc_info()[1]) try: stemmer = PorterStemmer() #create a stemmer with the nltk porter stemmer line=[stemmer.stem(term) for term in line] #use nltk stemmer to convert to word roots except: print ('Error with stemming', sys.exc_info()[0], sys.exc_info()[1]) pass return line except: print ('Error in tokenizer function', sys.exc_info()[0], sys.exc_info()[1]) pass
topic summary """ import pickle import random # for consistent testing random.seed(1532525625823) raw_data = pickle.load(open("pickles/list-of-reviews.p", "rb")) from sklearn.feature_extraction.text import CountVectorizer from sklearn.feature_extraction.text import TfidfTransformer from nltk.corpus import stopwords count_vect = CountVectorizer(stop_words=set(stopwords.words('english'))) train_counts = count_vect.fit_transform(random.sample(raw_data, 30000)) raw_data = None btr = pickle.load(open("pickles/dict-of-business-to-reviews.p", "rb")) test_counts = count_vect.transform(btr["Appliance Service Center"]) tfidf_transformer = TfidfTransformer() train_tfidf = tfidf_transformer.fit_transform(train_counts) test_tfidf = tfidf_transformer.transform(test_counts) dtm = train_tfidf dtm_test = test_tfidf vocab = count_vect.get_feature_names()
import pandas as pd import numpy as np import pickle from DataFormatter import create_dataset from nltk.corpus import stopwords # EGC en_stop = set(stopwords.words('french')) en_stop.add('les') en_stop.add('a') en_stop.add('ce') en_stop.add('cet') en_stop.add('cette') en_stop.add('article') en_stop.add('approche') en_stop.add('données') en_stop.add('non') en_stop.update(set(stopwords.words('english'))) data = pd.read_csv('Data/egc.csv', sep="\t") data['txt'] = data['title'].astype(str) + ". " + data['abstract'].astype(str) doc_set = list(data['txt']) years = np.array(data['year']) years = years.flatten().tolist() dataset = create_dataset(doc_set, years, en_stop, l=5, max_df=0.75, min_df=5) pickle.dump(dataset, open("Data/egc.dwe", "wb")) inpu = pickle.load(open("Data/egc.dwe", "rb"))
def run(queryList): # stemmer = PorterStemmer() stemmer = SnowballStemmer("english") f = open("data/expanded.txt", "w+") for query in queryList: querySplitted = query.split(",") # tokenizing the query tokens = nltk.word_tokenize(querySplitted[1]) # removing stop words in the query filtered_words = [ word for word in tokens if word not in stopwords.words('english') ] # pos tagging of tokens pos = nltk.pos_tag(filtered_words) synonyms = [] # synonyms of all the tokens index = 0 # iterating through the tokens for item in filtered_words: synsets = wordnet.synsets(item) if not synsets: # stemming the tokens in the query synsets = wordnet.synsets(stemmer.stem(item)) # synonyms of the current token currentSynonyms = [] currentPOS = get_wordnet_pos(pos[index]) # iterating through the synsets for i in synsets: # first we check if token and synset have the same part of speech if str(i.pos()) == str(currentPOS): for j in i.lemmas(): if j.name() not in currentSynonyms: # if we have not currentSynonyms.append(j.name().replace("_", " ")) synonyms.append(currentSynonyms) index += 1 f.write(querySplitted[0] + ", " + querySplitted[1] + ", ") # removing duplicate lists in the synonyms list tmp = [] for elem in synonyms: if elem and elem not in tmp: tmp.append(elem) synonyms = tmp # now that we have all the synonyms for x in itertools.product(*synonyms): current = "" for item in x: current += item current += " " current += ", " f.write(current) f.write("\n")
from __future__ import unicode_literals from nltk.corpus import stopwords import itertools import string import os stop_words_nltk = stopwords.words("english") stop_words_extra = [ "right", "yeah", "okay", "ourselves", "hers", "between", "yourself", "but", "again", "there", "about", "once", "during", "out", "very", "having", "with", "they", "own", "an", "be", "some",
from sklearn.model_selection import train_test_split from sklearn.metrics import accuracy_score from sklearn.metrics import roc_auc_score from sklearn.metrics import roc_curve, auc from scipy.sparse import hstack import nltk from nltk.corpus import stopwords from pymystem3 import Mystem from string import punctuation import pickle import numpy as np mystem = Mystem() russian_stopwords = stopwords.words("russian") nltk.download("stopwords") NGRAM_RANGE = (1, 5) # слова н-граммы TOKEN_MODE = 'char' # максимальное кол-во фичей TOP_K = 1000 # минимальная частота MIN_DOCUMENT_FREQUENCY = 2 def preprocess_text(text): tokens = mystem.lemmatize(text.lower()) tokens = [token for token in tokens if token not in russian_stopwords \ and token != " " \