def openfile(filename,output): print(filename) #starts run time start = timeit.default_timer() ps = PorterStemmer() file = open(filename,"r") tokens = [] #Used for removing punctuation from the documents translate_table = dict((ord(char), None) for char in string.punctuation) start2 = timeit.default_timer() #splits the lines into words and removes the punctuation for line in file: tokens += word_tokenize(line.translate(translate_table) ) start3 = timeit.default_timer() print("tokenize") print(start3 - start2) #creates a set of stop words to be removed later stop_words = set(stopwords.words("english")) start6 = timeit.default_timer() #if a word is not a stop word it adds it to a list filtered_sentence = [] for w in tokens: if w not in stop_words: filtered_sentence.append(w) start7 = timeit.default_timer() print("stop word removal") print(start7 - start6) startw = timeit.default_timer() #stems each word and adds it to the output file in csv form f = open(output,'w') iterFilSen = iter(filtered_sentence) if output == "documents.csv": for w in filtered_sentence: if w == "I": f.write("\n") f.write(ps.stem(w)) f.write(",") else: for w in iterFilSen: if w == "I": f.write("\n") #removes the I number W next(iterFilSen) next(iterFilSen) else: f.write(ps.stem(w)) f.write(",") #ends run time stop = timeit.default_timer() print("writing") print(stop - startw) print("total: "+output) print(stop - start)
class StemmedBagOfWordsFeatureGenerator(EdgeFeatureGenerator): """ Generates stemmed Bag of Words representation for each sentence that contains an edge, using the function given in the argument. By default it uses Porter stemmer :type feature_set: nala.structures.data.FeatureDictionary :type stemmer: nltk.stem.PorterStemmer :type stop_words: list[str] :type training_mode: bool """ def __init__(self, feature_set, stop_words=[], training_mode=True): self.feature_set = feature_set """the feature set for the dataset""" self.training_mode = training_mode """whether the mode is training or testing""" self.stemmer = PorterStemmer() """an instance of the PorterStemmer""" self.stop_words = stop_words """a list of stop words""" def generate(self, dataset): for edge in dataset.edges(): sentence = edge.part.sentences[edge.sentence_id] if self.training_mode: for token in sentence: if self.stemmer.stem( token.word ) not in self.stop_words and not token.features['is_punct']: feature_name = '4_bow_stem_' + self.stemmer.stem( token.word) + '_[0]' self.add_to_feature_set(edge, feature_name)
def stem(string): """Stem a phrase""" global stemmer if not stemmer: stemmer = Stemmer() #words = string.split() #for i in range(len(words)): # words[i] = self.stemmer.stem(words[i]) # stemming last word only #string = self._reGlue(words) # #string2 = stemmer.stem(string) #if string2 not in stemdict: # stemdict[string2] = string # FIX ME if string not in stemdict: if bad_unicode(string): ## added A. Meyers 8/28/15 temp = stemmer.stem(remove_non_unicode(string)) else: temp = stemmer.stem(string) if temp: stemdict[string] = temp if not temp: pass elif temp not in unstemdict: unstemdict[temp] = [string] elif string not in unstemdict[temp]: unstemdict[temp].append(string) else: temp = stemdict[string] return temp
def tokenizeTags(str,dict_items): #temp map (for getting the local term frequency) #for a sentence str =str.decode('ascii', 'ignore') #tokenizer = nltk.tokenize.treebank.TreebankWordTokenizer() #tokens=tokenizer.tokenize(str) tokens = str.split() #print tokens stemmer = PorterStemmer() #small set of stopwords (remove you, are, and, I those kinds of words) last =[] #bigram_list=[] for d in tokens: d = d.split('-') for c in d: c=re.compile('[%s]' % re.escape(string.punctuation)).sub('', c) #regular expression -> strip punctuations if c!='' and c not in dict_items: try: if int(c): if len(c)!=4 and (c>2015 or c<1900): #keep years c=stemmer.stem('NUM') except Exception: c = stemmer.stem(c.lower()) pass #c = stemmer.stem(c.lower()) last.append(c) #bigram generation #index= len(last) #if index>1: # bigram = last[index-2]+' '+last[index-1] # bigram_list.append(bigram) return last
def new_lesk(context_sentence, ambiguous_word, pos=None, stem=True, hyperhypo=True): ps = PorterStemmer() max_overlaps = 0; lesk_sense = None context_sentence = context_sentence.split() for ss in wn.synsets(ambiguous_word): # If POS is specified. if pos and ss.pos is not pos: continue lesk_dictionary = [] # Includes definition. lesk_dictionary+= ss.definition.split() # Includes lemma_names. lesk_dictionary+= ss.lemma_names # Optional: includes lemma_names of hypernyms and hyponyms. if hyperhypo == True: lesk_dictionary+= list(chain(*[i.lemma_names for i in ss.hypernyms()+ss.hyponyms()])) if stem == True: # Matching exact words causes sparsity, so lets match stems. lesk_dictionary = [ps.stem(i) for i in lesk_dictionary] context_sentence = [ps.stem(i) for i in context_sentence] overlaps = set(lesk_dictionary).intersection(context_sentence) if len(overlaps) > max_overlaps: lesk_sense = ss max_overlaps = len(overlaps) return lesk_sense
def tokenize2_bigram(str,df_freq): temp_map={} #for a sentence str =str.decode('ascii', 'ignore') tokens = str.split() #print tokens stemmer = PorterStemmer() last =[] bigram_list=[] for d in tokens: d = d.split('-') for c in d: c=re.compile('[%s]' % re.escape(string.punctuation)).sub('', c) #regular expression -> strip punctuations if c!='': try: if int(c): if len(c)!=4 and (c>2015 or c<1900): #keep years c=stemmer.stem('NUM') except Exception: c = stemmer.stem(c.lower()) pass #c = stemmer.stem(c.lower()) last.append(c) #bigram generation index= 0 if index>1: bigram = last[index-2]+' '+last[index-1] bigram_list.append(bigram) updateDF(temp_map,df_freq,bigram) index+=1 return bigram_list
def tokenize2(str,df_freq): #temp map (for getting the local term frequency) temp_map={} #for a sentence str =str.decode('ascii', 'ignore') #tokenizer = nltk.tokenize.treebank.TreebankWordTokenizer() #tokens=tokenizer.tokenize(str) tokens = str.split() #print tokens stemmer = PorterStemmer() #small set of stopwords (remove you, are, and, I those kinds of words) last =[] #bigram_list=[] for d in tokens: d = d.split('-') for c in d: c=re.compile('[%s]' % re.escape(string.punctuation)).sub('', c) #regular expression -> strip punctuations if c!='': try: if int(c): if len(c)!=4 and (c>2015 or c<1900): #keep years c=stemmer.stem('NUM') except Exception: c = stemmer.stem(c.lower()) pass last.append(c) updateDF(temp_map,df_freq,c)
class IntermediateTokensFeatureGenerator(EdgeFeatureGenerator): """ Generate the bag of words representation, masked text, stemmed text and parts of speech tag for each of the tokens present between two entities in an edge. :param feature_set: the feature set for the dataset :type feature_set: nala.structures.data.FeatureDictionary :param training_mode: indicates whether the mode is training or testing :type training_mode: bool """ def __init__(self, feature_set, training_mode=True): self.feature_set = feature_set """the feature set for the dataset""" self.training_mode = training_mode """whether the mode is training or testing""" self.stemmer = PorterStemmer() """an instance of PorterStemmer""" def generate(self, dataset): for edge in dataset.edges(): sentence = edge.part.sentences[edge.sentence_id] if edge.entity1.head_token.features['id'] < edge.entity2.head_token.features['id']: first = edge.entity1.head_token.features['id'] second = edge.entity2.head_token.features['id'] for i in range(first+1, second): token = sentence[i] feature_name = '33_fwd_bow_intermediate_'+token.word+'_[0]' self.add_to_feature_set(edge, feature_name) feature_name = '34_fwd_bow_intermediate_masked_'+token.masked_text(edge.part)+'_[0]' self.add_to_feature_set(edge, feature_name) feature_name = '35_fwd_stem_intermediate_'+self.stemmer.stem(token.word)+'_[0]' self.add_to_feature_set(edge, feature_name) feature_name = '36_fwd_pos_intermediate_'+token.features['pos']+'_[0]' self.add_to_feature_set(edge, feature_name) else: first = edge.entity2.head_token.features['id'] second = edge.entity1.head_token.features['id'] for i in range(first+1, second): token = sentence[i] feature_name = '37_bkd_bow_intermediate_'+token.word+'_[0]' self.add_to_feature_set(edge, feature_name) feature_name = '38_bkd_bow_intermediate_masked_'+token.masked_text(edge.part)+'_[0]' self.add_to_feature_set(edge, feature_name) feature_name = '39_bkd_stem_intermediate_'+self.stemmer.stem(token.word)+'_[0]' self.add_to_feature_set(edge, feature_name) feature_name = '40_bkd_pos_intermediate_'+token.features['pos']+'_[0]' self.add_to_feature_set(edge, feature_name) for i in range(first+1, second): token = sentence[i] feature_name = '41_bow_intermediate_'+token.word+'_[0]' self.add_to_feature_set(edge, feature_name) feature_name = '42_bow_intermediate_masked_'+token.masked_text(edge.part)+'_[0]' self.add_to_feature_set(edge, feature_name) feature_name = '43_stem_intermediate_'+self.stemmer.stem(token.word)+'_[0]' self.add_to_feature_set(edge, feature_name) feature_name = '44_pos_intermediate_'+token.features['pos']+'_[0]' self.add_to_feature_set(edge, feature_name)
class EntityHeadTokenFeatureGenerator(EdgeFeatureGenerator): """ Calculate the head token for each entity, using a simple heuristic - the distance to the root of the sentence. If the entity has just one token, then that forms the head token. If the entity has multiple tokens, then the token which is closest to the root of the sentence forms the entity head. :param feature_set: the feature set for the dataset :type feature_set: nala.structures.data.FeatureDictionary :param training_mode: whether the mode is training or testing, default True :type training_mode: bool """ def __init__(self, feature_set, training_mode=True): self.feature_set = feature_set """the feature set for the dataset""" self.training_mode = training_mode """whether the mode is training or testing""" self.stemmer = PorterStemmer() """an instance of the PorterStemmer""" def generate(self, dataset): for edge in dataset.edges(): entity1 = edge.entity1 entity2 = edge.entity2 self.named_entity_count('entity1_', entity1.class_id, edge) self.named_entity_count('entity2_', entity2.class_id, edge) entity1_stem = self.stemmer.stem(entity1.head_token.word) entity1_non_stem = entity1.head_token.word[len(entity1_stem):] entity2_stem = self.stemmer.stem(entity2.head_token.word) entity2_non_stem = entity1.head_token.word[len(entity2_stem):] feature_name_1_1 = '7_entity1_txt_' + entity1.head_token.word + '_[0]' feature_name_2_1 = '7_entity2_txt_' + entity2.head_token.word + '_[0]' feature_name_1_2 = '8_entity1_pos_' + entity1.head_token.features['pos'] + '_[0]' feature_name_2_2 = '8_entity2_pos_' + entity2.head_token.features['pos'] + '_[0]' feature_name_1_3 = '9_entity1_stem_' + entity1_stem + '_[0]' feature_name_2_3 = '9_entity2_stem_' + entity2_stem + '_[0]' feature_name_1_4 = '10_entity1_nonstem_' + entity1_non_stem + '_[0]' feature_name_2_4 = '10_entity2_nonstem_' + entity2_non_stem + '_[0]' self.add_to_feature_set(edge, feature_name_1_1) self.add_to_feature_set(edge, feature_name_2_1) self.add_to_feature_set(edge, feature_name_1_2) self.add_to_feature_set(edge, feature_name_2_2) self.add_to_feature_set(edge, feature_name_1_3) self.add_to_feature_set(edge, feature_name_2_3) self.add_to_feature_set(edge, feature_name_1_4) self.add_to_feature_set(edge, feature_name_2_4) def named_entity_count(self, prefix, entity_type, edge): entities = edge.part.get_entities_in_sentence(edge.sentence_id, entity_type) feature_name = '1_'+prefix+entity_type+'_count_['+str(len(entities))+']' self.add_to_feature_set(edge, feature_name)
class Indexer(): def __init__(self, rem_punc=True, rem_stop=True): self.rem_punc = rem_punc self.rem_stop = rem_stop self.stoplist = stopwords.words('english') self.punctunation = list(string.punctuation) self.token_dict = dict() self.pst = PorterStemmer() self.postings_list = dict() def get_pages(self): with open('./data/ucl', 'r') as ifile: contents = ifile.read() for page in contents.split('visited:'): self.parse_page(page) def parse_page(self, page): page = unicode(page, errors='ignore') lines = page.strip().split() if len(lines) > 2: title = lines[1] # tokenize and make lowercase tokens = [word.lower() for word in word_tokenize(str(lines[2:]))] # remove punctuation if self.rem_punc: tokens = [word for word in tokens if word not in self.punctunation] # remove stopwords if self.rem_stop: tokens = [word for word in tokens if word not in self.stoplist] # stem (Porter stemmer) tokens = [self.pst.stem(word) for word in tokens] # add to dictionary self.add_to_token_dict(title, tokens[3:]) def add_to_token_dict(self, title, tokens): if tokens: words = dict() for token in tokens[1:]: key = self.pst.stem(token.lower()) if key in self.token_dict: self.token_dict[key] += 1 else: self.token_dict[key] = 1 if key in words: words[key] += 1 else: words[key] = 1 self.postings_list[title] = [(k, v) for k, v in words.iteritems()]
def splitAndStem(inputfilename, outputfilename): ''' For each ingredient split it into words, stem each word, construct a new recipe from those words :param inputfilename: :return: ''' with open(outputfilename, 'w') as ff: ff.write('[\n') with open(inputfilename) as f: d = eval(f.read()) stemmer = PorterStemmer() with open(outputfilename, 'a') as ff: for i in d: # print(i) new_item = {} new_ingredients = [] for ingredient in i['ingredients']: tokens = word_tokenize(ingredient) clean_tokens = [re.subn('[^A-Za-z]', '', token)[0] for token in tokens] new_ingredients += [stemmer.stem(w).lower() for w in clean_tokens] new_item['cuisine'] = i['cuisine'] new_item['id'] = i['id'] new_item['ingredients'] = new_ingredients json_recipe = json.dumps(new_item) ff.write('%s,\n' % str(json_recipe))
def porter_list1(lista): stemmer = PorterStemmer() newlist = [] for b in lista: b = stemmer.stem(b) newlist.append(b) return newlist
def parseReviews(mypath): filelist = os.listdir(mypath) wordDict = {} negationList = ["no","not","never","can't","won't","cannot","didn't","couldn't"] negationFlag = False stopwordList = set(stopwords.words("english")) stemmer = PorterStemmer() for file in filelist: with open(mypath + "/" + file,"r") as f: word_list = word_tokenize(f.read()) for word in word_list: if word in negationList: #double negative if negationFlag: negationFlag = False else: negationFlag = True continue if not word.isalnum(): negationFlag = False if word.isalnum() and word not in stopwordList: word = stemmer.stem(word) if negationFlag: word = "!" + word negationFlag = False if word not in wordDict: wordDict[word] = 1 else: wordDict[word] += 1 return wordDict
def prepare_data(reviews): # run porter stemmer on every word stemmer = PorterStemmer() stem_text = lambda x: {'class': x['class'], 'text': stemmer.stem(x['text'])} # clean text and remove empty items reviews = filter(lambda x: x != {}, reviews) reviews = map(stem_text, reviews) print('classification: ' + reviews[observed_element]['class'] + '\n\n------------------------------------\n\n') print('stemming: ' + reviews[observed_element]['text'] + '\n\n------------------------------------\n\n') # remove stopwords reviews = map(remove_stop_words, reviews) print('stopwords: ' + reviews[observed_element]['text'] + '\n\n------------------------------------\n\n') # remove undesired patterns reviews = map(clean_text, reviews) print('elementos inuteis: ' + reviews[observed_element]['text'] + '\n\n------------------------------------\n\n') return reviews
def deleting_stop_words_and_punctuating(text): stop_words = set(stopwords.words('english')) ps = PorterStemmer() lemmatizer = WordNetLemmatizer() word_tokenize_text = word_tokenize(text) words = [ps.stem(lemmatizer.lemmatize(w)) for w in word_tokenize_text] return [w.lower() for w in words if not (w in stop_words or w in string.punctuation or w in "''" or w in '``' or w in "the" or w in 'in' or w in "'s")]
class Document(object): def __init__(self, title, raw, stopwords=set()): self.title = title self.raw = raw self.stops = stopwords self.docid = 0 self.tokens = self._tokenize() self.stemmer = PorterStemmer() self.terms = self._get_terms() self.log_terms = self._log_terms() self.magnitude = 0 def _tokenize(self): """Takes the raw terms and returns the tokenized contents""" return wpt(self.raw) def _get_terms(self): """Gets a freqdist of the standardized terms from the list of tokens Developer's note: this is where I would put stopwords """ stems = [] for token in self.tokens: if (not token.isalnum()) or (token in self.stops) or (token.isdigit()): continue stemmed = self.stemmer.stem(token) stems.append(stemmed.lower()) return FreqDist(stems) def _log_terms(self): return dict((term, (1+log(freq, 2))) for term, freq in self.terms.items()) def __len__(self): """This returns the number of terms in the document. NOT the size of it. """ return len(self.terms)
def preprocess(text): stemmer = PorterStemmer() stop = stopwords.words('english') tokens = [tok for tok in word_tokenize(text.lower()) if tok not in stop] tokens_stemmed = [stemmer.stem(tok) for tok in tokens] return tokens_stemmed
def preprocess_document(doc): stopset = set(stopwords.words('english')) stemmer = PorterStemmer() tokens = wordpunct_tokenize(doc) clean = [token.lower() for token in tokens if token.lower() not in stopset and len(token) > 2] final = [stemmer.stem(word) for word in clean] return final
def preprocessing(text, debug = False): if debug: print text # lower case text = text.lower() if debug: print text # can't -> cannot, bya's -> bya is text = replacers.RegexpReplacer().replace(text) if debug: print text # word tokenize words = word_tokenize(text) if debug: print words # removing stopwords english_stops = set(stopwords.words('english')) english_stops_added = english_stops | {'.', ',', ':', ';'} words = [word for word in words if word not in english_stops_added] if debug: print words # stemming words stemmer = PorterStemmer() words_stemmed = list(map(lambda word: stemmer.stem(word), words)) if debug: print words_stemmed return words, words_stemmed
def extract_clean_sentences(self): """ Extracts sentences from plain text. Also applies the following cleaning operations: - Exclude all characters not recognized by 'utf-8' encoding - Exclude all characters not contained in [a-zA-Z0-9 '-] - Exclude common stopwords """ text = self.raw_text exclude = re.compile('[^a-zA-Z0-9 \'-]') linebreaks = re.compile('\s') excess_space = re.compile('\s+') stemmer = PorterStemmer() sentences = sent_tokenize(text) out = [] for sentence in sentences: sentence = linebreaks.sub(' ', sentence) sentence = exclude.sub(' ', sentence) sentence = excess_space.sub(' ', sentence) tokens = word_tokenize(sentence) tokens = [stemmer.stem(t.lower()) for t in tokens] out.append(tokens) return out
def testing(): # - tokenize on sentence and word ex_txt = "hello there Mr. Bartuska, How are you? The weather is great and I enjoy Python. cheers!" print(sent_tokenize(ex_txt)) print(word_tokenize(ex_txt, language='english')) # - stop words (pre-defined by nltk) stop_words = set(stopwords.words('english')) print(stop_words) words = word_tokenize(ex_txt) print(words) filtered_sent = [] for w in words: if w not in stop_words: filtered_sent.append(w) print(filtered_sent) filtered_sent = [w for w in words if not w in stop_words] print(filtered_sent) # - stemming ps = PorterStemmer() example_words = [python,pythoner,pythoning,pythoned,pythonly] # for w in example_words: # print(ps.stem(w)) new_text = "it is very important to be pothonly while you are pythoning with python. All pythoners have pythoned poorly at least once." words = word_tokenize(new_text) for w in words: print(ps.stem(w))
def stemText(text): ps = PorterStemmer() words = word_tokenize(text) #all_words = []; for w in words: all_words.append(ps.stem(w))
def parseTranscript(transcript): assert isinstance(transcript, Transcript), \ "transcript must be stored in custom namedtuple, not {}".format(type(transcript)) text = transcript.prepared.append(transcript.QandA) id = "{ticker}-{year}-{month}-{day}".format(ticker=transcript.ticker.split(':')[-1], year=transcript.date.year, month=transcript.date.month, day=transcript.date.day) tokenizer = wordpunct_tokenize stemmer = PorterStemmer() index = dict() pos = 0 for row in text: for i, token in enumerate(tokenizer(row.lower())): token = stemmer.stem(token) if token not in index and '|' not in token: index[token] = [id, [str(pos + i)]] elif '|' not in token: index[token][-1].append(str(pos + i)) try: pos += (i + 1) except: pass return index
class Stemmer(SentenceProcesser): def __init__(self): self.stemmer=PorterStemmer() def process(self, sentence): for word in sentence.words: word.stem=self.stemmer.stem(word.content) return sentence
def get_english_vocab(lemmatize=False): vocab = (w.lower() for w in words.words()) if lemmatize: stemmer = PorterStemmer() vocab = (stemmer.stem(w) for w in vocab) return set(vocab)
def buildVocab(self): '''Build a vocabulary for the selected documents (from dir database).''' ## Note: The source of text should be Lucene processed field values. Lucene tokenized the text, remove stop words, and may take other unknown steps. ## Right now the vocabulary is built on the raw text with NLTK based stopwords removal, and tokenization. This should be improved. # collect contents from /database/ for each of these doc for pmid in self.pmidList: # self.pmidList includes the query and the 99 most similar articles selected by BM25 self.corpus.append(file(os.path.join(self.dbDir,pmid)).read()) # corpus contains raw text (MH, title*2, abstract) for text in self.corpus: sent_tokenize_list = sent_tokenize(text.strip().lower(), "english") # tokenize an article text stemmed_text = [] if sent_tokenize_list: # if sent_tokenize_list is not empty porter_stemmer = PorterStemmer() for sent in sent_tokenize_list: words = TreebankWordTokenizer().tokenize(sent) # tokenize the sentence words = [word.strip(string.punctuation) for word in words] words = [word for word in words if not word in stopwords.words("english")] words = [word for word in words if len(word)>1] # remove single letters and non alphabetic characters words = [word for word in words if re.search('[a-zA-Z]',word)] words = [porter_stemmer.stem(word) for word in words] # apply Porter stemmer stemmed_text.append(" ".join(words)) self.vocab+=words self.stemmed_corpus.append(". ".join(stemmed_text)) # append a stemmed article text # save stemmed corpus pickle.dump(self.stemmed_corpus, file(os.path.join(self.stemmed_corpusDir,str(self.pmidList[0])),"w")) # remove low frequency tokens and redundant tokens tokenDist = Counter(self.vocab) lowFreqList = [] for token, count in tokenDist.iteritems(): if count<2: lowFreqList.append(token) self.vocab = list(set(self.vocab)-set(lowFreqList)) # save vocabulary pickle.dump(self.vocab,file(os.path.join(self.vocabDir,str(self.pmidList[0])),"w"))
class Tfidf_KeywordSelection: def __init__(self,keyword_count,stem=True): self.keyword_count = keyword_count self.stem = stem if self.stem: self.stemmer = PorterStemmer() def fit(self,X,y=None): return self def predict(self,X): if self.stem: for idx in xrange(len(X)): for idx_cand in xrange(len(X[idx])): X[idx][idx_cand] = " ".join([self.stemmer.stem(word) for word in X[idx][idx_cand].split()]) corpus_tfidf,dictionary = self.score_keyphrases_by_tfidf(X) ypred = [] for scores in corpus_tfidf: scores = sorted(scores,key=lambda x:x[1],reverse=True)[:self.keyword_count] ypred.append([dictionary[word_idx] for word_idx,score in scores]) return ypred def score_keyphrases_by_tfidf(self, candidates): # make gensim dictionary and corpus dictionary = gensim.corpora.Dictionary(candidates) corpus = [dictionary.doc2bow(candidate) for candidate in candidates] # transform corpus with tf*idf model tfidf = gensim.models.TfidfModel(corpus) corpus_tfidf = tfidf[corpus] return corpus_tfidf, dictionary
def _stemmatize(self, word): lmtzr = WordNetLemmatizer() # lemmatizer won't stem words ending in '-ing' unless you tell it it's a verb stemmer = PorterStemmer() if word.endswith('ing'): return stemmer.stem(word) return lmtzr.lemmatize(word)
def stemming(): ps = PorterStemmer() input_tweet = 'testing tests trying tries' words = word_tokenize(input_tweet) for w in words: print(ps.stem(words))
class Tokenizer: def __init__(self): self.relative_path = os.path.join("my_class/") self.stopword_list = csv_io.read_csv(self.relative_path + 'stopword.csv') + [u','] self.stemmer = PorterStemmer() def is_stop_word(self, word): if word.lower() in self.stopword_list: return True else: return False def stemming(self, token): return self.stemmer.stem(token) def to_tokens(self, sentence): return nltk.word_tokenize(sentence) def is_zh (self,c): c_unicode = ord (c) # unicode range zh_range = [[0x2e80, 0x33ff], [0xff00, 0xffef], [0x4e00, 0x9fbb], \ [0xf900, 0xfad9], [0x20000, 0x2a6d6], [0x2f800, 0x2fa1d]] for lower, upper in zh_range: if c_unicode >= lower and c_unicode <= upper: return True return False
word_tokens = [word for word in word_tokens if word.isalnum()] print("tokens"); print(word_tokens) stop_words = set(stopwords.words('english')) filtered_sentence = [] filtered_sentence = [w for w in word_tokens if not w in stop_words] print("tokens without stopwords"); print(filtered_sentence) fdist = FreqDist(filtered_sentence) print(fdist) fdist.plot(30,cumulative=False) plt.show() print("stemming") stemmed_words = [] for word in filtered_sentence: stemmed_words.append(ps.stem(word)) print(stemmed_words) print("lemma") lemmed_words = [] for word in filtered_sentence: lemmed_words.append(lem.lemmatize(word,"v")) print(lemmed_words) print("pos") pos_tags = nltk.pos_tag(filtered_sentence) print(pos_tags)
def stem(word): s = PorterStemmer() return s.stem(word)
new_output = np.ones((output.shape[0], 2)) for i in range(output.shape[0]): if (output[i] == 1): new_output[i][0] = 1 new_output[i][1] = 0 else: new_output[i][0] = 0 new_output[i][1] = 1 ps = PorterStemmer() for i in range(len(l)): j = 0 for j in range(len(l[i])): l[i][j] = ps.stem(l[i][j]) wordlist = {} for i in range(len(l)): for j in range(len(l[i])): if (l[i][j] in wordlist): wordlist[l[i][j]] = wordlist[l[i][j]] + 1 else: wordlist[l[i][j]] = 1 freq_l = sorted(wordlist.values()) freq_l.reverse() freq_l[1999] new_wordlist = {}
words = [word.lower() for word in words] #Loại bỏ hư từ và ghi ra file words = [word for word in words if word not in my_stopwords] arr_word = CountFrequency(words) file_after_remove_stopword = open( path_output + '/file_after_remove_stopword_' + str(i) + '.txt', "w", encoding="utf8") for (word, fre) in arr_word.items(): file_after_remove_stopword.write(word + ':' + str(fre) + '\n') file_after_remove_stopword.close() #chuẩn hóa từ ps = PorterStemmer() words = [ps.stem(word) for word in words] list_word.append(words) #tạo mảng các document sau khi chuẩn hóa str_words = ' '.join(words) list_document_after_preprocess.append(str_words) #ghi file tổng kết len_words = len(words) file_summary.write(list_name[i] + ": " + str(len(sents_cleaned)) + ", " + str(len_words) + '\n') arr_word = CountFrequency(words) file_final = open(path_output + '/' + list_name[i] + '_word.txt', "w", encoding="utf8")
class Preprocess(): ## this is for each document def __init__(self): self.inittoken_list = [] self.http_dic = {"ALL": []} self.number_removed_list = [] self.number_dic = {"ALL": []} ## recorded in init order self.stemmed_list = [] self.punctuation_list = [ ".", "'", '"', "?", ",", ")", "(", "@", "%", "$", "*", "-", "_", "/", "!", "#", "^", "&", "`", ":", ";" ] self.poter = PorterStemmer() self.stopward_list = [] self.stopward_dic = {"ALL": []} init_stopward_list = [ 'i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn', "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't" ] self.stopward_adding(init_stopward_list) self.stopwarded_list = [] self.word_dic = {} ## format:{term:[index_list]} def read_file(self, storage_place): #print(1) if len(self.inittoken_list) != 0: return "you have already put some data in here" ## vertify type of input if not isinstance(storage_place, str): print( "you should input where you store your document in string type." ) return False ## make document in to a list of list of strings, seperated in lines storage_place = storage_place.strip("/") document_list = open(storage_place, 'rt').readlines() ## make document into a single list of string valid_index = 0 for line in document_list: start_flag = 0 for stop_flag in range(len(line)): valid_flag = False if line[stop_flag] == ' ': word = line[start_flag:stop_flag] valid_flag = self.preprocess_word(word, valid_index) start_flag = stop_flag + 1 if line[-1] == '.' and stop_flag == len(line) - 1: # check the last word for each line word = line[start_flag:-1] valid_flag = self.preprocess_word(word, valid_index) if valid_flag: ## flag is true if word is valid valid_index = valid_index + 1 return True def preprocess_word(self, word, valid_index): ## 應在這裡把 字串list、字典 建好 if self.http_remove(word): return False if self.number_remove(word): return False #self.minus_split( voca_index) pun_removed = self.punctuation_remove(word) if self.len_filter(pun_removed): return False stemmed = self.stemming(pun_removed) if self.stopwording(stemmed): return False self.word_dic_create(stemmed, valid_index) return True def http_remove(self, word): ## true if this word is a website address, and add it into http_dic flag = 0 if "http" == word[:4] or "www" == word[:3]: ## first 4 chars in word == http, or first 3 chars in word == www flag = 1 self.http_index = 0 if flag: if not word in self.http_dic: self.http_dic["ALL"].append(word) self.http_dic[word] = [] self.http_dic[word].append(self.http_index) self.http_index = self.http_index + 1 return True return False def number_remove(self, word): ## true if 're is number in the word, and add it into number_dic flag = 0 for char in word: if ord(char) < 58 and ord(char) > 47: ## ASCII for numbers : 48~57 flag = 1 break self.number_index = 0 if flag: if not word in self.number_dic: self.number_dic["ALL"].append(word) self.number_dic[word] = [] self.number_dic[word].append(self.number_index) self.number_index = self.number_index + 1 return True return False def punctuation_remove(self, word): for pun in self.punctuation_list: if pun in word: word = word.replace(pun, '') return word def len_filter(self, word): if len(word) < 3: return True return False def stemming(self, word): stemmed = self.poter.stem(word) return stemmed def stopwording(self, word): ## true if the word is stopword if word in self.stopward_dic: #self.stopward_dic['ALL'].append( voca_index) #self.stopward_dic[ dest_document[voca_index]].append(voca_index) return True ## add normal word into stopworded_list self.stopwarded_list.append(word) return False def word_dic_create(self, word, index): if word in self.word_dic: self.word_dic[word].append(index) else: self.word_dic[word] = [index] return True def minus_split(self): ####### INCOMPLEPE ####### for voca_index in range(len(self.inittoken_list)): if "-" in self.inittoken_list[voca_index]: temp = self.inittoken_list[voca_index].split("-") self.inittoken_list.append(temp) self.inittoken_list[voca_index] = self.inittoken_list[ voca_index].replace("-", "") return None def stopward_adding(self, new_ward_list): ## check for type of list if not isinstance(new_ward_list, list): print("want a list. in stopward_adding") return False for stopward in new_ward_list: ## check for type of each ward in list if not isinstance(stopward, str): print("want a list of string. in stopward_adding") return False ## stem and add stemmed_stopward = self.poter.stem(stopward) if not stemmed_stopward in self.stopward_dic: self.stopward_list.append(stemmed_stopward) self.stopward_dic.update({stemmed_stopward: []}) #self.stopward_flag[0] = self.stopward_flag[0] +1 return 0 def punctuation_adding(self, new_pun): return 0 def save_result(self): with open("R09725049_result.txt", "w") as text_file: text_file.write(str(self.stopwarded_list)) return "file saved"
train['tweet'] = train['tweet'].apply( lambda x: " ".join(x for x in x.split() if x not in freq)) train['tweet'].head() #Spelling correction from textblob import TextBlob train['tweet'][:5].apply(lambda x: str(TextBlob(x).correct())) #Tokenization TextBlob(train['tweet'][1]).words #Stemming from nltk.stem import PorterStemmer st = PorterStemmer() train['tweet'][:5].apply( lambda x: " ".join([st.stem(word) for word in x.split()])) # Lemmatization from textblob import Word train['tweet'] = train['tweet'].apply( lambda x: " ".join([Word(word).lemmatize() for word in x.split()])) train['tweet'].head() #. Advance Text Processing # N-grams # N-grams are the combination of multiple words used together. Ngrams with N=1 are called unigrams. Similarly, bigrams (N=2), trigrams (N=3) and so on can also be used. # Unigrams do not usually contain as much information as compared to bigrams and trigrams. The basic principle behind n-grams is that they capture the language structure, # like what letter or word is likely to follow the given one. The longer the n-gram (the higher the n), the more context you have to work with. Optimum length really # depends on the application – if your n-grams are too short, you may fail to capture important differences. On the other hand, if they are too long, you may fail # to capture the “general knowledge” and only stick to particular cases.
input_str = input_str.lower() input_str = input_str.translate(str.maketrans('', '', string.punctuation)) result = re.sub("\d", "", input_str) print(result) from nltk.tokenize import word_tokenize tokens = word_tokenize(input_str) print(tokens) from nltk import FreqDist frequency_token = nltk.FreqDist(tokens) print(frequency_token.most_common(10)) input_str = word_tokenize(input_str) for word in input_str: print(stemmer.stem(word)) from sklearn.feature_extraction.text import CountVectorizer def bow_extractor(corpus, ngram_range=(1, 1)): vectorizer = CountVectorizer(min_df=1, ngram_range=ngram_range) features = vectorizer.fit_transform(corpus) return vectorizer, features bow_vectorizer, bow_features = bow_extractor(CORPUS) features = bow_features.todense() print(features) feature_names = bow_vectorizer.get_feature_names()
from nltk.tokenize import word_tokenize from nltk.stem import PorterStemmer ps = PorterStemmer() words = ['walk', 'walking', 'walked', 'walks'] for word in words: print(ps.stem(word)) sentence = "I walked in a park. The weather was good. I saw some children playing football. And a dog was chasing a cat." tokens = word_tokenize(sentence) for token in tokens: print(ps.stem(token))
class Preprocessor(): def __init__(self, *filenames): self.entries = Parser.parse(*filenames) self.stop_words = set(stopwords.words('english')) self.stemmer = PorterStemmer() self.nchars = {} self.docsLength = np.empty(len(self.entries)) self.min_lower_year = None self.min_upper_year = None self.max_lower_year = None self.max_upper_year = None # used in data analyzer def clean_entries(self): for entry in self.entries: entry_only_words = self.__extract_only_words(entry.body) words = entry_only_words.lower().split() # if word is not in stopwords, stemm it and save it cleaned = [ self.stemmer.stem(word) for word in words if word not in self.stop_words ] # create string from words that passed exam above (words that are not stopwords) entry.body = " ".join(cleaned) # used in main funcs def get_clean_data(self): entries_text = [entry.body for entry in self.entries] # "deletes" everything that is not a word or number (?, !, '...) entries_only_words = [ self.__extract_only_words(entry_text) for entry_text in entries_text ] # final clean - performs removing stopwords and stemms every word clean_entries = [] for i, entry_only_words in enumerate(entries_only_words): words = entry_only_words.lower().split() # getDocLength, use all words (this line can be moved around if other lengths (e.g. no stopwords) are used) # self.docsLength[i] = len(words) --> normalization done directly in SvenClassifier, there are no stop words # if word is not in stopwords, stemm it and save it cleaned = [ self.stemmer.stem(word) for word in words if word not in self.stop_words ] # create string from words that passed exam above (words that are not stopwords) clean_entries.append(" ".join(cleaned)) # list of strings separated by space, every string is cleaned text from one text entry in dataset return clean_entries def get_raw_words(self): entries_text = [entry.body.lower() for entry in self.entries] return [ self.__extract_only_words(entry_text) for entry_text in entries_text ] def __extract_only_words(self, entry_text): return re.sub("[^a-zA-Z0-9]", " ", entry_text) def __calcNChars(self, words, sizes): for word in words: for size in sizes: nchars = [ word[i:i + size] for i in range(len(word) - size + 1) ] for nchar in nchars: self.nchars[nchar] = self.nchars.get(nchar, 0) + 1 def getNChars(self, items, sizes=(2, 3), freq=1): entries_text = [entry.body for entry in self.entries] entries_only_words = [ self.__extract_only_words(entry_text) for entry_text in entries_text ] for item in entries_only_words: self.__calcNChars(item.split(), sizes=sizes) return [ nchar for nchar, nchar_freq in self.nchars.items() if nchar_freq > freq ] def labels_for_years(self, year_type): text_periods = self.__get_text_periods(year_type) labels_lower = [] labels_upper = [] for text_period in text_periods: time_span = text_period.yes_time_span() labels_lower.append(time_span[LOWER]) labels_upper.append(time_span[UPPER]) return labels_lower, labels_upper def labels_for_years_norm(self, year_type): text_periods = self.__get_text_periods(year_type) labels_lower = [] labels_upper = [] time_span_length = self.__get_time_span_length(text_periods) custom_time_spans = self.__generate_custom_time_spans(time_span_length) for text_period in text_periods: time_span = text_period.yes_time_span() chosen_time_span = self.__find_starting_year( time_span, custom_time_spans) labels_lower.append(chosen_time_span[LOWER]) labels_upper.append(chosen_time_span[UPPER]) return labels_lower, labels_upper def __find_starting_year(self, time_span, custom_time_spans): intersecs = [] # intersections (amount of years) for custom_time_span in custom_time_spans: intersec = min(time_span[UPPER], custom_time_span[UPPER]) - max( time_span[LOWER], custom_time_span[LOWER]) intersecs.append(intersec) # take time span for which intersection is largest return custom_time_spans[np.argmax(intersecs)] def __generate_custom_time_spans(self, time_span_length): start = 1700 spans = [] while start <= 2012: spans.append((start, start + time_span_length)) start += time_span_length + 1 return spans def __get_time_span_length(self, text_periods): time_span = text_periods[0].yes_time_span() return time_span[1] - time_span[0] def __get_text_periods(self, year_type): if year_type is "F": return [entry.textF for entry in self.entries] if year_type is "C": return [entry.textC for entry in self.entries] if year_type is "M": return [entry.textM for entry in self.entries]
if dataset['cEXT'][i] == 'n' and dataset['cNEU'][i] == 'n' and dataset['cAGR'][i] == 'n'and dataset['cCON'][i] == 'n' and dataset['cOPN'][i] == 'n': indices.append(i) dataset.drop(dataset.index[indices], inplace=True) dataset = dataset.reset_index(drop=True) all_essays = [] for i in range(0, len(dataset['TEXT'])): essay = re.sub('a-zA-Z', ' ', dataset['TEXT'][i]) essay = essay.lower() essay = essay.split() ps = PorterStemmer() wnl = WordNetLemmatizer() essay = [wnl.lemmatize(word) if wnl.lemmatize(word).endswith('e') else ps.stem(word) for word in essay if not word in set(stopwords.words())] essay = ' '.join(essay) all_essays.append(essay) print("Done " + str(i)) with open("essaysfinal", "wb") as fp: pickle.dump(all_essays, fp) complete_ds = [] y_req = [] y = dataset.iloc[:, 2:7].values for d in range(0, len(y)): for i in range(0, len(y[0])): if y[d][i] == 'y':
class PythonRouge(ReferenceBasedMetric): _non_alphanumeric_regex = re.compile('[^A-Za-z0-9]') def __init__( self, ngram_orders: List[int] = [1, 2], max_sentences: Optional[int] = None, max_words: Optional[int] = None, max_bytes: Optional[int] = None, use_porter_stemmer: bool = True, remove_stopwords: bool = False, compute_rouge_l: bool = False, rouge_data_dir: str = f'{DATA_ROOT}/metrics/ROUGE-1.5.5/data'): super().__init__() self.ngram_orders = ngram_orders self.max_sentences = max_sentences self.max_words = max_words self.max_bytes = max_bytes self.use_porter_stemmer = use_porter_stemmer self.remove_stopwords = remove_stopwords self.compute_rouge_l = compute_rouge_l if not os.path.exists(rouge_data_dir): raise Exception( f'Path "{rouge_data_dir}" does not exist. PythonRouge requires data files from ROUGE. ' f'Have you setup ROUGE?') self.stemmer = PorterStemmer(PorterStemmer.ORIGINAL_ALGORITHM) self.stemmer_exceptions = self._load_stemmer_exceptions(rouge_data_dir) self.stopwords = self._load_stopwords(rouge_data_dir) def _load_stemmer_exceptions(self, root: str) -> Dict[str, str]: exceptions = {} for filename in ['adj.exc', 'adv.exc', 'noun.exc', 'verb.exc']: file_path = os.path.join(root, 'WordNet-2.0-Exceptions', filename) with open(file_path, 'r') as f: for line in f: # I think there is a bug in the original perl script # to construct the exceptions database. Some of the lines # have more than 2 words on them, but the script only # maps the first to the second, ignoring the third. columns = line.strip().split() exceptions[columns[0]] = columns[1] return exceptions def _load_stopwords(self, root: str) -> Set[str]: file_path = os.path.join(root, 'smart_common_words.txt') return set(open(file_path, 'r').read().splitlines()) def normalize_and_tokenize_sentence(self, sentence: str) -> List[str]: sentence = PythonRouge._non_alphanumeric_regex.sub(' ', sentence) sentence = sentence.lower() tokens = [] for token in sentence.split(): if self.remove_stopwords and token in self.stopwords: continue if self.use_porter_stemmer and len(token) > 3: if token in self.stemmer_exceptions: tokens.append(self.stemmer_exceptions[token]) else: tokens.append(self.stemmer.stem(token)) else: tokens.append(token) return tokens def _normalize_and_tokenize_summary(self, summary: List[str]) -> List[str]: return [ self.normalize_and_tokenize_sentence(sentence) for sentence in summary ] def preprocess_summary(self, summary: SummaryType) -> List[List[str]]: summary = shorten_summary(summary, self.max_sentences, self.max_words, self.max_bytes) summary = self._normalize_and_tokenize_summary(summary) return summary def _count_ngrams(self, summary: SummaryType, n: int) -> Counter: counts = Counter() if isinstance(summary, str): summary = [summary] tokens = [token for sentence in summary for token in sentence] for i in range(len(tokens) - n + 1): ngram = ' '.join(tokens[i:i + n]) counts[ngram] += 1 return counts def _calculate_intersection( self, reference_counts: Counter, summary_counts: Counter) -> Tuple[float, float, float]: reference_total = sum(reference_counts.values()) summary_total = sum(summary_counts.values()) intersection = 0 for ngram in summary_counts: intersection += min(summary_counts[ngram], reference_counts[ngram]) return reference_total, summary_total, intersection def _calculate_pr_f1(self, reference_total: int, summary_total: int, intersection: int) -> Tuple[float, float, float]: precision = 0.0 if summary_total != 0.0: precision = intersection / summary_total * 100 recall = 0.0 if reference_total != 0.0: recall = intersection / reference_total * 100 if precision + recall == 0: f1 = 0.0 else: f1 = 2 * (precision * recall) / (precision + recall) return precision, recall, f1 def _longest_common_substring(self, tokens1: List[str], tokens2: List[str], hit_mask: List[int]) -> int: m, n = len(tokens1), len(tokens2) counter = [[0] * (n + 1) for x in range(m + 1)] pointers = [[None] * (n + 1) for x in range(m + 1)] for i in range(1, m + 1): for j in range(1, n + 1): if tokens1[i - 1] == tokens2[j - 1]: counter[i][j] = counter[i - 1][j - 1] + 1 pointers[i][j] = '\\' elif counter[i - 1][j] >= counter[i][j - 1]: counter[i][j] = counter[i - 1][j] pointers[i][j] = '^' else: counter[i][j] = counter[i][j - 1] pointers[i][j] = '<' # Mark the hit_mask i, j = m, n while i != 0 and j != 0: if pointers[i][j] == '\\': i -= 1 j -= 1 hit_mask[i] = 1 elif pointers[i][j] == '^': i -= 1 elif pointers[i][j] == '<': j -= 1 else: raise Exception(f'Unknown pointer: {pointers[i][j]}') def _calculate_rouge_l(self, references: List[SummaryType], summary: SummaryType): model_unigrams = self._count_ngrams(summary, 1) num_model_unigrams = sum(count for count in model_unigrams.values()) if isinstance(summary, str): summary = [summary] references = [[reference] if isinstance(reference, str) else reference for reference in references] total_hit = 0 total_base = 0 for reference in references: temp_model_unigrams = Counter(model_unigrams) gold_unigrams = self._count_ngrams(reference, 1) hit, base = 0, 0 for ref_sentence in reference: hit_mask = [0] * len(ref_sentence) base += len(ref_sentence) for model_sentence in summary: self._longest_common_substring(ref_sentence, model_sentence, hit_mask) for i, token in enumerate(ref_sentence): if hit_mask[i] == 1: try: if temp_model_unigrams[ token] > 0 and gold_unigrams[token] > 0: hit += 1 temp_model_unigrams[token] -= 1 gold_unigrams[token] -= 1 except KeyError: pass total_hit += hit total_base += base precision = 0.0 if (num_model_unigrams * len(references)) != 0.0: precision = total_hit / (num_model_unigrams * len(references)) * 100 recall = 0.0 if total_base != 0.0: recall = total_hit / total_base * 100 if (precision + recall) != 0.0: f1 = 2 * (precision * recall) / (precision + recall) else: f1 = 0.0 return precision, recall, f1 def score_multi_all( self, summaries_list: List[List[SummaryType]], references_list: List[List[ReferenceType]] ) -> List[List[MetricsDict]]: summaries_list = [[ self.preprocess_summary(summary) for summary in summaries ] for summaries in summaries_list] references_list = [[ self.preprocess_summary(reference) for reference in references ] for references in references_list] metrics_lists = [] for summaries, references in zip(summaries_list, references_list): metrics_list = [MetricsDict() for _ in summaries] for n in self.ngram_orders: reference_ngrams_list = [ self._count_ngrams(reference, n) for reference in references ] for i, summary in enumerate(summaries): total_reference_count = 0 total_summary_count = 0 total_intersection = 0 summary_ngrams = self._count_ngrams(summary, n) for reference_ngrams in reference_ngrams_list: reference_total, summary_total, intersection = self._calculate_intersection( reference_ngrams, summary_ngrams) total_reference_count += reference_total total_summary_count += summary_total total_intersection += intersection precision, recall, f1 = self._calculate_pr_f1( total_reference_count, total_summary_count, total_intersection) metrics_list[i][f'python-rouge-{n}'] = { 'precision': precision, 'recall': recall, 'f1': f1, } if self.compute_rouge_l: for i, summary in enumerate(summaries): precision, recall, f1 = self._calculate_rouge_l( references, summary) metrics_list[i]['python-rouge-l'] = { 'precision': precision, 'recall': recall, 'f1': f1 } metrics_lists.append(metrics_list) return metrics_lists
import nltk from nltk.stem import PorterStemmer from nltk.stem import WordNetLemmatizer stemmer_output = PorterStemmer() print(stemmer_output.stem('happiness')) lemmatizer_output = WordNetLemmatizer() print(lemmatizer_output.lemmatize('happiness'))
def stemming_text(tokenized_text): ps = PorterStemmer() stemmed_words = [] for word in tokenized_text: stemmed_words.append(ps.stem(word)) return stemmed_words
tweets_clean.append(word) print('removed stop words and punctuation:') print(tweets_clean) print('\033[92m') print(tweets_clean) print('\033[94m') # Instantiate stemming class stemmer = PorterStemmer() tweets_stem = [] for word in tweets_clean: stem_word = stemmer.stem(word) tweets_stem.append(stem_word) print('stemmed words:') print(tweets_stem) def process_tweet(tweet): """Process tweet function. Args: tweet: a string containing a tweet. Returns: tweets_clean: a list of words containing the processed tweet. """ stemmer = PorterStemmer()
def run_cleaner0(): party_aff = [('obama', 'D'), ('clinton', 'D'), ('bush', 'R'), ('gwbush', 'R')] file_nms_rows = session.execute("SELECT DISTINCT filename FROM iwords.raw") file_nms0 = [] for row in file_nms_rows: file_nms0.append(row.filename) file_nms = np.array(file_nms0) for fnm in file_nms: sub_stmt = """SELECT * FROM iwords.raw WHERE filename = '{}';""".format( fnm) #print('sub') #print(sub_stmt) tmp_sub = session.execute(sub_stmt) #print('sub executed') # pull each row and organize data data = [] for row in tmp_sub: tmp = { 'filename': row.filename, 'line_num': row.line_num, 'doc_num': row.doc_num, 'pres': row.pres, 'speech_title': row.speech_title, 'speech_dt': row.speech_dt, 'in_office': row.in_office, 'text': row.text } data.append(tmp) # clean data df0 = pd.DataFrame(data) #print(row.speech_dt) time_input = df0['speech_dt'][0] if time_input != None: talk_time = pendulum.parse(str(time_input)) pres = df0['pres'][0] title = df0['speech_title'][0] prty = [p for n, p in party_aff if n == pres][0] io_val = df0['in_office'][0] for txt_str in df0['text']: sw_en = stopwords.words('english') stemming = PorterStemmer() pattern = "\w+" #\w+(?:'\w+)?|[^\w\s] #arr = nltk.word_tokenize(df0["text"][0]) #list(map(lambda x: nltk.pos_tag(x, tagset='universal', lang='eng'), arr)) arr0 = regexp_tokenize(txt_str, pattern) arr1 = map(lambda x: stemming.stem(x), arr0) arr2 = [word for word in arr1 if word not in sw_en] pos0 = nltk.pos_tag(arr2, tagset='universal', lang='eng') for word, pos in pos0: input_time = str(talk_time) if (len(input_time) > 30): input_time = str(talk_time)[0:23] + "+00:00" else: input_time = talk_time stmt = """INSERT INTO iwords.clean0 (filename, talk_time, pres, party, in_office, word, pos) VALUES ('{}', '{}', '{}', '{}', {}, '{}', '{}');""".format( fnm, input_time, pres, prty, io_val, word, pos) #print(stmt) session.execute(stmt) talk_time = talk_time.add(seconds=0.26)
# example_words = ["python","pythoner","pythoning","pythoned","pythonly"] new_text = "It is important to by very pythonly while you are pythoning with python. All pythoners have pythoned poorly at least once." stop_words = set(stopwords.words('english')) word_tokens = word_tokenize(example_sent) filtered_sentence = [w for w in word_tokens if not w in stop_words] filtered_sentence = [] for w in word_tokens: if w not in stop_words: filtered_sentence.append(w) # tokenizing print(sent_tokenize(example_sent)) print("----------------hasil tokenizing--------------") print(word_tokenize(example_sent)) # filtering print("----------------hasil filtering--------------") # print(word_tokens) print(filtered_sentence) print("----------------hasil stemming--------------") # stemming words = word_tokenize(example_sent) for w in filtered_sentence: print(ps.stem(w))
for docid, termid in forward_index.keys(): if doc_id == str(docid): distinct_terms += 1 total_terms += len(forward_index[(docid, termid)]) i += 1 return total_terms load_term_ids() load_doc_ids() if (len(sys.argv) == 3): if (sys.argv[1] == '--term'): term = sys.argv[2] stemmer = PorterStemmer() stemmed = stemmer.stem(term) if termids.has_key(stemmed): load_term_info() term_id = termids[stemmed] print 'Listing for term: ' + term show_term_info(term_id) else: print "List for " + term + " not present." elif (sys.argv[1] == '--doc'): doc_name = sys.argv[2] if docids.has_key(doc_name): load_doc_info() doc_id = docids[doc_name] print 'Listing for doc: ' + doc_name show_doc_info(doc_id) else:
def get_stem(word): ps = PorterStemmer() return ps.stem(word)
import nltk from nltk.stem import PorterStemmer paragraph = "John does his work intelligently. John is an intelligent man. John is always working" sentences = nltk.word_tokenize(paragraph) stemmer = PorterStemmer() #Create Object print sentences for w in sentences: print "Actual %s || Stem: %s", w, stemmer.stem(w)
takeOutStopWords = [] for txt in tokenized: temp=[] for word in txt: if word.lower() not in stop_words: temp.append(word) takeOutStopWords.append(temp) #apply stemming techniques to find the words’ roots #PorterStemmer PorterStemmer=[] sentence=[] for txt in takeOutStopWords: for word in txt: sentence.append(ps.stem(word)) PorterStemmer.append(sentence) #SnowballStemmer SnowballStemmer=[] sentence=[] for txt in takeOutStopWords: for word in txt: sentence.append(ss.stem(word)) SnowballStemmer.append(sentence) #list of all words words=[] for txt in PorterStemmer: for word in txt:
corpus = " ".join(new_doc) import re corpus = re.sub("[^A-Za-z ]+", "", corpus) corpus = corpus.lower() stop_words2 = ["rt", "the", "today", "we", "i", "so", "space"] corpus = [w for w in corpus.split(" ") if not w in stop_words2] corpus = " ".join(corpus) from nltk.stem import PorterStemmer lst = PorterStemmer() corpus = [lst.stem(w) for w in corpus.split(" ")] corpus = " ".join(corpus) from sklearn.feature_extraction.text import CountVectorizer vect = CountVectorizer() count_vect = vect.fit_transform(corpus.split(" ")) count_vect.shape names = vect.get_feature_names() report = pd.DataFrame(count_vect.toarray(), columns=names) file = {} for i in report.columns: file[i] = report[i].sum()
print() print("Stemmers") ps = PorterStemmer() ls = LancasterStemmer() ss = SnowballStemmer("english") print("Languages Supported By Snowball Stemmer") [print(x) for x in ss.languages] print() print(stem_format.format('Input', *stemmers)) print(stem_format.format('=' * 10, '=' * 16, '=' * 16, '=' * 16)) # [print(stem_format.format(x,ps.stem(x),ls.stem(x),ss.stem(x))) for x in wpt if len(x) > 1] [ print(stem_format.format(x, ps.stem(x), ls.stem(x), ss.stem(x))) for x in words ] print() print("Stopwords Finder") stop_words = set(stopwords.words('english')) [print(x) for x in wpt if x in stop_words] print() print("Lemmatizers") wnl = WordNetLemmatizer() print(lemma_format.format('Input', *lemmatizers)) print(lemma_format.format('=' * 10, '=' * 25, '=' * 25, '=' * 25)) # [print(lemma_format.format(x,wnl.lemmatize(x),wnl.lemmatize(x,pos="v"))) for x in wpt if len(x) > 1]
import urllib3 urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) from bs4 import BeautifulSoup http = urllib3.PoolManager() response = http.request( 'GET', "https://timesofindia.indiatimes.com/india/rbi-governor-met-pm-modi-fm-jaitley-last-week-in-bid-to-heal-rift/articleshow/66597760.cms" ) soup = BeautifulSoup(response.data, 'html.parser') Text = ". ".join([p.text for p in soup.find_all('div', {'class': 'Normal'})]) print(Text) psm = PorterStemmer() st = LancasterStemmer() lmtzr = WordNetLemmatizer() paras_stemmedSansStopWords = [ psm.stem(word.lower()) for word in word_tokenize(Text) if word.lower() not in stopwords.words('english') ] print(paras_stemmedSansStopWords) stemmed_para = " ".join([word for word in paras_stemmedSansStopWords]) stemmed_para custom = set( stopwords.words('english') + list(punctuation) + ["'", '"', "“", '’']) stemmed_words = [ word for word in paras_stemmedSansStopWords if word not in custom ] print(stemmed_words) print(stopwords.words('english')) from collections import Counter
stop_text = [] tokens = word_tokenize(text) for i in tokens: if i not in download_stopwords and i not in punktuation: stop_text.append(i) tokens = word_tokenize(text) #разбиение на слова tok_sent = sent_tokenize(text) #разбиение на предложения stemsPorter = [] #Стеммер Портера porter = PorterStemmer() for w in tokens: a = w w = porter.stem(w) if w != "": stemsPorter.append(w) stems = [] stemmer = SnowballStemmer("russian") #Стеммер Snowball for token in tokens: token = stemmer.stem(token) if token != "" and token not in punktuation: stems.append(token) result=[] text_split=text.split(" ") for i in range (len(text_split)): result.append(text_split[i]) if stems[i] not in punktuation: result.append(stems[i])
def my_fun(self): ds1 = pd.read_csv(self.train_path) train_labels = ds1.iloc[:, 2] train_row = ds1.shape[0] ds2 = pd.read_csv(self.test_path) test_row = ds1.shape[0] corpus1 = ds1.iloc[:, 1].to_numpy() corpus2 = ds2.iloc[:, 1].to_numpy() corpus = np.concatenate([corpus1, corpus2], axis=0) # print(corpus.shape) for i in range(len(corpus)): regex = re.compile('[^a-zA-Z]') corpus[i] = regex.sub(' ', corpus[i]) txt = ''.join(corpus[i]) corpus[i] = txt for i in range(len(corpus)): txt1 = corpus[i].split(' ') txt = "" for j in txt1: if (len(j) > 3): txt += " " + j corpus[i] = txt # clf = svm.SVC(kernel='linear',C=1) # clf.fit(train_data, train_labels) # prediction=clf.predict(test_data) # return prediction for i in range(len(corpus)): stemmer = PorterStemmer() txt1 = word_tokenize(corpus[i]) txt = "" for word in txt1: txt += " " + stemmer.stem(word) corpus[i] = txt corpus my_stop_words = text.ENGLISH_STOP_WORDS vectorizer = TfidfVectorizer(stop_words=my_stop_words) X = vectorizer.fit_transform(corpus) X = X.toarray() # print(X,X.shape) #from sklearn.decomposition import PCA #pca = PCA(n_components=1000) #X=pca.fit_transform(X) train_data = X[:train_row] test_data = X[train_row:] # # # print("Train data shape:- ",train_data.shape) # print("Train labels shape:- ",test_data.shape) # clf = svm.SVC(kernel='linear', C=1) clf.fit(train_data, train_labels) prediction = clf.predict(test_data) return prediction
#To remove the axis value : plt.axis("off") plt.show() #Stemming Example : #Import stemming library : from nltk.stem import PorterStemmer porter = PorterStemmer() #Word-list for stemming : word_list = ["Study", "Studying", "Studies", "Studied"] for w in word_list: print(porter.stem(w)) #Stemming Example : #Import stemming library : from nltk.stem import SnowballStemmer snowball = SnowballStemmer("english") #Word-list for stemming : word_list = ["Study", "Studying", "Studies", "Studied"] for w in word_list: print(snowball.stem(w)) #Stemming Example :
def stemming_tweets(self, tweet): ps = PorterStemmer() tweets_stemming = ps.stem(tweet) return tweets_stemming
if elem in m: return elem return 'no problemo' # In[96]: data['final_Combined'] = data['final_Combined'].apply(strip_space) # In[97]: ps = PorterStemmer() data['final_Combined']= data['final_Combined'].apply(lambda x: [ps.stem(elem) for elem in x] ) flag_words = [ps.stem(elem) for elem in flag_words] # In[98]: data['type']=data['final_Combined'].apply(check_flag_words) data = data[data['type']!='no problemo'] # data.groupby('type').size().plot.bar() # In[100]: data.groupby('score').size()
def stemData(word): ps = PorterStemmer() word = ps.stem(word) return word
class SearchEngine: def __init__(self, file): self.file = file self.stopwords = set(stopwords.words('english')) self.index = defaultdict(lambda: defaultdict(int)) self.tokenizer = RegexpTokenizer(r'\w+') self.stemmer = PorterStemmer() self.totalDocs = 0 self.results = defaultdict(float) def get_files(self): try: with open(self.file,encoding='utf-8') as json_file: data = json.load(json_file) for key in data: if key == "39/373": continue if len(data[key]) <= 300: print(key) self.find_text(key,data[key]) except Exception as e: print(e) def create_tokens(self,words): tokens = [] for word in words: word = word.lower() if word not in self.stopwords and len(word) <= 40 and len(word) > 1 and (re.match("^[a-z]+$",word) or re.match("^[0-9]+$",word)): tokens.append(self.stemmer.stem(word)) return tokens def find_text(self,path,url): frequencies = defaultdict(int) self.totalDocs += 1 soup = BeautifulSoup(open("WEBPAGES_RAW/"+path), "lxml") headers = [] #tokenized list for headerWords in soup.find_all(['h1','h2','h3','b','strong']): headers += self.create_tokens(self.tokenizer.tokenize(headerWords.text)) content = [] #TOTAL VISIBLE CONTENT ON PAGE, tokenized list for b in soup.find_all('body'): content += self.create_tokens(self.tokenizer.tokenize(b.text)) #adding to index... #form is word: url: tfidf*weight for c in content: if c in headers: #more weight self.index[c][url] = 1.5 else: self.index[c][url] = 1.0 frequencies[c] += 1 #adding term frequency total = len(content) for (word,frequency) in frequencies.items(): entry = self.index[word][url] self.index[word][url] = (frequency / total) * entry def query(self,query): query = self.create_tokens(self.tokenizer.tokenize(query)) return query def insertIDF(self): for (word, urls) in self.index.items(): for (url, docinfo) in urls.items(): entry = docinfo tfidf = entry * math.log10((self.totalDocs + .001)/(len(urls) + .001)) #accounts for dividing by 0 self.index[word][url] = tfidf def insertDB(self): for x in self.index: collection.insert({"token": str(x), "value": self.index[x]}, check_keys=False) def run(self): self.get_files() #run on all files in corpus here self.insertIDF() print("length: ", len(self.index)) print("numbers of documents: ", self.totalDocs) collection.remove() # Clears the existing database before creation self.insertDB() # Inserts index in to db def search(self,query): #while True: #Retrieves Query from DB self.results = defaultdict(float) #tempInput = str(input("Input Query: ")) tempInput = query if tempInput == "quit": #break return query = self.query(tempInput) if len(query) > 1: for q in query: temp = {"token": q} found = collection.find_one(temp) try: values = found["value"] for v,k in values.items(): #v is url, k is tfidf if v not in self.results: self.results[v] = k else: #multiple words in same document self.results[v] += k except Exception as e: continue else: temp = {"token": query[0]} found = collection.find_one(temp) try: values = found["value"] self.results.update(values) except Exception as e: #continue return '''print("Showing 20 results out of ", len(self.results))