def __init__(self, weight_gpop=0, stemm=True, stemmer='porter', tokenize=True, clean=True, synonyms=False, fuzzy=True, fuzz_thres=0, add_artists=False, add_albums=False, return_num_predictions=500): ''' Constructor ''' self.weight_gpop = weight_gpop self.return_num_predictions = return_num_predictions self.add_artists = add_artists self.add_albums = add_albums self.stemm = stemm self.tokenize = tokenize self.clean = clean self.fuzzy = fuzzy self.fuzz_thres = fuzz_thres if stemmer == 'wn': self.stemmer = stem.WordNetLemmatizer() elif stemmer == 'porter': self.stemmer = stem.PorterStemmer() elif stemmer == 'snowball': self.stemmer = stem.SnowballStemmer('english') self.stemmers = stemmer self.synonyms = synonyms
def test_word_stemming_filter(): stim = ComplexTextStim(join(TEXT_DIR, 'sample_text.txt'), columns='to', default_duration=1) # With all defaults (porter stemmer) filt = WordStemmingFilter() assert isinstance(filt.stemmer, nls.PorterStemmer) stemmed = filt.transform(stim) stems = [s.text for s in stemmed] target = ['some', 'sampl', 'text', 'for', 'test', 'annot'] assert stems == target # Try a different stemmer filt = WordStemmingFilter(stemmer='snowball', language='english') assert isinstance(filt.stemmer, nls.SnowballStemmer) stemmed = filt.transform(stim) stems = [s.text for s in stemmed] assert stems == target # Handles StemmerI stemmer stemmer = nls.WordNetLemmatizer() filt = WordStemmingFilter(stemmer=stemmer) stemmed = filt.transform(stim) stems = [s.text for s in stemmed] assert stems == target # Fails on invalid values with pytest.raises(ValueError): filt = WordStemmingFilter(stemmer='nonexistent_stemmer')
def get_training_features(text_file): ''' Takes a compiled text file of utterances from oral arguments, returns a list of tuples [(feature_dict, label)] It ignores which Justice is speaking, using them all as training data. ''' tokenizer = RegexpTokenizer(r'\b\w\w+\b') stemmer = stem.snowball.EnglishStemmer() wnl = stem.WordNetLemmatizer() stopper = stopwords.words('english') features = [] with open(text_file, 'r') as f: for line in f: sl = line.strip().split(' +++$+++ ') ## Skip things the lawyers say if sl[4] == 'NOT JUSTICE': continue vote = sl[5] presenter = sl[6] ## skip the phrase if the justice did not vote ('NA') or was an announcement ('') if vote == 'NA' or presenter == '': #print('*'*30,line, vote) continue ## determine its polarity (for or against) polarity = 'pos' if vote == presenter else 'neg' phrase = sl[7] #print(vote, presenter, polarity) ## phrase --> cleaned list of words --> feature dictionary #word_feature_dict = extract_word_features(phrase, tokenizer, stemmer, stopper) word_feature_dict = extract_word_features2(phrase, tokenizer, wnl, stopper) ## create tuple of the (dict, polarity) & add it to the list of tuples features.append((word_feature_dict, polarity)) return features
def doTheThing(fileContents): # TOKENIZATION tokenizedWords = tokenize.word_tokenize(fileContents) # STOPWORDS filteredWords = [] stop_words = set(corpus.stopwords.words('english')) for w in tokenizedWords: if w not in stop_words: filteredWords.append(w) # FREQUENCY DISTRIBUTION freqDist = probability.FreqDist(tokenizedWords) # STEMING ps = stem.PorterStemmer() stemmedWords = [] for w in filteredWords: stemmedWords.append(ps.stem(w)) # LEMMATIZATION wnl = stem.WordNetLemmatizer() lemmatizedWords = [] for w in filteredWords: lemmatizedWords.append(wnl.lemmatize(w, "v")) return [ tokenizedWords, filteredWords, freqDist, stemmedWords, lemmatizedWords ]
class WordNetLemmatizer(BaseNormalizer): name = 'WordNet Lemmatizer' normalizer = stem.WordNetLemmatizer().lemmatize @wait_nltk_data def __init__(self): super().__init__()
def stem_token(word): stem_token = "" if stemmer_name == "Porter-Stemmer": #print ("Performing Porter Stemming") stemmer = stem.PorterStemmer() phrase_array_token = word.split() stem_token = "" for s in phrase_array_token: stem_token = stem_token + stemmer.stem(s) + " " stem_token = stem_token.strip(" ") word = stem_token elif stemmer_name == "Lancaster-Stemmer": #print ("Performing Lancaster Stemming") stemmer = stem.LancasterStemmer() phrase_array_token = word.split() stem_token = "" for s in phrase_array_token: stem_token = stem_token + stemmer.stem(s) + " " stem_token = stem_token.strip(" ") word = stem_token elif stemmer_name == "WordNet-Lemmatizer": #print ("Performing Wordnet Lemmatization") stemmer = stem.WordNetLemmatizer() phrase_array_token = word.split() stem_token = "" for s in phrase_array_token: stem_token = stem_token + stemmer.lemmatize(s) + " " stem_token = stem_token.strip(" ") word = stem_token #stopword[count]=stemmer.lemmatize(stopword[count]) return (word)
def generate(model: base.BaseEstimator, sentences: List[List[str]]) -> None: """Tag the sentences with the given model. Parameters ---------- sentences : list List of lists of strings representing the sentences to tag. """ print(f"Tagging {len(sentences)} sentences.") # Since the models were trained on the lemmatized version of the words, # we also lemmatize them when tagging unlabeled sentences. lemmatizer = stem.WordNetLemmatizer() for sentence in sentences: # Convert to the lemmatized versions lemmatized = [lemmatizer.lemmatize(w.lower()) for w in sentence] # Convert to conllu.TokenList because models expect that. # Since they are essentially dicts, we build them that way. tags = model.predict([[{"lemma": w} for w in lemmatized]]) print("Word\tTag") for w, t in zip(sentence, tags[0]): print(f"{w}\t{t}") print()
def lemmalize_list(word_list): lemmatizer = stem.WordNetLemmatizer() result = [] for word in word_list: # print(lemmatizer.lemmatize(word)) result.appned(lemmatizer.lemmatize(word)) return result
def createList(self, Text): f = open( os.path.dirname(os.path.realpath(__file__)) + '/StopWords', "r") stopwords = f.readlines() for word in Text.split(' '): word = word.lower() wnl = lem.WordNetLemmatizer() #lemmatization word = wnl.lemmatize(word) for x in self.symbols: if x in word: word = word.replace(x, '') by = str.encode(word) #removing stopwords if by in stopwords: continue try: a = float(word) #print (a) continue except ValueError: True if word not in self.wordlist: self.wordlist.append(word)
def lemmatizeText(self, text): """Apply lemmatization to a string.""" l = stem.WordNetLemmatizer() bow = text.split(" ") #this creates a bag of words result = [] for word in bow: result.append(l.lemmatize(word)) return ' '.join(result)
def lemmat(text): lem=stem.WordNetLemmatizer() words=word_tokenize(text) new_text="" for word in words: w=lem.lemmatize(word) new_text=new_text+" "+w return new_text
def lemmatize_sentence(sentence): """Applies lemmatization to sentence.""" sentence = [(stem.WordNetLemmatizer().lemmatize(word, pos_tag), pos_tag) if pos_tag else (word, pos_tag) for word, pos_tag in sentence] return sentence
def Stem_qu(qu_doc): WordNetStem = stem.WordNetLemmatizer() for i, word in enumerate(qu_doc): qu_doc[i] = WordNetStem.lemmatize(word, "n") for i, word in enumerate(qu_doc): qu_doc[i] = WordNetStem.lemmatize(word, "v") #processed_docs[idx] = words qu_doc = [token for token in qu_doc if token not in STOPWORDS] return qu_doc
def Stem_voca(processed_docs): WordNetStem = stem.WordNetLemmatizer() for idx,words in enumerate(processed_docs): for i,word in enumerate(words): words[i] = WordNetStem.lemmatize(word,"n") for i,word in enumerate(words): words[i] = WordNetStem.lemmatize(word,"v") #processed_docs[idx] = words processed_docs[idx] = [token for token in words if token not in STOPWORDS] return processed_docs
def analysis(filename): fr = open(filename) domain = filename.split('_')[-1].split('.')[0] data = fr.readlines() fr.close() cat = [] m_cat = [] cat_s = {} for line in data: line = line.strip() listfromline = line.split() cat.append(listfromline[-1]) c = listfromline[-1] c = c.split(';') # m_cat.append('#'.join(c)) # if len(c) == 2: # # print(c) # if c[0]!='LAPTOP': # key = c[0] # else: # key = c[1] for key in c: import nltk.stem as ns lemmatizer = ns.WordNetLemmatizer() sent = [lemmatizer.lemmatize(lemmatizer.lemmatize(word,'n'),'v') for word in listfromline[:-1] if word not in ['#num','was']] if 'wa' in sent: print('wa in sent') if key not in cat_s: cat_s[key] = ' '.join(sent) else: cat_s[key] += ' '+' '.join(sent) for key,strings in cat_s.items(): # print(key) import collections obj = collections.Counter(strings.split()) tuples = obj.most_common(10) for word in tuples: print(word) print('---------------------------------') wordcloud = WordCloud(background_color = 'white', width=800, height=600, margin=2).generate(strings) import matplotlib.pyplot as plt plt.imshow(wordcloud) plt.axis('off') plt.title(key) # print(strings) # plt.show() plt.savefig(domain+'_'+key+'.eps', format='eps')
def english_token(sentence, tokenize_flag=1, is_filter_stopword=1, stem_flag=1, lemma_flag=1): # 两种英文分词方式, 2更优 if tokenize_flag == 1: source_tokens = word_tokenize(sentence) elif tokenize_flag == 2: tokenizer = tokenize.WordPunctTokenizer() source_tokens = tokenizer.tokenize(sentence) # print(source_tokens) # 删除标点符号 for token in source_tokens[::-1]: if len(token) == 1 and token[0].isalpha() == False: source_tokens.remove(token) # 过滤停用词 if is_filter_stopword: list_stopWords = list(set(corpus.stopwords.words('english'))) filtered_stop_words = [w for w in source_tokens if not w in list_stopWords] else: filtered_stop_words = source_tokens # print(filtered_stop_words) # 两种词干化处理工具,2更优 stem_tokens = [] if stem_flag == 1: porterStemmer = stem.PorterStemmer() for word in filtered_stop_words: stem_tokens.append(porterStemmer.stem(word)) elif stem_flag == 2: snowballStemmer = stem.SnowballStemmer('english') for word in filtered_stop_words: stem_tokens.append(snowballStemmer.stem(word)) # 将动名词词型还原,2更优 lemma_tokens = [] if lemma_flag == 1: lemmatizer = stem.WordNetLemmatizer() for word in stem_tokens: # 将名词还原为单数形式 n_lemma = lemmatizer.lemmatize(word, pos='n') # 将动词还原为原型形式 v_lemma = lemmatizer.lemmatize(n_lemma, pos='v') # print('%8s %8s %8s' % (word, n_lemma, v_lemma)) lemma_tokens.append(v_lemma) elif lemma_flag == 2: lemmatizer = stem.wordnet.WordNetLemmatizer() tagged_corpus = pos_tag(stem_tokens) for token, tag in tagged_corpus: if tag[0].lower() in ['n', 'v']: lemma_tokens.append(lemmatizer.lemmatize(token, tag[0].lower())) else: lemma_tokens.append(token) return lemma_tokens
def word_lemma(word_input, pos_input=None): if pos_input in ["NN", "NNP", "NNS", "NNPS", "CD", "DT", "FW"]: pos_sign = 'n' elif pos_input in ["VB", "VBD", "VBG", "VBP", "VBZ"]: pos_sign = 'v' elif pos_input in ["JJ", "JJR", "JJS"]: pos_sign = 'a' elif pos_input in ["RB", "RBR", "RBS", "RP"]: pos_sign = 'r' else: pos_sign = None try: if pos_sign != None: word_root = stem.WordNetLemmatizer().lemmatize(word_input, pos=pos_sign) else: word_root = stem.WordNetLemmatizer().lemmatize(word_input) except StandardError as err: print(err) return (word_root)
def solve(word_list): wnl = stem.WordNetLemmatizer() porter = stem.porter.PorterStemmer() a = [porter.stem(word) for word in word_list] b = [wnl.lemmatize(word) for word in word_list] lemmatizer = Lemmatizer() c = [lemmatizer.lookup(word) for word in word_list] res = {} res['a'] = a res['b'] = b res['c'] = c return res
def filtered_sentiment(): lemmatizer = stem.WordNetLemmatizer() lines = [] for line in open('sentiment.txt', 'r'): words = line.split(' ') fileterd_words = [] for word in words: if not stop_word.exists(word): feature = lemmatizer.lemmatize(word) fileterd_words.append(feature) lines.append(fileterd_words) return lines
def _split_sentence(self, text, parts, to_stem=False): info_of_words = nltk.word_tokenize(text) words = [] if to_stem: stemmer = stem.WordNetLemmatizer() for w in info_of_words: w = w.lower() w = stemmer.lemmatize(w) words.append(w) else: for w in info_of_words: w = w.lower() words.append(w) return words
def lemmatisation(inp, outp): ''' Lemmatisation of strings ''' print "# lemmatisation" wnl = stem.WordNetLemmatizer() for line in inp: sline = line.strip() tokens = sline.split() lemmas = [] for token in tokens: lemma = wnl.lemmatize(token) lemmas.append(lemma) outp.write(' '.join(lemmas)) outp.write('\n') outp.close()
def calculate_overlap(claim, headline): wordnet_lemmatizer = stem.WordNetLemmatizer() puncts = ".,;?:!" lemmas = {0: [], 1: []} item = 0 for sentence in [claim, headline]: lemmas[item] = [ wordnet_lemmatizer.lemmatize(word).lower() for word in nltk.word_tokenize(sentence) if word not in puncts ] item = item + 1 common_lemma = set(lemmas[0]).intersection(lemmas[1]) union_lemma = set(lemmas[0]).union(lemmas[1]) overlap = float(len(common_lemma) / len(union_lemma)) return overlap
def stem_token(stemmer_name, stopword): if stemmer_name == "Porter-Stemmer": #print ("Performing Porter Stemming") stemmer = stem.PorterStemmer() for count in range(len(stopword)): stopword[count] = stemmer.stem(stopword[count]) elif stemmer_name == "Lancaster-Stemmer": #print ("Performing Lancaster Stemming") stemmer = stem.LancasterStemmer() for count in range(len(stopword)): stopword[count] = stemmer.stem(stopword[count]) elif stemmer_name == "WordNet-Lemmatizer": #print ("Performing Wordnet Lemmatization") stemmer = stem.WordNetLemmatizer() for count in range(len(stopword)): stopword[count] = stemmer.lemmatize(stopword[count]) return (stopword)
def norm(df: pd.DataFrame) -> pd.Series: ddf = df.copy(deep=False) lemmatizer = ns.WordNetLemmatizer() topics = [] for t_item, o_item in zip(ddf["topics"], ddf["origin"]): # author topics first select if o_item == "ieee": if "Author" in t_item: ts = re.split(",", re.split(":", t_item)[-1]) elif "IEEE" in t_item: ts = re.split( ",", re.search("IEEE Keywords:(.*?);", t_item).groups()[0]) else: try: ts = re.split( ",", re.search("INSPEC: Controlled Indexing:(.*?);", t_item).groups()[0]) except: ts = re.split(",", str(t_item)) else: ts = re.split(",", str(t_item)) # topic of one paper process ts = hero.remove_html_tags(hero.lowercase(pd.Series(ts))) topic = [] for t in ts: t.replace(" - ", "-") if len(re.split("and", t)) == 2 and "-" not in t: topic += re.split("and", t) continue if len(re.split("/", t)) == 2: topic += re.split("/", t) continue if "blockchain" in t and len(re.split(" ", t)) >= 2: t = re.split(" ", t)[-1] if t != "": topic.append(t.replace("\xa0", "")) topics.append(",".join([ similar_replace(stem(remove_chore(t), lemmatizer)) for t in topic ])) # topics.append(",".join([stem(remove_chore(t), lemmatizer) for t in topic])) return pd.Series(topics)
def predict(): data = request.get_json(force=True) test_file = data # prepare the Text # clean up text: rem white space, new line marks, blank lines body_text = test_file.strip().replace(' ', ' ') body_text = body_text.replace('\n', ' ').replace('\r', '') # delete digits body_text = sub(pattern=r"\d", repl=r" ", string=body_text) # remove punctuation - updated translator = str.maketrans(' ',' ', string.punctuation) body_text = body_text.translate(translator) body_text = os.linesep.join([s for s in body_text.splitlines() if s]) # further processing tokenizer = RegexpTokenizer(r'\w+') lemmatizer = stem.WordNetLemmatizer() test_text=[] raw = body_text.lower() raw = normalize(raw) tokens = tokenizer.tokenize(raw) stopped_tokens = [i for i in tokens if not i in stop_words] lemmatized_tokens = [lemmatizer.lemmatize(i) for i in stopped_tokens] test_text.extend(lemmatized_tokens) doc = raw doc = doc.lower() doc = doc.split() vec_bow = original.doc2bow(doc) vec_lda = lda_model[vec_bow] # return list of topics print(vec_lda) result=dict(vec_lda) # s1 = json.dumps(vec_lda.astype(float)) listToStr = ' '.join([str(elem) for elem in vec_lda]) return jsonify(listToStr)
def doTheThing(fileContents, mode): result = [] # TOKENIZATION if mode >= 0: tokenizedWords = tokenize.word_tokenize(fileContents) print('Tokenization...') result.append(tokenizedWords) # STOPWORDS if mode >= 1: print('Stopwords...') filteredWords=[] stop_words = set(get_stop_words('polish')) for w in tokenizedWords: if w not in stop_words: filteredWords.append(w) result.append(filteredWords) # FREQUENCY DISTRIBUTION if mode >= 2: print('FrequencyDistribution...') freqDist = probability.FreqDist(filteredWords) result.append( freqDist ) # STEMING if mode >= 3: print('Stemming...') ps = stem.PorterStemmer() stemmedWords = [] for w in filteredWords: stemmedWords.append(ps.stem(w)) result.append(stemmedWords) # LEMMATIZATION if mode >= 4: print('Lemmanization...') wnl = stem.WordNetLemmatizer() lemmatizedWords = [] for w in filteredWords: lemmatizedWords.append(wnl.lemmatize(w, "v")) result.append(lemmatizedWords) return result
def tokens_to_csv(): dic = defaultdict(list) for filename in glob.glob("*.txt"): with open(filename, 'r') as f: tokens = word_tokenize(f.read().lower()) tokens = set([ nltk.WordNetLemmatizer().lemmatize(token) for token in tokens ]) tokens = [w for w in tokens if w not in stop_words] for word in tokens: dic[word].append(filename) with open('Tokens.csv', 'w') as f: header = "Token,PostingList\n" f.write(header) for key, value in dic.items(): data = "" for e in value: data += str(e) + " " f.write(f"{key},{data}\n")
def clean_string(text): ''' Takes a string, lowercase, tokenizes, lematizes, and removes stopwords, and returns a single string again ''' from nltk import stem from nltk.tokenize import RegexpTokenizer from nltk.corpus import stopwords import unicodedata tokenizer = RegexpTokenizer(r'\b\w\w+\b') #stemmer = stem.snowball.EnglishStemmer() lemmer = stem.WordNetLemmatizer() #stopper = stopwords.words('english') # if type(text) == unicode: # text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore') words = text.lower() words = tokenizer.tokenize(words) for word in words: if type(word) == unicode: print(word) word = unicodedata.normalize('NFKD', word).encode('ascii', 'ignore') #words = [lemmer.lemmatize(w) for w in words] #words = [i for i in words if i not in stopper] return ' '.join(words)
class Preprocessor: stopWords = "stop_words" letters = "letters" numbers = "numbers" stemming = "stemming" lemming = "lemming" stops = stopwords.words('english') letters = re.compile("^[a-z][A-Z]$") number = re.compile("^[-+]?[0-9]+$") ps = stem.PorterStemmer() lm = stem.WordNetLemmatizer() def __init__(self, methods=[stopWords, letters, numbers, lemming]): self.methods = methods def preprocess(self, text): words = re.findall(r'\w+', text.lower()) if self.stopWords in self.methods: words = [w for w in words if w not in self.stops] if self.letters in self.methods: words = [w for w in words if not self.letters.match(w)] if self.numbers in self.methods: words = [ num2words(w) if self.number.match(w) else w for w in words ] if self.stemming in self.methods: words = [self.ps.stem(w) for w in words] if self.lemming in self.methods: words = [self.lm.lemmatize(w, pos="v") for w in words] return words
def frequency_counter(domain): if domain == 'laptop': suffix = '_2014.pkl' else: suffix = '_2016_2.pkl' data_file = './pkl/data_' + domain + suffix fr = open(data_file, 'rb') data = pickle.load(fr) fr.close() sents = data['raw_sentence'] import nltk.stem as ns lemmatizer = ns.WordNetLemmatizer() sents = [[ lemmatizer.lemmatize(lemmatizer.lemmatize(word, 'n'), 'v') for word in sent ] for sent in sents] word_c = Counter() for sent in sents: word_c += Counter(sent) stop_words = list(set(stopwords.words('english'))) print(stop_words) common_words = word_c.most_common(200) for word, count in common_words: if word not in stop_words: print(word, count) # print(word_c.most_common(10)) labels = data['labels'] term_c = Counter() for label, sent in zip(labels, sents): temp = [w for l, w in zip(label, sent) if l != 0] # terms.append(temp) term_c += Counter(temp) print('----------------------------------------------')