def init_feature_sentences(self, total_content): t = Tokenizer() p = POSTagger() wnl = WordNetLemmatizer() sentences = t.sent_tokenize(total_content.lower()) for sentence in sentences: tagged_sentence = p.ntlk_tag(t.word_tokenize(sentence)) #Initializing Feature Sentence dictionary feature_sentence = {} feature_sentence['sentence'] = sentence feature_sentence['tags'] = tagged_sentence feature_sentence['nouns'] = [] feature_sentence['noun_phrases'] = [] #Finding the Nouns/Noun Phrases in the tagged sentence for i in range(0,len(tagged_sentence)): (word, tag) = tagged_sentence[i] #Chunking if tag.startswith('N') and tag != 'NNP': if i > 0 and len(feature_sentence['nouns']) > 0 and tagged_sentence[i - 1][0] == feature_sentence['nouns'][-1] and feature_sentence['sentence'].find(feature_sentence['nouns'][-1] + ' ' + word) > -1: feature_sentence['noun_phrases'].append(wnl.lemmatize(feature_sentence['nouns'].pop() + ' ' + word)) else: feature_sentence['nouns'].append(wnl.lemmatize(word)) self.feature_sentences.append(feature_sentence)
def write_clean_turian_unigrams(): """ Extracts unigram embeddings from Socher's binary distribution. These can be used by other composers. There are only 50k embeddings (presumably for the most frequent tokens in the corpus). The words have not been processed- there are punctuation-only tokens, uppercased words and non-lemmatized words. There isn't any PoS tag filtering either- words like "to", "while" and "there". I remove punctuation, then lowercase and lemmatize each entry. Multiple entries may map to the same canonical form. I select the shortest original entry (ties are broken by giving preference to words that are already lowercased). This could have been done better. Only vectors for the selected entries are kept. There's 33k canonical forms left, many of which are not nouns/adjs/verbs. We don't have a PoS tag for the canonical forms. I get around the problem by creating 3 copies of each canonical form and expand "cat" to cat/N, cat/J and cat/V, which all share the same vector. """ logging.info('Writing Turian unigrams to %s', turian_unigram_vectors_file) mat = loadmat(socher_unigram_embedding_matlab) words = [w[0] for w in mat['words'].ravel()] df = pd.DataFrame(mat['We'].T, index=words) lmtzr = WordNetLemmatizer() clean_to_dirty = defaultdict(list) # canonical -> [non-canonical] dirty_to_clean = dict() # non-canonical -> canonical to_keep = set() # which non-canonical forms forms we will keep # todo this can be done based on frequency or something for w in words: if set(w).intersection(set(string.punctuation).union(set('0123456789'))): # not a real word- contains digits or punctuation continue lemma = lmtzr.lemmatize(w.lower()) clean_to_dirty[lemma].append(w) dirty_to_clean[w] = lemma # decide which of possibly many non-canonical forms with the same lemma to keep # prefer shorter and lowercased non-canonical forms for lemma, dirty_list in clean_to_dirty.items(): if len(dirty_list) > 1: best_lemma = min(dirty_list, key=lambda w: (len(w), not w.islower())) else: best_lemma = dirty_list[0] to_keep.add(best_lemma) # remove non-canonical forms we don't want idx_to_drop = [i for i, w in enumerate(df.index) if w not in to_keep] ddf = df.drop(df.index[idx_to_drop]) # canonicalize whatever is left ddf.index = [lmtzr.lemmatize(w.lower()) for w in ddf.index] # we don't know what the PoS tags of the canonical forms are, so make them all of the same tag # e.g. expand "cat" to cat/N, cat/J and cat/V, which all share the same vector new_index = ['%s/%s'%(w, pos) for pos in 'NJV' for w in ddf.index] new_data = np.vstack([ddf.values] * 3) ddf = pd.DataFrame(new_data, index= new_index) dv = DenseVectors(ddf, allow_lexical_overlap=True) dv.to_tsv(turian_unigram_vectors_file) logging.info('Done')
def __init__(self, text, product_name): self.candidate_features = [] self.feature_sentences = [] self.product_name = product_name.lower().split('-')[0].split('_') t = Tokenizer() sents = t.sent_tokenize(text.lower()) p = POSTagger() wnl = WordNetLemmatizer() for sent in sents: tagged_sent = p.nltk_tag(t.word_tokenize(sent)) feature_sent = {} feature_sent['sentence'] = sent feature_sent['tags'] = tagged_sent feature_sent['nouns'] = [] feature_sent['noun_phrases'] = [] for i in range(0, len(tagged_sent)): (word, tag) = tagged_sent[i] #Don't include proper nouns if tag.startswith('N') and tag != 'NNP': """ Consecutive nouns might form a feature phrase. Eg. Picture quality is a phrase. Meaningless phrases like 'quality digital' are removed later as their frequeny of occurence is low. """ if i > 0 and len(feature_sent['nouns']) > 0 and tagged_sent[i - 1][0] == feature_sent['nouns'][-1] and feature_sent['sentence'].find(feature_sent['nouns'][-1] + ' ' + word) > -1: feature_sent['noun_phrases'].append(wnl.lemmatize(feature_sent['nouns'].pop() + ' ' + word)) else: feature_sent['nouns'].append(wnl.lemmatize(word)) self.feature_sentences.append(feature_sent)
def lemmatize(tokens): # lemmatize words. try both noun and verb lemmatizations lmtzr = WordNetLemmatizer() for i in range(0,len(tokens)): res = lmtzr.lemmatize(tokens[i]) if res == tokens[i]: tokens[i] = lmtzr.lemmatize(tokens[i], 'v') else: tokens[i] = res return tokens
class Lemmatizer(): def __init__(self): self.lemmatizer = WordNetLemmatizer() self.stemmer = SnowballStemmer("english", ignore_stopwords=True) ''' Lemmatizes every word in a sentence and then tokenizes it. sentence: str ''' def lemmatize(self, sentence): tokens = word_tokenize(sentence) lemmas = self.lemmatizeTokens(tokens) return " ".join(lemmas) ''' Turns phrase tokens into lemmatized tokens, which means into some standard format as determined by the nltk lemmatizer. "Dogs" to "dog", "went" to "go", etc. tokens: list of str ''' def lemmatizeTokens(self, tokens): tokens_tagged = pos_tag(tokens) #Get simple POS tags. tokens_simpleTags = [(word, map_tag('en-ptb', 'universal', tag)) for word, tag in tokens_tagged] #Actually lemmatize. lemmas = [] for token, tag in tokens_simpleTags: lemmatized = "" if tag == "VERB": lemmatized = self.lemmatizer.lemmatize(token, pos='v') elif tag == "ADJ": lemmatized = self.lemmatizer.lemmatize(token, pos='a') elif tag == "ADV": lemmatized = self.lemmatizer.lemmatize(token, pos='r') else: lemmatized = self.lemmatizer.lemmatize(token) #pos = 'n' lemmas.append(lemmatized.encode("utf-8")) return lemmas ''' Reduce this word down to its most basic form by removing suffixes or common ending and finding the "root" or "stem" of the word. Example: "response," "responsive," and "responsivity" all stem from "respons," or something similar. ''' def stem(self, tokens): stemmed = [] for token in tokens: stem = self.stemmer.stem(token) stemmed.append(stem.encode("utf-8")) return stemmed
def process_data(sentence): #Reference: http://stackoverflow.com/questions/20827741/nltk-naivebayesclassifier-training-for-sentiment-analysis #Reference: https://blog.cambridgecoding.com/2016/01/25/implementing-your-own-spam-filter/ lemmatizer = WordNetLemmatizer() return [ lemmatizer.lemmatize(word.lower()) for word in word_tokenize(sentence) ]
def tokenize(data_list): from nltk.tokenize import TweetTokenizer from nltk.corpus import stopwords tt = TweetTokenizer() stopwords = set(stopwords.words('english')) climate_list = [] for d in data_list: d = str(d) data_dict = {} list_words = tt.tokenize(d) list_words = [ w.lower() for w in list_words if isinstance(w, str) == True ] list_words = [w for w in list_words if w.isalpha()] filtered_words = [w for w in list_words if not w in stopwords] wnl = WordNetLemmatizer() filtered_words = [wnl.lemmatize(w, 'n') for w in filtered_words] filtered_words = [wnl.lemmatize(w, 'v') for w in filtered_words] stemmer = PorterStemmer() filtered_words = [stemmer.stem(w) for w in filtered_words] for i in filtered_words: key = data_dict.get(i) if key == None: data_dict[i] = 1 else: data_dict[i] += 1 climate_list.append(data_dict) return climate_list
def _process_descr_text(self): # Preprocessing of text data # Convert all the string to lower cases proc_text = self.descr_text.lower() # \S+ means anything that is not an empty space proc_text = re.sub('http\S*', '', proc_text) # \s+ means all empty space (\n, \r, \t) proc_text = re.sub('\s+', ' ', proc_text) proc_text = re.sub('[^\w\s]', '', proc_text) # Adding domain-based stop words to general English stop words list and ignoring these in data stop = stopwords.words('english') + ["festival", "event", "festiv", "day", "week", "month", "year", "much"\ "feature", "celebration", "celebrate", "featuring", "featurin", "include", \ "weekend", "event", "featuring", "enjoy", "fest", "cotopaxi", "questival", \ "around", "best", "including", "great", "first", "come", "throughout", "area", \ "festivals", "events", "fairs", "days", "celebrations", "fests", "includes", \ "features", "celebrating", "areas"] proc_text = " ".join(word for word in proc_text.split() if word not in stop) # Tokenizes and lemmatizes words proc_text = word_tokenize(proc_text) lemztr = WordNetLemmatizer() proc_text = ' '.join([lemztr.lemmatize(word) for word in proc_text]) #self.proc_text = proc_text return proc_text
class TextAnalyser(object): def __init__(self): self.threshold = 0.99 self.__rake = Rake() self.__stemmer = LancasterStemmer() self.__lemma = WordNetLemmatizer() self.__stopwords = ['alt'] pass def extract(self, text): self.__rake.extract_keywords_from_text(text.strip()) scores = self.__rake.get_ranked_phrases_with_scores() keywords = self.unpack_keywords(scores) words = filter(lambda x: x[1] not in self.__stopwords and x[1].isalnum(), keywords) filtered_words = map(lambda x: x[1], filter(lambda x: x[0] > self.threshold, words)) lemms = map(lambda x: self.__lemma.lemmatize(x), filtered_words) stems = map(lambda x: self.__stemmer.stem(x), lemms) return stems @staticmethod def unpack_keywords(keywords): words = [] for k in keywords: for p in k[1].split(' '): words.append((k[0], p)) return words
def data_cleaning(data): data["essay"] = data["essay"].str.lower() # 分词 tokenizer = RegexpTokenizer(r'[a-zA-Z]+') data["essay_token"] = data["essay"].apply(tokenizer.tokenize) # 去停用词 # stop_words = stopwords.words('english') # data["essay_token"] = data["essay_token"].apply(lambda x: [word for word in x if word not in stop_words]) # 词形还原 lemmatizer = WordNetLemmatizer() data["essay_token"] = data["essay_token"].apply(lambda x: [lemmatizer.lemmatize(word) for word in x]) # 词干化 stemmer = nltk.stem.SnowballStemmer('english') data["essay_token"] = data["essay_token"].apply(lambda x: [stemmer.stem(word) for word in x]) data = data.fillna(6) essays = [] for essay, score in zip(data['essay_token'], data['score']): essays.append((' '.join(essay), int(score))) return essays
def preprocess(text_data): nltk.download("wordnet") processed_text = [] word_lemmatizer = WordNetLemmatizer() url_pattern = r"((http://)[^ ]*|(https://)[^ ]*|( www\.)[^ ]*)" user_pattern = "@[^\s]+" alpha_pattern = "[^a-zA-Z0-9]" sequence_pattern = r"(.)\1\1+" seq_replace_pattern = r"\1\1" for tweet in text_data: tweet = tweet.lower() # Replace all Urls with 'URL' tweet = re.sub(url_pattern, ' URL', tweet) # Replace all emojis for emoji in emojis.keys(): tweet = tweet.replace(emoji, "EMOJI" + emojis[emoji]) tweet = re.sub(user_pattern, ' USER', tweet) tweet = re.sub(alpha_pattern, " ", tweet) tweet = re.sub(sequence_pattern, seq_replace_pattern, tweet) tweet_words = '' for word in tweet.split(): if len(word) > 1: word = word_lemmatizer.lemmatize(word) tweet_words += (word + '') processed_text.append(tweet_words) return processed_text
def lemma(): file1 = open('inputtext.txt', 'r') text = file1.readline() lem = [] tags = [] data = "" while text != "": data = data + text tags = tags + pos_tag(wordpunct_tokenize(text)) # lem = WordNetLemmatizer().lemmatize((word for word in tokens), (pos for word,pos in tags)) for word, pos in tags: lem.append(WordNetLemmatizer.lemmatize(word, pos)) text = file1.readline() # bi-gram bgram = {} ngram = ngrams(data.split(), 2) for item in ngram: if item in bgram: bgram[item] += 1 else: bgram[item] = 1 # top 10 bigrams for i in 10: print(bgram[i]) while text != "": for word in bgram[:10]: if (text.__contains__(word)): print(text) text = file1.readline()
def quantify_negativity(lyrics, emolex): wnl = WordNetLemmatizer() negative_pct = {} stop_words = set(stopwords.words('english')) print('[+] Calculating lyrics negativity') for song in lyrics: tokens = [t.lower() for t in word_tokenize(lyrics[song]) \ if t not in stop_words and t.isalpha()] tokens = set([wnl.lemmatize(t) for t in tokens]) tagged = pos_tag(tokens) tokens = [x[0] for x in tagged if x[1] not in unwanted_pos_tags] len_tokens = len(tokens) total_tokens = len_tokens if len_tokens != 0 else 1 negative_tokens = 0 for token in tokens: for emotion in negative_emotions: if token.lower() in emolex[emotion]: negative_tokens += 1 break negative_pct[song] = negative_tokens / total_tokens return negative_pct
def prepare_vocab(self, word_corpus, word_embedding, topk, num_vocab): lemmatizer = WordNetLemmatizer() lemmaed_count_1w = Counter() with open(word_corpus, 'r') as f: for line in f: word, count = line.strip().split('\t') lemmaed_count_1w[lemmatizer.lemmatize(word)] += int(count) topk_vocab = heapq.nlargest(topk, lemmaed_count_1w, key=lemmaed_count_1w.get) topk_vocab_vec = OrderedDict.fromkeys(topk_vocab) n_non_empty = 0 with open(word_embedding, 'r') as f: for line in tqdm(f): word, *vec = line.rstrip().split(' ') if word in topk_vocab_vec: topk_vocab_vec[word] = np.array(vec, dtype=float) n_non_empty += 1 print('Num of non empty vectors: ', n_non_empty) vocab_vec = np.zeros([300, num_vocab]) vocab = [] num = 0 for k, v in iter(topk_vocab_vec.items()): if v is not None: vocab_vec[:, num] = v vocab.append(k) num += 1 if num >= num_vocab: break np.save('vocab.npy', vocab) np.save('vocab_vec.npy', vocab_vec)
def tokenize(document): lemmatizer = WordNetLemmatizer() "Break the document into sentences" for sent in sent_tokenize(document): "Break the sentence into part of speech tagged tokens" for token, tag in pos_tag(wordpunct_tokenize(sent)): "Apply preprocessing to the token" token = token.lower() # Convert to lower case token = token.strip() # Strip whitespace and other punctuations token = token.strip('_') # remove _ if any token = token.strip('*') # remove * if any "If stopword, ignore." if token in stopwords.words('english'): continue "If punctuation, ignore." if all(char in string.punctuation for char in token): continue "If number, ignore." if token.isdigit(): continue # Lemmatize the token and yield # Note: Lemmatization is the process of looking up a single word form # from the variety of morphologic affixes that can be applied to # indicate tense, plurality, gender, etc. lemma = lemmatizer.lemmatize(token) # all_lema.append(lemma) yield lemma
def rq3(): path_base = "commits_" for filename in ["big", "small"]: logs = get_logs(path_base + filename) text = " ".join([log for log in logs]) #text = "Life is like a box of chocolates. You never know what you're gonna get." raw_words = re.findall(r"\w+(?:[-']\w+)*|'|[-.(]+|\S\w*", text.lower()) table = str.maketrans('', '', string.punctuation) filtered_words = [ word.translate(table) for word in raw_words if word not in stopwords.words('english') and len(word) > 1 ] #snowball_stemmer = SnowballStemmer("english") #words_stem = [snowball_stemmer.stem(filtered_word) for filtered_word in filtered_words] wordnet_lematizer = WordNetLemmatizer() words_lema = [ wordnet_lematizer.lemmatize(filtered_word) for filtered_word in filtered_words ] c = Counter() for word_lema in words_lema: c[word_lema] += 1 words = c.most_common(100) output_keywords(filename, words)
def returnKeywordFromList(convertpath): token_dict = {} i=0 #nltk.download() wnl = WordNetLemmatizer() fileName = {} #print file #print str(i)+ file #file_path = subdir + os.path.sep + file shakes = open(convertpath, 'r') text = shakes.read() lowers = "".join(map(lambda l:l.decode('unicode_escape').encode('ascii','ignore'),text)) no_punctuation = re.sub(r'[?|$|.|!0-9()=+-\/\'\"\|]',r'',lowers) d = {v:True for v in no_punctuation.split()} for token in d.keys(): no_punctuation = no_punctuation.replace(token, wnl.lemmatize(token)) fileName[i] = file token_dict[i] = no_punctuation.replace("\n"," ").replace("\r","") #break #this can take some time ##print token_dict.values() tfidf_vect = TfidfVectorizer(stop_words =stops, ngram_range=(1, 2)) # # # count_vect.stop_words = stops # X_train_counts = tfidf_vect.fit_transform(token_dict.values()) #print tfidf_vect.get_feature_names() #print(sortSparseMatrix(X_train_counts.getrow(0),rev=False, only_indices=False)) sortedMatrix = sortSparseMatrix(X_train_counts.getrow(0),rev=True, only_indices=False)[0] x = map(lambda (x,y):x,sortedMatrix) result = getKeywordAlgorithms(1,sortedMatrix) return map(lambda key:tfidf_vect.get_feature_names()[key],result)
def preprocess(x): """Preprocesses the combined thread and author and converts to tfidf features""" # change shoe brands to 'shoebrand' x.replace('(?i)asics|nike|adidas|hoka|brooks|puma|new balance|oiselle|saucony', 'shoebrand', regex=True, inplace=True) # change interval descriptions to 'intervals' x.replace('(?i)[0-9]+x([0-9]{2,4}|[a-z]+)', 'intervals', regex=True, inplace=True) # change times to 'time' x.replace('[0-9]{0,2}((:[0-9]{2})|(\\.[0-9]+))', 'time', regex=True, inplace=True) # change common race distances to 'distance' x.replace('(?i)[0-9]+(m|yd|km|mi|k)', 'distance', regex=True, inplace=True) # remove special characters x.replace('\\W', ' ', regex=True, inplace=True) # remove single characters x.replace('\\s+[a-zA-Z]\\s+', ' ', regex=True, inplace=True) # remove single character from start x.replace('\\^[a-zA-Z]\\s+', ' ', regex=True, inplace=True) # remove multiple spaces x.replace('\\s+', ' ', regex=True, inplace=True) # to lowercase x = x.str.lower() # lemmatize stemmer = WordNetLemmatizer() x.apply(lambda text: [stemmer.lemmatize(word) for word in text.split()]) return x
def feature_extractor_top_words_weights(data): data = data.decode('utf-8') top_words = ['travel', 'vacation', 'city', 'itsmorefuninthephilippines', 'travel', 'boracay', 'philippine', 'view', 'day', 'beach', 'morning', 'resort', 'good', 'cebu', 'island'] features = {} lemmatizer = WordNetLemmatizer() stop_words = stopwords.words('english') words = [lemmatizer.lemmatize(word.lower()) for word in word_tokenize(data)] for word in words: if word not in stop_words: if word in features: if word in top_words: features[word] += 1.5 else: features[word] += 1 else: if word in top_words: features[word] = 1.5 else: features[word] = 1 return features
def feature_extractor_top_words_weights(data): """ Extract features using the top words with weights method parameter: data (tweet) returns: returns features of the given data """ data = data.decode('utf-8') # top 15 frequently-ocurring words from the tourism-related twitter corpus top_words = ['travel', 'vacation', 'city', 'itsmorefuninthephilippines', 'travel', 'boracay', 'philippine', 'view', 'day', 'beach', 'morning', 'resort', 'good', 'cebu', 'island'] features = {} lemmatizer = WordNetLemmatizer() stop_words = stopwords.words('english') # preprocessing: tokenize, convert to lowercase and lemmatize words words = [lemmatizer.lemmatize(word.lower()) for word in word_tokenize(data)] # remove stop words and add words and their frequencies as features for word in words: if word not in stop_words: if word in features: # if word is found in the top words list, increase by 1.5 or preferred weight if word in top_words: features[word] += 1.5 else: features[word] += 1 else: if word in top_words: features[word] = 1.5 else: features[word] = 1 return features
def text2sents(text, lemmatize=False, stemmer=None): """ converts a text into a list of sentences consisted of normalized words :param text: list of string to process :param lemmatize: if true, words will be lemmatized, otherwise -- stemmed :param stemmer: stemmer to be used, if None, PortedStemmer is used. Only applyed if lemmatize==False :return: list of lists of words """ sents = sent_tokenize(text) tokenizer = RegexpTokenizer(r'\w+') if lemmatize: normalizer = WordNetLemmatizer() tagger = PerceptronTagger() elif stemmer is None: normalizer = PorterStemmer() else: normalizer = stemmer sents_normalized = [] for sent in sents: sent_tokenized = tokenizer.tokenize(sent) if lemmatize: sent_tagged = tagger.tag(sent_tokenized) sent_normalized = [normalizer.lemmatize(w[0], get_wordnet_pos(w[1])) for w in sent_tagged] else: sent_normalized = [normalizer.stem(w) for w in sent_tokenized] sents_normalized.append(sent_normalized) return sents_normalized
def lemmatizing(line_list): """ Input: line_list (list of strings(sentences/documents)) - e.g. dataset.data Iterates over all terms in lines, lemmatize them using WordNetLemmatizer() Return: lemmatized_list (list of strings(terms that stemmed)) """ lemmatized_list = [] lemmatizer = WordNetLemmatizer() for i, line in enumerate(line_list): # linercase line = line.lower() # remove punctuation # below method will simply remove punctuation, but mistakes such as amazon.com => amazoncom # nopunct_line = ''.join([c for c in line # if re.match("[a-z\-\' \n\t]", c)]) # this solve the problem above: nopunct_line = re.sub('[^A-Za-z0-9]+', ' ', line) # tokenize line_token = wt(nopunct_line) # stemming lemmatized_line = [] for term in line_token: term = lemmatizer.lemmatize(term) lemmatized_line.append(term) # back to sentence as a string lemmatized_sentence = ' '.join(lemmatized_line) lemmatized_list.append(lemmatized_sentence) return lemmatized_list
def preprocess(original_str): # stemmer wnl = WordNetLemmatizer() # pos original_str = unicode(original_str, errors='ignore') print type(original_str) article_tok = pos_tag(word_tokenize(original_str)) print type(article_tok) print "token: " print article_tok # choose Noun str_noun = '' for word, tag in article_tok: if ("NN" in tag) or ("JJ" in tag): # print(word,":",tag) # print(wnl.lemmatize(word)) try: stemming_word = wnl.lemmatize(word) print stemming_word if len(word) > 1: str_noun = str_noun + stemming_word + " " except UnicodeDecodeError as e: print "error: " + word # end if # result # final_doc.append(str_noun) # print "return_preprocess : " + str_noun return str_noun
def lemmstem(sentences): ''' This function is responsible for perfoming the lemmarization and stemming of the words Input: A list of trees containing the sentences. All words are classificated by their NE type Output: Lemmatized/Stemmized sentences ''' lmtzr = WordNetLemmatizer() st = LancasterStemmer() dic = {'VB' :wordnet.VERB, 'NN': wordnet.NOUN, 'JJ':wordnet.ADJ, 'RB':wordnet.ADV } for sent in sentences: lvsidx=sent.treepositions('leaves') for pos in lvsidx: word=sent[pos][0] tag = sent[pos][1] rtag = tag[0:2] if rtag in dic: lemm=lmtzr.lemmatize( word, dic[rtag] ) stem=st.stem(lemm) #print word, lemm, stem #Linia maldita sent[pos]=(word, tag, stem) else: sent[pos]=(word, tag, word) return sentences
def feature_extractor_tripadvisor_top_words_weights(data): data = data.decode('utf-8') top_file = open('scraper/top_words.txt', 'r') top_words = [word.replace('\n', '') for word in top_file] places_file = open('scraper/places.txt', 'r') for place in places_file: place = place.replace('\n', '') for word in place.split(' '): if word != '-': top_words.append(word) features = {} lemmatizer = WordNetLemmatizer() stop_words = stopwords.words('english') words = [lemmatizer.lemmatize(word.lower()) for word in word_tokenize(data)] for word in words: if word not in stop_words: if word in features: if word in top_words: features[word] += 1.5 else: features[word] += 1 else: if word in top_words: features[word] = 1.5 else: features[word] = 1 return features
class NLTKPreprocessor(BaseEstimator, TransformerMixin): def __init__(self, stopwords=None, punct=None, lower=True, strip=True): # print("Inside NLTk cosnt") self.lower = lower self.strip = strip self.stopwords = stopwords or set(sw.words('english')) self.punct = punct or set(string.punctuation) self.lemmatizer = WordNetLemmatizer() def fit(self, X, y=None): #print("In fit") return self def inverse_transform(self, X): #print("In inverse") return [" ".join(doc) for doc in X] def transform(self, X): #print("In transorm") return [list(self.tokenize(doc)) for doc in X] def tokenize(self, document): # Break the document into sentences #print("In tokenize") for sent in sent_tokenize(document): print("sent", sent) # Break the sentence into part of speech tagged tokens try: for token, tag in pos_tag(wordpunct_tokenize(sent)): # Apply preprocessing to the token token = token.lower() if self.lower else token token = token.strip() if self.strip else token token = token.strip('_') if self.strip else token token = token.strip('*') if self.strip else token # If stopword, ignore token and continue # if token in self.stopwords: # continue # If punctuation, ignore token and continue if all(char in self.punct for char in token): continue # Lemmatize the token and yield lemma = self.lemmatize(token, tag) yield lemma except: print("In token tag") def lemmatize(self, token, tag): #print("In leammarize") tag = { 'N': wn.NOUN, 'V': wn.VERB, 'R': wn.ADV, 'J': wn.ADJ }.get(tag[0], wn.NOUN) return self.lemmatizer.lemmatize(token, tag)
class Preprocessor(BaseEstimator, TransformerMixin): def __init__(self, stopwords=None, punct=None, lower=True, strip=True): self.stopwords = set(stopwords) if stopwords else set( sw.words('english')) self.punct = punct if punct else set(string.punctuation) self.lower = lower self.strip = strip self.lemmatizer = WordNetLemmatizer() def fit(self, X, y=None): return self def inverse_transform(self, X): return X def transform(self, X): return [list(self.tokenize(doc)) for doc in X] def tokenize(self, document): """ Returns a normalized, lemmatized list of tokens from a document by applying segmentation (breaking into sentences), then word/punctuation tokenization, and finally part of speech tagging. It uses the part of speech tags to look up the lemma in WordNet, and returns the lowercase version of all the words, removing stopwords and punctuation. """ # Break the document into sentences for sentence in sent_tokenize(document): # Break the sentence into part of speech tagged token for token, tag in pos_tag(wordpunct_tokenize(sentence)): # Applying preprocessing to the token token = token.lower() if self.lower else token token = token.strip() if self.strip else token token = token.strip('_') if self.strip else token token = token.strip('*') if self.strip else token # If punctuation of stopword, ignore the token and continue if token in self.stopwords or all(char in self.punct for char in token): continue # Lemmatize the token and yield lemma = self.lemmatize(token, tag) yield lemma def lemmatize(self, token, tag): """ Converts the Penn Treebank tag to a WordNet POS tag, then uses that tag to perform much more accurate WordNet lemmatization. """ tag = { 'N': wn.NOUN, 'V': wn.VERB, 'R': wn.ADV, 'J': wn.ADJ }.get(tag[0], wn.NOUN) return self.lemmatizer.lemmatize(token, tag)
def process_words(self,text): result = [] lem = WordNetLemmatizer() word_tokens = word_tokenize(text) for word in word_tokens: if word not in stop_words: result.append(lem.lemmatize(word)) return ','.join(result).replace(',',' ')
def preprocess(sentece): """ Preprocess the data splitting the words and linking the different form of the same word """ tokens = word_tokenize(sentece) lemmatizer = WordNetLemmatizer() return [lemmatizer.lemmatize(word.lower()) for word in tokens]
def get_lemmatized_text(corpus): import nltk nltk.download('wordnet') lemmatizer = WordNetLemmatizer() return [ ' '.join([lemmatizer.lemmatize(word) for word in review.split()]) for review in corpus ]
def lemmatize_verbs(words): """Lemmatize verbs in list of tokenized words""" lemmatizer = WordNetLemmatizer() lemmas = [] for word in words: lemma = lemmatizer.lemmatize(word, pos='v') lemmas.append(lemma) return lemmas
def preprocess(text): lemma = WordNetLemmatizer() tokens = word_tokenize(text) tokens = [token for token in tokens if token.isalpha()] tokens = [lemma.lemmatize(word.lower(), pos="v") for word in tokens] tokens = [lemma.lemmatize(word.lower(), pos="n") for word in tokens] return tokens
def preprocess(sentence): lemmatizer = WordNetLemmatizer() tokenizer = RegexpTokenizer(r'\w+') return [ lemmatizer.lemmatize(word.lower()) for word in tokenizer.tokenize(unicode(sentence, errors='ignore')) if not word.startswith('/') ]
class NLTKPreprocessor(BaseEstimator, TransformerMixin): def __init__(self, stopwords=[], punct=[], lower=True, strip=True, lemmatize=True, ignore_type=[]): self.lower = lower self.strip = strip self.ignore_type = ignore_type self.stopwords = stopwords self.punct = punct self.do_lemmatize = lemmatize self.lemmatizer = WordNetLemmatizer() def fit(self, X, y=None): return self def inverse_transform(self, X): return [" ".join(doc) for doc in X] def transform(self, X): return [list(self.tokenize(doc)) for doc in X] def tokenize(self, document): for sent in sent_tokenize(document): for token, tag in pos_tag(wordpunct_tokenize(sent)): token = token.lower() if self.lower else token token = token.strip() if self.strip else token token = token.strip('_') if self.strip else token token = token.strip('*') if self.strip else token if token in self.stopwords: continue if all(char in self.punct for char in token): continue if self.do_lemmatize: lemma = self.lemmatize(token, tag, self.ignore_type) yield lemma else: yield token def lemmatize(self, token, tag, ignore_type=['N']): tag = { 'N': wn.NOUN, 'V': wn.VERB, 'R': wn.ADV, 'J': wn.ADJ }.get(tag[0], wn.NOUN) # Ignore nouns by default to account for plurals if tag in ignore_type: return token else: return self.lemmatizer.lemmatize(token, tag)
def tokenize_cleaned(document, th_tokenizer, thai_char, stopwords_en, stopwords_th, keywords): """ Tokenize and lemmatize tokens in document. :param document: Document in string :param th_tokenizer: Thai tokenizer function (return list) :param thai_char: re.compile patter containing all Thai alphabets. :param stopwords_en: set() of English stop word :param stopwords_th: set() of Thai stop word :param keywords: set() of keywords. :return: list of tokens. """ from copy import deepcopy from nltk import WordNetLemmatizer import nltk if './Resource/nltk_data' not in nltk.data.path: nltk.data.path.append('./Resource/nltk_data') if '../Resource/nltk_data' not in nltk.data.path: nltk.data.path.append('../Resource/nltk_data') def test_all_en_alpha(text): # test if characters in the string are all English alphabet. roman_alpha = [chr(alpha) for alpha in range(65, 90)] + \ [chr(alpha) for alpha in range(97, 122)] for alpha in text: if alpha not in roman_alpha: return False return True word_stem_func = WordNetLemmatizer() # declare English lemmatizer. document = deepcopy(document) document = document.split(' ') # split to form a list of phrases which are separated by '\s' # remove English stop word. document = [token.lower() for token in document if token not in stopwords_en] # Lemmatize English tokens. document = [word_stem_func.lemmatize(token) if test_all_en_alpha(token) and token not in keywords # do not lemmatize keywords. else token for token in document] # tokenize Thai phrase. tokenized = [] for token in document: if thai_char.search(token): # check if phrase is in Thai tokenized.extend(th_tokenizer(token)) # extend to include a list of Thai tokens else: tokenized.append(token) # append non-Thai tokens # remove Thai stop word for token_index in reversed(range(len(tokenized))): # iterate backward if tokenized[token_index] in stopwords_th: # if token is Thai stop word tokenize_document.pop(token_index) # remove Thai stop word from doc return tokenized
def Preprocessing(df, contractions): pd.options.mode.chained_assignment = None contractionsDict = {} for i in contractions['data']: contractionsDict[i[0]] = i[1] # remove url df['sentence'] = df['sentence'].str.replace('http\S+|www.\S+', '', case=False) # remove number df['sentence'] = df['sentence'].str.replace('\d+', '') # remove hashtags df['sentence'] = df['sentence'].str.replace('#(\w+)', '') # change all text with contraction for index, row in df.iterrows(): row[1] = ' '.join([ str(x) for x in [ contractionsDict[t] if t in contractionsDict.keys() else t for t in [e.lower() for e in row[1].split()] ] ]) # remove stopword stop_words = [] for word in stopwords.words('english'): stop_words.append(word) if ('not' not in word and 'no' not in word) else stop_words # remove punctuation tokenizer = RegexpTokenizer(r'\w+') for index, row in df.iterrows(): word_tokens = tokenizer.tokenize(row[1]) row[1] = ' '.join( [w for w in word_tokens if not w.lower() in stop_words]) # using lemmetizer wordnet_lemmatizer = WordNetLemmatizer() for index, row in df.iterrows(): row[1] = ' '.join( wordnet_lemmatizer.lemmatize(t) for t in row[1].split()) # remove non-english word english_words = set(nltk.corpus.words.words()) for index, row in df.iterrows(): word_tokens = tokenizer.tokenize(row[1]) row[1] = " ".join(w for w in word_tokens if w.lower() in english_words or not w.isalpha()) # remove non-alphabetic characters for index, row in df.iterrows(): word_tokens = tokenizer.tokenize(row[1]) row[1] = " ".join(w for w in word_tokens if w.isalpha()) return df
class NLTKPreprocessor(BaseEstimator, TransformerMixin): #it loads a variety of corpora and models for use in tokenization. #By default the set of english stopwords from NLTK is used, and the WordNetLemmatizer #looks up data from the WordNet lexicon. Note that this takes a noticeable amount of time, #and should only be done on instantiation of the transformer. def __init__(self, stopwords=None, punct=None, lower=True, strip=True): self.lower = lower self.strip = strip self.stopwords = stopwords or set(sw.words('english')) self.punct = punct or set(string.punctuation) self.lemmatizer = WordNetLemmatizer() def fit(self, X, y=None): return self def inverse_transform(self, X): return [" ".join(doc) for doc in X] def transform(self, X): return [list(self.tokenize(doc)) for doc in X] #The tokenize method breaks raw strings into sentences, #then breaks those sentences into words and punctuation, #and applies a part of speech tag. The token is then normalized: #made lower case, then stripped of whitespace and other types of punctuation that may be appended. #If the token is a stopword or if every character is punctuation, the token is ignored. #If it is not ignored, the part of speech is used to lemmatize the token, which is then yielded def tokenize(self, document): # Break the document into sentences for sent in sent_tokenize(document): # Break the sentence into part of speech tagged tokens for token, tag in pos_tag(wordpunct_tokenize(sent)): # Apply preprocessing to the token token = token.lower() if self.lower else token token = token.strip() if self.strip else token token = token.strip('_') if self.strip else token token = token.strip('*') if self.strip else token # If stopword, ignore token and continue if token in self.stopwords: continue # If punctuation, ignore token and continue if all(char in self.punct for char in token): continue # Lemmatize the token and yield lemma = self.lemmatize(token, tag) yield lemma def lemmatize(self, token, tag): tag = { 'N': wn.NOUN, 'V': wn.VERB, 'R': wn.ADV, 'J': wn.ADJ }.get(tag[0], wn.NOUN) return self.lemmatizer.lemmatize(token, tag)
def pre_tokenize(train, dev, test): from nltk.tokenize import TweetTokenizer from nltk.corpus import stopwords tt = TweetTokenizer() stopwords = set(stopwords.words('english')) x_train = [] x_dev = [] x_test = [] wnl = WordNetLemmatizer() stemmer = PorterStemmer() for w in train: list_words = tt.tokenize(w) list_words = [ w.lower() for w in list_words if isinstance(w, str) == True ] list_words = [w for w in list_words if w.isalpha()] filtered_words = [w for w in list_words if not w in stopwords] filtered_words = [wnl.lemmatize(w, 'n') for w in filtered_words] filtered_words = [wnl.lemmatize(w, 'v') for w in filtered_words] filtered_words = [stemmer.stem(w) for w in filtered_words] x_train.append(' '.join(filtered_words)) for w in dev: list_words = tt.tokenize(w) list_words = [ w.lower() for w in list_words if isinstance(w, str) == True ] list_words = [w for w in list_words if w.isalpha()] filtered_words = [w for w in list_words if not w in stopwords] filtered_words = [wnl.lemmatize(w, 'n') for w in filtered_words] filtered_words = [wnl.lemmatize(w, 'v') for w in filtered_words] filtered_words = [stemmer.stem(w) for w in filtered_words] x_dev.append(' '.join(filtered_words)) for w in test: list_words = tt.tokenize(w) list_words = [ w.lower() for w in list_words if isinstance(w, str) == True ] list_words = [w for w in list_words if w.isalpha()] filtered_words = [w for w in list_words if not w in stopwords] filtered_words = [wnl.lemmatize(w, 'n') for w in filtered_words] filtered_words = [wnl.lemmatize(w, 'v') for w in filtered_words] filtered_words = [stemmer.stem(w) for w in filtered_words] x_test.append(' '.join(filtered_words)) return x_train, x_dev, x_test
class WordComplexityLexicon: def __init__(self, lexicon): word_ratings = {} for line in open(lexicon): tokens = [t.strip() for t in line.strip().split('\t')] word_ratings[tokens[0].lower()] = float(tokens[1]) self.word_ratings = word_ratings self.lemmatizer = WordNetLemmatizer() self.lancaster_stemmer = LancasterStemmer(strip_prefix_flag=True) self.snowball_stemmer = SnowballStemmer("english") def get_feature(self, words): phrase = max(words, key=len) if phrase in self.word_ratings: return [self.word_ratings[phrase], 1.0] else: ratings = [] lemman = self.lemmatizer.lemmatize(phrase, pos='n') lemmav = self.lemmatizer.lemmatize(phrase, pos='v') lemmaa = self.lemmatizer.lemmatize(phrase, pos='a') lemmar = self.lemmatizer.lemmatize(phrase, pos='r') stem_lan = self.lancaster_stemmer.stem(phrase) try: stem_snow = self.snowball_stemmer.stem(phrase) except TypeError: stem_snow = "" if lemman in self.word_ratings: ratings.append(self.word_ratings[lemman]) elif lemmav in self.word_ratings: ratings.append(self.word_ratings[lemmav]) elif lemmaa in self.word_ratings: ratings.append(self.word_ratings[lemmaa]) elif lemmar in self.word_ratings: ratings.append(self.word_ratings[lemmar]) elif stem_snow in self.word_ratings: ratings.append(self.word_ratings[stem_snow]) elif stem_lan in self.word_ratings and abs(len(stem_lan) - len(phrase)) <= 2: ratings.append(self.word_ratings[stem_lan]) if len(ratings) > 0: return [max(ratings)*1.0, 1.0] return [0.0, 0.0]
def process(comment): lemma = WordNetLemmatizer() stop_words = set(stopwords.words('english')) tokens = word_tokenize(comment) filtered_sentence = "" for w in tokens: if not w in stop_words: filtered_sentence = filtered_sentence + lemma.lemmatize(w) + " " return filtered_sentence
def words_and_types(text): tokens = [word.lower() for word in word_tokenize(text) if word.isalpha()] wordtypes_ordered = nltk.pos_tag(tokens, tagset='universal') wordtypes = dict(wordtypes_ordered) lemma = WordNetLemmatizer() tokens = [lemma.lemmatize(word, pos = wordnet_tag(wordtypes[word])) for word in tokens] wordtypes = dict([(tokens[i], wordtypes_ordered[i][1]) for i in range(len(tokens))]) sys.stdout.flush() return tokens, wordtypes
def word_extractor2(text): wordlemmatizer = WordNetLemmatizer() text = re.sub(r'([a-z])\1+', r'\1\1',text)#substitute multiple letter by two words = "" wordtokens = [ wordlemmatizer.lemmatize(word.lower()) \ for word in word_tokenize(text.decode('utf-8', 'ignore')) ] for word in wordtokens: words+=" "+word return words
def Check(mArray): # what am I checking? item = mArray[1] lmtzr = WordNetLemmatizer() item = lmtzr.lemmatize(item) # converts to a string return ''.join(item)
class Preprocessor(object): def __init__(self): self.tokenizer = RegexpTokenizer(r'\w+') self.stopwords_eng = stopwords.words('english') self.lemmatizer = WordNetLemmatizer() def __call__(self, doc): return [ self.lemmatizer.lemmatize(t) for t in self.tokenizer.tokenize(doc) ] def process(self, text): tokens = self.tokenizer.tokenize(text.lower()) tokens_processed = [] for t in tokens: if t in self.stopwords_eng: continue tokens_processed.append(self.lemmatizer.lemmatize(t)) return tokens_processed
def lemmatize(tokens_list): """ Uses WordNet lemmatizer to lemmatize """ wordnet_lemmatizer = WordNetLemmatizer() lemmatized_list = [] for i in tokens_list: lemmatized_list.append(wordnet_lemmatizer.lemmatize(i, get_pos(i))) return lemmatized_list
def Check(mArray): #what am I checking? #Taking the 2nd item in the array since popopen puts the file path as the first item. item = mArray[1] lmtzr = WordNetLemmatizer() item = lmtzr.lemmatize(item, get_wordnet_pos(item)) #converts to a string return ''.join(item)
def add_lemmatizer(): in_fp = open(word_topic_file) out_fp = open(word_topic_lexeme_file, 'w') wnl = WordNetLemmatizer() ### line = '' line_num = 0 while 1 and line_num < max_line_num: line = in_fp.readline() line = line.strip() line_words = line.split(' ') line_write = '' for words in line_words: word_topic = words.split(':') word_id = word_topic[0] topic_id = word_topic[1] line_write += word_id line_write += ':' line_write += topic_id line_write += ':' ## if id_word_dict.has_key(word_id): word = id_word_dict[word_id] if word_lexeme_id_dict.has_key(word): line_write += word_lexeme_id_dict[word] line_write += ' ' else: word_list = [] word_list.append(word) pos = pt(word_list) tag = pos[0][1] lexeme = wnl.lemmatize(word, penn_to_wn(tag)) #print ': ', word, lexeme if word_id_dict.has_key(lexeme): lexeme_id = word_id_dict[lexeme] word_lexeme_id_dict[word] = lexeme_id line_write += lexeme_id line_write += ' ' else: word_lexeme_id_dict[word] = word_id line_write += word_id line_write += ' ' ## line_write = line_write.strip() out_fp.write(line_write) if line_num < max_line_num -1: out_fp.write('\n') line_num += 1 if line_num%1000 ==0: print 'line: ', line_num ### in_fp.close() out_fp.close()
class NLTKPreprocessor(BaseEstimator, TransformerMixin): def __init__(self, stopwords=None, punct=None, lower=True, strip=True): self.lower = lower self.strip = strip #self.stopwords = stopwords or set(sw.words('english')) self.punct = punct or set(string.punctuation) self.lemmatizer = WordNetLemmatizer() def fit(self, X, y=None): return self def inverse_transform(self, X): return [" ".join(doc) for doc in X] def transform(self, X): return [ list(self.tokenize(doc)) for doc in X ] def tokenize(self, document): # Break the document into sentences for sent in sent_tokenize(document): # Break the sentence into part of speech tagged tokens for token, tag in pos_tag(wordpunct_tokenize(sent)): # Apply preprocessing to the token token = token.lower() if self.lower else token token = token.strip() if self.strip else token token = token.strip('_') if self.strip else token token = token.strip('*') if self.strip else token # If stopword, ignore token and continue # if token in self.stopwords: # continue # If punctuation, ignore token and continue if all(char in self.punct for char in token): continue # Lemmatize the token and yield lemma = self.lemmatize(token, tag) yield lemma def lemmatize(self, token, tag): tag = { 'N': wn.NOUN, 'V': wn.VERB, 'R': wn.ADV, 'J': wn.ADJ }.get(tag[0], wn.NOUN) return self.lemmatizer.lemmatize(token, tag)
def check_whole_collocation(self, sentence): """ Generate part1 + part2 possible variants :param col :return: return collocation -- (col1, col2) or False """ collocate_set = set() lmtzr = WordNetLemmatizer() lemmas1 = lmtzr.lemmatize(self.first_col) lemmas2 = lmtzr.lemmatize(self.second_col) try: for lem1 in self.lemma_dictionary[lemmas1]: for lem2 in self.lemma_dictionary[lemmas2]: collocate_set.add((lem1, lem2)) except: collocate_set.add((lemmas1, lemmas2)) for col in collocate_set: my_col = col[0] + " " + col[1] if my_col in sentence: return col return False
def wn_lemmatize(lemma): """ Auxiliary function for pos_lemmatizing (below) Lemmatize the supplied (word, pos) pair using nltk.stem.WordNetLemmatizer. If the tag corresponds to a WordNet tag, then we convert to that one and use it, else we just use the strong for lemmatizing. """ string, tag = lemma string = string.lower() tag = tag.lower() wnl = WordNetLemmatizer() if tag.startswith('v'): tag = 'v' elif tag.startswith('n'): tag = 'n' elif tag.startswith('j'): tag = 'a' elif tag.startswith('rb'): tag = 'r' if tag in ('a', 'n', 'r', 'v'): return wnl.lemmatize(string, tag) else: return wnl.lemmatize(string)
def review_to_words(raw_review, need_to_lemmatize=False): # Function to convert a raw review to a string of words # optional lemmatization # meaningful_words = review_to_wordlist(raw_review) if need_to_lemmatize: wnl = WordNetLemmatizer() meaningful_words = [wnl.lemmatize(w) for w in meaningful_words] # 6. Join the words back into one string separated by space return " ".join(meaningful_words)
def feature_extractor(data): data = data.decode('utf-8') features = {} lemmatizer = WordNetLemmatizer() stop_words = stopwords.words('english') words = [lemmatizer.lemmatize(word.lower()) for word in word_tokenize(data)] for word in words: if word not in stop_words: if word in features: features[word] += 1 else: features[word] = 1 return features
def get_words(document): ''' Return a list of unique words in document ''' regex1 = re.compile('\W') # match non-alphanumeric regex2 = re.compile('&(#)*(\w)*;') # match html entities regex3 = re.compile('( ){2,}') # match more than 2 spaces lemmatizer = WordNetLemmatizer() tokenizer = WhitespaceTokenizer() # lowercase document, remove punctuation, and html entities document = regex3.sub(' ', regex2.sub(' ', regex1.sub(' ', document.lower()))) words = [ lemmatizer.lemmatize(word) for word in tokenizer.tokenize(document) if word not in STOPWORDS and len(word) > 2 ] return FreqDist(words)
def feature_extractor_tripadvisor_top_words_weights(data): """ Extract features using the top words with weights method using words from TripAdvisor parameter: data (tweet) returns: returns features of the given data """ data = data.decode('utf-8') # retrieve file of top 100 frequently-occurring words from TripAdvisor comments top_file = open('classifier/top_words.txt', 'r') top_words = [word.replace('\n', '') for word in top_file] # retrieve file of 100 places from TripAdvisor places_file = open('classifier/places.txt', 'r') # clean places file for place in places_file: place = place.replace('\n', '') for word in place.split(' '): if word != '-': top_words.append(word) features = {} lemmatizer = WordNetLemmatizer() stop_words = stopwords.words('english') # preprocessing: tokenize, convert to lowercase and lemmatize words words = [lemmatizer.lemmatize(word.lower()) for word in word_tokenize(data)] # remove stop words and add words and their frequencies as features # if word is found in the top words list, increase by 1.5 or preferred weight for word in words: if word not in stop_words: if word in features: if word in top_words: features[word] += 1.5 else: features[word] += 1 else: if word in top_words: features[word] = 1.5 else: features[word] = 1 return features
def __word_cleaner(self, sentence): """ Removes the unwanted words in the sentence. """ features = {} words = {} lematizer = WordNetLemmatizer() # get individual words from text words = [lematizer.lemmatize(word.lower()) for word in \ word_tokenize(sentence)] final_words = [] for word in words: word = word.encode('utf-8', 'ignore') if len(word) > 1: # check if word in not a stop word if word not in stopwords.stop_words: final_words.append(word) return ' '.join(final_words)
def _lemmatize_words(text): """Lemmatize all words in the text.""" lemmatizer = WordNetLemmatizer() lemmatizations = {} tokens = text.split() for word in tokens: if word not in lemmatizations: lemmatizations[word] = lemmatizer.lemmatize(word) for i in xrange(5): # Need to repeat several times to be safe tokens = text.split() for j in xrange(len(tokens)): try: tokens[j] = lemmatizations[tokens[j]] except KeyError: # During last pass, words were turned into their lemmas, which don't # have entries in lemmatizations pass text = ' '.join(tokens) return text
def feature_extractor(d): features = {} words = {} lematizer = WordNetLemmatizer() # get individual words from text words = [lematizer.lemmatize(word.lower()) for word in word_tokenize(d)] for word in words: word = word.encode('utf-8', 'ignore') if len(word) > 1: # check if word in not a stop word if word not in stopwords.stop_words: # check if the word is not a url or @person if not re.match('http://.*|@.*', word): if word in features: features[word] += 1 else: features[word] = 1 return features
def word_extractor2(text, sw): wordlemmatizer = WordNetLemmatizer() #Se obtienen stopwords del idioma ingles commonwords = stopwords.words('english') text = re.sub(r'([a-z])\1+', r'\1\1', text) words = "" #Se realiza lower-casing y lematizacion wordtokens = [wordlemmatizer.lemmatize(word.lower()) \ for word in word_tokenize(text.decode('utf-8', 'ignore'))] #Se eliminan tokens pertenecientes al conjunto de stopwords, en caso de que sw == True if sw == True: for word in wordtokens: if word not in commonwords: words += " " + word else: for word in wordtokens: words += " " + word return words
def tokens(document, lowercase=True, tokenize='no_digits', stopwords=False, stemming=None, lemmatize=False): """Tokenize a raw string based on passed tokenization options.""" raw_doc = unicode(document, errors='ignore') # adjust case if lowercase: raw_doc = raw_doc.lower() # tokenize according to specifications if tokenize == 'symbols': tokenizer = RegexpTokenizer(r'[\'\w\-]+') if tokenize == 'no_symbols': tokenizer = RegexpTokenizer(r'\w+') if tokenize == 'no_digits': tokenizer = RegexpTokenizer(r'[A-Za-z]+') tokens = tokenizer.tokenize(raw_doc) if not stopwords: stop = sw.words('english') tokens = [word for word in tokens if word not in stop] if stemming and lemmatize: print ('Error: can only choose one of stemming or lemmatize. ' 'Choosing stemming') lemmatize = False if stemming: if stemming == 'porter': stemmer = PorterStemmer() if stemming == 'lancaster': stemmer = LancasterStemmer() if stemming == 'snowball': stemmer = SnowballStemmer('english') tokens = [stemmer.stem(word) for word in tokens] if lemmatize: wnl = WordNetLemmatizer() tokens = [wnl.lemmatize(word) for word in tokens] return tokens
def tokenize(article): ''' INPUT string OUTPUT list This is a tokenizer to replace the default tokenizer in TfidfVectorizer ''' stop = stop_words() tokens = [word.lower() for word in word_tokenize(article)] # lemmatize lmtzr = WordNetLemmatizer() tokens = [lmtzr.lemmatize(word) for word in tokens] # now remove stop words tokens = [word for word in tokens if word not in stop] # remove words less than three letters tokens = [word for word in tokens if len(word) >= 3] return tokens