def tokenize(multi_word_queries, text): """Returns a list of words that make up the text. Params: {text: String} Returns: List """ lower_case = text.lower() tokenizer = RegexpTokenizer( 'not\s+very\s+[a-z]+|not\s+[a-z]+|no\s+[a-z]+|[a-z]+') result = tokenizer.tokenize(lower_case) multi_tokenizer = MWETokenizer([('working', 'out'), ('coffee', 'shops'), ('average', 'prices'), ('union', 'square'), ('real', 'estate'), ('ice', 'cream'), ('whole', 'foods'), ('co', 'op'), ('wall', 'street'), ('world', 'trade'), ('high', 'school'), ('dim', 'sum'), ('empire', 'state'), ('high', 'rise'), ('walk', 'ups')]) if len(multi_word_queries) > 0: for tok in multi_word_queries: if (len(tok.split('_')) > 1): multi_tokenizer.add_mwe(tuple(tok.split('_'))) #add neighborhood names for n in neighborhood_name_phrases: multi_tokenizer.add_mwe(tuple(n.split('_'))) result2 = multi_tokenizer.tokenize(result) return result2
def trim_bio(text): # keywords to return keywords = [] # load from file after custom edit df_keyword = pd.read_csv(local_data + "data/keywords/df.csv") ## convert df to list important_words = df_keyword["Unnamed: 0"].tolist() ## format important words so that they can be registered to tokenizer important_words = [x.split() for x in important_words] # initialize tokenizer tokenizer = MWETokenizer() for iw in important_words: tokenizer.add_mwe([x for x in iw]) # add important words #tokenizer.add_mwe(iw) # add important words # tokenize bio tokens = tokenizer.tokenize([word.lower() for word in text.split()]) # find important words from tokens, append it to keyword for iw in important_words: iw_joined = "_".join(iw) if (iw_joined in tokens): keywords.append(iw_joined) return keywords
def tokenization(docs): documents = {} for doc in docs: document_plain = docs[doc] document_plain = document_plain.replace("/", "").replace("-", " ") #re.sub(r'\([^)]*\)', '', document_plain) re.sub(r'\([0-9]*\)', '', document_plain) relevant_words = [] mwetokenizer = MWETokenizer() document_ner = spacy_nlp(document_plain) for element in document_ner.ents: # don't consider numbers if element.label_ not in "CARDINAL": relevant_words.append(element) #for each relevant word, if whitespace is present, create a single token with all the words for word in relevant_words: token = str(word).split() if len(token) > 1: move_data = [] for element in token: move_data.append(element) tup = tuple(move_data) mwetokenizer.add_mwe(tup) document_tokenized = word_tokenize(document_plain) document_retokenized = mwetokenizer.tokenize(document_tokenized) documents[doc] = document_retokenized return documents
def get_context(self, query_str, text, k=10): if query_str in text: tokenizer = MWETokenizer() query_str_tokens = tuple(query_str.split()) query_str_dashed = "_".join(query_str_tokens) tokenizer.add_mwe(query_str_tokens) text_token = tokenizer.tokenize(text.split()) try: t_start = text_token.index(query_str_dashed) except: return None, None, None t_end = t_start + 1 start_index = max(t_start - k, 0) end_index = min(t_end + k, len(text_token)) text_token_query = text_token[start_index:t_start] + text_token[ t_end + 1:end_index] context = " ".join(text_token_query) context_mention = text_token[start_index:t_start] + [ query_str ] + text_token[t_end + 1:end_index] context_mention = " ".join(context_mention) return context, text_token_query, context_mention else: logging.info('error, query not in text') return None, None, None
def init_base_order_tokenizer(): p = nltk.PorterStemmer() food_tokenizer = MWETokenizer() food_items = {} prices_items = {} image_items = {} cal_items = {} with open('sheet1.csv') as csvfile: reader = csv.DictReader(csvfile) for row in reader: food_item = row['Menu Item'].replace(' ', '_').lower() price = float(row['Price']) image = row['Image'] cal = float(row['Calories']) image_items[food_item] = image food_items[food_item] = 0 prices_items[food_item] = price cal_items[food_item] = cal items_stem = [ p.stem(i) for i in row['Menu Item'].lower().split(' ') ] if len(items_stem) > 1: food_tokenizer.add_mwe(tuple(items_stem)) with open('mwe.csv') as csvfile: reader = csv.DictReader(csvfile) for row in reader: items_stem = [ p.stem(i) for i in row['Menu Item'].lower().split(' ') ] if len(items_stem) > 1: food_tokenizer.add_mwe(tuple(items_stem)) return food_tokenizer, food_items, prices_items, cal_items, image_items
def initialize_known_phrase_tokenization(phrases): from nltk.tokenize import MWETokenizer tokenizer = MWETokenizer() for phrase in phrases: if (phrase): phrase_as_list = phrase.replace("_", " ").split() tokenizer.add_mwe(phrase_as_list) return tokenizer
def phrase_eval(params): list_phrases, unigram_set, target_token, idf, agg_score, pid = params idf_list = [*idf] idf_set = set(idf_list) tokenizer = MWETokenizer(separator=' ') for e in unigram_set: tokenizer.add_mwe(nltk.word_tokenize(e)) phrases_score = {} for phrase in tqdm(list_phrases, desc='phrase-eval-{}'.format(pid), mininterval=10): score = 0 tokens = nltk.word_tokenize(phrase) if not set(tokens).issubset(idf_set): continue nonstop_tokens = [token for token in tokens if token not in stop] if len(nonstop_tokens) / len(tokens) <= 0.5: continue raw_tokenized = tokenizer.tokenize(tokens) tokenized_set = set(raw_tokenized) keywords = tokenized_set.intersection(unigram_set) for token in keywords: score += agg_score[token] score /= (1 + np.log(len(nonstop_tokens))) vocab = set(target_token).union(set(tokens)) vocab = list(vocab.intersection(idf_set)) target_vec = [0] * len(vocab) phrase_vec = [0] * len(vocab) target_token_freq = dict(Counter(target_token)) target_token_subset = list(set(vocab).intersection(set(target_token))) for token in target_token_subset: index = vocab.index(token) target_vec[index] = target_token_freq[token] / len( target_token) * idf[token] phrase_token_freq = dict(Counter(tokens)) for token in tokens: index = vocab.index(token) phrase_vec[index] = phrase_token_freq[token] / len( tokens) * idf[token] tfidf_sim = 1 - spatial.distance.cosine(target_vec, phrase_vec) phrases_score.update({phrase: {'score': score, 'eval': tfidf_sim}}) rearrange = {} for k, v in phrases_score.items(): rearrange.update({k: v['score']}) top_10 = nlargest(10, rearrange, key=rearrange.get) return {key: phrases_score[key] for key in top_10}
def LoadTokenizer(): global tokenizer tokenizer = MWETokenizer(separator=' ') for spword in WordDict: if ' ' in spword: tupleword = tuple(spword.split(' ')) tokenizer.add_mwe(tupleword) if ':' in spword: tupleword = tuple(re.split(r"(:)", spword)) tokenizer.add_mwe(tupleword)
def multi_word_tokenizer(relevant_words, text): mwetokenizer = MWETokenizer() #add tuples of words into multiword tokenizer for word in relevant_words: token = str(word).split() move_data=[] for element in token: move_data.append(element) tup = tuple(move_data) mwetokenizer.add_mwe(tup) #execute multitokenization return mwetokenizer.tokenize(text)
def merge_task(task_list, args): with open('{}/wiki_quality.txt'.format(args.entity_dir), 'r') as f: raw_list = f.read() f.close() nlp = spacy.load('en_core_web_lg', disable=['ner']) entityset = set(raw_list.split('\n')) tokenizer = MWETokenizer(separator=' ') for e in entityset: tokenizer.add_mwe(nltk.word_tokenize(e)) print("successfully read entity file and initialized tokenizer") sys.stdout.flush() for fname in task_list: outputname = 'SENTENCE_ENTITY_{}'.format(fname.split('_')[-1]) context = [] with open('{}/{}'.format(args.input_dir,fname), 'r') as f: doc = f.readlines() f.close() for item in tqdm(doc, desc='{}'.format(fname), mininterval=30): item_dict = json.loads(item) sent = nltk.word_tokenize(item_dict['text']) raw_tokenized = tokenizer.tokenize(sent) tokenized_set = set(raw_tokenized) mentioned_entity = list(tokenized_set.intersection(entityset)) if len(mentioned_entity) != 0: doc = nlp(item_dict['text']) item_dict.update({'entityMentioned':mentioned_entity}) unigram = [token.text for token in textacy.extract.ngrams(doc,n=1,filter_nums=True, filter_punct=True, filter_stops=True)] item_dict['unigram'] = unigram tokens = [token.text for token in doc] pos = [token.pos_ for token in doc] phrases = phrasemachine.get_phrases(tokens=tokens, postags=pos) item_dict['phrases'] = list(phrases['counts']) context.append(json.dumps(item_dict)) with open('{}/{}'.format(args.output_dir, outputname), "w+") as f: f.write('\n'.join(context)) f.close()
def phrase_eval(params): list_phrases, unigram_set, target_token, idf, pid = params idf_list = [*idf] idf_set = set(idf_list) tokenizer = MWETokenizer(separator=' ') for e in unigram_set: tokenizer.add_mwe(nltk.word_tokenize(e)) phrases_score = [] for phrase in list_phrases: score = 0 tokens = nltk.word_tokenize(phrase) if not set(tokens).issubset(idf_set): continue nonstop_tokens = [token for token in tokens if token not in stop] if len(nonstop_tokens) / len(tokens) <= 0.5: continue vocab = set(target_token).union(set(tokens)) vocab = list(vocab.intersection(idf_set)) target_vec = [0] * len(vocab) phrase_vec = [0] * len(vocab) target_token_freq = dict(Counter(target_token)) target_token_subset = list(set(vocab).intersection(set(target_token))) for token in target_token_subset: index = vocab.index(token) target_vec[index] = target_token_freq[token] / len( target_token) * idf[token] phrase_token_freq = dict(Counter(tokens)) for token in tokens: index = vocab.index(token) phrase_vec[index] = phrase_token_freq[token] / len( tokens) * idf[token] tfidf_sim = 1 - spatial.distance.cosine(target_vec, phrase_vec) phrases_score.append([phrase, tfidf_sim]) return phrases_score
def __get_tokenizer(self): f = open(entity_path) mwetokenizer = MWETokenizer([], separator=' ') i = 30 while True: i = i - 1 if i <= 0: break line = f.readline() if not line: break indexed_token = line.lower().split() token = indexed_token token.pop(0) token = tuple(token) # print(token) mwetokenizer.add_mwe(token) f.close() return mwetokenizer
def trim_bio(text): # keywords to return keywords = [] ## define important words #important_words = [ ["data", "science"], # ["data", "scientist"], # ["machine", "learning"], # ["data", "engineer"], # ["data", "analytics"], # ["artificial", "intelligence"], # ["ai"], ["phd"], ["founder"], ["professor"],["candidate"],["ceo"], # ["student"], ["engineer"], ["computer", "science"] # ] # load from file after custom edit df_keyword = pd.read_csv("data/keywords/df.csv") ## convert df to list important_words = df_keyword["Unnamed: 0"].tolist() ## format important words so that they can be registered to tokenizer important_words = [x.split() for x in important_words] # initialize tokenizer tokenizer = MWETokenizer() for iw in important_words: tokenizer.add_mwe([x for x in iw]) # add important words #tokenizer.add_mwe(iw) # add important words # tokenize bio tokens = tokenizer.tokenize([word.lower() for word in text.split()]) # find important words from tokens, append it to keyword for iw in important_words: iw_joined = "_".join(iw) if (iw_joined in tokens): keywords.append(iw_joined) return keywords
def tokenizer_sent(dataset): tokenizer = MWETokenizer() aspect_tokenized = [] sentence_tokenized = [] for i in range(0, len(dataset.index)): aspect_split = tuple(dataset['aspect_term'][i].lower().split()) res = tokenizer.add_mwe(aspect_split) aspect_tokenized.append(res) for j in range(0, len(dataset.index)): tok = nltk.pos_tag( tokenizer.tokenize(dataset['text'][i].lower().split())) sentence_tokenized.append(tok)
def merge_task(task_list, args): with open('{}/wiki_quality.txt'.format(args.entity_dir), 'r') as f: raw_list = f.read() f.close() entityset = set(raw_list.split('\n')) tokenizer = MWETokenizer(separator=' ') for e in entityset: tokenizer.add_mwe(nltk.word_tokenize(e)) print("successfully read entity file and initialized tokenizer") sys.stdout.flush() for fname in task_list: outputname = 'SENTENCE_ENTITY_{}'.format(fname.split('_')[-1]) context = [] with open('{}/{}'.format(args.input_dir,fname), 'r') as f: doc = f.readlines() f.close() for item in tqdm(doc, desc='{}'.format(fname), mininterval=30): item_dict = json.loads(item) if set(item_dict['nsubj']).issubset(pronoun) or item_dict['nsubj'] == []: continue sent = nltk.word_tokenize(item_dict['text']) raw_tokenized = tokenizer.tokenize(sent) tokenized_set = set(raw_tokenized) mentioned_entity = list(tokenized_set.intersection(entityset)) if len(mentioned_entity) != 0: item_dict.update({'entityMentioned':mentioned_entity}) item_dict['iid'] = '{}{}{}'.format(item_dict['did'],item_dict['pid'],item_dict['sid']) context.append(json.dumps(item_dict)) with open('{}/{}'.format(args.output_dir, outputname), "w+") as f: f.write('\n'.join(context)) f.close()
def text_process(text): #number removal if text == -2: return '' body = re.sub(r'\d+', '', text) #punctuation removal i.e. [!”#$%&’()*+,-./:;<=>?@[\]^_`{|}~] # punc = string.punctuation # punct_mapping = {"_":" ", "'":" "} # punc += "“”’" punc = "/-'?!,#$%\'()*+-/:;<=>@\\^_`{|}~[]" + '""“”’' # punc = re.sub("-","", punc) body = body.translate(body.maketrans(punc, " " * len(punc))) #text lower body = body.lower() #multi-word tokenize multi_word_list = [('north', 'korea'), ('south', 'korea'), ('north', 'korean'), ('south', 'korean'), ('kim', 'jong', 'un'), ('park', 'geun', 'hye')] tokenizer = MWETokenizer() for mw in multi_word_list: tokenizer.add_mwe(mw) text = tokenizer.tokenize(body.split()) #stopwort removal stopset = set(stopwords.words('english')) # text = word_tokenize(body) text = [x for x in text if x not in stopset] text = [word for word in text if len(word) > 3] #lemmatization lemmatizer = WordNetLemmatizer() lemma_text = [lemmatizer.lemmatize(x) for x in text] return lemma_text
def Tokenize(text): tokenizer = MWETokenizer(category.all()) for word in category: if word.find(' '): tokenizer.add_mwe(word.split()) for word in sub_category: if word.find(' '): tokenizer.add_mwe(word.split()) for word in brand: if word.find(' '): tokenizer.add_mwe(word.split()) for word in article: if word.find(' '): tokenizer.add_mwe(word.split()) token = tokenizer.tokenize(text.split()) tokens = [] for word in token: word = word.replace("_", " ") tokens.append(word) return tokens
def main_thrd(query, num_process, input_dir, target): start_time = time.time() nlp = spacy.load('en_core_web_lg', disable=['ner']) ##### sentence search ##### input_files = os.listdir(input_dir) tasks = list(split(input_files, num_process)) inputs = [(tasks[i], query, input_dir) for i in range(num_process)] with Pool(num_process) as p: search_results = p.map(sent_search, inputs) search_merge = search_results[0]['context'] count_merge = search_results[0]['freq'] for pid in range(1, len(search_results)): tmp_context = search_results[pid]['context'] tmp_freq = search_results[pid]['freq'] for ent in query: search_merge[ent] += tmp_context[ent] count_merge[ent]['total'] += tmp_freq[ent]['total'] tmp_freq[ent].pop('total', None) count_merge[ent].update(tmp_freq[ent]) for ent in query: for index in range(len(search_merge[ent])): search_merge[ent][index]['doc_score'] = count_merge[ent][ search_merge[ent][index]['did']] / count_merge[ent]['total'] print("--- search use %s seconds ---" % (time.time() - start_time)) sys.stdout.flush() start_time = time.time() unigrams = [] for ent in query: for sent in search_merge[ent]: unigrams += sent['unigram'] unigram_set = set(unigrams) N = 0 cnt = Counter() for ent in query: N += len(search_merge[ent]) for sent in search_merge[ent]: cnt.update(sent['tokens']) cnt = dict(cnt) for ent in query: unigram_set.discard(ent) idf = {} for key in cnt.keys(): idf.update({key: np.log(N / cnt[key])}) unigram_sents = {} for ent in query: unigram_sents.update({ent: {}}) for sent in search_merge[ent]: unigram = set(sent['unigram']) unigram_intersect = unigram.intersection(unigram_set) for item in unigram_intersect: if item in unigram_sents[ent].keys(): unigram_sents[ent][item].append(sent) else: unigram_sents[ent].update({item: [sent]}) score_dist = {} for ug in unigram_set: score_dist.update({ug: {}}) for ent in query: score_dist[ug].update({ent: 0}) if ug in unigram_sents[ent].keys(): did = set() for sent in unigram_sents[ent][ug]: score_dist[ug][ent] += sent['doc_score'] * idf[ug] did.add(sent['did']) #using rank to score unigram score_redist = {} for ent in query: score_redist.update({ent: dict.fromkeys(unigram_set, 0)}) for ug in unigram_set: score_redist[ent][ug] = score_dist[ug][ent] sorted_score = sorted(score_redist[ent].items(), key=lambda item: item[1], reverse=True) rank, count, previous, result = 0, 0, None, {} for key, num in sorted_score: count += 1 if num != previous: rank += count previous = num count = 0 result[key] = 1.0 / rank score_redist[ent] = result for ug in unigram_set: for ent in query: score_dist[ug][ent] = score_redist[ent][ug] query_weight = [] for ent in query: query_weight.append( 1 / skew([sent['doc_score'] for sent in search_merge[ent]])) agg_score = {} for ug in score_dist.keys(): tmp_res = [item[1] for item in score_dist[ug].items()] wgmean = np.exp( sum(query_weight * np.log(tmp_res)) / sum(query_weight)) agg_score.update({ug: wgmean}) score_sorted = sorted(agg_score.items(), key=lambda x: x[1], reverse=True) print("--- unigram score %s seconds ---" % (time.time() - start_time)) print(score_sorted[:10]) sys.stdout.flush() start_time = time.time() tokenizer = MWETokenizer(separator=' ') for ent in query: tokenizer.add_mwe(nltk.word_tokenize(ent)) mined_phrases = [] query_set = set(query) for ent in query: for sent in search_merge[ent]: for phrase in sent['phrases']: tokens = nltk.word_tokenize(phrase) raw_tokenized = tokenizer.tokenize(tokens) tokenized_set = set(raw_tokenized) if tokenized_set.intersection(query_set) == set(): mined_phrases.append(phrase) print("--- phrase mining %s seconds ---" % (time.time() - start_time)) sys.stdout.flush() start_time = time.time() idf_list = [*idf] target_doc = nlp(target) target_vec = [0] * len(idf_list) target_token = [token.lemma_ for token in target_doc if not token.is_punct] list_phrases = list(set(mined_phrases)) tasks = list(split(list_phrases, num_process)) print('target_token', target_token) inputs = [(tasks[i], unigram_set, target_token, idf, agg_score, i) for i in range(num_process)] phrases_score = {} with Pool(num_process) as p: eval_results = p.map(phrase_eval, inputs) for tmp_res in eval_results: phrases_score.update(tmp_res) phrases_sorted = sorted(phrases_score.items(), key=lambda x: x[1]['score'], reverse=True) print("--- phrase eval use %s seconds ---" % (time.time() - start_time)) sys.stdout.flush() return phrases_sorted
import nltk from nltk.stem import WordNetLemmatizer nltk.download('wordnet') lemmatizer = WordNetLemmatizer() words = ["resource", "resources", "company", "companies", "run", "ran", "like", "likes"] for w in words: print("{} = {}".format(w, lemmatizer.lemmatize(w))) from nltk.tokenize import MWETokenizer tokenizer = MWETokenizer() tokenizer.add_mwe(('open', 'source')) words = tokenizer.tokenize('The governance of open source projects'.split()) print(" ".join(words))
'BG', 'RG', 'GG', 'WP', 'UP', 'BP', 'RP', 'GP', ] #create tokenizers no_space_mwe_tokenizer = MWETokenizer(separator='') space_mwe_tokenizer = MWETokenizer(separator=' ') for token in no_space_mwes: no_space_mwe_tokenizer.add_mwe(token) for token in space_mwes: space_mwe_tokenizer.add_mwe(token) def add_starts_stops(token_list): return ['<START>'] + token_list + ['<STOP>'] #tokenize sentences def tokenize(representation): line = representation for seq in seqs_to_insert_spaces_for: if seq in line: line = (' ' + seq + ' ').join(line.split(seq)) word_level = word_tokenize(line)
for sent in morphed_names: list2 = [] for w in sent: if w != 'COGS': w = w.lower() list2.append(w) morphed_names2.append(list2) final_name_list = [] for sent in morphed_names2: sent = ' '.join(sent) final_name_list.append(sent) for strngs in morphed_names2: if len(strngs) > 1: tokenizer.add_mwe(strngs) df.columns = final_name_list aggregate_words = ["total", "sum", "overall", "aggregate"] ranking_words = { "biggest": 1, "highest": 1, "peak": 1, "top": 1, "smallest": 0, "least": 0, "lowest": 0, "bottom": 0 } gap_words = ["gap", "difference"]
wd_permutations = [] for x in range(min_perm, max_perm): for foods in permutations(words, x): wd_permutations.append(foods) #wd_permutations = permutations(words, 2) list_wd_permutations = list(wd_permutations) #join tuples back together into a string and append them to the new list for list_wd_permutation in list_wd_permutations: join_wd_permutations = (' '.join(list_wd_permutation)) list_join_wd_permutations.add(join_wd_permutations) #if the number of words is more than 1, then add it to the list of mwe if (len(list_wd_permutation) > 1): mwe_tokenizer.add_mwe(list_wd_permutation) print("\n+++++++++++\n") print("Complete list of permutations:") print(list_join_wd_permutations) print("\n+++++++++++\n") print("Complete list of multi-word expressions:") print(mwe_tokenizer._mwes) print("\n+++++++++++\n")
import math import nltk from nltk.tokenize import MWETokenizer # restore model from collocation.nb model_1 = pickle.load( open('/home/hrrathod/project/collocations/collocation.nb', 'rb')) # restore model from train_collocation.nb model_2 = pickle.load( open('/home/hrrathod/project/collocations/train_collocation.nb', 'rb')) # Creat empty list for the multi-word expression tokens tokenizer = MWETokenizer([]) # Converting the collocations into a multi-word expression token for w1, w2 in model_1: tokenizer.add_mwe((w1, w2)) # REPLACE PATH # Get list of files in the test set test_files = sorted(glob.glob('/home/hrrathod/project/test/*.txt')) # Number of tokens in the positive reviews pos_total_tokens = model_2['pos_fd'].N() # Number of tokens in the neutral reviews neu_total_tokens = model_2['neu_fd'].N() # Number of tokens in the negative reviews neg_total_tokens = model_2['neg_fd'].N() # Combining all FDs fd = model_2['pos_fd'] + model_2['neu_fd'] + model_2['neg_fd']
from nltk.corpus import stopwords import pickle from nltk import pos_tag from textblob import Word from nltk import RegexpParser from nltk.corpus import wordnet train = [] reader = csv.reader(open('Topic_set_train.csv', 'r')) tokenizer = MWETokenizer() for row in reader: print("Data : " + str(row)) title, category = row tokenizer.add_mwe(title.split()) wiki_page = wikipedia.page(title) wiki_content = str.lower(wiki_page.summary) tokens = tokenizer.tokenize(wiki_content.split()) tokens_content = " ".join(tokens) word_tokens = word_tokenize(tokens_content) bigger_words = [ k for k in word_tokens if len(k) >= 3 and not k.startswith('===') ] stop = set(stopwords.words('english')) stopwords_cleaned_list = [k for k in bigger_words if k not in stop] lemmatized_tokens = [] for word in stopwords_cleaned_list: w = Word(word) lemmatized_tokens.append(w.lemmatize())
) print("\n") # Read the words of interest words = open("emotion_words.txt").read().lower().split("\n") sentiment_bag = set() # Get the multi-word expression tokenizer and add each to the sentiment_bag mwe = set(filter(lambda a: " " in a, words)) print("Multi-word expressions in emotion words: {}".format(",".join(mwe))) # Create the MWE tokenizer mwe_tokenizer = MWETokenizer() for s in mwe: print("Add mwe ", s) mwe_tokenizer.add_mwe(s.split(" ")) sentiment_bag.add(s.replace(" ", "_")) lmtzr = WordNetLemmatizer() st = LancasterStemmer() ps = PorterStemmer() print("Stemming:") for word in filter(lambda a: " " not in a, words): print("{} => {} / {} / {}".format(word, lmtzr.lemmatize(word), st.stem(word), ps.stem(word))) sentiment_bag.add(word) sentiment_bag.add(st.stem(word)) # I like this one the best # Process all the lists for (label, files) in sorted(makecloud.TRANSCRIPTS.items()): scores = [] print("{}:\n{}=".format(label, "=" * len(label)))
def tokenize_with_mwe(text): mwe_tokenizer = MWETokenizer([('Republic', 'Day')]) mwe_tokenizer.add_mwe(('Indian', 'Army')) return mwe_tokenizer.tokenize(text.split())
def main(): parser = argparse.ArgumentParser( description="group sentence by cooccurrence") parser.add_argument('--input_dir', type=str, default='', help='autophrase parsed directory') parser.add_argument('--query_string', type=str, default='', help='search query') parser.add_argument('--num_process', type=int, default=2, help='number of parallel') args = parser.parse_args() query = args.query_string.split(',') nlp = spacy.load('en_core_web_lg', disable=['ner']) print(query) sys.stdout.flush() ##### sentence search ##### start_time = time.time() input_dir = os.listdir(args.input_dir) tasks = list(split(input_dir, args.num_process)) inputs = [(tasks[i], args) for i in range(args.num_process)] with Pool(args.num_process) as p: search_results = p.map(sent_search, inputs) search_merge = search_results[0]['context'] count_merge = search_results[0]['freq'] for pid in range(1, len(search_results)): tmp_context = search_results[pid]['context'] tmp_freq = search_results[pid]['freq'] for ent in query: search_merge[ent] += tmp_context[ent] count_merge[ent]['total'] += tmp_freq[ent]['total'] tmp_freq[ent].pop('total', None) count_merge[ent].update(tmp_freq[ent]) for ent in query: for index in range(len(search_merge[ent])): search_merge[ent][index]['doc_score'] = count_merge[ent][ search_merge[ent][index]['did']] / count_merge[ent]['total'] fid = 1 for ent in query: with open('retrieved-{}.txt'.format(fid), "w+") as f: for sent in search_merge[ent]: f.write(json.dumps(sent) + '\n') f.close() fid += 1 print("--- search use %s seconds ---" % (time.time() - start_time)) sys.stdout.flush() start_time = time.time() unigrams = [] for ent in query: for sent in search_merge[ent]: unigrams += sent['unigram'] unigram_set = set(unigrams) for ent in query: unigram_set.discard(ent) unigram_sents = {} for ent in query: unigram_sents.update({ent: {}}) for sent in search_merge[ent]: unigram = set(sent['unigram']) unigram_intersect = unigram.intersection(unigram_set) for item in unigram_intersect: if item in unigram_sents[ent].keys(): unigram_sents[ent][item].append(sent) else: unigram_sents[ent].update({item: [sent]}) score_dist = {} for ug in unigram_set: score_dist.update({ug: {}}) for ent in query: score_dist[ug].update({ent: 0}) if ug in unigram_sents[ent].keys(): did = set() for sent in unigram_sents[ent][ug]: score_dist[ug][ent] += sent['doc_score'] #if sent['did'] not in did: #score_dist[ug][ent] += sent['doc_score'] did.add(sent['did']) agg_score = {} for ug in score_dist.keys(): tmp_res = [item[1] for item in score_dist[ug].items()] agg_score.update({ug: np.mean(tmp_res) - np.std(tmp_res)}) score_sorted = sorted(agg_score.items(), key=lambda x: x[1], reverse=True) print("--- unigram score %s seconds ---" % (time.time() - start_time)) sys.stdout.flush() ### phrase hard match ### start_time = time.time() mined_phrases = {} for ent in query: mined_phrases.update({ent: []}) for sent in search_merge[ent]: mined_phrases[ent] += sent['phrases'] coo_phrases = set(mined_phrases[query[0]]) for ent in query: coo_phrases = coo_phrases.intersection(set(mined_phrases[ent])) print(coo_phrases) print(mined_phrases) tokenizer = MWETokenizer(separator=' ') for ph in coo_phrases: tokenizer.add_mwe(nltk.word_tokenize(ph)) search_refetch = {} for ent in query: search_refetch.update({ent: []}) for sent in search_merge[ent]: sent_tok = nltk.word_tokenize(sent['text']) raw_tokenized = tokenizer.tokenize(sent_tok) tokenized_set = set(raw_tokenized) mentioned_phrase = list(tokenized_set.intersection(coo_phrases)) if len(mentioned_phrase) != 0: sent['phrases'] = mentioned_phrase search_refetch[ent].append(sent) mined_phrases = {} fid = 0 for ent in query: ent_phrase = [] phrase_cnt = Counter() mined_phrases.update({ent: []}) for sent in search_refetch[ent]: mined_phrases[ent] += sent['phrases'] phrase_cnt.update(sent['phrases']) for ent in query: phrase_cnt.pop(ent, None) with open('phrase-mined-{}.txt'.format(fid), "w+") as f: for pair in sorted(phrase_cnt.items(), key=lambda kv: kv[1]): f.write('{} {} \n'.format(pair[0], pair[1])) f.close() fid += 1 coo_phrases = set(mined_phrases[query[0]]) for ent in query: coo_phrases = coo_phrases.intersection(set(mined_phrases[ent])) print('number of cooccurred phrase: ', len(coo_phrases)) print("--- phrase eval use %s seconds ---" % (time.time() - start_time)) sys.stdout.flush() start_time = time.time() phrase_sents = {} for ent in query: phrase_sents.update({ent: {}}) for sent in search_refetch[ent]: interc = set(sent['phrases']).intersection(coo_phrases) for item in interc: if item in phrase_sents[ent].keys(): phrase_sents[ent][item].append(sent) else: phrase_sents[ent].update({item: [sent]}) fid = 1 for ent in query: with open('phrase-sent-dist-{}.txt'.format(fid), "w+") as f: for key, value in phrase_sents[ent].items(): meta = {key: len(value)} f.write(json.dumps(meta) + '\n') f.close() fid += 1
class LyricsCleaner: """cleans and tokenizes the from a song lyrics in preparation for embedding""" def __init__(self, filename): """initializes a LyricsCleaner object""" self._filename = filename self._tokenizer = MWETokenizer() for word in SIGNAL_WORDS: self._tokenizer.add_mwe(('[', word, ']')) self._stemmer = LancasterStemmer() def tokenizeSong(self): """breaks up the lyrics into tokens using the nltk tokenizer, stemming, and various normalization techniques""" with open(NAMES_CSV) as nameFile: read = csv.reader(nameFile, delimiter=",") names = [] for name in read: names.append(name[0]) with open(self._filename) as songJSON: rawData = json.load(songJSON) #get the lyrics from the json file lyrics = rawData["songs"][0]["lyrics"] if not lyrics == None: #preserve the newline for prediction, want to predict the newline character preserveNewline = lyrics.replace("\n", " **NEWLINE** ") #tokenize the lyrics tokenizedLyrics = nltk.word_tokenize(preserveNewline) #replace people's names with general name token for k in range(len(tokenizedLyrics)): if tokenizedLyrics[k] in names: tokenizedLyrics[k] = "**NAME_VAR**" #NOT DOING THIS ANYMORE: take out words that are not english #else: # for h in range(len(tokenizedLyrics[k])): # if not tokenizedLyrics[k][h] in string.printable and len(tokenizedLyrics[k]) > 1 and tokenizedLyrics[k][h] != "…": # if h != len(tokenizedLyrics[k]): # print(tokenizedLyrics[k] + " ==> " + tokenizedLyrics[k][h]) # tokenizedLyrics[k] = "**NOT_ENLGISH**" #bring the multi-word expressions back together ([CHOURS], [VERSE], etc) tokenizedLyrics = self._tokenizer.tokenize(tokenizedLyrics) #add start token newLyrics = ['START'] #normalize the labels for the parts of the song i = 0 while i < len(tokenizedLyrics): word = tokenizedLyrics[i] if word == "[": if tokenizedLyrics[i + 1] in SIGNAL_WORDS: j = i + 2 while tokenizedLyrics[j] != "]" and j < len(tokenizedLyrics) - 1: j += 1 word = word + "_" + tokenizedLyrics[i+1] + "_" + tokenizedLyrics[j] newLyrics += [word.lower()] i = j #if word is not a stopword, keep it elif word not in nltk.corpus.stopwords.words("english"): if not word[2:len(word)-2] == SIGNAL_WORDS: #make everything lowercase because capitalization doesn't really matter in songs? #add the stem newLyrics += [self._stemmer.stem(word.lower())] if word.lower() != self._stemmer.stem(word.lower()): #if stem is same as original word except for last letter in original if word.lower()[:len(word)-1] == self._stemmer.stem(word.lower()): #add the last letter in the original word newLyrics += word.lower()[len(word)-1:] #if stem is same as original word except for last two letters in original elif word.lower()[:len(word)-2] == self._stemmer.stem(word.lower()): #add the last two letters in the original word newLyrics += word.lower()[len(word)-2:] #if stem is same as original word except for last three letters in original elif word.lower()[:len(word)-3] == self._stemmer.stem(word.lower()): #add the last three letters in the original word newLyrics += word.lower()[len(word)-3:] #if stem is like once or since the stem is "ont" or "sint" elif word.lower()[len(word)-3:len(word)-1] == "ce" and self._stemmer.stem(word.lower())[len(self._stemmer.stem(word.lower()))-1] == "t": #add the "ce" as a token newLyrics += word.lower()[len(word)-3:len(word)-1] i += 1 #print(word.lower()[len(word)-3:len(word)-1]) #add end token to the end of a song newLyrics += ['END'] return newLyrics
from nltk.tokenize import TweetTokenizer, MWETokenizer import nltk.corpus as corpus # Twitter Tokenizer tweettk = TweetTokenizer() # Multi-Word Tokenizer mwetk = MWETokenizer() # Load movie titles into Multi-Word Tokenizer movie_titles = [] with open('movies.json', 'r') as file: movie_titles = [movie['name'].split() for movie in json.load(file)] for title in movie_titles: # Movies that are one word don't need to be included if len(title) < 2: continue mwetk.add_mwe(tuple(title)) # Include lowercased version as well mwetk.add_mwe(tuple([s.lower() for s in title])) stop_words = set(corpus.stopwords.words('english')) printable = set(string.printable) def tokenizeTweets(tweets): for tweet in tweets: # Remove any non-ASCII chars text = ''.join([x for x in tweet['text'] if x in printable]) text = tweettk.tokenize(text) text = mwetk.tokenize(text) text = [token for token in text if token not in stop_words] tweet['original_text'] = tweet['text']
# 4. Whitespace Tokenizer # 5. Word Punct Tokenizer # 1. Tweet tokenizer from nltk.tokenize import TweetTokenizer tweet_tokenizer = TweetTokenizer() tweet_tokenizer.tokenize(sentence5) # 2. MWE Tokenizer (Multi-Word Expression) from nltk.tokenize import MWETokenizer mwe_tokenizer = MWETokenizer([ ('Republic', 'Day') ]) # Declaring set of words that are to be treated as one entity mwe_tokenizer.add_mwe(('Indian', 'Army')) # Adding more words to the set mwe_tokenizer.tokenize( sentence5.split() ) # Indian Army' should be treated as a single token. But here "Army!" is treated as a token. mwe_tokenizer.tokenize(sentence5.replace( '!', '').split()) # "Army!" will be treated as Army # 3. Regexp Tokenizer from nltk.tokenize import RegexpTokenizer reg_tokenizer = RegexpTokenizer('\w+|\$[\d\.]+|\S+') reg_tokenizer.tokenize(sentence5) # 4. Whitespace Tokenizer
print(phrases) print("=" * 20) #totenize words words = word_tokenize(text) print(words) print("=" * 20) #pos tagging words word_tags = nltk.pos_tag(words) print(word_tags) print("=" * 20) #select specified words (proper noun here) for word_tag in word_tags: if word_tag[1] == 'NNP': print(word_tag[0]) # import MWETokenizer() method from nltk from nltk.tokenize import MWETokenizer mwe = MWETokenizer() # Create a string input mwe.add_mwe(('All', 'work', 'and')) mwe.add_mwe(('New', 'York')) # tokenize witg mwe mwe_words = mwe.tokenize(words) print(mwe_words)
from spacy.en import English nlp = English() # This is for multi-word-phrases. MWE = [] path = "/".join(os.path.realpath(__file__).split("/")[:-2]) + '/input/' print 'path', path with open(path+'STREUSLE2.1-mwes.tsv') as f: for line in f.readlines(): multiword_expression = line.split('\t')[0].split()[1:] MWE.append(multiword_expression) MWE_tokenizer = MWETokenizer(MWE, separator='-') # Add whatever additional custom multi-word-expressions. MWE_tokenizer.add_mwe(( 'dive', 'bar')) # Stopwords stops = set(stopwords.words("english") + stopwords.words("spanish")) keep_list = ['after', 'during', 'not', 'between', 'other', 'over', 'under', 'most', ' without', 'nor', 'no', 'very', 'against','don','aren'] stops = set([word for word in stops if word not in keep_list]) table = string.maketrans("","") sent_detector = nltk.data.load('tokenizers/punkt/english.pickle') whitespace_tokenizer = WhitespaceTokenizer()