def get_route(msg): """Returns a map with keys 'origin'" and 'arrival_location'.""" tokenizer = MWETokenizer(CITY_TOKENS) route = {'origin': None, 'destination': None} tokens = tokenizer.tokenize(msg.lower().split(' ')) def lookahead(start_idx): """Returns a slice of the tokens list starting at index start_idx.""" end_idx = min(start_idx + TOKEN_LOOKAHEAD, len(tokens)) words = ['from', 'to', 'on'] for i in xrange(start_idx + 1, end_idx): if tokens[i] in ['from', 'to', 'on']: end_idx = i break return tokens[start_idx:end_idx] for i in xrange(len(tokens) - 1): if tokens[i] in ['from', 'to']: city_tokens = lookahead(i + 1) city = determine_city(city_tokens) if city is None: print "City not recognized: {}".format(' '.join(city_tokens)) else: if tokens[i] == 'from': route['origin'] = city elif tokens[i] == 'to': route['destination'] = city return route
def mwe_tokenize(tokens, bigrams): #Retokenizes tokenized text to combine MWEs from list of most common # with open('bigrams_MWEs.pkl', 'rb') as fid: # bigrams = cPickle.load(fid) tokenizer = MWETokenizer(mwes= bigrams[:100], separator='+') tokens = tokenizer.tokenize(tokens) return tokens
def __init__(self): print >> sys.stderr, '[TEXT]\t%s\t*** Initializing Text Object ***' % arrow.now( ) # Read Configuration from ini file conf = Config(self.INI_PATH) phrases_extractor_path = conf.config_section_map('Model')['n_gram'] word2vec_model_path = conf.config_section_map('Model')['word2vec'] words_category_path = conf.config_section_map('Corpus')['key_words'] # Variable initialization # - key words and their related words self.words_category = None with open(words_category_path, 'rb') as f: self.words_category = json.load(f) # - all of the related words in the words_category print >> sys.stderr, '[TEXT]\t%s\tLoading n-Gram model ...' % arrow.now( ) self.interested_phrases = list( set([ item for sublist in self.words_category.values() # Get sublist for item in sublist # Merge sublist if isPhrase(item) # Filter non phrases ])) # - word2vec model print >> sys.stderr, '[TEXT]\t%s\tLoading word2vec model ...' % arrow.now( ) self.word2vec_model = Word2Vec.load_word2vec_format( word2vec_model_path, binary=True) print >> sys.stderr, '[TEXT]\t' # - phrases extractor (n-gram kernel) self.phrases_extractor = PhrasesExtractor( phrases_extractor_path, interested_phrases=self.interested_phrases) # - MWE Tokenizer self.mwe = MWETokenizer() # Init words analysor self.words_analysor = WordsAnalysor() # Document-Term Vectors self.dt_matrix = [] # Labels for documents self.labels = []
def tokenize(): """ Pull phrases from the corpus. For example, to distinguish between orange color and orange flavor, or to determine the strength of the flavor (light_citrus) or the carbonation (strong_carbonation) """ global review_df from phrases import phrase_map, phrases, synonym_map phrase_tokenizer = MWETokenizer(phrases) stop_word_list = set(stopwords.words('english')) lemmatizer = WordNetLemmatizer() def process(x): return [ phrase_map.get(word, word) for word in phrase_tokenizer.tokenize([ lemmatizer.lemmatize(synonym_map.get(word, word)) for word in x if word not in stop_word_list ]) ] b = bag.from_sequence(review_df['review_pp1'].str.split()) mapped = b.map(process) review_df['review_pp1'] = pd.Series(mapped.compute()).str.join(' ')
import sys import sqlite3 as sql import nltk import string import csv import time import re import os from collections import Counter from bs4 import BeautifulSoup from nltk.corpus import stopwords from nltk.tokenize.mwe import MWETokenizer multi_word_tokenizer = MWETokenizer() # multi_word_tokenizer.add_mwe(("Multiple", "Sklerose")) con = None rows = [] chapter_ids = [ "section1", "section2", "section3", "section4", "section5", "section6", "section7", "section8", "section9", "section10", "section11", "section12", "section13", "section14", "section15", "section16", "Section7000", "Section7050", "Section7100", "Section7150", "Section7200", "Section7250", "Section7350", "Section7400", "Section7450", "Section7500", "Section7550", "Section7600", "Section7650", "Section7700" ]
import getopt import sqlite3 as sql import nltk import string import csv import time import re import os import hashlib from collections import Counter from bs4 import BeautifulSoup # from nltk.corpus import stopwords from nltk.tokenize.mwe import MWETokenizer multi_word_tokenizer = MWETokenizer() # multi_word_tokenizer.add_mwe(("Multiple", "Sklerose")) chapter_ids = [ "section1", "section2", "section3", "section4", "section5", "section6", "section7", "section8", "section9", "section10", "section11", "section12", "section13", "section14", "section15", "section16", "Section7000", "Section7050", "Section7100", "Section7150", "Section7200", "Section7250", "Section7350", "Section7400", "Section7450", "Section7500", "Section7550", "Section7600", "Section7650", "Section7700" ] list_of_stopwords = [] def is_integer(s):
def mwe_tokenize(tokens, bigrams): tokenizer = MWETokenizer(mwes=bigrams[:100], separator='+') tokens = tokenizer.tokenize(tokens) return tokens
def keyword_frequent(all_text): # text: a list synonyms = {} feature_1 = ["English", "Cantonese", "Chinese"] feature_2 = ["Supervise", "Coach", "Team", "Staff"] feature_3 = [ "Digital Marketing", "Digital Media Buy", "Search Engine Marketing", "Search Engine Optimization", "Mobile", "Social Media", "Content Calendar", "Performance Marketing", "Channel", "Paid Social", "Programmatic Display", "Remarketing", "Social Campaign", "Webiste Content", "KOLs", "Content Marketing", "Digital Analytics" ] feature_4 = [ "Analysis", "Budget", "ROI", "KPI", "Forecasting", "Program", "Competitor Analysis" ] feature_5 = ["University", "College"] feature = { "Language": feature_1, "Product_Experience": feature_2, "Functional_Experience": feature_3, "Digital_Marketing_Strategy": feature_4, "Education": feature_5 } synonyms["Chinese"] = ["Mandarin", "Putonghua"] synonyms["Team"] = ["Team building"] synonyms["Digital Marketing"] = [ "Online", "eDM", "Electronic Direct Marketing" ] synonyms["Digital Media Buy"] = ["banner ads", "landing page"] synonyms["Search Engine Marketing"] = ["SEM"] synonyms["Search Engine Optimization"] = ["SEO"] synonyms["Social Media"] = [ "Facebook", "WeChat", "Twitter", "Instagram", "IG", "Snapchat", "Line", "Myspace", "Flickr", "LinkedIn", "Xing" ] synonyms["KOLs"] = ["Key Opinion Leaders"] synonyms["ROI"] = ["Return on investment"] all_keywords = [] for key in feature: all_keywords += feature[key] tokenizer = MWETokenizer([tuple(x.lower().split()) for x in all_keywords]) all_frequency = nltk.FreqDist( tokenizer.tokenize(nltk.word_tokenize("\n".join(all_text).lower()))) all_keywords_frequency = {} for key in feature.keys(): freq_dict = {} for keyword in feature[key]: freq = all_frequency["_".join(keyword.lower().split())] # print keyword, expFreq["_".join(keyword.split())] if keyword in synonyms.keys(): for syn in synonyms[keyword]: # print keyword, syn, expFreq["_".join(syn.split())] freq += all_frequency["_".join(syn.lower().split())] freq_dict[keyword] = freq all_keywords_frequency[key] = freq_dict return all_keywords_frequency
def __init__(self,): NltkTokenizer.__init__(self) _MWETokenizer.__init__(self,)
class TextAnalysor: ''' Text This is a class for processing the raw text information, extracting useful features and also providing user-friendly data API. Essential public attributes: * dt_matrix: documents-term (documents-feature) matrix * labels: labels for each of the documents ''' INI_PATH = 'conf/text.ini' WORD_MIN_LEN = 2 ANCHOR_MIN_SIM = 0.5 PHRASE_MIN_SIM = 0.8 def __init__(self): print >> sys.stderr, '[TEXT]\t%s\t*** Initializing Text Object ***' % arrow.now( ) # Read Configuration from ini file conf = Config(self.INI_PATH) phrases_extractor_path = conf.config_section_map('Model')['n_gram'] word2vec_model_path = conf.config_section_map('Model')['word2vec'] words_category_path = conf.config_section_map('Corpus')['key_words'] # Variable initialization # - key words and their related words self.words_category = None with open(words_category_path, 'rb') as f: self.words_category = json.load(f) # - all of the related words in the words_category print >> sys.stderr, '[TEXT]\t%s\tLoading n-Gram model ...' % arrow.now( ) self.interested_phrases = list( set([ item for sublist in self.words_category.values() # Get sublist for item in sublist # Merge sublist if isPhrase(item) # Filter non phrases ])) # - word2vec model print >> sys.stderr, '[TEXT]\t%s\tLoading word2vec model ...' % arrow.now( ) self.word2vec_model = Word2Vec.load_word2vec_format( word2vec_model_path, binary=True) print >> sys.stderr, '[TEXT]\t' # - phrases extractor (n-gram kernel) self.phrases_extractor = PhrasesExtractor( phrases_extractor_path, interested_phrases=self.interested_phrases) # - MWE Tokenizer self.mwe = MWETokenizer() # Init words analysor self.words_analysor = WordsAnalysor() # Document-Term Vectors self.dt_matrix = [] # Labels for documents self.labels = [] def save_variables(self, file_path): ''' SAVE VARIABLES This method would save the text analysor in two files, one is a .npy file stores the documents-term matrix, and the other one is a text file stores the labels. ''' # Save the document-term matrix np.save(file_path, self.dt_matrix) # Save the labels information labels = [ '#'.join(multiple_labels) + '\n' for multiple_labels in self.labels ] with open(file_path + '.txt', 'w') as f: try: f.writelines(labels) except: print >> sys.stderr, '[ERROR] Saving failed. Invalid file path: %s' % file_path def load_variables(self, file_path): ''' LOAD VARIABLES This method loads two files (.txt for labels information and .npy for documents-term matrix) from local file system to initialize a text analysor instance. ''' if not os.path.exists(file_path + '.txt') or not os.path.exists(file_path + '.npy'): print >> sys.stderr, '[WARN] Loading failed. Invalid file path: %s' % file_path return # Load the document-term matrix self.dt_matrix = np.load(file_path + '.npy').tolist() # Load the labels information with open(file_path + '.txt', 'r') as f: try: labels = f.readlines() self.labels = [ list(set(label.strip('\n').split('#'))) for label in labels ] except: print >> sys.stderr, '[ERROR] Loading failed. Unknown error' #################################### # Global Analysis #################################### def fuzzy_LSA(self, n_components_for_svd=2): print >> sys.stderr, '[TEXT]\t%s\tFuzzy LSA ...' % arrow.now() # Tf-idf Transformation tfidf = TfidfTransformer() tfidf_matrix = tfidf.fit_transform(self.dt_matrix).toarray() # SVD # n_components is recommended to be 100 by Sklearn Documentation for LSA # http://scikit-learn.org/stable/modules/generated/sklearn.decomposition.TruncatedSVD.html svd = TruncatedSVD(n_components=n_components_for_svd) svd_matrix = svd.fit_transform(tfidf_matrix) # print >> sys.stderr, tfidf_matrix # print >> sys.stderr, svd_matrix feature_matrix = svd_matrix.tolist() return feature_matrix, \ self._sort_by_labels(feature_matrix), \ tfidf_matrix.tolist() def regular_LSA(self, n_components_for_svd=2): print >> sys.stderr, '[TEXT]\t%s\tRegular LSA ...' % arrow.now() self.words_analysor.LSA(n_components_for_svd=n_components_for_svd) feature_matrix = self.words_analysor.svd_matrix.tolist() return feature_matrix, \ self._sort_by_labels(feature_matrix), \ self.words_analysor.tfidf_matrix.tolist(), \ self.words_analysor.dt_matrix.tolist(), \ self.words_analysor.feature_names # def fuzzy_LDA(self, n_topics_for_lda=2): # print >> sys.stderr, '[TEXT]\t%s\tFuzzy LDA ...' % arrow.now() # feature_matrix = LatentDirichletAllocation( # n_topics=n_topics_for_lda, # max_iter=5, # learning_method='online', # learning_offset=50., # random_state=0 # ).fit_transform(self.dt_matrix).tolist() # return feature_matrix, \ # self._sort_by_labels(feature_matrix) # def regular_LDA(self, n_topics_for_lda=2): # print >> sys.stderr, '[TEXT]\t%s\tRegular LDA ...' % arrow.now() # self.words_analysor.LDA(n_topics=n_topics_for_lda) # feature_matrix = self.words_analysor.lda_matrix.tolist() # return feature_matrix, \ # self._sort_by_labels(feature_matrix), \ # self.words_analysor.dt_matrix.tolist(), \ # self.words_analysor.feature_names def _sort_by_labels(self, feature_matrix): # Get the set for all the labels that appearred labels_set = list( set([item for sublist in self.labels for item in sublist])) label_feature_dict = {} for label_in_set in labels_set: label_feature_dict[label_in_set] = [] for i in range(len(self.labels)): for label_for_feature in self.labels[i]: if label_for_feature == label_in_set: label_feature_dict[label_in_set].append( feature_matrix[i]) return label_feature_dict def set_text(self, text, label): ''' ''' # Init self._initialize_temporal_variables() # raw text self.text = text # # Init words analysor self.words_analysor.add_document(text) # Tokenize the raw text # print >> sys.stderr, '[TEXT]\t%s\tTokenizing ...' % arrow.now() self._tokenize() # Get the structure of the tokenized text # print >> sys.stderr, '[TEXT]\t%s\tGetting Structure ...' % arrow.now() self._get_structure() # Anchor the locations of keywords in the text # print >> sys.stderr, '[TEXT]\t%s\tAnchorring Keywords ...' % arrow.now() # self._anchor_keywords() # Find K-nearest tokens from the text to the tokens in the words_category # print >> sys.stderr, '[TEXT]\t%s\tFinding K nearest tokens ...' % arrow.now() self._find_k_nearest_tokens() self.dt_matrix.append(self.term_vector) self.labels.append(label) def _initialize_temporal_variables(self): self.sents_by_tokens = [] self.sents_by_words = [] self.phrases_count = {} self.filtered_phrases = {} self.length_of_sents = [] self.length_of_text = -1 self.structure = {} self.anchors = {} #################################### # Processing for A Single Document #################################### def _tokenize(self): self.sents_by_tokens = [] self.sents_by_words = self.words_analysor.cur_sents_by_words # Take interested phrases from the text into consideration self.phrases_count = self.phrases_extractor.phrases_count( self.text) # Get all possible phrases from the text self.filtered_phrases = self._phrases_filter(self.phrases_count.keys()) # Add the filtered phrases into the MWE Tokenizer for p in self.filtered_phrases.keys(): self.mwe.add_mwe(str(p).split('_')) # Tokenize by MWE for sent in self.sents_by_words: # Text by tokens sent_by_tokens = [token for token in self.mwe.tokenize(sent)] self.sents_by_tokens.append(sent_by_tokens) def _get_structure(self): self.length_of_sents = [len(sents) for sents in self.sents_by_tokens] self.length_of_text = sum(self.length_of_sents) self.structure = defaultdict( lambda: { # The list of indexs of the token in the whole text 'text_indexs': [], # The list of indexs of the sentences in the whole text 'sent_indexs': [], # The list of indexs of the token in their sentences 'inner_indexs': [] }) text_i = 0 sent_i = 0 inner_i = 0 for sent in self.sents_by_tokens: # Tokens structure info for token in sent: if token not in stopwords.words( 'english') and len(token) > self.WORD_MIN_LEN: self.structure[token]['text_indexs'].append(text_i) self.structure[token]['sent_indexs'].append(sent_i) self.structure[token]['inner_indexs'].append(inner_i) text_i += 1 inner_i += 1 sent_i += 1 inner_i = 0 def _anchor_keywords(self): self.anchors = {} for categories in self.words_category.keys(): category_list = categories.strip().split('/') similar_tokens_info = defaultdict(lambda: 0) for category in category_list: for token in self.structure.keys(): sim = self._phrases_similarity(category, token) if sim > self.ANCHOR_MIN_SIM and sim > similar_tokens_info[ token]: similar_tokens_info[token] = sim self.anchors[categories] = similar_tokens_info # print >> sys.stderr, json.dumps(self.anchors, indent=4) def _find_k_nearest_tokens(self, K=10): self.k_nearest_tokens = {} for category in self.words_category.keys(): self.k_nearest_tokens[category] = [] # Calculate the distance between every word/phrase in the text and category for category, words_in_category in self.words_category.iteritems(): tokens_in_text = self.structure.keys() tokens_in_category = map(lambda x: x.encode('ascii', 'ignore'), words_in_category) # Calculate the matrix of distances between # words_in_text & words_in_category len_i_t = len(tokens_in_text) len_i_c = len(tokens_in_category) dist_mat = np.zeros((len_i_t, len_i_c)) for i in range(len_i_t): for j in range(len_i_c): if isPhrase(tokens_in_text[i]) and isPhrase( tokens_in_category[j]): dist_mat[i, j] = self._phrases_similarity( tokens_in_text[i], tokens_in_category[j]) elif (not isPhrase(tokens_in_text[i])) and (not isPhrase( tokens_in_category[j])): dist_mat[i, j] = self._words_similarity( tokens_in_text[i], tokens_in_category[j]) else: dist_mat[i, j] = 0 # Find the best matched token in the text for each of token under the category best_matched_indexs = dist_mat.argmax( axis=0 ) # The index of the best matched tokens for each of the category best_matched_dists = [ ] # The distance between the best matched words and the words in text for j in range(len(best_matched_indexs)): best_matched_dists.append(dist_mat[best_matched_indexs[j], j]) best_matched_dists = np.array(best_matched_dists) # Find K-nearest words (to the current category) in the text for k in range(K): j = best_matched_dists.argmax( ) # The index of the words in text which has the highest similarity i = best_matched_indexs[j] # If the current best matched distance is lower than 0, then abandon it. if best_matched_dists[j] <= 0: break best_matched_dists[ j] = -1 # Remove the largest value in the best_matched_dists self.k_nearest_tokens[category].append({ 'in_text': tokens_in_text[i], 'in_category': tokens_in_category[j], 'count': len(self.structure[tokens_in_text[i]]['text_indexs']), 'distance': dist_mat[i, j] # 'rate': self._rate_token_candidates(category, tokens_in_text[i]) }) # Convert term dict to numerical term vector self.term_vector = self._term_dict2term_vector(self.k_nearest_tokens) # print >> sys.stderr, json.dumps(self.k_nearest_tokens, indent=4) #################################### # Utilities #################################### def _rate_token_candidates(self, category, candidate_token): if not bool(self.anchors[category]): return 0 else: dist = np.array([ self._tokens_min_distance(candidate_token, anchor_token) for anchor_token in self.anchors[category].keys() ]).astype('float') # anchor_sim = np.array([self.anchors[category][anchor_token] for anchor_token in self.anchors[category].keys()]).astype('float') anchor_sim = np.array( self.anchors[category].values()).astype('float') # Rate: determine which token candidate under a category in the text is the most informative, and # most accurate item as to the category. # rate = max(anchor_sim * ((1.0 - dist[:,0] / self.length_of_text) ** dist[:,1])) rate = max( (1.0 - dist[:, 0] / self.length_of_text)**(dist[:, 1] + 1.0)) return rate def _phrases_filter(self, phrases): filtered_phrases = {} for p in phrases: sims = [ self._phrases_similarity(p, p_i) for p_i in self.interested_phrases ] # Remove irrelevant phrases according to the interested phrases list if max(sims) > self.PHRASE_MIN_SIM: filtered_phrases[p] = {} filtered_phrases[p][ 'similar_phrase'] = self.interested_phrases[np.argmax( sims)] filtered_phrases[p]['similarity'] = max(sims) return filtered_phrases def _words_similarity(self, word_A, word_B): try: similarity = self.word2vec_model.similarity(word_A, word_B) except KeyError, m: # TODO if word_A == word_B: similarity = 1 else: similarity = 0 return similarity