def build_postings(self, node): if node.searchable: # extra weight to the question text # TODO make this generic. weights should be incorporated in the graph # TODO a default weight system should be used in case weights are not put in the graph searchable_text = " ".join( node.searchable) + node.searchable[0] * 2 # get lemmatized tokens lemmatized_tokens = utils.lemmatize_text(searchable_text.lower()) # get stemmed tokens stemmed_tokens = utils.stem_text(searchable_text.lower()) # merge the lemmatized and stemmed tokens into lmmatized_tokens # every stemmed token that gets put, is put as many times its versions occur in the text lemmatized_tokens_set = set(lemmatized_tokens) for token in stemmed_tokens: if token not in lemmatized_tokens_set: lemmatized_tokens = lemmatized_tokens + [token] # remove stop words # lemmatized_tokens = utils.remove_stop_words(lemmatized_tokens, input_type="list") token_frequencies = dict() # count frequency for every lemmatized token for token in lemmatized_tokens: token_frequency = token_frequencies.get(token, 0) token_frequencies[token] = token_frequency + 1 # put token and frequency info in postings for token in token_frequencies: self.search_postings.add_document_for_token( token, node.id, {"tf": token_frequencies[token]})
def get_top_k_suggestions(data, intent_utils, search_postings, k=2): query = data["message"] query_tokens = utils.lemmatize_text(query.lower()) query_tokens_set = set(query_tokens) stemmed_query_tokens = utils.stem_text(query.lower()) for token in stemmed_query_tokens: if token not in query_tokens_set: query_tokens = query_tokens + [token] print "lemmatized, stemmed and stripped query tokens: " + json.dumps(query_tokens) # remove stop words # query_tokens = utils.remove_stop_words(query_tokens, input_type="list") results = [] # trigger # the trigger shall control whether to use cosine similarity or just a sum of scores trigger = True if trigger: # initializations unique_q_tokens_with_frequencies = dict() postings_vocab = search_postings.get_vocabulary() postings_word_mapping = search_postings.get_vocabulary(return_type="dict") query_vector = [0] * len(postings_vocab) doc_set = set() # get tf in query # and get a doc set for q_token in query_tokens: freq = unique_q_tokens_with_frequencies.get(q_token, 0) unique_q_tokens_with_frequencies[q_token] = freq + 1 if search_postings.get_token(q_token): doc_set = doc_set.union(set(map(lambda x: x["id"], search_postings.get_token(q_token).doc_list))) for q_token in query_tokens: # for this token, get the idf token_obj = search_postings.get_token(q_token) if token_obj: # compute tf-idf idf = token_obj.features["idf"] q_tf_idf = unique_q_tokens_with_frequencies[q_token] * idf # store in query vector query_vector[postings_word_mapping[q_token]] = q_tf_idf # compute cosine similarity for each doc for doc_id in list(doc_set): results.append([doc_id, utils.cosine_similarity(search_postings.doc_term_tf_idf[doc_id], query_vector)]) # return the top k results sorted_results = sorted(results, key=lambda x:x[1], reverse=True)[:k] return map(lambda x: x[0], sorted_results)
def lemmatize_sent(sent): return utils.lemmatize_text( utils.remove_unalphabetic_words(nltk.word_tokenize(sent)))
def build_extraction_postings(self, db_object, redis_object, extraction_indices): if self.db_info and self.db_info.get("mappings"): map_names = self.db_info.get("mappings", []) or [] for map_name in map_names: # initializations mapping = dict() postings_object = Postings() # skip if entry for the map exists in redis map_value = redis_object.get(map_name) # in case the entry has not been populated before if not map_value: # get mapping from DB mapping = db_object["mappings"].find_one( {"name": map_name}) mapping.pop("_id") # store mapping in Redis redis_object.set(map_name, json.dumps(mapping)) else: mapping = json.loads(map_value) entries = mapping.get("map") tokenized_entries = [] fields_to_index = mapping.get("toIndex") # build postings for i, entry in enumerate(entries): # use active entries if entry.get("active"): # merge all texts stripped_text = utils.remove_non_alpha_num_chars( " ".join( filter( lambda x: bool(x), reduce(lambda x, y: x + y, [ entry.get(field, []) or [] if type( entry.get(field, []) or []) == list else [str(entry[field])] for field in fields_to_index ], []))))[0] # generate tokens if stripped_text: map( lambda x: postings_object. add_document_for_token(x, i), set(utils.lemmatize_text( stripped_text.lower()))) if not map_value: # construct tokens for all constituents of the entry and store in redis if not already there tokenized_elements = map( lambda x: sorted( utils.lemmatize_text( utils.remove_non_alpha_num_chars(x)[0]) ), filter( lambda x: bool(x), reduce(lambda x, y: x + y, [ entry.get(field, []) or [] if type( entry.get(field, []) or []) == list else [str(entry[field])] for field in fields_to_index ], []))) tokenized_entries.append(tokenized_elements) else: if not map_value: tokenized_entries.append(None) extraction_indices[map_name] = postings_object if not map_value: # set tokenized mappings in redis if not already there redis_object.set("tokenized" + map_name, json.dumps(tokenized_entries))
from nltk.probability import FreqDist ruta_archivos = "..\\corpus\\SFU_Spanish_Review_Corpus\\lavadoras" sustantivos = [] inputt = open('UnigramTagger_cess_esp.pkl', 'rb') unigram_tagger = load(inputt) inputt.close() for file_name in utils.find_all_files_in_path('*.txt', ruta_archivos): oraciones = nltk.sent_tokenize(open(file_name).read().replace('\n', '.')) palabras_etiquetas = unigram_tagger.tag(nltk.word_tokenize(oraciones[-1])) sustantivos_archivo = [ sustantivo for sustantivo, tag in palabras_etiquetas if tag.startswith('n') ] sustantivos_archivo = utils.lemmatize_text(sustantivos_archivo) sustantivos += sustantivos_archivo #print(sustantivos) fd = FreqDist(sustantivos) print([word for word, freq in fd.most_common(10)]) sustantivos = [] errs = 0 for file_name in utils.find_all_files_in_path('*.txt', ruta_archivos): oraciones = nltk.sent_tokenize(open(file_name).read().replace('\n', '.')) palabras_etiquetas = unigram_tagger.tag(nltk.word_tokenize(oraciones[-1])) sustantivos_archivo = [ sustantivo for sustantivo, tag in palabras_etiquetas if tag.startswith('n') ]
def manual_classes_classifier(data_path, column, language, lemmatize, manual_mappings, manual_classes, predicted_classes_filename, should_upload_db, account_key_path): print("Build classifier...") with open(manual_classes, encoding="utf8") as json_data: manual_classes_dict = json.load(json_data) classifier = Classifier(manual_classes_dict, language) print("Classifier built") print() print("Loading data...") data_df = load_data(data_path, column) print("Loaded data sample") print(data_df.head()) print() print("Cleaning data...") data_df[column] = clean_data(data_df[column]) print("Clean data sample") print(data_df.head()) print() print("Removing stopwors...") data_df[column] = remove_stopwords(data_df[column], language) print("Data sample") print(data_df.head()) print() if lemmatize: print("Lemmatizing data...") data_df[column] = lemmatize_text(data_df[column], language) print("Lemmatized data sample") print(data_df.head()) print() if manual_mappings: print("Applying manual mappings...") data_df[column] = apply_manual_mappings(data_df[column], manual_mappings) print("Manually mapped data sample") print(data_df.head()) print() print("Predict classes...") predicted_classes = predict(classifier, data_df[column]) save_classes(predicted_classes, predicted_classes_filename) print("Predicted classes saved to:", predicted_classes_filename) print() if should_upload_db: db_client = connect_db(account_key_path) print("Uploading predicted classes to db...") upload_db( db_client, 'predicted_classes', { column: json.loads( pd.DataFrame(predicted_classes).to_json(orient='index', force_ascii=False)) }) print('Done') print()
def text_analysis( data_path, column, groups, language, lemmatize, ngram_range, num_topics, num_words, manual_mappings, generate_word_cloud, word_cloud_filename, frequent_words_filename, frequent_words_plot_filename, top_tfidf_words_filename, top_tfidf_words_plot_filename, predict_topics, topics_filename, predicted_topics_filename, ldavis_filename_prefix, predict_sentiment, predicted_sentiment_filename, should_upload_db, account_key_path ): print("Loading data...") data_df = load_data(data_path, column, groups) print("Loaded data sample") print(data_df.head()) print() print("Cleaning data...") data_df[column] = clean_data(data_df[column]) print("Clean data sample") print(data_df.head()) print() print("Removing stop words from data...") data_df[column] = remove_stopwords(data_df[column], language) print("Data sample") print(data_df.head()) print() if lemmatize: print("Lemmatizing data...") data_df[column] = lemmatize_text(data_df[column], language) print("Lemmatized data sample") print(data_df.head()) print() if manual_mappings: print("Applying manual mappings...") data_df[column] = apply_manual_mappings(data_df[column], manual_mappings) print("Manually mapped data sample") print(data_df.head()) print() if generate_word_cloud: print("Generating word cloud...") plot_word_cloud(data_df[column], word_cloud_filename, language) print("word_cloud saved to:", word_cloud_filename) print() count_vectorizer, count_data = get_count_vectorizer_and_transformed_data( data_df[column], language, ngram_range ) all_word_count_pair_list = most_frequent_words( count_data, count_vectorizer, count_data.shape[0] + 1 ) word_count_pair_list = all_word_count_pair_list[:num_words] tfidf_vectorizer, tfidf_data = get_tfidf_vectorizer_and_transformed_data( data_df[column], language, ngram_range ) all_tfidf_pair_list = most_frequent_words( tfidf_data, tfidf_vectorizer, tfidf_data.shape[0] + 1 ) tfidf_pair_list = all_tfidf_pair_list[:num_words] print("Saving frequent words...") save_words( all_word_count_pair_list, frequent_words_filename ) print("Frequent words saved to:", frequent_words_filename) print() if should_upload_db: db_client = connect_db(account_key_path) else: db_client = None if should_upload_db: print("Uploading frequent words to db...") upload_db(db_client, 'frequent_words', { column: {w: int(c) for w, c in word_count_pair_list} }) print('Done') print() print("Generating frequent word plot...") plot_top_words(word_count_pair_list, frequent_words_plot_filename) print("Frequent word plot saved to:", frequent_words_plot_filename) print() print("Saving top tfidf words...") save_words( all_tfidf_pair_list, top_tfidf_words_filename ) print("Top tfidf words saved to:", top_tfidf_words_filename) print() if should_upload_db: print("Uploading frequent words to db...") upload_db(db_client, 'top_tfidf', { column: {w: int(c) for w, c in tfidf_pair_list} }) print('Done') print() print("Generating top tfidf word plot...") plot_top_words(tfidf_pair_list, top_tfidf_words_plot_filename) print("Top tfidf word plot saved to:", top_tfidf_words_plot_filename) print() if groups: group_unique_vals = {} for group in groups: group_unique_vals[group] = data_df[group].unique() splits = {} for group, unique_vals in group_unique_vals.items(): for val in unique_vals: splits[(group, val)] = data_df[group] == val for i in range(len(groups) - 1): splits = concat_splits(splits) grouped_words_counts = {} grouped_words_tfidf = {} for key, split_idcs in splits.items(): split = data_df[split_idcs] split_texts = split[column] if len(split_texts) > 0 and any(split_texts.str.len() > 0): word_cloud_filename_val = add_prefix_to_filename( word_cloud_filename, key ) frequent_words_filename_val = add_prefix_to_filename( frequent_words_filename, key ) frequent_words_plot_filename_val = add_prefix_to_filename( frequent_words_plot_filename, key ) top_tfidf_words_filename_val = add_prefix_to_filename( top_tfidf_words_filename, key ) top_tfidf_words_plot_filename_val = add_prefix_to_filename( top_tfidf_words_plot_filename, key ) if generate_word_cloud: print("Generating word cloud...") plot_word_cloud(split_texts, word_cloud_filename_val, language) print("word_cloud saved to:", word_cloud_filename_val) print() try: count_vectorizer, count_data = get_count_vectorizer_and_transformed_data( split_texts, language, ngram_range ) all_word_count_pair_list = most_frequent_words( count_data, count_vectorizer, count_data.shape[0] + 1 ) word_count_pair_list = all_word_count_pair_list[:num_words] tfidf_vectorizer, tfidf_data = get_tfidf_vectorizer_and_transformed_data( split_texts, language, ngram_range ) all_tfidf_pair_list = most_frequent_words( tfidf_data, tfidf_vectorizer, tfidf_data.shape[0] + 1 ) tfidf_pair_list = all_tfidf_pair_list[:num_words] print("Saving frequent words...") save_words( all_word_count_pair_list, frequent_words_filename_val ) print("Frequent words saved to:", frequent_words_filename_val) print() print("Generating frequent word plot...") plot_top_words(word_count_pair_list, frequent_words_plot_filename_val) print("Frequent word plot saved to:", frequent_words_plot_filename_val) print() print("Saving top tfidf words...") save_words( all_tfidf_pair_list, top_tfidf_words_filename_val ) print("Top tfidf words saved to:", top_tfidf_words_filename_val) print() print("Generating top tfidf word plot...") plot_top_words(tfidf_pair_list, top_tfidf_words_plot_filename_val) print("Top tfidf word plot saved to:", top_tfidf_words_plot_filename_val) print() grouped_words_counts[key[1::2]] = { w: int(c) for w, c in all_word_count_pair_list } grouped_words_tfidf[key[1::2]] = { w: int(c) for w, c in all_tfidf_pair_list } except: print("Error processing", key, "skipping it. texts are probably all stopwords") print("Saving grouped frequent words...") group_frequent_words_filename = add_prefix_to_filename( frequent_words_filename, groups ) remapped_grouped_words_counts = remap_keys(grouped_words_counts, groups) with open(group_frequent_words_filename, 'w', encoding="utf8") as f: json.dump(remapped_grouped_words_counts, f, ensure_ascii=False) print("Frequent words saved to:", group_frequent_words_filename) print() if should_upload_db: print("Uploading grouped_words_counts to db...") upload_db(db_client, 'grouped_words_counts', { column: remap_to_dict(remapped_grouped_words_counts) }) print('Done') print() print("Saving grouped top tfidf words...") group_top_tfidf_words_filename = add_prefix_to_filename( top_tfidf_words_filename, groups ) remapped_grouped_words_tfidf = remap_keys(grouped_words_tfidf, groups) with open(group_top_tfidf_words_filename, 'w', encoding="utf8") as f: json.dump(remapped_grouped_words_tfidf, f, ensure_ascii=False) print("Top tfidf words saved to:", group_top_tfidf_words_filename) print() if should_upload_db: print("Uploading grouped_words_tfidf to db...") upload_db(db_client, 'grouped_words_tfidf', { column: remap_to_dict(remapped_grouped_words_tfidf) }) print('Done') print() if predict_topics: print("Calculating topic model...") lda, predicted_topics = learn_topic_model(tfidf_data, num_topics) print("Topics found via LDA:") print_topics(lda, tfidf_vectorizer, num_words) print("Saving topics...") save_topics(lda, tfidf_vectorizer, topics_filename) print("Topics saved to:", topics_filename) print() print("Saving predicted topics...") save_predicted_topics(predicted_topics, predicted_topics_filename) print("Predicted topics saved to:", predicted_topics_filename) print() if should_upload_db: print("Uploading predicted topics to db...") upload_db(db_client, 'predicted_topics', { column: json.loads(pd.DataFrame(predicted_topics).to_json( orient='index', force_ascii=False )) }) print('Done') print() print("Generating LDA visualization...") visualize_topic_model(lda, count_data, tfidf_vectorizer, num_topics, ldavis_filename_prefix) print("LDA visualization saved to:", ldavis_filename_prefix) print() if predict_sentiment: if language == 'it': print("Predict sentiment...") predicted_sentiment = predict_sentiment_with_sentita(data_df[column]) save_sentiment(predicted_sentiment, predicted_sentiment_filename) print("Predict sentiment saved to:", predicted_sentiment_filename) print() if should_upload_db: print("Uploading predicted sentiment to db...") upload_db(db_client, 'predicted_sentiment', { column: json.loads(pd.DataFrame(predicted_sentiment).to_json( orient='index', force_ascii=False )) }) print('Done') print() elif language == 'en': print("Predict sentiment...") predicted_sentiment = predict_sentiment_with_paralleldots(data_df) save_sentiment(predicted_sentiment, predicted_sentiment_filename) print("Predict sentiment saved to:", predicted_sentiment_filename) print() if should_upload_db: print("Uploading predicted sentiment to db...") upload_db(db_client, 'predicted_sentiment', { column: json.loads(pd.DataFrame(predicted_sentiment).to_json( orient='index', force_ascii=False )) }) print('Done') print() else: print("Sentiment analysis on {} language is not supported") print()