Beispiel #1
0
    def build_postings(self, node):
        if node.searchable:
            # extra weight to the question text
            # TODO make this generic. weights should be incorporated in the graph
            # TODO a default weight system should be used in case weights are not put in the graph
            searchable_text = " ".join(
                node.searchable) + node.searchable[0] * 2
            # get lemmatized tokens
            lemmatized_tokens = utils.lemmatize_text(searchable_text.lower())
            # get stemmed tokens
            stemmed_tokens = utils.stem_text(searchable_text.lower())

            # merge the lemmatized and stemmed tokens into lmmatized_tokens
            # every stemmed token that gets put, is put as many times its versions occur in the text
            lemmatized_tokens_set = set(lemmatized_tokens)
            for token in stemmed_tokens:
                if token not in lemmatized_tokens_set:
                    lemmatized_tokens = lemmatized_tokens + [token]

            # remove stop words
            # lemmatized_tokens = utils.remove_stop_words(lemmatized_tokens, input_type="list")
            token_frequencies = dict()
            # count frequency for every lemmatized token
            for token in lemmatized_tokens:
                token_frequency = token_frequencies.get(token, 0)
                token_frequencies[token] = token_frequency + 1
            # put token and frequency info in postings
            for token in token_frequencies:
                self.search_postings.add_document_for_token(
                    token, node.id, {"tf": token_frequencies[token]})
Beispiel #2
0
def get_top_k_suggestions(data, intent_utils, search_postings, k=2):
    query = data["message"]
    query_tokens = utils.lemmatize_text(query.lower())
    query_tokens_set = set(query_tokens)
    stemmed_query_tokens = utils.stem_text(query.lower())
    for token in stemmed_query_tokens:
        if token not in query_tokens_set:
            query_tokens = query_tokens + [token]
    print "lemmatized, stemmed and stripped query tokens: " + json.dumps(query_tokens)
    # remove stop words
    # query_tokens = utils.remove_stop_words(query_tokens, input_type="list")
    results = []

    # trigger
    # the trigger shall control whether to use cosine similarity or just a sum of scores
    trigger = True

    if trigger:
        # initializations
        unique_q_tokens_with_frequencies = dict()
        postings_vocab = search_postings.get_vocabulary()
        postings_word_mapping = search_postings.get_vocabulary(return_type="dict")
        query_vector = [0] * len(postings_vocab)
        doc_set = set()

        # get tf in query
        # and get a doc set
        for q_token in query_tokens:
            freq = unique_q_tokens_with_frequencies.get(q_token, 0)
            unique_q_tokens_with_frequencies[q_token] = freq + 1
            if search_postings.get_token(q_token):
                doc_set = doc_set.union(set(map(lambda x: x["id"], search_postings.get_token(q_token).doc_list)))

        for q_token in query_tokens:
            # for this token, get the idf
            token_obj = search_postings.get_token(q_token)
            if token_obj:
                # compute tf-idf
                idf = token_obj.features["idf"]
                q_tf_idf = unique_q_tokens_with_frequencies[q_token] * idf
                # store in query vector
                query_vector[postings_word_mapping[q_token]] = q_tf_idf

        # compute cosine similarity for each doc
        for doc_id in list(doc_set):
            results.append([doc_id, utils.cosine_similarity(search_postings.doc_term_tf_idf[doc_id], query_vector)])

        # return the top k results
        sorted_results = sorted(results, key=lambda x:x[1], reverse=True)[:k]
        return map(lambda x: x[0], sorted_results)
def lemmatize_sent(sent):
    return utils.lemmatize_text(
        utils.remove_unalphabetic_words(nltk.word_tokenize(sent)))
Beispiel #4
0
 def build_extraction_postings(self, db_object, redis_object,
                               extraction_indices):
     if self.db_info and self.db_info.get("mappings"):
         map_names = self.db_info.get("mappings", []) or []
         for map_name in map_names:
             # initializations
             mapping = dict()
             postings_object = Postings()
             # skip if entry for the map exists in redis
             map_value = redis_object.get(map_name)
             # in case the entry has not been populated before
             if not map_value:
                 # get mapping from DB
                 mapping = db_object["mappings"].find_one(
                     {"name": map_name})
                 mapping.pop("_id")
                 # store mapping in Redis
                 redis_object.set(map_name, json.dumps(mapping))
             else:
                 mapping = json.loads(map_value)
             entries = mapping.get("map")
             tokenized_entries = []
             fields_to_index = mapping.get("toIndex")
             # build postings
             for i, entry in enumerate(entries):
                 # use active entries
                 if entry.get("active"):
                     # merge all texts
                     stripped_text = utils.remove_non_alpha_num_chars(
                         " ".join(
                             filter(
                                 lambda x: bool(x),
                                 reduce(lambda x, y: x + y, [
                                     entry.get(field, []) or [] if type(
                                         entry.get(field, []) or []) == list
                                     else [str(entry[field])]
                                     for field in fields_to_index
                                 ], []))))[0]
                     # generate tokens
                     if stripped_text:
                         map(
                             lambda x: postings_object.
                             add_document_for_token(x, i),
                             set(utils.lemmatize_text(
                                 stripped_text.lower())))
                     if not map_value:
                         # construct tokens for all constituents of the entry and store in redis if not already there
                         tokenized_elements = map(
                             lambda x: sorted(
                                 utils.lemmatize_text(
                                     utils.remove_non_alpha_num_chars(x)[0])
                             ),
                             filter(
                                 lambda x: bool(x),
                                 reduce(lambda x, y: x + y, [
                                     entry.get(field, []) or [] if type(
                                         entry.get(field, []) or []) == list
                                     else [str(entry[field])]
                                     for field in fields_to_index
                                 ], [])))
                         tokenized_entries.append(tokenized_elements)
                 else:
                     if not map_value:
                         tokenized_entries.append(None)
             extraction_indices[map_name] = postings_object
             if not map_value:
                 # set tokenized mappings in redis if not already there
                 redis_object.set("tokenized" + map_name,
                                  json.dumps(tokenized_entries))
Beispiel #5
0
from nltk.probability import FreqDist

ruta_archivos = "..\\corpus\\SFU_Spanish_Review_Corpus\\lavadoras"
sustantivos = []
inputt = open('UnigramTagger_cess_esp.pkl', 'rb')
unigram_tagger = load(inputt)
inputt.close()

for file_name in utils.find_all_files_in_path('*.txt', ruta_archivos):
    oraciones = nltk.sent_tokenize(open(file_name).read().replace('\n', '.'))
    palabras_etiquetas = unigram_tagger.tag(nltk.word_tokenize(oraciones[-1]))
    sustantivos_archivo = [
        sustantivo for sustantivo, tag in palabras_etiquetas
        if tag.startswith('n')
    ]
    sustantivos_archivo = utils.lemmatize_text(sustantivos_archivo)
    sustantivos += sustantivos_archivo

#print(sustantivos)
fd = FreqDist(sustantivos)
print([word for word, freq in fd.most_common(10)])

sustantivos = []
errs = 0
for file_name in utils.find_all_files_in_path('*.txt', ruta_archivos):
    oraciones = nltk.sent_tokenize(open(file_name).read().replace('\n', '.'))
    palabras_etiquetas = unigram_tagger.tag(nltk.word_tokenize(oraciones[-1]))
    sustantivos_archivo = [
        sustantivo for sustantivo, tag in palabras_etiquetas
        if tag.startswith('n')
    ]
Beispiel #6
0
def manual_classes_classifier(data_path, column, language, lemmatize,
                              manual_mappings, manual_classes,
                              predicted_classes_filename, should_upload_db,
                              account_key_path):
    print("Build classifier...")
    with open(manual_classes, encoding="utf8") as json_data:
        manual_classes_dict = json.load(json_data)
    classifier = Classifier(manual_classes_dict, language)
    print("Classifier built")
    print()

    print("Loading data...")
    data_df = load_data(data_path, column)
    print("Loaded data sample")
    print(data_df.head())
    print()

    print("Cleaning data...")
    data_df[column] = clean_data(data_df[column])
    print("Clean data sample")
    print(data_df.head())
    print()

    print("Removing stopwors...")
    data_df[column] = remove_stopwords(data_df[column], language)
    print("Data sample")
    print(data_df.head())
    print()

    if lemmatize:
        print("Lemmatizing data...")
        data_df[column] = lemmatize_text(data_df[column], language)
        print("Lemmatized data sample")
        print(data_df.head())
        print()

    if manual_mappings:
        print("Applying manual mappings...")
        data_df[column] = apply_manual_mappings(data_df[column],
                                                manual_mappings)
        print("Manually mapped data sample")
        print(data_df.head())
        print()

    print("Predict classes...")
    predicted_classes = predict(classifier, data_df[column])
    save_classes(predicted_classes, predicted_classes_filename)
    print("Predicted classes saved to:", predicted_classes_filename)
    print()

    if should_upload_db:
        db_client = connect_db(account_key_path)
        print("Uploading predicted classes to db...")
        upload_db(
            db_client, 'predicted_classes', {
                column:
                json.loads(
                    pd.DataFrame(predicted_classes).to_json(orient='index',
                                                            force_ascii=False))
            })
        print('Done')
        print()
Beispiel #7
0
def text_analysis(
        data_path,
        column,
        groups,
        language,
        lemmatize,
        ngram_range,
        num_topics,
        num_words,
        manual_mappings,
        generate_word_cloud,
        word_cloud_filename,
        frequent_words_filename,
        frequent_words_plot_filename,
        top_tfidf_words_filename,
        top_tfidf_words_plot_filename,
        predict_topics,
        topics_filename,
        predicted_topics_filename,
        ldavis_filename_prefix,
        predict_sentiment,
        predicted_sentiment_filename,
        should_upload_db,
        account_key_path
):
    print("Loading data...")
    data_df = load_data(data_path, column, groups)
    print("Loaded data sample")
    print(data_df.head())
    print()

    print("Cleaning data...")
    data_df[column] = clean_data(data_df[column])
    print("Clean data sample")
    print(data_df.head())
    print()

    print("Removing stop words from data...")
    data_df[column] = remove_stopwords(data_df[column], language)
    print("Data sample")
    print(data_df.head())
    print()

    if lemmatize:
        print("Lemmatizing data...")
        data_df[column] = lemmatize_text(data_df[column], language)
        print("Lemmatized data sample")
        print(data_df.head())
        print()

    if manual_mappings:
        print("Applying manual mappings...")
        data_df[column] = apply_manual_mappings(data_df[column], manual_mappings)
        print("Manually mapped data sample")
        print(data_df.head())
        print()

    if generate_word_cloud:
        print("Generating word cloud...")
        plot_word_cloud(data_df[column], word_cloud_filename, language)
        print("word_cloud saved to:", word_cloud_filename)
        print()

    count_vectorizer, count_data = get_count_vectorizer_and_transformed_data(
        data_df[column], language, ngram_range
    )
    all_word_count_pair_list = most_frequent_words(
        count_data, count_vectorizer, count_data.shape[0] + 1
    )
    word_count_pair_list = all_word_count_pair_list[:num_words]

    tfidf_vectorizer, tfidf_data = get_tfidf_vectorizer_and_transformed_data(
        data_df[column], language, ngram_range
    )
    all_tfidf_pair_list = most_frequent_words(
        tfidf_data, tfidf_vectorizer, tfidf_data.shape[0] + 1
    )
    tfidf_pair_list = all_tfidf_pair_list[:num_words]

    print("Saving frequent words...")
    save_words(
        all_word_count_pair_list,
        frequent_words_filename
    )
    print("Frequent words saved to:", frequent_words_filename)
    print()

    if should_upload_db:
        db_client = connect_db(account_key_path)
    else:
        db_client = None

    if should_upload_db:
        print("Uploading frequent words to db...")
        upload_db(db_client, 'frequent_words', {
            column: {w: int(c) for w, c in word_count_pair_list}
        })
        print('Done')
        print()

    print("Generating frequent word plot...")
    plot_top_words(word_count_pair_list, frequent_words_plot_filename)
    print("Frequent word plot saved to:", frequent_words_plot_filename)
    print()

    print("Saving top tfidf words...")
    save_words(
        all_tfidf_pair_list,
        top_tfidf_words_filename
    )
    print("Top tfidf words saved to:", top_tfidf_words_filename)
    print()

    if should_upload_db:
        print("Uploading frequent words to db...")
        upload_db(db_client, 'top_tfidf', {
            column: {w: int(c) for w, c in tfidf_pair_list}
        })
        print('Done')
        print()

    print("Generating top tfidf word plot...")
    plot_top_words(tfidf_pair_list, top_tfidf_words_plot_filename)
    print("Top tfidf word plot saved to:", top_tfidf_words_plot_filename)
    print()

    if groups:
        group_unique_vals = {}
        for group in groups:
            group_unique_vals[group] = data_df[group].unique()

        splits = {}
        for group, unique_vals in group_unique_vals.items():
            for val in unique_vals:
                splits[(group, val)] = data_df[group] == val

        for i in range(len(groups) - 1):
            splits = concat_splits(splits)

        grouped_words_counts = {}
        grouped_words_tfidf = {}

        for key, split_idcs in splits.items():
            split = data_df[split_idcs]
            split_texts = split[column]

            if len(split_texts) > 0 and any(split_texts.str.len() > 0):
                word_cloud_filename_val = add_prefix_to_filename(
                    word_cloud_filename, key
                )
                frequent_words_filename_val = add_prefix_to_filename(
                    frequent_words_filename, key
                )
                frequent_words_plot_filename_val = add_prefix_to_filename(
                    frequent_words_plot_filename, key
                )
                top_tfidf_words_filename_val = add_prefix_to_filename(
                    top_tfidf_words_filename, key
                )
                top_tfidf_words_plot_filename_val = add_prefix_to_filename(
                    top_tfidf_words_plot_filename, key
                )

                if generate_word_cloud:
                    print("Generating word cloud...")
                    plot_word_cloud(split_texts, word_cloud_filename_val, language)
                    print("word_cloud saved to:", word_cloud_filename_val)
                    print()

                try:
                    count_vectorizer, count_data = get_count_vectorizer_and_transformed_data(
                        split_texts, language, ngram_range
                    )
                    all_word_count_pair_list = most_frequent_words(
                        count_data, count_vectorizer, count_data.shape[0] + 1
                    )
                    word_count_pair_list = all_word_count_pair_list[:num_words]

                    tfidf_vectorizer, tfidf_data = get_tfidf_vectorizer_and_transformed_data(
                        split_texts, language, ngram_range
                    )
                    all_tfidf_pair_list = most_frequent_words(
                        tfidf_data, tfidf_vectorizer, tfidf_data.shape[0] + 1
                    )
                    tfidf_pair_list = all_tfidf_pair_list[:num_words]

                    print("Saving frequent words...")
                    save_words(
                        all_word_count_pair_list,
                        frequent_words_filename_val
                    )
                    print("Frequent words saved to:", frequent_words_filename_val)
                    print()

                    print("Generating frequent word plot...")
                    plot_top_words(word_count_pair_list, frequent_words_plot_filename_val)
                    print("Frequent word plot saved to:", frequent_words_plot_filename_val)
                    print()

                    print("Saving top tfidf words...")
                    save_words(
                        all_tfidf_pair_list,
                        top_tfidf_words_filename_val
                    )
                    print("Top tfidf words saved to:", top_tfidf_words_filename_val)
                    print()

                    print("Generating top tfidf word plot...")
                    plot_top_words(tfidf_pair_list, top_tfidf_words_plot_filename_val)
                    print("Top tfidf word plot saved to:", top_tfidf_words_plot_filename_val)
                    print()

                    grouped_words_counts[key[1::2]] = {
                        w: int(c) for w, c in all_word_count_pair_list
                    }
                    grouped_words_tfidf[key[1::2]] = {
                        w: int(c) for w, c in all_tfidf_pair_list
                    }
                except:
                    print("Error processing", key,
                          "skipping it. texts are probably all stopwords")

        print("Saving grouped frequent words...")
        group_frequent_words_filename = add_prefix_to_filename(
            frequent_words_filename, groups
        )
        remapped_grouped_words_counts = remap_keys(grouped_words_counts, groups)
        with open(group_frequent_words_filename, 'w', encoding="utf8") as f:
            json.dump(remapped_grouped_words_counts, f, ensure_ascii=False)
        print("Frequent words saved to:", group_frequent_words_filename)
        print()

        if should_upload_db:
            print("Uploading grouped_words_counts to db...")
            upload_db(db_client, 'grouped_words_counts', {
                column: remap_to_dict(remapped_grouped_words_counts)
            })
            print('Done')
            print()

        print("Saving grouped top tfidf words...")
        group_top_tfidf_words_filename = add_prefix_to_filename(
            top_tfidf_words_filename, groups
        )
        remapped_grouped_words_tfidf = remap_keys(grouped_words_tfidf, groups)
        with open(group_top_tfidf_words_filename, 'w', encoding="utf8") as f:
            json.dump(remapped_grouped_words_tfidf, f, ensure_ascii=False)
        print("Top tfidf words saved to:", group_top_tfidf_words_filename)
        print()

        if should_upload_db:
            print("Uploading grouped_words_tfidf to db...")
            upload_db(db_client, 'grouped_words_tfidf', {
                column: remap_to_dict(remapped_grouped_words_tfidf)
            })
            print('Done')
            print()

    if predict_topics:
        print("Calculating topic model...")
        lda, predicted_topics = learn_topic_model(tfidf_data, num_topics)
        print("Topics found via LDA:")
        print_topics(lda, tfidf_vectorizer, num_words)
        print("Saving topics...")
        save_topics(lda, tfidf_vectorizer, topics_filename)
        print("Topics saved to:", topics_filename)
        print()

        print("Saving predicted topics...")
        save_predicted_topics(predicted_topics, predicted_topics_filename)
        print("Predicted topics saved to:", predicted_topics_filename)
        print()

        if should_upload_db:
            print("Uploading predicted topics to db...")
            upload_db(db_client, 'predicted_topics', {
                column: json.loads(pd.DataFrame(predicted_topics).to_json(
                    orient='index', force_ascii=False
                ))
            })
            print('Done')
            print()

        print("Generating LDA visualization...")
        visualize_topic_model(lda, count_data, tfidf_vectorizer,
                              num_topics, ldavis_filename_prefix)
        print("LDA visualization saved to:", ldavis_filename_prefix)
        print()

    if predict_sentiment:
        if language == 'it':
            print("Predict sentiment...")
            predicted_sentiment = predict_sentiment_with_sentita(data_df[column])
            save_sentiment(predicted_sentiment, predicted_sentiment_filename)
            print("Predict sentiment saved to:", predicted_sentiment_filename)
            print()

            if should_upload_db:
                print("Uploading predicted sentiment to db...")
                upload_db(db_client, 'predicted_sentiment', {
                    column: json.loads(pd.DataFrame(predicted_sentiment).to_json(
                        orient='index', force_ascii=False
                    ))
                })
                print('Done')
                print()

        elif language == 'en':
            print("Predict sentiment...")
            predicted_sentiment = predict_sentiment_with_paralleldots(data_df)
            save_sentiment(predicted_sentiment, predicted_sentiment_filename)
            print("Predict sentiment saved to:", predicted_sentiment_filename)
            print()

            if should_upload_db:
                print("Uploading predicted sentiment to db...")
                upload_db(db_client, 'predicted_sentiment', {
                    column: json.loads(pd.DataFrame(predicted_sentiment).to_json(
                        orient='index', force_ascii=False
                    ))
                })
                print('Done')
                print()
        else:
            print("Sentiment analysis on {} language is not supported")
            print()