def get_word_overlap_score(actual_word_lists, generated_word_lists):
    english_stopwords = lexicon_helper.get_stopwords()
    sentiment_words = lexicon_helper.get_sentiment_words()

    scores = list()
    for word_list_1, word_list_2 in zip(actual_word_lists, generated_word_lists):
        score = 0
        words_1 = set(word_list_1)
        words_2 = set(word_list_2)

        words_1 -= sentiment_words
        words_1 -= english_stopwords
        words_2 -= sentiment_words
        words_2 -= english_stopwords

        word_intersection = words_1 & words_2
        word_union = words_1 | words_2
        if word_union:
            score = len(word_intersection) / len(word_union)
            scores.append(score)

    word_overlap_score = statistics.mean(scores) if scores else 0

    del english_stopwords
    del sentiment_words

    return word_overlap_score
Exemple #2
0
def build_word_statistics(text_file_path, label_file_path):
    text_tokenizer = tf.keras.preprocessing.text.Tokenizer()
    with open(text_file_path) as text_file:
        text_tokenizer.fit_on_texts(text_file)
    vocab = text_tokenizer.word_index.keys()
    del text_tokenizer
    stopwords = lexicon_helper.get_stopwords()
    vocab -= stopwords
    logger.debug(vocab)

    labels = set()
    with open(label_file_path) as label_file:
        for label_line in label_file:
            label = label_line.strip()
            labels.add(label)
    logger.debug(labels)

    empty_template = dict()
    for label in labels:
        empty_template[label] = 0
    logger.debug(empty_template)

    word_occurrences = dict()
    with open(text_file_path) as text_file, open(label_file_path) as label_file:
        for text_line, label_line in zip(text_file, label_file):
            words = text_line.strip().split()
            label = label_line.strip()
            for word in words:
                if len(word) > 3 and word in vocab:
                    if word not in word_occurrences:
                        word_occurrences[word] = empty_template.copy()
                    occurrence = word_occurrences[word]
                    occurrence[label] += 1
    logger.debug(word_occurrences)

    label_word_scores = dict()
    for label in labels:
        word_scores = list()
        for word in word_occurrences:
            try:
                occurrence = word_occurrences[word]
                positive_count = occurrence[label]
                negative_count = sum(occurrence.values()) - positive_count
                kld = positive_count * (math.log(positive_count) / math.log(negative_count))
                word_scores.append((kld, word))
            except Exception:
                logger.debug("error while processing word '{}' for label '{}'".format(word, label))
        label_word_scores[label] = word_scores

    logger.debug(label_word_scores)

    for label in label_word_scores:
        word_scores = label_word_scores[label]
        word_scores.sort(key=lambda x: x[0], reverse=True)
        most_correlated = [x[1] for x in word_scores[:100]]
        logger.info("For label '{}'".format(label))
        logger.info("Most correlated words: {}".format(most_correlated))
Exemple #3
0
def populate_word_blacklist(word_index):
    blacklisted_words = set()
    blacklisted_words |= set(global_config.predefined_word_index.values())
    if global_config.filter_sentiment_words:
        blacklisted_words |= lexicon_helper.get_sentiment_words()
    if global_config.filter_stopwords:
        blacklisted_words |= lexicon_helper.get_stopwords()

    global bow_filtered_vocab_indices
    allowed_vocab = word_index.keys() - blacklisted_words
    i = 0
    for word in allowed_vocab:
        vocab_index = word_index[word]
        bow_filtered_vocab_indices[vocab_index] = i
        i += 1

    global_config.bow_size = len(allowed_vocab)
    logger.info("Created word index blacklist for BoW")
    logger.info("BoW size: {}".format(global_config.bow_size))
Exemple #4
0
def build_word_statistics(text_file_path, label_file_path, no_sentiment_words=True):
    if no_sentiment_words:
        logger.info("Also excluding sentiment words!")

    text_tokenizer = TweetTokenizer()
    vocab = set()
    with open(text_file_path) as text_file:
        for line in text_file:
            vocab.update(text_tokenizer.tokenize(line.lower()))
    stopwords = lexicon_helper.get_stopwords()
    vocab -= stopwords
    if no_sentiment_words:
        sentiment_words = lexicon_helper.get_sentiment_words()
        vocab -= sentiment_words
    logger.debug(vocab)
    logger.info("Vocab size after filtering: " + str(len(vocab)))

    labels = set()
    with open(label_file_path) as label_file:
        for label_line in label_file:
            label = label_line.strip()
            labels.add(label)
    logger.debug(labels)

    empty_template = dict()
    for label in labels:
        empty_template[label] = 0
    logger.debug(empty_template)

    word_occurrences = dict()
    with open(text_file_path) as text_file, open(label_file_path) as label_file:
        for text_line, label_line in zip(text_file, label_file):
            words = text_tokenizer.tokenize(text_line.strip())
            label = label_line.strip()
            for word in words:
                if len(word) > 3 and word in vocab:
                    if word not in word_occurrences:
                        word_occurrences[word] = empty_template.copy()
                    word_occurrences[word][label] += 1
    logger.debug(word_occurrences)

    label_word_scores = dict()
    for label in labels:
        word_scores = list()
        for word in word_occurrences:
            try:
                occurrence = word_occurrences[word]
                positive_count = occurrence[label]
                negative_count = sum(occurrence.values()) - positive_count
                kld = positive_count * (math.log(positive_count) / math.log(negative_count))
                word_scores.append((kld, word))
            except Exception:
                logger.debug("error while processing word '{}' for label '{}'".format(word, label))
        label_word_scores[label] = word_scores

    logger.debug(label_word_scores)

    for label in label_word_scores:
        word_scores = label_word_scores[label]
        word_scores.sort(key=lambda x: x[0], reverse=True)
        most_correlated = [x[1] for x in word_scores[:100]]
        logger.info("For label '{}'".format(label))
        logger.info("Most correlated words: {}".format(most_correlated))