def get_word_overlap_score(actual_word_lists, generated_word_lists): english_stopwords = lexicon_helper.get_stopwords() sentiment_words = lexicon_helper.get_sentiment_words() scores = list() for word_list_1, word_list_2 in zip(actual_word_lists, generated_word_lists): score = 0 words_1 = set(word_list_1) words_2 = set(word_list_2) words_1 -= sentiment_words words_1 -= english_stopwords words_2 -= sentiment_words words_2 -= english_stopwords word_intersection = words_1 & words_2 word_union = words_1 | words_2 if word_union: score = len(word_intersection) / len(word_union) scores.append(score) word_overlap_score = statistics.mean(scores) if scores else 0 del english_stopwords del sentiment_words return word_overlap_score
def build_word_statistics(text_file_path, label_file_path): text_tokenizer = tf.keras.preprocessing.text.Tokenizer() with open(text_file_path) as text_file: text_tokenizer.fit_on_texts(text_file) vocab = text_tokenizer.word_index.keys() del text_tokenizer stopwords = lexicon_helper.get_stopwords() vocab -= stopwords logger.debug(vocab) labels = set() with open(label_file_path) as label_file: for label_line in label_file: label = label_line.strip() labels.add(label) logger.debug(labels) empty_template = dict() for label in labels: empty_template[label] = 0 logger.debug(empty_template) word_occurrences = dict() with open(text_file_path) as text_file, open(label_file_path) as label_file: for text_line, label_line in zip(text_file, label_file): words = text_line.strip().split() label = label_line.strip() for word in words: if len(word) > 3 and word in vocab: if word not in word_occurrences: word_occurrences[word] = empty_template.copy() occurrence = word_occurrences[word] occurrence[label] += 1 logger.debug(word_occurrences) label_word_scores = dict() for label in labels: word_scores = list() for word in word_occurrences: try: occurrence = word_occurrences[word] positive_count = occurrence[label] negative_count = sum(occurrence.values()) - positive_count kld = positive_count * (math.log(positive_count) / math.log(negative_count)) word_scores.append((kld, word)) except Exception: logger.debug("error while processing word '{}' for label '{}'".format(word, label)) label_word_scores[label] = word_scores logger.debug(label_word_scores) for label in label_word_scores: word_scores = label_word_scores[label] word_scores.sort(key=lambda x: x[0], reverse=True) most_correlated = [x[1] for x in word_scores[:100]] logger.info("For label '{}'".format(label)) logger.info("Most correlated words: {}".format(most_correlated))
def populate_word_blacklist(word_index): blacklisted_words = set() blacklisted_words |= set(global_config.predefined_word_index.values()) if global_config.filter_sentiment_words: blacklisted_words |= lexicon_helper.get_sentiment_words() if global_config.filter_stopwords: blacklisted_words |= lexicon_helper.get_stopwords() global bow_filtered_vocab_indices allowed_vocab = word_index.keys() - blacklisted_words i = 0 for word in allowed_vocab: vocab_index = word_index[word] bow_filtered_vocab_indices[vocab_index] = i i += 1 global_config.bow_size = len(allowed_vocab) logger.info("Created word index blacklist for BoW") logger.info("BoW size: {}".format(global_config.bow_size))
def build_word_statistics(text_file_path, label_file_path, no_sentiment_words=True): if no_sentiment_words: logger.info("Also excluding sentiment words!") text_tokenizer = TweetTokenizer() vocab = set() with open(text_file_path) as text_file: for line in text_file: vocab.update(text_tokenizer.tokenize(line.lower())) stopwords = lexicon_helper.get_stopwords() vocab -= stopwords if no_sentiment_words: sentiment_words = lexicon_helper.get_sentiment_words() vocab -= sentiment_words logger.debug(vocab) logger.info("Vocab size after filtering: " + str(len(vocab))) labels = set() with open(label_file_path) as label_file: for label_line in label_file: label = label_line.strip() labels.add(label) logger.debug(labels) empty_template = dict() for label in labels: empty_template[label] = 0 logger.debug(empty_template) word_occurrences = dict() with open(text_file_path) as text_file, open(label_file_path) as label_file: for text_line, label_line in zip(text_file, label_file): words = text_tokenizer.tokenize(text_line.strip()) label = label_line.strip() for word in words: if len(word) > 3 and word in vocab: if word not in word_occurrences: word_occurrences[word] = empty_template.copy() word_occurrences[word][label] += 1 logger.debug(word_occurrences) label_word_scores = dict() for label in labels: word_scores = list() for word in word_occurrences: try: occurrence = word_occurrences[word] positive_count = occurrence[label] negative_count = sum(occurrence.values()) - positive_count kld = positive_count * (math.log(positive_count) / math.log(negative_count)) word_scores.append((kld, word)) except Exception: logger.debug("error while processing word '{}' for label '{}'".format(word, label)) label_word_scores[label] = word_scores logger.debug(label_word_scores) for label in label_word_scores: word_scores = label_word_scores[label] word_scores.sort(key=lambda x: x[0], reverse=True) most_correlated = [x[1] for x in word_scores[:100]] logger.info("For label '{}'".format(label)) logger.info("Most correlated words: {}".format(most_correlated))