def preprocess(sentences): # Tokenize sentences tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/ce-ms-marco-TinyBERT-L-4") model = AutoModel.from_pretrained("sentence-transformers/ce-ms-marco-TinyBERT-L-4") encoded_input = tokenizer(sentences.to_list(), padding=True, truncation=True, max_length=128, return_tensors='pt') # Compute token embeddings with torch.no_grad(): model_output = model(**encoded_input) # Perform pooling. In this case, mean pooling sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask']) sentiment_train = sentences.apply(lambda x: sentiment(x)) sentiment_train = pd.DataFrame(sentiment_train.values.tolist(), columns=['polarity', 'subjectivity'], index=sentences.index) parse_s = sentences.apply(lambda x: parse(x, lemmata=True)) sent = parse_s.apply(lambda x: Sentence(x)) modality_s = pd.DataFrame(sent.apply(lambda x: modality(x))) meta_df = sentiment_train.merge(modality_s, left_index=True, right_index=True) input_matrix = pd.concat([meta_df.reset_index(drop=True), pd.DataFrame(sentence_embeddings)], axis=1) return input_matrix
def extract_bias_features(text, do_get_caster=False): features = OrderedDict() if sys.version_info < (3, 0): # ignore conversion errors between utf-8 and ascii text = text.decode('ascii', 'ignore') text_nohyph = text.replace( "-", " ") # preserve hyphenated words as separate tokens txt_lwr = str(text_nohyph).lower() words = ''.join(ch for ch in txt_lwr if ch not in '!"#$%&()*+,-./:;<=>?@[\]^_`{|}~').split() unigrams = sorted(list(set(words))) bigram_tokens = find_ngrams(words, 2) bigrams = [" ".join([w1, w2]) for w1, w2 in sorted(set(bigram_tokens))] trigram_tokens = find_ngrams(words, 3) trigrams = [ " ".join([w1, w2, w3]) for w1, w2, w3 in sorted(set(trigram_tokens)) ] ## SENTENCE LEVEL MEASURES # word count features['word_cnt'] = len(words) # unique word count features['unique_word_cnt'] = len(unigrams) # Flesch-Kincaid Grade Level (reading difficulty) using textstat features['fk_gl'] = flesch_kincaid_grade(text) # compound sentiment score using VADER sentiment analysis package vader_sentiment = vader_sentiment_analysis.polarity_scores(text) vader_negative_proportion = vader_sentiment['neg'] vader_compound_sentiment = vader_sentiment['compound'] features['vader_sentiment'] = vader_compound_sentiment features['vader_senti_abs'] = abs(vader_compound_sentiment) # negative-perspective features['neg_persp'] = check_neg_persp(words, vader_negative_proportion, vader_compound_sentiment) # modality (certainty) score and mood using http://www.clips.ua.ac.be/pages/pattern-en#modality sentence = parse(text, lemmata=True) sentence_obj = Sentence(sentence) features['certainty'] = round(modality(sentence_obj), 4) # quoted material quote_dict = check_quotes(text) features["has_quotes"] = quote_dict["has_quotes"] features["quote_length"] = quote_dict["mean_quote_length"] features["nonquote_length"] = quote_dict["mean_nonquote_length"] ## LEXICON LEVEL MEASURES # presupposition markers count = count_feature_freq(presup, words, txt_lwr) features['presup_cnt'] = count features['presup_rto'] = round(old_div(float(count), float(len(words))), 4) # doubt markers count = count_feature_freq(doubt, words, txt_lwr) features['doubt_cnt'] = count features['doubt_rto'] = round(old_div(float(count), float(len(words))), 4) # partisan words and phrases count = count_feature_freq(partisan, words, txt_lwr) features['partisan_cnt'] = count features['partisan_rto'] = round(old_div(float(count), float(len(words))), 4) # subjective value laden word count count = count_feature_freq(value_laden, words, txt_lwr) features['value_cnt'] = count features['value_rto'] = round(old_div(float(count), float(len(words))), 4) # figurative language markers count = count_feature_freq(figurative, words, txt_lwr) features['figurative_cnt'] = count features['figurative_rto'] = round( old_div(float(count), float(len(words))), 4) # attribution markers count = count_feature_freq(attribution, words, txt_lwr) features['attribution_cnt'] = count features['attribution_rto'] = round( old_div(float(count), float(len(words))), 4) # self reference pronouns count = count_feature_freq(self_refer, words, txt_lwr) features['self_refer_cnt'] = count features['self_refer_rto'] = round( old_div(float(count), float(len(words))), 4) # Contextual Aspect Summary and Topical-Entity Recognition (CASTER) if do_get_caster: """ May incur a performance cost in time to process """ caster_dict = get_caster(text) features['caster_dict'] = caster_dict return features
def extract_bias_features(text): features = {} text = unicode(text, errors='ignore') if not isinstance(text, unicode) else text txt_lwr = str(text).lower() words = nltk.word_tokenize(txt_lwr) words = [w for w in words if len(w) > 0 and w not in '.?!,;:\'s"$'] unigrams = sorted(list(set(words))) bigram_tokens = nltk.bigrams(words) bigrams = [" ".join([w1, w2]) for w1, w2 in sorted(set(bigram_tokens))] trigram_tokens = nltk.trigrams(words) trigrams = [ " ".join([w1, w2, w3]) for w1, w2, w3 in sorted(set(trigram_tokens)) ] # word count features['word_cnt'] = len(words) # unique word count features['unique_word_cnt'] = len(unigrams) # coherence marker count count = count_feature_list_freq(coherence, words, bigrams, trigrams) features['cm_cnt'] = count features['cm_rto'] = round(float(count) / float(len(words)), 4) # degree modifier count count = count_feature_list_freq(modifiers, words, bigrams, trigrams) features['dm_cnt'] = count features['dm_rto'] = round(float(count) / float(len(words)), 4) # hedge word count count = count_feature_list_freq(hedges, words, bigrams, trigrams) features['hedge_cnt'] = count features['hedge_rto'] = round(float(count) / float(len(words)), 4) # factive verb count count = count_feature_list_freq(factives, words, bigrams, trigrams) features['factive_cnt'] = count features['factive_rto'] = round(float(count) / float(len(words)), 4) # assertive verb count count = count_feature_list_freq(assertives, words, bigrams, trigrams) features['assertive_cnt'] = count features['assertive_rto'] = round(float(count) / float(len(words)), 4) # implicative verb count count = count_feature_list_freq(implicatives, words, bigrams, trigrams) features['implicative_cnt'] = count features['implicative_rto'] = round(float(count) / float(len(words)), 4) # bias words and phrases count count = count_feature_list_freq(biased, words, bigrams, trigrams) features['bias_cnt'] = count features['bias_rto'] = round(float(count) / float(len(words)), 4) # opinion word count count = count_feature_list_freq(opinionLaden, words, bigrams, trigrams) features['opinion_cnt'] = count features['opinion_rto'] = round(float(count) / float(len(words)), 4) # weak subjective word count count = count_feature_list_freq(subj_weak, words, bigrams, trigrams) features['subj_weak_cnt'] = count features['subj_weak_rto'] = round(float(count) / float(len(words)), 4) # strong subjective word count count = count_feature_list_freq(subj_strong, words, bigrams, trigrams) features['subj_strong_cnt'] = count features['subj_strong_rto'] = round(float(count) / float(len(words)), 4) # composite sentiment score using VADER sentiment analysis package compound_sentiment = vader_sentiment_analysis.polarity_scores( text)['compound'] features['vader_sentiment'] = compound_sentiment # subjectivity score using Pattern.en pattern_subjectivity = pattern_sentiment(text)[1] features['subjectivity'] = round(pattern_subjectivity, 4) # modality (certainty) score and mood using http://www.clips.ua.ac.be/pages/pattern-en#modality sentence = parse(text, lemmata=True) sentence_obj = Sentence(sentence) features['modality'] = round(modality(sentence_obj), 4) features['mood'] = mood(sentence_obj) # Flesch-Kincaid Grade Level (reading difficulty) using textstat features['fk_gl'] = textstat.flesch_kincaid_grade(text) # liwc 3rd person pronoun count (combines S/he and They) count = count_liwc_list_freq(liwc_3pp, words) features['liwc_3pp_cnt'] = count features['liwc_3pp_rto'] = round(float(count) / float(len(words)), 4) # liwc auxiliary verb count count = count_liwc_list_freq(liwc_aux, words) features['liwc_aux_cnt'] = count features['liwc_aux_rto'] = round(float(count) / float(len(words)), 4) # liwc adverb count count = count_liwc_list_freq(liwc_adv, words) features['liwc_adv_cnt'] = count features['liwc_adv_rto'] = round(float(count) / float(len(words)), 4) # liwc preposition count count = count_liwc_list_freq(liwc_prep, words) features['liwc_prep_cnt'] = count features['liwc_prep_rto'] = round(float(count) / float(len(words)), 4) # liwc conjunction count count = count_liwc_list_freq(liwc_conj, words) features['liwc_conj_cnt'] = count features['liwc_conj_rto'] = round(float(count) / float(len(words)), 4) # liwc discrepency word count count = count_liwc_list_freq(liwc_discr, words) features['liwc_discr_cnt'] = count features['liwc_discr_rto'] = round(float(count) / float(len(words)), 4) # liwc tentative word count count = count_liwc_list_freq(liwc_tent, words) features['liwc_tent_cnt'] = count features['liwc_tent_rto'] = round(float(count) / float(len(words)), 4) # liwc certainty word count count = count_liwc_list_freq(liwc_cert, words) features['liwc_cert_cnt'] = count features['liwc_cert_rto'] = round(float(count) / float(len(words)), 4) # liwc causation word count count = count_liwc_list_freq(liwc_causn, words) features['liwc_causn_cnt'] = count features['liwc_causn_rto'] = round(float(count) / float(len(words)), 4) # liwc work word count count = count_liwc_list_freq(liwc_work, words) features['liwc_work_cnt'] = count features['liwc_work_rto'] = round(float(count) / float(len(words)), 4) # liwc achievement word count count = count_liwc_list_freq(liwc_achiev, words) features['liwc_achiev_cnt'] = count features['liwc_achiev_rto'] = round(float(count) / float(len(words)), 4) return features
def extract_bias_features(text): features = OrderedDict() text_nohyph = text.replace( "-", " ") # preserve hyphenated words as seperate tokens txt_lwr = str(text_nohyph).lower() words = ''.join(ch for ch in txt_lwr if ch not in '!"#$%&()*+,-./:;<=>?@[\]^_`{|}~').split() unigrams = sorted(list(set(words))) bigram_tokens = find_ngrams(words, 2) bigrams = [" ".join([w1, w2]) for w1, w2 in sorted(set(bigram_tokens))] trigram_tokens = find_ngrams(words, 3) trigrams = [ " ".join([w1, w2, w3]) for w1, w2, w3 in sorted(set(trigram_tokens)) ] # word count features['word_cnt'] = len(words) # unique word count features['unique_word_cnt'] = len(unigrams) # presupposition verb count count = count_feature_list_freq(presup, words, bigrams, trigrams) features['presup_cnt'] = count features['presup_rto'] = round(float(count) / float(len(words)), 4) # coherence marker count count = count_phrase_freq(coherence, txt_lwr) features['cm_cnt'] = count features['cm_rto'] = round(float(count) / float(len(words)), 4) # assertive verb count count = count_feature_list_freq(assertives, words, bigrams, trigrams) features['assertive_cnt'] = count features['assertive_rto'] = round(float(count) / float(len(words)), 4) # degree modifier count count = count_feature_list_freq(modifiers, words, bigrams, trigrams) features['dm_cnt'] = count features['dm_rto'] = round(float(count) / float(len(words)), 4) # hedge word count count = count_feature_list_freq(hedges, words, bigrams, trigrams) features['hedge_cnt'] = count features['hedge_rto'] = round(float(count) / float(len(words)), 4) # partisan words and phrases count count = count_feature_list_freq(partisan, words, bigrams, trigrams) features['partisan_cnt'] = count features['partisan_rto'] = round(float(count) / float(len(words)), 4) # subjective value laden word count count = count_feature_list_freq(value_laden, words, bigrams, trigrams) features['opinion_cnt'] = count features['opinion_rto'] = round(float(count) / float(len(words)), 4) # compound sentiment score using VADER sentiment analysis package compound_sentiment = vader_sentiment_analysis.polarity_scores( text)['compound'] features['vader_sentiment'] = compound_sentiment features['vader_senti_abs'] = abs(compound_sentiment) # modality (certainty) score and mood using http://www.clips.ua.ac.be/pages/pattern-en#modality sentence = parse(text, lemmata=True) sentence_obj = Sentence(sentence) features['modality'] = round(modality(sentence_obj), 4) # Flesch-Kincaid Grade Level (reading difficulty) using textstat features['fk_gl'] = flesch_kincaid_grade(text) # figurative count count = count_phrase_freq(figurative, txt_lwr) features['figurative_cnt'] = count features['figurative_rto'] = round(float(count) / float(len(words)), 4) # liwc 3rd person pronoun count (combines S/he and They) count = count_liwc_list_freq(liwc_3pp, words) features['liwc_3pp_cnt'] = count features['liwc_3pp_rto'] = round(float(count) / float(len(words)), 4) # liwc achievement word count count = count_liwc_list_freq(liwc_achiev, words) features['liwc_achiev_cnt'] = count features['liwc_achiev_rto'] = round(float(count) / float(len(words)), 4) # liwc causation word count count = count_liwc_list_freq(liwc_causn, words) features['liwc_causn_cnt'] = count features['liwc_causn_rto'] = round(float(count) / float(len(words)), 4) # liwc self reference promouns count count = count_liwc_list_freq(liwc_self, words) features['liwc_self_cnt'] = count features['liwc_self_rto'] = round(float(count) / float(len(words)), 4) # liwc tentative word count count = count_liwc_list_freq(liwc_tent, words) features['liwc_tent_cnt'] = count features['liwc_tent_rto'] = round(float(count) / float(len(words)), 4) # liwc work word count count = count_liwc_list_freq(liwc_work, words) features['liwc_work_cnt'] = count features['liwc_work_rto'] = round(float(count) / float(len(words)), 4) # handle quoted material in text quote_dict = check_quotes(text) features["has_quotes"] = quote_dict["has_quotes"] features["mean_quote_length"] = quote_dict["mean_quote_length"] features["mean_nonquote_length"] = quote_dict["mean_nonquote_length"] return features
def get_modality_mood(text): t = parse(text, lemmata=True) t = Sentence(t) return modality(t), mood(t)
def get_modality_by_line(poem): return [round(modality(Sentence(parse(line, lemmata=True))), 1) for line in poem]