def get_ngram_feature_vector(post, n, ngram_word_corpus: dict): """ Function that creates the n-gram feature vector of a post :arg post: The post the we want to extract the n-gram features from :arg n: The n in n-gram :arg ngram_word_corpus: The entire feature vector initialized with zeroes :return: The final n-gram feature vector of the specified post """ # Make a copy of the initialized feature vector to avoid changing it by reference ngram_feature_vector = ngram_word_corpus.copy() # Find the post's n-grams grams = ngrams((utils.article(post)).split(), n) # For each n-gram in the post for g in grams: k = re.sub( r'[^a-zA-Z0-9 ]+', '', (" ".join(g) )).lower() # Remove special characters and make it lowercase # If it exists in our initialized feature vector add 1 if k in ngram_feature_vector.keys(): ngram_feature_vector[k] += 1 # Return the post's feature vector return ngram_feature_vector
def get_no_of_characters_features(post): """ Calculates the "Number of characters" features. 4 features are calculated in total :return: a list that contains the features """ f1 = utils.len_characters(utils.title(post)) f2 = utils.len_characters(utils.article(post)) f3 = utils.len_characters(utils.description(post)) f4 = utils.len_characters(utils.keywords(post)) return [f1, f2, f3, f4]
def get_slang_words_feature(post): """ Checks whether the post's text and article's title contain slang words :param post: the current post :return: a list of 0s, 1s indicating whether the fields contain slang words """ post_text = utils.title(post) article_title = utils.article(post) found_in_post_text = has_slang_words(post_text) found_in_article_title = has_slang_words(article_title) return [found_in_post_text, found_in_article_title]
def get_common_clickbait_phrases_feature(post): """ Checks whether the post's text and article's title contain common words/phrases :param post: the current post :return: a list of 0s, 1s indicating whether the fields contain common words/phrases """ post_text = utils.title(post) article_title = utils.article(post) found_in_post_text = has_common_phrases(post_text) found_in_article_title = has_common_phrases(article_title) return [found_in_post_text, found_in_article_title]
def get_ngram_corpus(n, lower_t, upper_t): """ Function that initializes the dataset's n-gram feature vectors for given n :arg n: The n in n-gram :arg lower_t: The lower threshold that removes n-grams with lower counts than the threshold in the dataset :arg upper_t: The upper threshold that removes n-grams with higher counts than the threshold in the dataset :return: The final n-gram feature vectors after the upper and lower pruning initialized to zero """ counts = {} # The dictionary that hold the n-gram occurrences # For every post in the dataset with open('dataset/instances.jsonl', 'rb') as f: for post in json_lines.reader(f): grams = ngrams((utils.article(post)).split(), n) # Get the post's n-grams # For every n-gram for g in grams: k = re.sub(r'[^a-zA-Z0-9 ]+', '', (" ".join(g))) # Remove special characters # If the n-gram is NNP don't take it into account if utils.POS_counts(k)['NNP'] == 0: k = k.lower() # make i lowercase # Increment the count dictionary if k in counts.keys(): counts[k] += 1 else: counts[k] = 1 # Create the final feature vector taking ito account the counts dictionary and the upper and lower thresholds ng = {k: 0 for k, v in counts.items() if v > lower_t and not v >= upper_t} # Write the results into a csv in order to plot the n-gram distributions afterwards pd.DataFrame(counts.items(), columns=['gram', 'count']).to_csv( "dataset/" + str(n) + "-gram_frequencies.csv", index=False) # Return the final feature vector with 0 values return ng
def get_hyperbolic_words_feature(connection, post): """ Checks if there are any hyperbolic words in the provided post's texts and article's title. NOTE! This needs the NLP Stanford server to be up and running. :param connection: the connection to the stanford local server :param post: the current post :return: a list with 1s or 0s based on whether the post text/article title contains at least one hyperbolic word or not """ post_text = utils.title(post) article_title = utils.article(post) found_in_post_text = has_hyperbolic_words(connection, post_text) found_in_article_title = has_hyperbolic_words(connection, article_title) return [found_in_post_text, found_in_article_title]
def get_no_of_characters_ratio_features(post): """ Calculates the "Number of characters ratio". 6 features are calculated in total :return: a list that contains the features """ post_title_len = utils.len_characters(utils.title(post)) article_title_len = utils.len_characters(utils.article(post)) article_desc_len = utils.len_characters(utils.description(post)) article_keywords_len = utils.len_characters(utils.keywords(post)) lst = [ post_title_len, article_title_len, article_desc_len, article_keywords_len ] features_lst = get_ratio_features_list(lst) return features_lst
def get_diff_between_no_of_characters_features(post): """ Calculates the "Difference between number of chars" features. 6 features are calculated in total :return: a list that contains the features """ post_title_len = utils.len_characters(utils.title(post)) article_title_len = utils.len_characters(utils.article(post)) article_desc_len = utils.len_characters(utils.description(post)) article_keywords_len = utils.len_characters(utils.keywords(post)) lst = [ post_title_len, article_title_len, article_desc_len, article_keywords_len ] features_lst = get_difference_features_list(lst) return features_lst
def get_sentiment_polarity_feature(post): """ Calculates the compound score of the post's text and the article's title :param post: the current post :return: a list with the compound scores """ post_text = utils.title(post) article_title = utils.article(post) # If list extract the element into a string if isinstance(post_text, list): post_text = post_text[0] if isinstance(article_title, list): article_title = article_title[0] scores_post_text = analyser.polarity_scores(post_text) scores_article_title = analyser.polarity_scores(article_title) return [scores_post_text["compound"], scores_article_title["compound"]]
def main(): # Creating label dictionary labels = utils.get_label_dict() with open('dataset/instances.jsonl', 'rb') as f: headers = False count = 0 # elements processed for post in json_lines.reader(f): count += 1 print('Sample', count) # Reading post/article elements post_id = utils.post_id(post) post_title = utils.title(post) article_title = utils.article(post) # Extracting sample label post_label = labels[post_id] # Presence of image in a post has_image = imf.image_presence(post) # Number of characters len_chars_post_title, len_chars_article_title, len_chars_article_desc, len_chars_article_keywords = \ laf.get_no_of_characters_features(post) # Difference between number of characters diff_chars_post_title_article_title, diff_chars_post_title_article_desc, diff_chars_post_title_article_keywords, \ diff_chars_article_title_article_desc, diff_chars_article_title_article_keywords, diff_chars_article_desc_article_keywords = \ laf.get_diff_between_no_of_characters_features(post) # Number of characters ratio ratio_chars_post_title_article_title, ratio_chars_post_title_article_desc, ratio_chars_post_title_article_keywords, \ ratio_chars_article_title_article_desc, ratio_chars_article_title_article_keywords, ratio_chars_article_desc_article_keywords = \ laf.get_no_of_characters_ratio_features(post) # Number of Words len_words_post_title, len_words_article_title, len_words_article_desc, len_words_article_keywords = \ laf.get_no_of_characters_features(post) # Difference between number of words diff_words_post_title_article_title, diff_words_post_title_article_desc, diff_words_post_title_article_keywords, \ diff_words_article_title_article_desc, diff_words_article_title_article_keywords, diff_words_article_desc_article_keywords = \ laf.get_diff_between_no_of_words_features(post) # Number of words ratio ratio_words_post_title_article_title, ratio_words_post_title_article_desc, ratio_words_post_title_article_keywords, \ ratio_words_article_title_article_desc, ratio_words_article_title_article_keywords, ratio_words_article_desc_article_keywords = \ laf.get_no_of_words_ratio_features(post) # Post creation hour post_creation_hour = adf.get_post_creation_hour(post) # Number of sings post_title_no_signs = adf.get_no_signs(post_title) # Number of hashtags post_title_no_hashtags = adf.get_no_hashtags(post_title) # Number of exclamations post_title_no_exclamations = adf.get_no_exclamations(post_title) article_title_no_exclamations = adf.get_no_exclamations( article_title) # Number of question marks post_title_no_questionmarks = adf.get_no_question_marks(post_title) article_title_no_questionmarks = adf.get_no_question_marks( article_title) # Number of abbreviations post_title_no_abbreviations = adf.get_no_abbreviations(post_title) article_title_no_abbreviations = adf.get_no_abbreviations( article_title) # Number of ellipses post_title_no_ellipses = adf.get_no_ellipses(post_title) article_title_no_ellipses = adf.get_no_ellipses(article_title) # Number of dots post_title_no_dots = adf.get_no_dots(post_title) article_title_no_dots = adf.get_no_dots(article_title) # Begins with interrogative post_title_begins_with_interrogative = adf.get_begins_with_interrogative( post_title) article_title_begins_with_interrogative = adf.get_begins_with_interrogative( article_title) # Begins with number post_title_begins_with_number = adf.get_begins_with_number( post_title) article_title_begins_with_number = adf.get_begins_with_number( article_title) # Contains determiners and possessives post_title_determiners, post_title_possessives = laf.get_det_poses( post_title) article_title_determiners, article_title_possessives = laf.get_det_poses( article_title) # Contains hyperbolic words try: nlp = StanfordCoreNLP('http://*****:*****@,Post_Title_No_#,' \ 'Post_Title_No_Exclam,Article_Title_No_Exclam,Post_Title_No_Question,Article_Title_No_Question,Post_Title_No_Abbrev,' \ 'Article_Title_No_Abbrev,Post_Title_No_Ellipses,Article_Title_No_Ellipses,Post_Title_No_Dots,Article_Title_No_Dots' for key, value in counts_post_title_POS.items(): feature_headers += ',Post_Title_' + key for key, value in counts_article_title_POS.items(): feature_headers += ',Article_Title_' + key feature_headers += ',Post_Title_NNPV,Post_Title_NNPT' feature_headers += ',Article_Title_NNPV,Article_Title_NNPT' for key, value in unigrams.items(): feature_headers += ',' + key for key, value in bigrams.items(): feature_headers += ',' + key for key, value in trigrams.items(): feature_headers += ',' + key # Writing file headlines with open('dataset/features.csv', encoding='utf8', mode='w', newline='') as features_file: features_writer = csv.writer(features_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) features_writer.writerow([feature_headers]) headers = True with open('dataset/features.csv', encoding='utf8', mode='a', newline='') as features_file: features_writer = csv.writer(features_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) features_writer.writerow([feature_output])