Ejemplo n.º 1
0
def get_no_of_characters_features(post):
    """
    Calculates the "Number of characters" features. 4 features
    are calculated in total
    :return: a list that contains the features
    """
    f1 = utils.len_characters(utils.title(post))
    f2 = utils.len_characters(utils.article(post))
    f3 = utils.len_characters(utils.description(post))
    f4 = utils.len_characters(utils.keywords(post))
    return [f1, f2, f3, f4]
Ejemplo n.º 2
0
def get_slang_words_feature(post):
    """
    Checks whether the post's text and article's title contain
    slang words
    :param post: the current post
    :return: a list of 0s, 1s indicating whether the fields contain
    slang words
    """
    post_text = utils.title(post)
    article_title = utils.article(post)

    found_in_post_text = has_slang_words(post_text)
    found_in_article_title = has_slang_words(article_title)

    return [found_in_post_text, found_in_article_title]
Ejemplo n.º 3
0
def get_common_clickbait_phrases_feature(post):
    """
    Checks whether the post's text and article's title contain
    common words/phrases
    :param post: the current post
    :return: a list of 0s, 1s indicating whether the fields contain
    common words/phrases
    """
    post_text = utils.title(post)
    article_title = utils.article(post)

    found_in_post_text = has_common_phrases(post_text)
    found_in_article_title = has_common_phrases(article_title)

    return [found_in_post_text, found_in_article_title]
Ejemplo n.º 4
0
def get_hyperbolic_words_feature(connection, post):
    """
    Checks if there are any hyperbolic words in the provided
    post's texts and article's title. NOTE! This needs the NLP Stanford
    server to be up and running.
    :param connection: the connection to the stanford local server
    :param post: the current post
    :return: a list with 1s or 0s based on whether the post text/article title
    contains at least one hyperbolic word or not
    """
    post_text = utils.title(post)
    article_title = utils.article(post)

    found_in_post_text = has_hyperbolic_words(connection, post_text)
    found_in_article_title = has_hyperbolic_words(connection, article_title)

    return [found_in_post_text, found_in_article_title]
Ejemplo n.º 5
0
def get_no_of_characters_ratio_features(post):
    """
    Calculates the "Number of characters ratio". 6 features
    are calculated in total
    :return: a list that contains the features
    """
    post_title_len = utils.len_characters(utils.title(post))
    article_title_len = utils.len_characters(utils.article(post))
    article_desc_len = utils.len_characters(utils.description(post))
    article_keywords_len = utils.len_characters(utils.keywords(post))

    lst = [
        post_title_len, article_title_len, article_desc_len,
        article_keywords_len
    ]

    features_lst = get_ratio_features_list(lst)
    return features_lst
Ejemplo n.º 6
0
def get_diff_between_no_of_characters_features(post):
    """
    Calculates the "Difference between number of chars" features.
    6 features are calculated in total
    :return: a list that contains the features
    """
    post_title_len = utils.len_characters(utils.title(post))
    article_title_len = utils.len_characters(utils.article(post))
    article_desc_len = utils.len_characters(utils.description(post))
    article_keywords_len = utils.len_characters(utils.keywords(post))

    lst = [
        post_title_len, article_title_len, article_desc_len,
        article_keywords_len
    ]

    features_lst = get_difference_features_list(lst)
    return features_lst
Ejemplo n.º 7
0
def get_sentiment_polarity_feature(post):
    """
    Calculates the compound score of the post's text and the
    article's title
    :param post: the current post
    :return: a list with the compound scores
    """
    post_text = utils.title(post)
    article_title = utils.article(post)

    # If list extract the element into a string
    if isinstance(post_text, list):
        post_text = post_text[0]
    if isinstance(article_title, list):
        article_title = article_title[0]

    scores_post_text = analyser.polarity_scores(post_text)
    scores_article_title = analyser.polarity_scores(article_title)
    return [scores_post_text["compound"], scores_article_title["compound"]]
Ejemplo n.º 8
0
def main():
    # Creating label dictionary
    labels = utils.get_label_dict()
    with open('dataset/instances.jsonl', 'rb') as f:
        headers = False
        count = 0  # elements processed
        for post in json_lines.reader(f):
            count += 1
            print('Sample', count)
            # Reading post/article elements
            post_id = utils.post_id(post)
            post_title = utils.title(post)
            article_title = utils.article(post)
            # Extracting sample label
            post_label = labels[post_id]
            # Presence of image in a post
            has_image = imf.image_presence(post)
            # Number of characters
            len_chars_post_title, len_chars_article_title, len_chars_article_desc, len_chars_article_keywords = \
                laf.get_no_of_characters_features(post)
            # Difference between number of characters
            diff_chars_post_title_article_title, diff_chars_post_title_article_desc, diff_chars_post_title_article_keywords, \
            diff_chars_article_title_article_desc, diff_chars_article_title_article_keywords, diff_chars_article_desc_article_keywords = \
                laf.get_diff_between_no_of_characters_features(post)
            # Number of characters ratio
            ratio_chars_post_title_article_title, ratio_chars_post_title_article_desc, ratio_chars_post_title_article_keywords, \
            ratio_chars_article_title_article_desc, ratio_chars_article_title_article_keywords, ratio_chars_article_desc_article_keywords = \
                laf.get_no_of_characters_ratio_features(post)
            # Number of Words
            len_words_post_title, len_words_article_title, len_words_article_desc, len_words_article_keywords = \
                laf.get_no_of_characters_features(post)
            # Difference between number of words
            diff_words_post_title_article_title, diff_words_post_title_article_desc, diff_words_post_title_article_keywords, \
            diff_words_article_title_article_desc, diff_words_article_title_article_keywords, diff_words_article_desc_article_keywords = \
                laf.get_diff_between_no_of_words_features(post)
            # Number of words ratio
            ratio_words_post_title_article_title, ratio_words_post_title_article_desc, ratio_words_post_title_article_keywords, \
            ratio_words_article_title_article_desc, ratio_words_article_title_article_keywords, ratio_words_article_desc_article_keywords = \
                laf.get_no_of_words_ratio_features(post)
            # Post creation hour
            post_creation_hour = adf.get_post_creation_hour(post)
            # Number of sings
            post_title_no_signs = adf.get_no_signs(post_title)
            # Number of hashtags
            post_title_no_hashtags = adf.get_no_hashtags(post_title)
            # Number of exclamations
            post_title_no_exclamations = adf.get_no_exclamations(post_title)
            article_title_no_exclamations = adf.get_no_exclamations(
                article_title)
            # Number of question marks
            post_title_no_questionmarks = adf.get_no_question_marks(post_title)
            article_title_no_questionmarks = adf.get_no_question_marks(
                article_title)
            # Number of abbreviations
            post_title_no_abbreviations = adf.get_no_abbreviations(post_title)
            article_title_no_abbreviations = adf.get_no_abbreviations(
                article_title)
            # Number of ellipses
            post_title_no_ellipses = adf.get_no_ellipses(post_title)
            article_title_no_ellipses = adf.get_no_ellipses(article_title)
            # Number of dots
            post_title_no_dots = adf.get_no_dots(post_title)
            article_title_no_dots = adf.get_no_dots(article_title)
            # Begins with interrogative
            post_title_begins_with_interrogative = adf.get_begins_with_interrogative(
                post_title)
            article_title_begins_with_interrogative = adf.get_begins_with_interrogative(
                article_title)
            # Begins with number
            post_title_begins_with_number = adf.get_begins_with_number(
                post_title)
            article_title_begins_with_number = adf.get_begins_with_number(
                article_title)
            # Contains determiners and possessives
            post_title_determiners, post_title_possessives = laf.get_det_poses(
                post_title)
            article_title_determiners, article_title_possessives = laf.get_det_poses(
                article_title)
            # Contains hyperbolic words
            try:
                nlp = StanfordCoreNLP('http://*****:*****@,Post_Title_No_#,' \
                                  'Post_Title_No_Exclam,Article_Title_No_Exclam,Post_Title_No_Question,Article_Title_No_Question,Post_Title_No_Abbrev,' \
                                  'Article_Title_No_Abbrev,Post_Title_No_Ellipses,Article_Title_No_Ellipses,Post_Title_No_Dots,Article_Title_No_Dots'
                for key, value in counts_post_title_POS.items():
                    feature_headers += ',Post_Title_' + key
                for key, value in counts_article_title_POS.items():
                    feature_headers += ',Article_Title_' + key
                feature_headers += ',Post_Title_NNPV,Post_Title_NNPT'
                feature_headers += ',Article_Title_NNPV,Article_Title_NNPT'
                for key, value in unigrams.items():
                    feature_headers += ',' + key
                for key, value in bigrams.items():
                    feature_headers += ',' + key
                for key, value in trigrams.items():
                    feature_headers += ',' + key
                # Writing file headlines
                with open('dataset/features.csv',
                          encoding='utf8',
                          mode='w',
                          newline='') as features_file:
                    features_writer = csv.writer(features_file,
                                                 delimiter=',',
                                                 quotechar='"',
                                                 quoting=csv.QUOTE_MINIMAL)
                    features_writer.writerow([feature_headers])
                headers = True
            with open('dataset/features.csv',
                      encoding='utf8',
                      mode='a',
                      newline='') as features_file:
                features_writer = csv.writer(features_file,
                                             delimiter=',',
                                             quotechar='"',
                                             quoting=csv.QUOTE_MINIMAL)
                features_writer.writerow([feature_output])