from gabry_dataset_parser import get_labeled_instances from nltk.corpus import stopwords from nltk.tokenize import word_tokenize import spacy from nltk.parse.corenlp import CoreNLPDependencyParser from pycorenlp import StanfordCoreNLP from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer import csv import numpy as np import matplotlib.pyplot as plt labeled_instances = get_labeled_instances( "./train_set/instances_converted_small.pickle", "./train_set/truth_converted_small.pickle") clickbait_df = labeled_instances[(labeled_instances.truthClass == 'clickbait')] no_clickbait_df = labeled_instances[( labeled_instances.truthClass == 'no-clickbait')] print(clickbait_df.columns) def get_slang_words_list(): slang_data = [] with open('slang_dict.doc', 'r') as exRtFile: exchReader = csv.reader(exRtFile, delimiter='`', quoting=csv.QUOTE_NONE) for row in exchReader: slang_data.append(row)
import pandas as pd from sklearn import preprocessing from sklearn.feature_selection import mutual_info_classif from gabry_dataset_parser import get_labeled_instances DATASET = 'big' path = "../features/{}/pos_features_{}_targetTitle_normalized{}.csv" POS_FEAT_PATH = path.format(DATASET, DATASET, "") feat_data = pd.read_csv(POS_FEAT_PATH) data_df = get_labeled_instances( "../train_set/instances_converted_{}.pickle".format(DATASET), "../train_set/truth_converted_{}.pickle".format(DATASET))[[ 'id', 'truthClass' ]] print( f"Labeled instances loaded. Shape: {data_df.shape}. Only 'id' and 'truthClass' kept." ) feat_data['id'] = feat_data['id'].astype(str) data_df = pd.merge(data_df, feat_data, on=['id']) le = preprocessing.LabelEncoder() label_encoded = le.fit_transform(data_df['truthClass']) label_encoded = [1 if lab == 0 else 0 for lab in list(label_encoded)] print( f"Labels encoded. Class '{data_df['truthClass'][0]}' --> label '{label_encoded[0]}'" ) label_encoded = pd.DataFrame(label_encoded, columns=['label']) data_df = data_df.drop(['id', 'truthClass'], 1)
# TODO: big targetTitle normalized --> Done # TODO: big targetTitle no-normalized --> Running DATASET = 'big' # 'small' or 'big' target = "targetTitle" # "postText" or "targetTitle" prefix = "PT" if target == "postText" else "TA" NORMALIZE = False FEATURES_DATA_PATH = r"../features/pos_features_{}_{}_{}.csv".format( DATASET, target, 'normalized' if NORMALIZE else "no-normalized") print( f"Generating POS features... it might take a while :P\n Path: '{FEATURES_DATA_PATH}' | {target} | {prefix}" ) labeled_instances = get_labeled_instances( "../train_set/instances_converted_{}.pickle".format(DATASET), "../train_set/truth_converted_{}.pickle".format(DATASET)) tagger = nltk.StanfordNERTagger( '../ner/english.all.3class.distsim.crf.ser.gz', '../ner/stanford-ner.jar', encoding='utf-8') tagset = nltk.load("help/tagsets/upenn_tagset.pickle") possible_tags = list(tagset.keys()) ids = list(labeled_instances.id) if target == 'postText': texts = [txt[0] for txt in list(labeled_instances.postText)] else: texts = [txt for txt in list(labeled_instances.targetTitle)]
for k in ngrams_no_clickbait.keys(): for ngram in ngrams_no_clickbait[k]: all_ngrams[ngram] = all_ngrams.get(ngram, 0) + ngrams_no_clickbait[k][ngram] THRESHOLD = 5 filtered_ngrams = {} for ngram in all_ngrams: if all_ngrams[ngram] >= THRESHOLD: filtered_ngrams[ngram] = all_ngrams[ngram] labeled_instances = get_labeled_instances( "../train_set/instances_converted_big.pickle", "../train_set/truth_converted_big.pickle")[[ 'truthClass', 'postText', 'id' ]] postTexts = list(labeled_instances.postText) ids = list(labeled_instances.id) dict_list = [] for idx, post_text in enumerate(postTexts): print(idx) post_text = post_text[0] post_dict = {x: 0 for x in filtered_ngrams.keys()} post_dict['id'] = ids[idx] ngrams = get_all_ngrams_for_post(post_text) for ngram in ngrams: if ngram in post_dict: post_dict[ngram] += 1