Esempio n. 1
0
def read_jsonl_folder(json_folder):
    """
    Read the instance.jsonl and truth.jsonl the folder
    json_folder: the path to the folder that contain the two files
    write_folder: the path to the folder that contain the outfile
    Return the name of the outfile
    """
    inst_columns = ['id',"targetTitle","targetParagraphs"]#, 'postMedia','postText']
    truth_columns = ["id","truthClass"]#, "truthMode","truthJudgments"]
    path_inst_file = json_folder+"/instances.jsonl"
    path_truth_file = json_folder+"/truth.jsonl"

    merged_df = prepare_json_data(path_inst_file, path_truth_file, inst_columns, truth_columns)

    merged_df["targetTitle"] = merged_df["targetTitle"].progress_map(lambda x: str(x).strip("[").strip(']').strip("\'").strip('\"'))
    #merged_df['postText'] = merged_df['postText'].progress_map(lambda x: ' '.join(map(str, x)))
    #merged_df['postMedia'] = merged_df['postMedia'].progress_map(lambda x: 0 if x == "[]" else 1)
    merged_df['targetParagraphs'] = merged_df['targetParagraphs'].progress_map(lambda x: ' '.join(map(str, x)))
    #merged_df["truthScale"] = merged_df["truthMode"].progress_map(lambda x: "non" if x == 0.0 else ("slightly" if 0.3<x<0.6 else ("considerable" if 0.6<x<1 else "heavy")))
    merged_df["truthClass"] = merged_df["truthClass"].progress_map(lambda x: "CB" if x == "clickbait" else "Non")

    drop_df = merged_df[~merged_df.targetTitle.str.contains("Sections Shows Live Yahoo!")]
    final_df = drop_df[~drop_df.targetTitle.str.contains("Top stories Top stories")]


    write_csv_file(final_df, json_folder)
    pk_file = save_pk_file(final_df, json_folder)
    #split_json_data(final_df, save_to)
    print(final_df[:3])

    return pk_file
Esempio n. 2
0
def read_gz_folder(gz_folder):
    """
    read .gz files and return a dataframe contain the data in the file
    gz_folder: path to folder containing .gz files
    """
    df_list = []
    for read_file in tqdm(glob.glob(os.path.join(gz_folder, '*.gz'))):
        file_name = read_file.replace(".gz", ".txt")
        gz_to_txt(read_file, file_name)
        df = read_txt(file_name)
        df_list.append(df)
    merged_df = pd.concat(df_list)

    write_csv_file(merged_df, gz_folder)
    pk_file = save_pk_file(merged_df, gz_folder)
    print(merged_df[:5])

    return pk_file
def headline_features(in_folder, w2v_model):
    """
    Extract features from headlines
    in_folder: path to folder containing data file
    w2v_model: word embedding model
    """

    sty_vecs = []
    w2v_vecs = []
    labels = []

    for file_name in glob.glob(os.path.join(in_folder, '*.pk')):
        print(f"\nReading {file_name}\n")
        df = load_pk_file(file_name)
        sty_vec = extract_sty_feat(df)
        sty_vecs.append(sty_vec)
        w2v_vec = create_w2v(w2v_model, df["targetTitle"])
        w2v_vecs.append(w2v_vec)
        labels.append(df["truthClass"])

    print("Concatenating feature vectors")
    X_sty = np.concatenate(sty_vecs, axis=0)
    print(f"Stylometric: {X_sty.shape}")
    X_w2v = np.concatenate(w2v_vecs, axis=0)
    print(f"Word2vec: {X_w2v.shape}")
    X_cmb = np.concatenate((X_w2v, X_sty), axis=1)
    print(f"Combined: {X_cmb.shape}")
    y = np.asarray(pd.concat(labels))

    print("Splitting data")
    print("Stylometric")
    sty_file = "Vector/sty"
    save_pk_file((X_sty, y), sty_file)

    print("Word2vec")
    w2v_file = "Vector/w2v"
    save_pk_file((X_w2v, y), w2v_file)

    print("Combined")
    cmb_file = "Vector/cmb"
    save_pk_file((X_cmb, y), cmb_file)

    print("Done")
Esempio n. 4
0
def preprocess(in_folder, out_folder):
    """
    Preprocess the data file in in_folder and return the processed data to out_folder
    in_folder: path to folder containing the data file
    out_folder: path to folder in which the processed data file is saved
    
    """
    processed_folder = create_folder_path(out_folder)

    for file_name in glob.glob(os.path.join(in_folder, '*.pk')):
        print(f"Reading {file_name}")
        load_data = load_pk_file(file_name)
        headlines = load_data["targetTitle"]
        processed_headlines = preprocess_text(headlines)
        processed_data = pd.concat([headlines, processed_headlines], axis=1)
        processed_data["truthClass"] = load_data["truthClass"]

        if len(load_data.columns) > 2:
            processed_data["targetParagraphs"] = load_data["targetParagraphs"]
            docs = processed_data["targetParagraphs"].progress_map(
                lambda text: [] if text == [] else Text_process(text))
            processed_data["cont_sent"] = docs.progress_map(lambda doc: [
            ] if doc == [] else Text_process.tokenised_sentencier(doc))
            processed_data["cont_num_sent"] = processed_data[
                "cont_sent"].progress_map(lambda x: len(x))
            processed_data["cont_avr_sent_len"] = processed_data[
                "cont_sent"].progress_map(lambda x: 0 if len(
                    x) == 0 else round(sum(len(i) for i in x) / len(x)))
            processed_data["cont_token"] = docs.progress_map(
                lambda doc: [] if doc == [] else Text_process.tokenizer(doc))
            processed_data["cont_num_token"] = processed_data[
                "cont_token"].progress_map(lambda x: len(x))
            processed_data["cont_avr_token_len"] = processed_data[
                "cont_token"].progress_map(lambda x: 0 if len(
                    x) == 0 else round(sum(len(i) for i in x) / len(x)))

            processed_data["cont_arg"] = docs.progress_map(
                lambda doc: Text_process.get_arg(doc))
            processed_data["cont_root"] = docs.progress_map(
                lambda doc: Text_process.get_root(doc))

            processed_data["cont_ent"] = docs.progress_map(
                lambda doc: Text_process.get_ent(doc))
            processed_data["cont_ent_label"] = docs.progress_map(
                lambda doc: Text_process.get_ent_label(doc))

            processed_data["cont_senti_score"] = docs.progress_map(
                lambda doc: Text_process.senti_score(doc))

            sim_scores = []
            for i, row in processed_data[["token", "cont_sent"]].iterrows():
                if row["cont_sent"] == []:
                    score = "NA"
                    sim_scores.append(score)
                else:
                    sim_score = similarity_calculator(row["token"],
                                                      row["cont_sent"])
                    sim_scores.append(sim_score)
                #processed_data.at[i,'sim_score'] = sim_score

            processed_data["sim_score"] = sim_scores
            processed_data["avr_sim_score"] = processed_data[
                "sim_score"].progress_map(
                    lambda score: "NA"
                    if score == "NA" else float(np.sum(score) / len(score)))
            processed_data["sim_pct"] = processed_data[
                "sim_score"].progress_map(
                    lambda score: "NA" if score == "NA" else round(
                        np.count_nonzero(score) / len(score) * 100))

            processed_data.drop(columns=['sim_score'])

        save_file_name = processed_folder + '/' + os.path.basename(
            file_name).replace(".pk", "")
        #write_csv_file(processed_data,save_file_name)
        save_pk_file(processed_data, save_file_name)
        print(save_file_name)
Esempio n. 5
0
import helpers
from sklearn.utils import shuffle
from helpers import load_pk_file, save_pk_file, create_folder_path

if __name__ == "__main__":
    ## reduce the number of non-clickbait samples in the dataset by randomly select n non-clickbait sample with n is the number of clickbait samples
    folder = create_folder_path("Train")
    Potthast_corpus = []
    Chakraborty_corpus = []

    for file_name in glob.glob(os.path.join("Processed_data", '*.pk')):
        df = load_pk_file(file_name)
        cb = df.loc[df['truthClass'] == "CB"]
        non = df.loc[df['truthClass'] == "Non"]
        
        if "clickbait17" in file_name:
            Potthast_corpus.append(cb)
            Potthast_corpus.append(non.sample(n = len(cb)))
        else:
            Chakraborty_corpus.append(cb.sample(n = 5000))
            Chakraborty_corpus.append(non.sample(n = 5000))

    save_Potthast = pd.concat(Potthast_corpus)
    save_Chakraborty = pd.concat(Chakraborty_corpus)

    Potthast_data = shuffle(save_Potthast, random_state=5)
    Chakraborty_data = shuffle(save_Chakraborty, random_state=5)

    save_pk_file(Potthast_data, folder+"/Potthast_data")
    save_pk_file(Chakraborty_data, folder+"/Chakraborty_data")
def content_features(in_folder, w2v_model, d2v_model):
    """
    Extract features from contents
    in_folder: path to folder containing data file
    w2v_model: word embedding model
    d2v_model: document embedding model
    """

    sty_vecs = []
    w2v_vecs = []
    d2v_vecs = []
    labels = []

    for file_name in glob.glob(os.path.join(in_folder, '*.pk')):
        print(f"\nReading {file_name}\n")
        df = load_pk_file(file_name)
        if "targetParagraphs" in df.columns:
            filtered_df = df[df["targetParagraphs"].apply(
                lambda x: len(x) > 0)]
            sty_vec = extract_sty_feat(filtered_df)
            sty_vecs.append(sty_vec)
            w2v_vec = create_w2v(w2v_model, filtered_df["targetTitle"])
            w2v_vecs.append(w2v_vec)
            labels.append(filtered_df["truthClass"])
            sent_vec = filtered_df["cont_sent"].progress_map(
                lambda x: np.asarray([create_d2v(d2v_model, i) for i in x]))
            avr_sent_vec = sent_vec.progress_map(lambda x: np.mean(x, axis=0))
            d2v_vec = np.asarray(list(avr_sent_vec))

            features = []
            for i, row in filtered_df.iterrows():
                feat = dict()
                feat["cont_num_token"] = row["cont_num_token"]
                feat["cont_avr_token_len"] = row["cont_avr_token_len"]
                feat["cont_senti_score"] = row["cont_senti_score"]
                feat["avr_sim_score"] = row["avr_sim_score"]
                feat["sim_pct"] = row["sim_pct"]
                features.append(feat)

            dict_vtrz = DictVectorizer(sparse=False)
            dict_vect = dict_vtrz.fit_transform(features)

            d2v_vecs.append(np.concatenate((d2v_vec, dict_vect), axis=1))

    print("Concatenating feature vectors")
    X_sty = np.concatenate(sty_vecs, axis=0)
    print(f"Stylometry: {X_sty.shape}")
    X_w2v = np.concatenate(w2v_vecs, axis=0)
    print(f"Word2vec: {X_w2v.shape}")
    X_d2v = np.concatenate(d2v_vecs, axis=0)
    print(f"Doc2vec: {X_d2v.shape}")
    X_cmb = np.concatenate((X_w2v, X_sty, X_d2v), axis=1)
    print(f"Combined: {X_cmb.shape}")
    y = list(pd.concat(labels))

    print("Splitting data")
    print("Doc2vec")
    cmb_file = "Vector/d2v"
    save_pk_file((X_cmb, y), cmb_file)

    print("Done")