path = "preprocessed_datasets/final_other_dataset.csv" print("") print("PREPROCESSING:") print("") ''' FAKE NEWS DATASET ''' print("INPUT:") print("(TYPE: ", type(fake), ")") print(fake.head(10)) preprocesser_fake = Preprocessing( fake, date, time, analysis=analysis, news_type="fake", language="es") # here you can set the configuration data_fake = preprocesser_fake.run_pipeline() print("") print("FINAL OUTPUT:") if preprocesser_fake.aggregation: print("(TYPE: ", type(data_fake.aggregated), ")") print(data_fake.aggregated) else: print("(TYPE: ", type(data_fake.docvectors), ")") print(data_fake.docvectors) ''' REAL NEWS DATASET ''' print("INPUT:") print("(TYPE: ", type(true), ")") print(true.head(10))
analysis="text", news_type="generated", duplicate_rows_removal=False, lowercasing=False, tokenization=False, lemmatization=False, noise_removal=False, stemming=False, stopword_removal=False, entity_recognition=False, data_augmentation=False, word2vec=True, doc2vec=False, aggregation=True) # here you can set the configuration gen = pp_generated.run_pipeline() dataframe = pd.DataFrame(gen.aggregated, columns=["text"]) dataframe["membership"] = generated["membership"] dataset = pp_generated.shuffle(dataframe).reset_index() dataset.columns = ["old index", "text", "membership"] dataset.index.name = "index" cardinality = len(dataset) outdir = "generated_datasets/" + date + "_" + time outname = "generated_dataset_" + str(cardinality) + ".csv" if not os.path.exists(outdir): os.makedirs(outdir) fullname = os.path.join(outdir, outname)