Example #1
0
path = "preprocessed_datasets/final_other_dataset.csv"

print("")
print("PREPROCESSING:")
print("")
''' FAKE NEWS DATASET '''

print("INPUT:")
print("(TYPE: ", type(fake), ")")
print(fake.head(10))

preprocesser_fake = Preprocessing(
    fake, date, time, analysis=analysis, news_type="fake",
    language="es")  # here you can set the configuration
data_fake = preprocesser_fake.run_pipeline()
print("")
print("FINAL OUTPUT:")

if preprocesser_fake.aggregation:
    print("(TYPE: ", type(data_fake.aggregated), ")")
    print(data_fake.aggregated)

else:
    print("(TYPE: ", type(data_fake.docvectors), ")")
    print(data_fake.docvectors)
''' REAL NEWS DATASET '''

print("INPUT:")
print("(TYPE: ", type(true), ")")
print(true.head(10))
    analysis="text",
    news_type="generated",
    duplicate_rows_removal=False,
    lowercasing=False,
    tokenization=False,
    lemmatization=False,
    noise_removal=False,
    stemming=False,
    stopword_removal=False,
    entity_recognition=False,
    data_augmentation=False,
    word2vec=True,
    doc2vec=False,
    aggregation=True)  # here you can set the configuration

gen = pp_generated.run_pipeline()
dataframe = pd.DataFrame(gen.aggregated, columns=["text"])
dataframe["membership"] = generated["membership"]
dataset = pp_generated.shuffle(dataframe).reset_index()
dataset.columns = ["old index", "text", "membership"]
dataset.index.name = "index"

cardinality = len(dataset)

outdir = "generated_datasets/" + date + "_" + time
outname = "generated_dataset_" + str(cardinality) + ".csv"

if not os.path.exists(outdir):
    os.makedirs(outdir)

fullname = os.path.join(outdir, outname)