def main(args): print("Get cooc of each doc from corpus") cooc_model = Processing() # savepath = "sample_data/" savepath = args.savepath coocpath = savepath + 'cooc/' filepath = args.filepath if not os.path.isdir(coocpath): os.system('mkdir ' + coocpath) if args.data_type == 'csv': df = pd.read_csv(filepath) else: df = pd.read_csv(filepath, sep='\t', ) print("Creation Finished.. Starts new job") print(" ") print("Make a graph") cooc_path_list = get_cooc_filenames(coocpath) feature_model = Feature(doc_path_list=cooc_path_list, dataframe=df) print("Make all features and load all to dataframe ") df = feature_model.make_df_from_dataset() df.to_csv(savepath + 'result_0~10.csv') # change name print("Completed")
def main(args): print("Get cooc of each doc from corpus") cooc_model = Processing() savepath = "sample_data/" coocpath = savepath + 'cooc/' filepath = args.filepath if not os.path.isdir(coocpath): os.system('mkdir ' + coocpath) if args.data_type == 'csv' or 'tsv': if args.data_type == 'csv': df = pd.read_csv(filepath) # path 가 현재는 dir, 근데 else: df = pd.read_csv( filepath, sep='\t', ) with tqdm(total=len(df['text'] [18087:18200])) as pbar: #change index here no_processed_idx = [] f = open(savepath + "no_processed_index.txt", 'a', encoding='utf-8') f.write("Not process index:\n") for idx, text in enumerate( df['text'][18087:18200]): #change index here try: pbar.update(1) cooc_model.cooc(text=text, savepath="{0}/{1}.csv".format( coocpath, idx + 18087)) except Exception as e: f.write("{}, index:{}\n".format(e, idx + 18087)) f.close() print(" ") print("Creation Finished.. Starts new job") print(" ") print("Make a graph") feature_model = Feature(doc_path_list=coocpath, dataframe=df) print("Make all features and load all to dataframe ") df = feature_model.make_df_from_dataset() df.to_csv(savepath + 'result.csv') print("Completed") elif args.data_type == 'txt' or 'text': path_fake = savepath + '/data/fake' path_true = savepath + '/data/true' doc_path_list_f = get_doc_filenames(path_fake) doc_path_list_t = get_doc_filenames(path_true) doc_label = [0] * len(doc_path_list_f) + [1] * len(doc_path_list_t) df = pd.DataFrame(doc_label, columns=['label']) with tqdm(total=len(doc_path_list_f), desc="co-occurrence matrix creation - fake news") as pbar: for idx, doc_path in enumerate(doc_path_list_f): pbar.update(1) cooc_model.cooc(filepath=doc_path, savepath="{0}/{1}.csv".format(path_fake, idx)) with tqdm(total=len(doc_path_list_t), desc="co-occurrence matrix creation - true news") as pbar: for idx, doc_path in enumerate(doc_path_list_t): pbar.update(1) cooc_model.cooc(filepath=doc_path, savepath="{0}/{1}.csv".format(path_true, idx)) print(" ") print("Creation Finished.. Starts new job") print(" ") print("Make a graph") cooc_f_list = get_cooc_filenames(document_path=path_fake) cooc_t_list = get_cooc_filenames(document_path=path_true) cooc_path_list = cooc_f_list + cooc_t_list feature_model = Feature(doc_path_list=cooc_path_list, dataframe=df) print("Make all features and load all to dataframe ") df = feature_model.make_df_from_dataset() df.to_csv(savepath + '/data/' + 'result.csv') print("Completed")