def processCSV(name, path, hub): if name in useless_csv: return global tot_data global tot_duplicates global tot_left global tot_nan global tot_owner global tot_before_equal_after global tot_comm_to_comm global tot_no_comment global tot_no_marked global tot_no_method_after global tot_no_method_before global tot_no_valid_ref global tot_triplets try: df = pd.read_csv(filepath_or_buffer=path) analyzer = Analyzer(df, hub) analyzer.remove_duplicates() analyzer.remove_owner_comments() analyzer.remove_nan_data() dfr = analyzer.analyze_data() tot_data += len(df) tot_duplicates += analyzer.duplicates tot_left += analyzer.left_side_cases tot_nan += analyzer.nan_data tot_owner += analyzer.owner_comments tot_before_equal_after += analyzer.before_equal_after tot_comm_to_comm += analyzer.comm_to_comm tot_no_comment += analyzer.no_comment tot_no_marked += analyzer.no_marked tot_no_method_after += analyzer.no_method_after tot_no_method_before += analyzer.no_method_before tot_no_valid_ref += analyzer.no_valid_ref tot_triplets += len(dfr) if len(dfr) > 0: # print('current csv: ', name, " ------ ", len(dfr)) dfr.to_csv("./processed/" + name) else: print('+++++ USELESS CSV: ', name) except Exception as e: print("----- CVS unreadable: ", name) print(e)