Esempio n. 1
0
def processCSV(name, path, hub):

    if name in useless_csv:
        return

    global tot_data
    global tot_duplicates
    global tot_left
    global tot_nan
    global tot_owner
    global tot_before_equal_after
    global tot_comm_to_comm
    global tot_no_comment
    global tot_no_marked
    global tot_no_method_after
    global tot_no_method_before
    global tot_no_valid_ref
    global tot_triplets

    try:
        df = pd.read_csv(filepath_or_buffer=path)
        analyzer = Analyzer(df, hub)
        analyzer.remove_duplicates()
        analyzer.remove_owner_comments()
        analyzer.remove_nan_data()
        dfr = analyzer.analyze_data()

        tot_data += len(df)
        tot_duplicates += analyzer.duplicates
        tot_left += analyzer.left_side_cases
        tot_nan += analyzer.nan_data
        tot_owner += analyzer.owner_comments
        tot_before_equal_after += analyzer.before_equal_after
        tot_comm_to_comm += analyzer.comm_to_comm
        tot_no_comment += analyzer.no_comment
        tot_no_marked += analyzer.no_marked
        tot_no_method_after += analyzer.no_method_after
        tot_no_method_before += analyzer.no_method_before
        tot_no_valid_ref += analyzer.no_valid_ref
        tot_triplets += len(dfr)

        if len(dfr) > 0:
            # print('current csv: ', name, " ------ ", len(dfr))
            dfr.to_csv("./processed/" + name)
        else:
            print('+++++ USELESS CSV: ', name)

    except Exception as e:
        print("----- CVS unreadable: ", name)
        print(e)