Ejemplo n.º 1
0
def compare_datasets(doc1: pd.DataFrame, doc2: pd.DataFrame, keys: list):
    # Comparing the two given datasets with the datacompy library
    if keys: comparison = datacompy.Compare(doc1, doc2, join_columns = keys, df1_name = "File #1", df2_name = "File #2")
    else: comparison = datacompy.Compare(doc1, doc2, on_index = True, df1_name = "File #1", df2_name = "File #2")

    # Getting the list of columns with unequal values
    unequal_columns = [column for column in comparison.column_stats if column["unequal_cnt"] > 0]

    # Creating the dataframe that will contains the differences
    diff = pd.DataFrame(columns = ["file1", "file2", "column", "line"])

    # For each column with unequal values
    for column in unequal_columns:
        # Getting a list of all the differences in this column
        samples = comparison.sample_mismatch(column["column"], for_display = True, sample_count = 9999999)

        # Formating the differences found
        for i in range(len(keys)): samples = samples.drop(samples.columns[0], axis = 1)
        samples['column'] = samples.columns[0].split(" (")[0]
        samples['line'] = samples.index
        samples.columns = ["file1", "file2", "column", "line"]

        # Adding the differences found to the created dataframe that will be returned
        diff = pd.concat([samples, diff], ignore_index = True)
    
    return diff[["line", "column", "file1", "file2"]], comparison
Ejemplo n.º 2
0
def test_dupes_from_real_data():
    data = """acct_id,acct_sfx_num,trxn_post_dt,trxn_post_seq_num,trxn_amt,trxn_dt,debit_cr_cd,cash_adv_trxn_comn_cntry_cd,mrch_catg_cd,mrch_pstl_cd,visa_mail_phn_cd,visa_rqstd_pmt_svc_cd,mc_pmt_facilitator_idn_num
100,0,2017-06-17,1537019,30.64,2017-06-15,D,CAN,5812,M2N5P5,,,0.0
200,0,2017-06-24,1022477,485.32,2017-06-22,D,USA,4511,7114,7.0,1,
100,0,2017-06-17,1537039,2.73,2017-06-16,D,CAN,5812,M4J 1M9,,,0.0
200,0,2017-06-29,1049223,22.41,2017-06-28,D,USA,4789,21211,,A,
100,0,2017-06-17,1537029,34.05,2017-06-16,D,CAN,5812,M4E 2C7,,,0.0
200,0,2017-06-29,1049213,9.12,2017-06-28,D,CAN,5814,0,,,
100,0,2017-06-19,1646426,165.21,2017-06-17,D,CAN,5411,M4M 3H9,,,0.0
200,0,2017-06-30,1233082,28.54,2017-06-29,D,USA,4121,94105,7.0,G,
100,0,2017-06-19,1646436,17.87,2017-06-18,D,CAN,5812,M4J 1M9,,,0.0
200,0,2017-06-30,1233092,24.39,2017-06-29,D,USA,4121,94105,7.0,G,
100,0,2017-06-19,1646446,5.27,2017-06-17,D,CAN,5200,M4M 3G6,,,0.0
200,0,2017-06-30,1233102,61.8,2017-06-30,D,CAN,4121,0,,,
100,0,2017-06-20,1607573,41.99,2017-06-19,D,CAN,5661,M4C1M9,,,0.0
200,0,2017-07-01,1009403,2.31,2017-06-29,D,USA,5814,22102,,F,
100,0,2017-06-20,1607553,86.88,2017-06-19,D,CAN,4812,H2R3A8,,,0.0
200,0,2017-07-01,1009423,5.5,2017-06-29,D,USA,5812,2903,,F,
100,0,2017-06-20,1607563,25.17,2017-06-19,D,CAN,5641,M4C 1M9,,,0.0
200,0,2017-07-01,1009433,214.12,2017-06-29,D,USA,3640,20170,,A,
100,0,2017-06-20,1607593,1.67,2017-06-19,D,CAN,5814,M2N 6L7,,,0.0
200,0,2017-07-01,1009393,2.01,2017-06-29,D,USA,5814,22102,,F,"""
    df1 = pd.read_csv(six.StringIO(data), sep=",")
    df2 = df1.copy()
    compare_acct = datacompy.Compare(df1, df2, join_columns=["acct_id"])
    assert compare_acct.matches()
    compare_unq = datacompy.Compare(
        df1, df2, join_columns=["acct_id", "acct_sfx_num", "trxn_post_dt", "trxn_post_seq_num"]
    )
    assert compare_unq.matches()
    # Just render the report to make sure it renders.
    t = compare_acct.report()
    r = compare_unq.report()
Ejemplo n.º 3
0
def test_lower():
    """This function tests the toggle to use lower case for column names or not"""
    # should match
    df1 = pd.DataFrame({"a": [1, 2, 3], "b": [0, 1, 2]})
    df2 = pd.DataFrame({"a": [1, 2, 3], "B": [0, 1, 2]})
    compare = datacompy.Compare(df1, df2, join_columns=["a"])
    assert compare.matches()
    # should not match
    df1 = pd.DataFrame({"a": [1, 2, 3], "b": [0, 1, 2]})
    df2 = pd.DataFrame({"a": [1, 2, 3], "B": [0, 1, 2]})
    compare = datacompy.Compare(df1,
                                df2,
                                join_columns=["a"],
                                cast_column_names_lower=False)
    assert not compare.matches()

    # test join column
    # should match
    df1 = pd.DataFrame({"a": [1, 2, 3], "b": [0, 1, 2]})
    df2 = pd.DataFrame({"A": [1, 2, 3], "B": [0, 1, 2]})
    compare = datacompy.Compare(df1, df2, join_columns=["a"])
    assert compare.matches()
    # should fail because "a" is not found in df2
    df1 = pd.DataFrame({"a": [1, 2, 3], "b": [0, 1, 2]})
    df2 = pd.DataFrame({"A": [1, 2, 3], "B": [0, 1, 2]})
    expected_message = "df2 must have all columns from join_columns"
    with raises(ValueError, match=expected_message):
        compare = datacompy.Compare(df1,
                                    df2,
                                    join_columns=["a"],
                                    cast_column_names_lower=False)
Ejemplo n.º 4
0
def test_compare_df_setter_bad():
    df = pd.DataFrame([{"a": 1, "A": 2}, {"a": 2, "A": 2}])
    with raises(TypeError, message="df1 must be a pandas DataFrame"):
        compare = datacompy.Compare("a", "a", ["a"])
    with raises(ValueError, message="df1 must have all fields from join_columns"):
        compare = datacompy.Compare(df, df.copy(), ["b"])
    with raises(ValueError, message="df1 must have unique column names"):
        compare = datacompy.Compare(df, df.copy(), ["a"])
    df_dupe = pd.DataFrame([{"a": 1, "b": 2}, {"a": 1, "b": 3}])
    assert datacompy.Compare(df_dupe, df_dupe.copy(), ["a", "b"]).df1.equals(df_dupe)
Ejemplo n.º 5
0
def test_compare_df_setter_good():
    df1 = pd.DataFrame([{"a": 1, "b": 2}, {"a": 2, "b": 2}])
    df2 = pd.DataFrame([{"A": 1, "B": 2}, {"A": 2, "B": 3}])
    compare = datacompy.Compare(df1, df2, ["a"])
    assert compare.df1.equals(df1)
    assert compare.df2.equals(df2)
    assert compare.join_columns == ["a"]
    compare = datacompy.Compare(df1, df2, ["A", "b"])
    assert compare.df1.equals(df1)
    assert compare.df2.equals(df2)
    assert compare.join_columns == ["a", "b"]
Ejemplo n.º 6
0
def test_compare_df_setter_good():
    df1 = pd.DataFrame([{'a': 1, 'b': 2}, {'a': 2, 'b': 2}])
    df2 = pd.DataFrame([{'A': 1, 'B': 2}, {'A': 2, 'B': 3}])
    compare = datacompy.Compare(df1, df2, ['a'])
    assert compare.df1.equals(df1)
    assert compare.df2.equals(df2)
    assert compare.join_columns == ['a']
    compare = datacompy.Compare(df1, df2, ['A', 'b'])
    assert compare.df1.equals(df1)
    assert compare.df2.equals(df2)
    assert compare.join_columns == ['a', 'b']
Ejemplo n.º 7
0
def test_compare_df_setter_bad():
    df = pd.DataFrame([{'a': 1, 'A': 2}, {'a': 2, 'A': 2}])
    with raises(TypeError, message='df1 must be a pandas DataFrame'):
        compare = datacompy.Compare('a', 'a', ['a'])
    with raises(ValueError,
                message='df1 must have all fields from join_columns'):
        compare = datacompy.Compare(df, df.copy(), ['b'])
    with raises(ValueError, message='df1 must have unique column names'):
        compare = datacompy.Compare(df, df.copy(), ['a'])
    df_dupe = pd.DataFrame([{'a': 1, 'b': 2}, {'a': 1, 'b': 3}])
    assert datacompy.Compare(df_dupe, df_dupe.copy(),
                             ['a', 'b']).df1.equals(df_dupe)
Ejemplo n.º 8
0
def test_index_with_joins_with_ignore_spaces():
    df1 = pd.DataFrame([{'a': 1, 'b': ' A'}, {'a': 2, 'b': 'A'}])
    df2 = pd.DataFrame([{'a': 1, 'b': 'A'}, {'a': 2, 'b': 'A '}])
    compare = datacompy.Compare(df1, df2, on_index=True, ignore_spaces=False)
    assert not compare.matches()
    assert compare.all_columns_match()
    assert compare.all_rows_overlap()
    assert not compare.intersect_rows_match()

    compare = datacompy.Compare(df1, df2, 'a', ignore_spaces=True)
    assert compare.matches()
    assert compare.all_columns_match()
    assert compare.all_rows_overlap()
    assert compare.intersect_rows_match()
Ejemplo n.º 9
0
def test_decimal_with_joins_with_ignore_spaces():
    df1 = pd.DataFrame([{"a": 1, "b": " A"}, {"a": 2, "b": "A"}])
    df2 = pd.DataFrame([{"a": 1, "b": "A"}, {"a": 2, "b": "A "}])
    compare = datacompy.Compare(df1, df2, "a", ignore_spaces=False)
    assert not compare.matches()
    assert compare.all_columns_match()
    assert compare.all_rows_overlap()
    assert not compare.intersect_rows_match()

    compare = datacompy.Compare(df1, df2, "a", ignore_spaces=True)
    assert compare.matches()
    assert compare.all_columns_match()
    assert compare.all_rows_overlap()
    assert compare.intersect_rows_match()
Ejemplo n.º 10
0
def test_index_with_joins_with_ignore_case():
    df1 = pd.DataFrame([{"a": 1, "b": "a"}, {"a": 2, "b": "A"}])
    df2 = pd.DataFrame([{"a": 1, "b": "A"}, {"a": 2, "b": "a"}])
    compare = datacompy.Compare(df1, df2, on_index=True, ignore_case=False)
    assert not compare.matches()
    assert compare.all_columns_match()
    assert compare.all_rows_overlap()
    assert not compare.intersect_rows_match()

    compare = datacompy.Compare(df1, df2, "a", ignore_case=True)
    assert compare.matches()
    assert compare.all_columns_match()
    assert compare.all_rows_overlap()
    assert compare.intersect_rows_match()
def fit_data_dum(dataframe, org_list, org_df, model):
    # use dummy on "app"
    dataframe_orgi = dataframe.drop(columns=['int_time', 'int_src', 'int_dst'])
    dataframe_dum_app = dataframe.join(pd.get_dummies(dataframe.app))
    print("shape of dataframe after dummy:{}".format(dataframe_dum_app.shape))

    #delete ['time', 'src', 'dst', 'app']
    dataframe_dum_app = dataframe_dum_app.drop(
        columns=['time', 'src', 'dst', 'app'])

    print("Start to check lost app feature...")
    compare = datacompy.Compare(dataframe_dum_app, org_df, on_index=True)
    print("compare Result:", compare.report())
    print("//----------------------------------------------//")
    print("缺少的為 :{}".format(compare.df2_unq_columns()))
    lost_list = compare.df2_unq_columns()

    lost_zero = np.zeros([len(dataframe_dum_app), len(lost_list)])
    concact_lost_zero_df = pd.DataFrame(lost_zero, columns=lost_list)
    print("shape of concact_lost_zero_df:{}".format(
        concact_lost_zero_df.shape))
    print("//----------------------------------------------//")
    dataframe_66col = pd.concat([dataframe_dum_app, concact_lost_zero_df],
                                axis=1)
    print("shape of dataframe_66col:{}".format(dataframe_66col.shape))
    print("//----------------------------------------------//")

    print("Check if there's any missing...")
    dataframe_66col_reload = dataframe_66col[org_list]
    compare = datacompy.Compare(dataframe_66col,
                                dataframe_66col_reload,
                                on_index=True)
    print("compare Result:", compare.report())
    print("缺少的為 :{}".format(compare.df2_unq_columns()))
    if compare.df2_unq_columns() == str(set()):
        print("There;s just completely the same.")
    else:
        print("{} is missing".format(compare.df2_unq_columns()))
    print("//----------------------------------------------//")

    print("Start to PREDICT!...")
    model_pred = model.predict(dataframe_66col_reload)
    print("predict result : ", model_pred)
    print("--------------------------------")
    print("shape of xgbc_pred data : ", model_pred.shape)
    unique, counts = np.unique(model_pred, return_counts=True)
    print("pred data contains:{} ".format(dict(zip(unique, counts))))

    return model_pred
Ejemplo n.º 12
0
def test_strings_with_ignore_spaces_and_join_columns():
    df1 = pd.DataFrame([{"a": "hi", "b": "A"}, {"a": "bye", "b": "A"}])
    df2 = pd.DataFrame([{"a": " hi ", "b": "A"}, {"a": " bye ", "b": "A"}])
    compare = datacompy.Compare(df1, df2, "a", ignore_spaces=False)
    assert not compare.matches()
    assert compare.all_columns_match()
    assert not compare.all_rows_overlap()
    assert compare.count_matching_rows() == 0

    compare = datacompy.Compare(df1, df2, "a", ignore_spaces=True)
    assert compare.matches()
    assert compare.all_columns_match()
    assert compare.all_rows_overlap()
    assert compare.intersect_rows_match()
    assert compare.count_matching_rows() == 2
Ejemplo n.º 13
0
def fit_data_dum(dataframe, org_list, org_df, model):
    # use dummy on "app"
    dataframe_orgi = dataframe.drop(columns=['int_time', 'int_src', 'int_dst'])
    dataframe_dum_app = dataframe.join(pd.get_dummies(dataframe.app))
    print("shape of dataframe after dummy:{}".format(dataframe_dum_app.shape))

    #delete ['time', 'src', 'dst', 'app']
    dataframe_dum_app = dataframe_dum_app.drop(
        columns=['time', 'src', 'dst', 'app'])

    print("Start to check lost app feature...")
    compare = datacompy.Compare(dataframe_dum_app, org_df, on_index=True)
    print("compare Result --> lost :", compare.df2_unq_columns())
    print("//----------------------------------------------//")
    lost_list = compare.df2_unq_columns()

    lost_zero = np.zeros([len(dataframe_dum_app), len(lost_list)])
    concact_lost_zero_df = pd.DataFrame(lost_zero, columns=lost_list)
    del lost_zero
    del lost_list

    print("shape of concact_lost_zero_df:{}".format(
        concact_lost_zero_df.shape))
    print("//----------------------------------------------//")
    dataframe_66col = pd.concat([dataframe_dum_app, concact_lost_zero_df],
                                axis=1)
    print("shape of dataframe_66col:{}".format(dataframe_66col.shape))
    print("//----------------------------------------------//")

    print("Check if there's any missing...")
    dataframe_66col_reload = dataframe_66col[org_list]
    compare = datacompy.Compare(dataframe_66col,
                                dataframe_66col_reload,
                                on_index=True)

    del dataframe_66col

    print("compare Result:", compare.report())
    print("//----------------------------------------------//")

    print("Start to PREDICT!...")
    model_pred_prob = model.predict_proba(dataframe_66col_reload)
    print("predict result : ", model_pred_prob)
    print("--------------------------------")
    print("shape of predict data : ", model_pred_prob.shape)
    del dataframe_66col_reload

    return model_pred_prob
Ejemplo n.º 14
0
def create_upload_data():
    search_value = request.json.get("search_value") if request.json.get(
        "search_value") else ''
    db.session.execute('truncate  table zzjg_xsxx_upload')
    db.session.commit()
    df1 = pd.read_sql(
        "select xh, xm, xb, csrq, csd, jg, mzm, gjdq, sfzjlxm, sfzjlxmc, sfzjh, xjzt, xslbm, xslbmc, szbh, sznj, yxsh, zyh, xz, frxnd, flxnd, xkml, xsdqzt, xqdm, fzsbs, flsh, bdtime, bz, qyzt from zzjg_xsxx where is_changed ='1' and flxnd like '%{}%'"
        .format(search_value), db.engine)
    df2 = pd.read_sql(
        "select xh, xm, xb, csrq, csd, jg, mzm, gjdq, sfzjlxm, sfzjlxmc, sfzjh, xjzt, xslbm, xslbmc, szbh, sznj, yxsh, zyh, xz, frxnd, flxnd, xkml, xsdqzt, xqdm, fzsbs, flsh, bdtime, bz, qyzt from zzjg_xsxx_CW where flsh in (select flsh from zzjg_xsxx where is_changed = '1' and flxnd like '%{}%'"
        .format(search_value), db.engine)

    compare = datacompy.Compare(df1, df2, join_columns=['flsh'])
    df_new = pd.read_sql(
        "select xh, xm, xb, csrq, csd, jg, mzm, gjdq, sfzjlxm, sfzjlxmc, sfzjh, xjzt, xslbm, xslbmc, szbh, sznj, yxsh, zyh, xz, frxnd, flxnd, xkml, xsdqzt, xqdm, fzsbs, flsh, bdtime, bz, qyzt from zzjg_xsxx where flsh not in (select flsh from zzjg_xsxx_cw) and flxnd like '%{}%'"
        .format(search_value), db.engine)
    # 新数据实际操作状态为 1
    df_new['sjcz'] = '1'
    all_new = [
        xsxxUploadModel(**stu) for stu in df_new.to_dict(orient='records')
    ]
    db.session.add_all(all_new)
    db.session.commit()
    # df_new.to_sql('zzjg_xsxx_upload', db.engine, index=False, if_exists='append')
    df_change = compare.all_mismatch()
    list = df_change['flsh'].tolist()
    data_change = xsxxModel.query.filter(xsxxModel.flsh.in_(list)).all()
    all_change = [xsxxUploadModel(**stu.to_dict()) for stu in data_change]
    # 老数据数据实际操作状态为 2
    for a in all_change:
        a.sjcz = '2'
        db.session.add_all(all_change)
        db.session.commit()
        return "生成成功"
Ejemplo n.º 15
0
def create_upload_data3():
    db.session.execute('truncate  table zzjg_bjxx_upload')
    db.session.commit()
    df1 = pd.read_sql(
        "select nj, bh, bjmc, zyh, xqdm, bjrs, fdydh, fdysfzjh, xz, frxnd, flxnd, yxsh from zzjg_bjxx where is_changed ='1'",
        db.engine)
    df2 = pd.read_sql(
        "select nj, bh, bjmc, zyh, xqdm, bjrs, fdydh, fdysfzjh, xz, frxnd, flxnd, yxsh from zzjg_bjxx_CW where bh in (select bh from zzjg_bjxx where is_changed = '1')",
        db.engine)

    compare = datacompy.Compare(df1, df2, join_columns=['bh'])
    df_new = pd.read_sql(
        "select nj, bh, bjmc, zyh, xqdm, bjrs, fdydh, fdysfzjh, xz, frxnd, flxnd, yxsh from zzjg_bjxx where bh not in (select bh from zzjg_bjxx_cw)",
        db.engine)
    # 新数据实际操作状态为 1
    df_new['sjcz'] = '1'
    all_new = [
        bjxxUploadModel(**stu) for stu in df_new.to_dict(orient='records')
    ]
    db.session.add_all(all_new)
    db.session.commit()
    # df_new.to_sql('zzjg_xsxx_upload', db.engine, index=False, if_exists='append')
    df_change = compare.all_mismatch()
    list = df_change['bh'].tolist()
    data_change = bjxxModel.query.filter(bjxxModel.bh.in_(list)).all()
    all_change = [bjxxUploadModel(**stu.to_dict()) for stu in data_change]
    # 老数据数据实际操作状态为 2
    for a in all_change:
        a.sjcz = '2'
    db.session.add_all(all_change)
    db.session.commit()
    return "生成成功"
Ejemplo n.º 16
0
def CompareCsv(req: func.HttpRequest) -> func.HttpResponse:

    print("Start of Script:-", datetime.datetime.now())
    # src_df = pd.read_csv(r'C:\AzureFunctionPOC\TestHttpTrigger\src.csv')
    # dest_df = pd.read_csv(r'C:\AzureFunctionPOC\TestHttpTrigger\dest.csv')

    # 40 records
    src = 'https://comparefilesfuncapp.blob.core.windows.net/csvblob/src.csv?sp=r&st=2020-11-02T05:35:44Z&se=2020-11-06T18:29:44Z&spr=https&sv=2019-12-12&sr=b&sig=ZZfxIjTWTZ0AbHWnUFL99KBcb9eZak82XoJk1nonY08%3D'
    dest = 'https://comparefilesfuncapp.blob.core.windows.net/csvblob/dest.csv?sp=r&st=2020-11-02T05:36:43Z&se=2020-11-06T18:29:43Z&spr=https&sv=2019-12-12&sr=b&sig=TnzlvKculK%2FWc4T3wEo3NvpJp4ytPiCvg9fAI0nHbGs%3D'
    # 5M records
    # src = 'https://comparefilesfuncapp.blob.core.windows.net/csvblob/csvoutput5MSrc.csv?sp=r&st=2020-11-03T06:59:59Z&se=2020-11-06T17:59:59Z&spr=https&sv=2019-12-12&sr=b&sig=BYhS66f063Lg6HIgYQbqUUOt4SuwYXEXocFBXQ%2BscbU%3D'
    # dest = 'https://comparefilesfuncapp.blob.core.windows.net/csvblob/csvoutput5MDest.csv?sp=r&st=2020-11-03T07:00:31Z&se=2020-11-06T18:00:31Z&spr=https&sv=2019-12-12&sr=b&sig=JcvWMFHFh%2BSVgM8AyIHviZCeGyOMekL22ZvzhwrhVLI%3D'

    src_df = pd.read_csv(src)
    dest_df = pd.read_csv(dest)
        
    compare = datacompy.Compare(src_df,dest_df,
    on_index = True,
    abs_tol=0, #Optional, defaults to 0
    rel_tol=0, #Optional, defaults to 0
    df1_name='Original', #Optional, defaults to 'df1'
    df2_name='New' #Optional, defaults to 'df2'
    )

    print("End of Script:-", datetime.datetime.now())
    return func.HttpResponse(compare.report())
Ejemplo n.º 17
0
def test_integer_column_names():
    """This function tests that integer column names would also work
    """
    df1 = pd.DataFrame({1: [1, 2, 3], 2: [0, 1, 2]})
    df2 = pd.DataFrame({1: [1, 2, 3], 2: [0, 1, 2]})
    compare = datacompy.Compare(df1, df2, join_columns=[1])
    assert compare.matches()
Ejemplo n.º 18
0
def test_simple_dupes_one_field_two_vals():
    df1 = pd.DataFrame([{"a": 1, "b": 2}, {"a": 1, "b": 0}])
    df2 = pd.DataFrame([{"a": 1, "b": 2}, {"a": 1, "b": 0}])
    compare = datacompy.Compare(df1, df2, join_columns=["a"])
    assert compare.matches()
    # Just render the report to make sure it renders.
    t = compare.report()
Ejemplo n.º 19
0
 def _compare(self, file_path: Path):
     symbol = file_path.name.strip(self.file_suffix)
     if symbol.lower() not in self.qlib_symbols:
         return self.NOT_IN_FEATURES
     # qlib1 data
     qlib_df = D.features([symbol], self.qlib_fields, freq=self.freq)
     qlib_df.rename(columns={_c: _c.strip("$")
                             for _c in qlib_df.columns},
                    inplace=True)
     # csv data
     origin_df = pd.read_csv(file_path)
     origin_df[self.date_field_name] = pd.to_datetime(
         origin_df[self.date_field_name])
     if self.symbol_field_name not in origin_df.columns:
         origin_df[self.symbol_field_name] = symbol
     origin_df.set_index([self.symbol_field_name, self.date_field_name],
                         inplace=True)
     origin_df.index.names = qlib_df.index.names
     try:
         compare = datacompy.Compare(
             origin_df,
             qlib_df,
             on_index=True,
             abs_tol=1e-08,  # Optional, defaults to 0
             rel_tol=1e-05,  # Optional, defaults to 0
             df1_name="Original",  # Optional, defaults to 'df1'
             df2_name="New",  # Optional, defaults to 'df2'
         )
         _r = compare.matches(ignore_extra_columns=True)
         return self.COMPARE_TRUE if _r else self.COMPARE_FALSE
     except Exception as e:
         logger.warning(f"{symbol} compare error: {e}")
         return self.COMPARE_ERROR
Ejemplo n.º 20
0
def test_columns_no_overlap():
    df1 = pd.DataFrame([{"a": 1, "b": 2, "c": "hi"}, {"a": 2, "b": 2, "c": "yo"}])
    df2 = pd.DataFrame([{"a": 1, "b": 2, "d": "oh"}, {"a": 2, "b": 3, "d": "ya"}])
    compare = datacompy.Compare(df1, df2, ["a"])
    assert compare.df1_unq_columns() == set(["c"])
    assert compare.df2_unq_columns() == set(["d"])
    assert compare.intersect_columns() == set(["a", "b"])
Ejemplo n.º 21
0
def test_all_mismatch():
    data1 = """acct_id,dollar_amt,name,float_fld,date_fld
    10000001234,123.45,George Maharis,14530.1555,2017-01-01
    10000001235,0.45,Michael Bluth,1,2017-01-01
    10000001236,1345,George Bluth,,2017-01-01
    10000001237,123456,Bob Loblaw,345.12,2017-01-01
    10000001239,1.05,Lucille Bluth,,2017-01-01
    10000001240,123.45,George Maharis,14530.1555,2017-01-02
    """

    data2 = """acct_id,dollar_amt,name,float_fld,date_fld
    10000001234,123.4,George Michael Bluth,14530.155,
    10000001235,0.45,Michael Bluth,,
    10000001236,1345,George Bluth,1,
    10000001237,123456,Robert Loblaw,345.12,
    10000001238,1.05,Loose Seal Bluth,111,
    10000001240,123.45,George Maharis,14530.1555,2017-01-02
    """
    df1 = pd.read_csv(io.StringIO(data1), sep=",")
    df2 = pd.read_csv(io.StringIO(data2), sep=",")
    compare = datacompy.Compare(df1, df2, "acct_id")

    output = compare.all_mismatch()
    assert output.shape[0] == 4

    assert (output.name_df1 != output.name_df2).values.sum() == 2
    assert (~(output.name_df1 != output.name_df2)).values.sum() == 2

    assert (output.dollar_amt_df1 != output.dollar_amt_df2).values.sum() == 1
    assert (
        ~(output.dollar_amt_df1 != output.dollar_amt_df2)).values.sum() == 3

    assert (output.float_fld_df1 != output.float_fld_df2).values.sum() == 3
    assert (~(output.float_fld_df1 != output.float_fld_df2)).values.sum() == 1
Ejemplo n.º 22
0
def test_compare_on_index_and_join_columns():
    df = pd.DataFrame([{'a': 1, 'b': 2}, {'a': 2, 'b': 2}])
    with raises(Exception, message='Only provide on_index or join_columns'):
        compare = datacompy.Compare(df,
                                    df.copy(),
                                    on_index=True,
                                    join_columns=['a'])
Ejemplo n.º 23
0
def test_columns_overlap():
    df1 = pd.DataFrame([{"a": 1, "b": 2}, {"a": 2, "b": 2}])
    df2 = pd.DataFrame([{"a": 1, "b": 2}, {"a": 2, "b": 3}])
    compare = datacompy.Compare(df1, df2, ["a"])
    assert compare.df1_unq_columns() == set()
    assert compare.df2_unq_columns() == set()
    assert compare.intersect_columns() == set(["a", "b"])
Ejemplo n.º 24
0
def test_columns_overlap():
    df1 = pd.DataFrame([{'a': 1, 'b': 2}, {'a': 2, 'b': 2}])
    df2 = pd.DataFrame([{'a': 1, 'b': 2}, {'a': 2, 'b': 3}])
    compare = datacompy.Compare(df1, df2, ['a'])
    assert compare.df1_unq_columns() == set()
    assert compare.df2_unq_columns() == set()
    assert compare.intersect_columns() == set(['a', 'b'])
Ejemplo n.º 25
0
def test_compare_on_index_and_join_columns():
    df = pd.DataFrame([{"a": 1, "b": 2}, {"a": 2, "b": 2}])
    with raises(Exception, message="Only provide on_index or join_columns"):
        compare = datacompy.Compare(df,
                                    df.copy(),
                                    on_index=True,
                                    join_columns=["a"])
Ejemplo n.º 26
0
def test_index_joining_strings_i_guess():
    df1 = pd.DataFrame([{'a': 'hi', 'b': 2}, {'a': 'bye', 'b': 2}])
    df2 = pd.DataFrame([{'a': 'hi', 'b': 2}, {'a': 'bye', 'b': 2}])
    df1.index = df1['a']
    df2.index = df2['a']
    compare = datacompy.Compare(df1, df2, on_index=True)
    assert compare.matches()
Ejemplo n.º 27
0
def create_upload_data1():
    db.session.execute('truncate  table zzjg_yxxx_upload')
    db.session.commit()
    df1 = pd.read_sql(
        "select yxsdm, yxsmc from zzjg_yxxx where is_changed ='1'", db.engine)
    df2 = pd.read_sql(
        "select yxsdm, yxsmc from zzjg_yxxx_CW where yxsdm in (select yxsdm from zzjg_yxxx where is_changed = '1')",
        db.engine)

    compare = datacompy.Compare(df1, df2, join_columns=['yxsdm'])
    df_new = pd.read_sql(
        "select yxsdm, yxsmc from zzjg_yxxx where yxsdm not in (select yxsdm from zzjg_yxxx_cw)",
        db.engine)
    # 新数据实际操作状态为 1
    df_new['sjcz'] = '1'
    all_new = [
        yxxxUploadModel(**stu) for stu in df_new.to_dict(orient='records')
    ]
    db.session.add_all(all_new)
    db.session.commit()
    df_change = compare.all_mismatch()
    list = df_change['yxsdm'].tolist()
    data_change = yxxxModel.query.filter(yxxxModel.yxsdm.in_(list)).all()
    all_change = [yxxxUploadModel(**stu.to_dict()) for stu in data_change]
    # 老数据数据实际操作状态为 2
    for a in all_change:
        a.sjcz = '2'
    db.session.add_all(all_change)
    db.session.commit()
    return "生成成功"
Ejemplo n.º 28
0
def create_upload_data2():
    db.session.execute('truncate  table zzjg_zyxx_upload')
    db.session.commit()
    df1 = pd.read_sql(
        "select zyh, zwmc, ywmc, zyfxh, bzkzym, yjszym, ssxkdl, ssxk, xz, pycc, zylxdm, yxsh, xqdm from zzjg_zyxx where is_changed ='1'",
        db.engine)
    df2 = pd.read_sql(
        "select zyh, zwmc, ywmc, zyfxh, bzkzym, yjszym, ssxkdl, ssxk, xz, pycc, zylxdm, yxsh, xqdm from zzjg_zyxx_CW where zyh in (select zyh from zzjg_zyxx where is_changed = '1')",
        db.engine)

    compare = datacompy.Compare(df1, df2, join_columns=['zyh'])
    df_new = pd.read_sql(
        "select zyh, zwmc, ywmc, zyfxh, bzkzym, yjszym, ssxkdl, ssxk, xz, pycc, zylxdm, yxsh, xqdm from zzjg_zyxx where zyh not in (select zyh from zzjg_zyxx_cw)",
        db.engine)
    # 新数据实际操作状态为 1
    df_new['sjcz'] = '1'
    all_new = [
        zyxxUploadModel(**stu) for stu in df_new.to_dict(orient='records')
    ]
    db.session.add_all(all_new)
    db.session.commit()
    # df_new.to_sql('zzjg_xsxx_upload', db.engine, index=False, if_exists='append')
    df_change = compare.all_mismatch()
    list = df_change['zyh'].tolist()
    data_change = zyxxModel.query.filter(zyxxModel.zyh.in_(list)).all()
    all_change = [zyxxUploadModel(**stu.to_dict()) for stu in data_change]
    # 老数据数据实际操作状态为 2
    for a in all_change:
        a.sjcz = '2'
    db.session.add_all(all_change)
    db.session.commit()
    return "生成成功"
Ejemplo n.º 29
0
def test_simple_dupes_two_fields():
    df1 = pd.DataFrame([{'a': 1, 'b': 2}, {'a': 1, 'b': 2, 'c': 2}])
    df2 = pd.DataFrame([{'a': 1, 'b': 2}, {'a': 1, 'b': 2, 'c': 2}])
    compare = datacompy.Compare(df1, df2, join_columns=['a', 'b'])
    assert compare.matches()
    #Just render the report to make sure it renders.
    t = compare.report()
Ejemplo n.º 30
0
def test_index_joining_strings_i_guess():
    df1 = pd.DataFrame([{"a": "hi", "b": 2}, {"a": "bye", "b": 2}])
    df2 = pd.DataFrame([{"a": "hi", "b": 2}, {"a": "bye", "b": 2}])
    df1.index = df1["a"]
    df2.index = df2["a"]
    df1.index.name = df2.index.name = None
    compare = datacompy.Compare(df1, df2, on_index=True)
    assert compare.matches()