def test_extractFeatures_keyerror():
    # Test when some groundtruth are not in the local caseversion file
    cvs = finddup.loadLocalCaseversions('tests/data/small_274_0_key_error.json')
    gt = finddup.loadGroundTruth('tests/data/groundtruth-274.csv', cvs['objects'])
    features = finddup.extractFeatures(cvs, gt['ids'])

    assert(features.shape[0] == len(gt['perdictions']))
def test_loadGroundTruth():
    groundtruth = finddup.loadGroundTruth('tests/data/groundtruth-274.csv')
    expected = {
        "perdictions": [
            "merge",
            "dup",
            "none"
        ],
        "ids": [
            {
                "lhs_id": "210201",
                "rhs_id": "210202"
            },
            {
                "lhs_id": "210201",
                "rhs_id": "210521"
            },
            {
                "lhs_id": "210201",
                "rhs_id": "211079"
            }
        ]
    }

    assert(groundtruth == expected)
def test_loadGroundTruth_empty():
    groundtruth = finddup.loadGroundTruth('tests/data/groundtruth-274-empty.csv')
    expected = {
        "perdictions": [
            "none"
        ],
        "ids": [
            {
                "lhs_id": "210201",
                "rhs_id": "210202"
            },
        ]
    }

    assert(groundtruth == expected)
def test_loadGroundTruth_filter_by_csv():
    cvs = finddup.loadLocalCaseversions('tests/data/small_274_0_key_error.json')
    groundtruth = finddup.loadGroundTruth('tests/data/groundtruth-274.csv', cvs['objects'])
    expected = {
        "perdictions": [
            "merge",
            "none"
        ],
        "ids": [
            {
                "lhs_id": "210201",
                "rhs_id": "210202"
            }, # 210521 does not exist in csv
            {
                "lhs_id": "210201",
                "rhs_id": "211079"
            }
        ]
    }

    assert(groundtruth == expected)
def test_extractFeatures_select():
    cvs = finddup.loadLocalCaseversions('tests/data/small_274_0.json')
    gt = finddup.loadGroundTruth('tests/data/groundtruth-274.csv')
    features = finddup.extractFeatures(cvs, gt['ids'])

    assert(features.shape[0] == len(gt['perdictions']))