def test_dedup_production_config(test_data_dir):
    ## now test the production config
    context = {}
    d1 = dedup({
        ## operate on which part of si.body
        'content_form': 'clean_visible',

        ## set this to false for N^2 comparison within each chunk
        'require_same_doc_id': True,

        'use_nilsimsa': False,

        ## must be greater than or equal to this
        'exactness_nilsimsa_threshold': 128,

        ## docs shorter than this are not rejected even if they have
        ## higher than exactness_nilsimsa_threshold with another doc
        'min_clean_length': 500,

        ## docs with same doc_id that pass exactness_nilsimsa_threshold
        ## are KEPT if the two docs have lengths that differ by more than
        ## this fraction (relative to the longer of the two, measured in
        ## thousandths):
        'min_len_sim_thousandths_clean': 850,
        'min_len_sim_thousandths_raw': 850,
    })
    num_dups = 0
    for num, si in enumerate(get_test_chunk(test_data_dir)):
        if not d1( si, context ):
            num_dups += 1
        if num > 20:
            break
    logger.debug('removed %d near-exact duplicates' % num_dups)
    assert num_dups == 3
Beispiel #2
0
def test_dedup_production_config(test_data_dir):
    ## now test the production config
    context = {}
    d1 = dedup({
        ## operate on which part of si.body
        'content_form': 'clean_visible',

        ## set this to false for N^2 comparison within each chunk
        'require_same_doc_id': True,
        'use_nilsimsa': False,

        ## must be greater than or equal to this
        'exactness_nilsimsa_threshold': 128,

        ## docs shorter than this are not rejected even if they have
        ## higher than exactness_nilsimsa_threshold with another doc
        'min_clean_length': 500,

        ## docs with same doc_id that pass exactness_nilsimsa_threshold
        ## are KEPT if the two docs have lengths that differ by more than
        ## this fraction (relative to the longer of the two, measured in
        ## thousandths):
        'min_len_sim_thousandths_clean': 850,
        'min_len_sim_thousandths_raw': 850,
    })
    num_dups = 0
    for num, si in enumerate(get_test_chunk(test_data_dir)):
        if not d1(si, context):
            num_dups += 1
        if num > 20:
            break
    logger.debug('removed %d near-exact duplicates' % num_dups)
    assert num_dups == 3
def test_dedup_debugging_config(tmpdir):
    
    ## first test the debugging config
    config = dict(
        ## operate on which part of si.body
        content_form = 'clean_visible',

        ## set this to false for N^2 comparison within each chunk
        require_same_doc_id = False,

        use_nilsimsa = True,

        ## must be greater than or equal to this
        exactness_nilsimsa_threshold = 128,

        ## docs shorter than this are not rejected even if they have
        ## higher than exactness_nilsimsa_threshold with another doc
        min_clean_length = 500,

        ## docs with same doc_id that pass exactness_nilsimsa_threshold
        ## are KEPT if the two docs have lengths that differ by more than
        ## this fraction (relative to the longer of the two, measured in
        ## thousandths):
        min_len_sim_thousandths_clean = 850,
        min_len_sim_thousandths_raw = 850,

        log_dir_path = tmpdir.dirname,
        log_nilsimsa_threshold = 100,
        )

    context = {}
    d1 = dedup( config )

    num_dups = 0

    for num, si in enumerate(get_test_chunk()):

        if not d1( si, context ):
            num_dups += 1

        if num > 10:
            break

    print 'removed %d near-exact duplicates' % num_dups
    assert num_dups == 7