Python duplicate_testの例

プログラミング言語: Python

名前空間/パッケージ名: trafilatura.filters

メソッド/関数: duplicate_test

hotexamples.comのコード掲載数: 2

Python duplicate_test - 2件のコード例が見つかりました。すべてオープンソースプロジェクトから抽出されたPythonのtrafilatura.filters.duplicate_testの実例で、最も評価が高いものを厳選しています。コード例の評価を行っていただくことで、より質の高いコード例が表示されるようになります。

コード例 #1

ファイルを表示

def test_lrucache():
    '''test basic duplicate detection'''
    lru_test = LRUCache(maxsize=2)
    trafilatura.filters.LRU_TEST = lru_test
    my_body = etree.Element('body')
    ### element too short
    #my_element = html.fromstring('<p>AAAA BBBB</p>')
    #my_body.append(my_element)
    #put_in_cache(my_body)
    #assert duplicate_test(my_element) is False
    ### cached element
    my_element = html.fromstring(
        '<p>AAAA BBBB AAAA BBBB AAAA BBBB AAAA BBBB AAAA BBBB AAAA BBBB AAAA BBBB AAAA BBBB AAAA BBBB AAAA BBBB AAAA BBBB AAAA BBBB AAAA BBBB</p>'
    )
    my_body.append(my_element)
    assert duplicate_test(my_element) is False
    assert duplicate_test(my_element) is False
    assert duplicate_test(my_body) is False
    assert duplicate_test(my_element) is True
    other_body = etree.Element('body')
    other_element = html.fromstring(
        '<p>CCCC DDDD CCCC DDDD CCCC DDDD CCCC DDDD CCCC DDDD CCCC DDDD CCCC DDDD CCCC DDDD CCCC DDDD CCCC DDDD CCCC DDDD</p>'
    )
    other_body.append(other_element)
    assert duplicate_test(other_body) is False
    assert duplicate_test(other_element) is False
    assert duplicate_test(other_body) is False
    assert duplicate_test(other_element) is True
    yet_another_body = etree.Element('body')
    yet_another_element = html.fromstring(
        '<p>EEEE FFFF EEEE FFFF EEEE FFFF EEEE FFFF EEEE FFFF EEEE FFFF EEEE FFFF EEEE FFFF EEEE FFFF EEEE FFFF EEEE FFFF EEEE FFFF EEEE FFFF</p>'
    )
    yet_another_body.append(yet_another_element)
    assert duplicate_test(yet_another_body) is False
    assert duplicate_test(yet_another_body) is False
    assert duplicate_test(yet_another_body) is False
    # 2 elements in cache, original element has been cleared?
    # print(LRU_TEST.maxsize, LRU_TEST.full)
    assert duplicate_test(other_element) is True
    assert duplicate_test(yet_another_element) is True
    assert duplicate_test(my_element) is False
    # clear the cache
    lru_test.clear()
    assert duplicate_test(other_element) is False
    # get wrong key
    assert lru_test.get('tralala') == -1

コード例 #2

ファイルを表示

def extract(filecontent,
            url=None,
            record_id='0001',
            no_fallback=False,
            include_comments=False,
            csv_output=False,
            xml_output=False,
            tei_output=False,
            tei_validation=False,
            target_language=None,
            include_tables=True,
            include_formatting=False):
    '''Main process for text extraction'''
    # init
    tree = load_html(filecontent)
    if tree is None:
        return None

    # Metadata here
    if csv_output is True or xml_output is True or tei_output is True:
        docmeta = extract_metadata(tree, default_url=url)
    else:
        docmeta = None

    # backup (or not) for further processing
    if no_fallback is False:
        backup_tree = deepcopy(tree)
    else:
        backup_tree = None

    # clean
    cleaned_tree = manual_cleaning(tree, include_tables)
    # save space and processing time
    cleaned_tree = prune_html(cleaned_tree)
    # use LXML cleaner
    cleaned_tree = HTML_CLEANER.clean_html(cleaned_tree)
    # tree_cache[cleaned_tree] = list(cleaned_tree.iter())

    # convert tags, the rest does not work without conversion
    cleaned_tree = convert_tags(cleaned_tree)
    # remove hi-element to avoid tail bug
    if (xml_output is False
            and tei_output is False) or include_formatting is False:
        etree.strip_tags(cleaned_tree, 'hi')

    # comments first, then remove
    if include_comments is True:
        commentsbody, temp_comments, len_comments, cleaned_tree = extract_comments(
            cleaned_tree)
    else:
        commentsbody, temp_comments, len_comments = None, '', 0

    # extract content
    postbody, temp_text, len_text, sure_thing = extract_content(
        cleaned_tree, include_tables)

    # compare if necessary
    if no_fallback is False:  # and sure_thing is False:
        postbody, temp_text, len_text = compare_extraction(
            backup_tree, url, postbody, temp_text, len_text)
        # try with justext
        if len_text < MIN_EXTRACTED_SIZE:
            LOGGER.error('not enough text %s %s', record_id, url)
            postbody, len_text, temp_text = justext_rescue(
                tree, url, target_language, postbody, len_text, temp_text)
            LOGGER.error('justext length %s', len_text)
        # second backup
        # if len_text < MIN_EXTRACTED_SIZE:
        #     postbody, len_text, temp_text = baseline(filecontent)
    else:
        # rescue: try to use original/dirty tree
        if sure_thing is False and len_text < MIN_EXTRACTED_SIZE:
            postbody, len_text, temp_text = baseline(filecontent)
            #tree = load_html(filecontent)
            #tree = convert_tags(tree)
            #postbody, temp_text, len_text, sure_thing = extract_content(tree)
            LOGGER.debug('non-clean extracted length: %s (extraction)',
                         len_text)

    if len_comments < MIN_EXTRACTED_COMM_SIZE:
        LOGGER.info('not enough comments %s %s', record_id, url)
    if len_text < MIN_OUTPUT_SIZE and len_comments < MIN_OUTPUT_COMM_SIZE:
        LOGGER.info('text and comments not long enough: %s %s', len_text,
                    len_comments)
        return None

    # sanity check on language
    if language_filter(temp_text, temp_comments, target_language, record_id,
                       url) is True:
        return None

    # check duplicates at body level
    if duplicate_test(postbody) is True:
        return None

    # cache elements
    put_in_cache(postbody)
    if commentsbody is not None:
        put_in_cache(commentsbody)

    # XML (TEI) steps
    if xml_output is True or tei_output is True:
        if xml_output is True:
            output = build_xml_output(postbody, commentsbody)
            output = add_xml_meta(output, docmeta)
        elif tei_output is True:
            output = build_tei_output(postbody, commentsbody, docmeta)
        # can be improved
        control_string = etree.tostring(output, encoding='unicode')
        control_string = sanitize(control_string)
        # necessary for cleaning
        control_parser = etree.XMLParser(remove_blank_text=True)
        output_tree = etree.fromstring(control_string, control_parser)
        # validate
        if tei_output is True and tei_validation is True:
            result = validate_tei(output_tree)
            LOGGER.info('TEI validation result: %s %s %s', result, record_id,
                        docmeta.url)
        returnstring = etree.tostring(output_tree,
                                      pretty_print=True,
                                      encoding='unicode').strip()
    # CSV + TXT output
    else:
        if csv_output is True:
            posttext = xmltotxt(postbody)
            commentstext = xmltotxt(commentsbody)
            returnstring = txttocsv(posttext, commentstext, docmeta)
        else:
            output = build_xml_output(postbody, commentsbody)
            returnstring = xmltotxt(output)

    return returnstring