def test_title():

    stage = title({})
    cv = clean_visible({})

    si = make_stream_item(0, '')
    si.body.clean_html = '''Then there
was a
<tag>   ...  <title>TITLE
HERE
  </title>
'''
    si = cv(si, {})
    si = stage(si)
    assert si.other_content['title'].clean_visible == 'TITLE HERE'

    si = make_stream_item(0, '')
    si.body.clean_html = '''Then there
was a
  that went <tag>   ...  <title>TITLE
HERE%s
  </title>
''' % ('*' * 80)
    si = cv(si, {})
    si = stage(si)
    assert si.other_content[
        'title'].clean_visible == 'TITLE HERE' + '*' * 50 + '...'
def test_title():
    
    stage = title({})
    cv = clean_visible({})

    si = make_stream_item(0, '')
    si.body.clean_html = '''Then there
was a
<tag>   ...  <title>TITLE
HERE
  </title>
'''
    si = cv(si, {})
    si = stage(si)
    assert si.other_content['title'].clean_visible == 'TITLE HERE'
    
    si = make_stream_item(0, '')
    si.body.clean_html = '''Then there
was a
  that went <tag>   ...  <title>TITLE
HERE%s
  </title>
''' % ('*' * 80)
    si = cv(si, {})
    si = stage(si)
    assert si.other_content['title'].clean_visible == 'TITLE HERE' + '*' * 50 + '...'
def ids_and_clean_visible_from_streamcorpus_chunk_path(corpus_path):
    '''converts a streamcorpus.Chunk file into the structure that is
    passed by the search engine to find_soft_selectors

    '''
    ch = clean_html(clean_html.default_config)
    cv = clean_visible(clean_visible.default_config)
    ids_and_clean_visible = []
    for si in streamcorpus.Chunk(path=corpus_path):
        if not si.body.clean_visible:
            ## attempt to make clean_visible
            if not si.body.raw:
                logger.critical('no raw content, so skipping: %r', si.abs_url)
                continue
            abs_url = si.abs_url
            si = ch(si, {})
            if not si:
                logger.critical(
                    'failed to make clean_html, so skipping: %r', abs_url)
                continue
            si = cv(si, {})
            if not si or not si.body.clean_visible:
                logger.critical(
                    'failed to make clean_visible, so skipping: %r', abs_url)
                continue
        rec = (si.stream_id, si.body.clean_visible.decode('utf8'), {})
        ids_and_clean_visible.append(rec)
    return ids_and_clean_visible
def make_hyperlink_labeled_test_stream_item(test_data_dir):
    context = {}
    si = make_test_stream_item(test_data_dir)
    assert len(si.body.clean_html) > 200
    hl = hyperlink_labels(config={"require_abs_url": True, "all_domains": True, "offset_types": ["BYTES"]})
    hl(si, context)
    cv = clean_visible(config={})
    cv(si, context)
    assert len(si.body.clean_visible) > 200
    return si
Example #5
0
def make_hyperlink_labeled_test_stream_item(test_data_dir):
    context = {}
    si = make_test_stream_item(test_data_dir)
    assert len(si.body.clean_html) > 200
    hl = hyperlink_labels(config={
        'require_abs_url': True,
        'all_domains': True,
        'offset_types': ['BYTES'],
    })
    hl(si, context)
    cv = clean_visible(config={})
    cv(si, context)
    assert len(si.body.clean_visible) > 200
    return si