Beispiel #1
0
def test_parser_and_checksum(filepath):
    response = _read_response(_data_dir, filepath)
    raw_parser = RawFeedParser()
    raw_result = raw_parser.parse(response)
    assert raw_result.feed
    assert raw_result.storys
    parser = FeedParser()
    result = parser.parse(raw_result)
    assert result.feed
    assert result.storys
    assert result.checksum.size() == len(result.storys)
Beispiel #2
0
def test_parse_story_no_id_no_summary_no_url():
    # total 3 storys
    # skip the no id story
    # story#0 no content no summary, no url
    # story#1 has content no summary, no url but id is valid url
    filename = 'well/v2ex-no-id-no-summary-no-url.xml'
    response = _read_response(_data_dir, filename)

    raw_result = RawFeedParser().parse(response)
    assert raw_result.storys
    # assert skip the no id story
    assert len(raw_result.storys) == 2
    # assert no summary
    assert not raw_result.storys[0]['summary']
    assert not raw_result.storys[1]['summary']
    # assert content
    assert not raw_result.storys[0]['content']
    assert raw_result.storys[1]['content']
    # assert pick id as url, discard the invalid one
    assert not raw_result.storys[0]['url']
    assert raw_result.storys[1]['url']

    result = FeedParser().parse(raw_result)
    assert result.storys
    assert len(raw_result.storys) == len(result.storys)
    # assert content
    assert not result.storys[0]['content']
    assert result.storys[1]['content']
    # assert extract summary from content
    assert not result.storys[0]['summary']
    assert result.storys[1]['summary']
    # assert pick id as url, discard the invalid one
    assert not result.storys[0]['url']
    assert result.storys[1]['url']
Beispiel #3
0
def test_parse_too_many_storys():
    items = []
    num_storys = 2000
    base = datetime.datetime.now()
    for i in range(num_storys):
        if i < num_storys // 2:
            date_published = None
        else:
            date_published = (base + datetime.timedelta(seconds=i)).isoformat()
        items.append({
            "id": f"{i}",
            "content_html": f"content_{i}",
            "summary": f"summary_{i}",
            "url": f"https://example.org/post/{i}",
            "date_published": date_published,
        })
    feed = {
        "version": "https://jsonfeed.org/version/1",
        "title": "Too many storys",
        "home_page_url": "https://example.org/",
        "feed_url": "https://example.org/feed.json",
        "items": items
    }
    data = json.dumps(feed).encode('utf-8')
    response = _create_builder(data).build()
    raw_result = RawFeedParser().parse(response)
    assert len(raw_result.storys) == num_storys
    result = FeedParser().parse(raw_result)
    assert len(result.storys) == _MAX_STORYS
    expected = set(range(num_storys - _MAX_STORYS, num_storys))
    story_ids = {int(x['ident']) for x in result.storys}
    assert story_ids == expected
Beispiel #4
0
def _parse_found(found):
    response: FeedResponse
    raw_result: RawFeedResult
    response, raw_result = found
    feed = AttrDict()

    # feed response
    feed.use_proxy = response.use_proxy
    feed.url = response.url
    feed.content_length = len(response.content)
    feed.content_hash_base64 = compute_hash_base64(response.content)
    feed.etag = response.etag
    feed.last_modified = response.last_modified
    feed.encoding = response.encoding
    del found, response  # release memory in advance

    # parse feed and storys
    result = FeedParser().parse(raw_result)
    del raw_result  # release memory in advance

    feed.title = result.feed['title']
    feed.link = result.feed['home_url']
    feed.author = result.feed['author_name']
    feed.icon = result.feed['icon_url']
    feed.description = result.feed['description']
    feed.dt_updated = result.feed['dt_updated']
    feed.version = result.feed['version']
    feed.storys = _get_storys(result.storys)
    del result  # release memory in advance

    return validate_feed(feed)
Beispiel #5
0
def do_sync_feed(
    ctx: ActorContext,
    feed_id: T.int,
    url: T.url,
    content_hash_base64: T.str.optional,
    etag: T.str.optional,
    last_modified: T.str.optional,
):
    params = dict(etag=etag, last_modified=last_modified)
    with FeedReader() as reader:
        status_code, response = reader.read(url, **params)
    LOG.info(f'read feed#{feed_id} url={unquote(url)} status_code={status_code}')
    if status_code != 200 or not response:
        return
    new_hash = compute_hash_base64(response.content)
    if new_hash == content_hash_base64:
        LOG.info(f'feed#{feed_id} url={unquote(url)} not modified by compare content hash!')
        return
    LOG.info(f'parse feed#{feed_id} url={unquote(url)}')
    parsed = FeedParser.parse_response(response)
    if parsed.bozo:
        LOG.warning(f'failed parse feed#{feed_id} url={unquote(url)}: {parsed.bozo_exception}')
        return
    try:
        feed = _parse_found(parsed)
    except Invalid as ex:
        LOG.warning(f'invalid feed#{feed_id} url={unquote(url)}: {ex}', exc_info=ex)
        return
    ctx.tell('harbor_rss.update_feed', dict(feed_id=feed_id, feed=feed))
Beispiel #6
0
def _parse_well_feed(filename) -> FeedResult:
    response = _read_response(_data_dir / 'well', filename)
    raw_result = RawFeedParser().parse(response)
    assert raw_result.feed
    assert raw_result.storys
    assert not raw_result.warnings
    result = FeedParser().parse(raw_result)
    assert len(result.storys) == len(raw_result.storys)
    return result
Beispiel #7
0
def _parse_found(found, checksum_data=None, is_refresh=False):
    response: FeedResponse
    raw_result: RawFeedResult
    response, raw_result = found
    feed = AttrDict()

    # feed response
    feed.use_proxy = response.use_proxy
    feed.url = response.url
    feed.content_length = len(response.content)
    feed.content_hash_base64 = compute_hash_base64(response.content)
    feed.etag = response.etag
    feed.last_modified = response.last_modified
    feed.encoding = response.encoding
    feed.response_status = response.status
    del found, response  # release memory in advance

    # parse feed and storys
    checksum = None
    if checksum_data and (not is_refresh):
        checksum = FeedChecksum.load(checksum_data)
    result = FeedParser(checksum=checksum).parse(raw_result)
    checksum_data = result.checksum.dump(limit=300)
    num_raw_storys = len(raw_result.storys)
    warnings = None
    if raw_result.warnings:
        warnings = '; '.join(raw_result.warnings)
    del raw_result  # release memory in advance
    msg = "feed url=%r storys=%s changed_storys=%s"
    LOG.info(msg, feed.url, num_raw_storys, len(result.storys))

    feed.title = result.feed['title']
    feed.link = result.feed['home_url']
    feed.author = result.feed['author_name']
    feed.icon = result.feed['icon_url']
    feed.description = result.feed['description']
    feed.dt_updated = result.feed['dt_updated']
    feed.version = result.feed['version']
    feed.storys = _get_storys(result.storys)
    feed.checksum_data = checksum_data
    feed.warnings = warnings
    del result  # release memory in advance

    return validate_feed(feed)
Beispiel #8
0
def test_parse_large_content(template_name, content_length, summary_length):
    content_snip = "<span>12345678</span>"
    summary_snip = '<span>123</span>'
    content_repeat = (content_length // len(content_snip)) + 1
    content = content_snip * content_repeat
    summary_repeat = (summary_length // len(summary_snip)) + 1
    summary = summary_snip * summary_repeat
    template = large_feed_templates[template_name]
    # use replace instead format to avoid KeyError for json string
    data = template\
        .replace('${content}', content)\
        .replace('${summary}', summary)\
        .encode('utf-8')
    response = _create_builder(content=data).build()
    raw_result = RawFeedParser().parse(response)
    assert raw_result and len(raw_result.storys) == 1
    assert len(raw_result.storys[0]['content']) <= _RAW_MAX_CONTENT_LENGTH
    assert len(raw_result.storys[0]['summary']) <= _RAW_MAX_SUMMARY_LENGTH
    result = FeedParser().parse(raw_result)
    assert result and len(result.storys) == 1
    assert len(result.storys[0]['content']) <= _MAX_CONTENT_LENGTH
    assert len(result.storys[0]['summary']) <= _MAX_SUMMARY_LENGTH