Ejemplo n.º 1
0
def search_diffbot_cache(term):
    response = requests.get('http://api.diffbot.com/v3/search',
                            params={
                                'token': config.credentials.diffbot,
                                'query':
                                requests.utils.quote('"{}"'.format(term)),
                                'col': 'GLOBAL-INDEX'
                            }).json()
    if not response.get('objects'):
        if response.get('error'):
            print("Response Error '{}' (code: {})".format(
                response['error'], response['errorCode']))
        else:
            print("NO RESULTS")
    results = []
    for object in response.get('objects', []):
        if object.get('text'):
            pr = PageRequest(object.get('pageUrl'), term, run=False)
            pr.extract_sentences(object.get('text'))
            result = {
                "title": object.get('title'),
                "url": object.get('pageUrl'),
                'search_provider': 'diffbot',
                "author": object.get('author'),
                "date": parse_date(object.get('date', '')).isoformat(),
                "doc": object.get('text'),
                "sentences": pr.sentences,
                "variants": list(pr.variants)
            }
            results.append(result)
    return results
Ejemplo n.º 2
0
def search_diffbot_cache(term):
    response = requests.get('http://api.diffbot.com/v3/search', params={
        'token': config.credentials.diffbot,
        'query': requests.utils.quote('"{}"'.format(term)),
        'col': 'GLOBAL-INDEX'
    }).json()
    if not response.get('objects'):
        if response.get('error'):
            print("Response Error '{}' (code: {})".format(response['error'], response['errorCode']))
        else:
            print("NO RESULTS")
    results = []
    for object in response.get('objects', []):
        if object.get('text'):
            pr = PageRequest(object.get('pageUrl'), term, run=False)
            pr.extract_sentences(object.get('text'))
            result = {
                "title": object.get('title'),
                "url": object.get('pageUrl'),
                'search_provider': 'diffbot',
                "author": object.get('author'),
                "date": parse_date(object.get('date', '')).isoformat(),
                "doc": object.get('text'),
                "sentences": pr.sentences,
                "variants": list(pr.variants)
            }
            results.append(result)
    return results
Ejemplo n.º 3
0
def test_extract_html_features():
    from serapis.extract import PageRequest

    test_request = PageRequest("http://thescene.whro.org/hear-cool-stuff", 'defenestration', run=False)
    test_html = "<div><p><em><strong>de-fen-es-tra-tion</strong></em> (dee-fen-uh-STRAY-shun) |&nbsp;n. the act of throwing someone or something out of a window</p></div><div>"
    test_request.get_html_features(test_html)
    assert test_request.features['highlighted']
Ejemplo n.º 4
0
def test_extract_html_features():
    from serapis.extract import PageRequest

    test_request = PageRequest("http://thescene.whro.org/hear-cool-stuff",
                               'defenestration',
                               run=False)
    test_html = "<div><p><em><strong>de-fen-es-tra-tion</strong></em> (dee-fen-uh-STRAY-shun) |&nbsp;n. the act of throwing someone or something out of a window</p></div><div>"
    test_request.get_html_features(test_html)
    assert test_request.features['highlighted']
Ejemplo n.º 5
0
def extract_wrapper(url_object, term):
    try:
        result = PageRequest(url_object['url'], term).structured
    except Exception:
        import traceback
        log.error("Failed to get page {} -- {}".format(url_object['url'],
                                                       traceback.format_exc()))
        return url_object
    return merge_dict(url_object, result)
Ejemplo n.º 6
0
def test_page_structure():
    from serapis.extract import PageRequest
    p = PageRequest(
        'http://nytimes.com/2015/10/04/technology/scouring-the-web-to-make-new-words-lookupable.html',
        "lookupable")

    assert p.structured['title'] == test_response['title']
    assert p.structured['url'] == test_response['url']
    assert p.structured['author'] == test_response['author']
    assert len(p.structured['doc']) > 0
Ejemplo n.º 7
0
def search_duckduckgo(term):
    result = []
    try:
        req = requests.get(
            'http://api.duckduckgo.com/?q={}&format=json'.format(term)).json()
    except:
        return result
    if req['AbstractSource'] not in config.duckduckgo_sources:
        return result
    if req.get('Abstract'):
        pr = PageRequest(req['AbstractURL'], term, run=False)
        pr.extract_sentences(req['Abstract'])
        result.append({
            'title': req['Heading'],
            'url': req['AbstractURL'],
            'search_provider': 'duckduckgo',
            'author': None,
            'date': None,
            'source': req['AbstractSource'],
            'doc': req['Abstract'],
            "sentences": pr.sentences,
            "variants": list(pr.variants)
        })
    if req.get('Definition'):
        pr = PageRequest(req['DefinitionURL'], term, run=False)
        pr.extract_sentences(req['Definition'])
        result.append({
            'title': req['Heading'],
            'url': req['DefinitionURL'],
            'source': req['DefinitionSource'],
            'search_provider': 'duckduckgo',
            'author': None,
            'date': None,
            'doc': req['Definition'],
            "sentences": pr.sentences,
            "variants": list(pr.variants)
        })
    log.info("Searching DuckDuckGo for '{}' returned {} results".format(
        term, len(result)))
    return result
Ejemplo n.º 8
0
def search_duckduckgo(term):
    result = []
    try:
        req = requests.get('http://api.duckduckgo.com/?q={}&format=json'.format(term)).json()
    except:
        return result
    if req['AbstractSource'] not in config.duckduckgo_sources:
        return result
    if req.get('Abstract'):
        pr = PageRequest(req['AbstractURL'], term, run=False)
        pr.extract_sentences(req['Abstract'])
        result.append({
            'title': req['Heading'],
            'url': req['AbstractURL'],
            'search_provider': 'duckduckgo',
            'author': None,
            'date': None,
            'source': req['AbstractSource'],
            'doc': req['Abstract'],
            "sentences": pr.sentences,
            "variants": list(pr.variants)
        })
    if req.get('Definition'):
        pr = PageRequest(req['DefinitionURL'], term, run=False)
        pr.extract_sentences(req['Definition'])
        result.append({
            'title': req['Heading'],
            'url': req['DefinitionURL'],
            'source': req['DefinitionSource'],
            'search_provider': 'duckduckgo',
            'author': None,
            'date': None,
            'doc': req['Definition'],
            "sentences": pr.sentences,
            "variants": list(pr.variants)
        })
    log.info("Searching DuckDuckGo for '{}' returned {} results".format(term, len(result)))
    return result
Ejemplo n.º 9
0
def test_page_request():
    from serapis.extract import PageRequest
    p = PageRequest(
        'http://nytimes.com/2015/10/04/technology/scouring-the-web-to-make-new-words-lookupable.html',
        "lookupable")
    assert p.response