Beispiel #1
0
def response(resp):
    if resp.status_code == 404:
        return []
    raise_for_httperror(resp)

    results = []
    api_result = loads(resp.text)

    # skip disambiguation pages
    if api_result.get('type') != 'standard':
        return []

    title = api_result['title']
    wikipedia_link = api_result['content_urls']['desktop']['page']

    results.append({'url': wikipedia_link, 'title': title})

    results.append({
        'infobox': title,
        'id': wikipedia_link,
        'content': api_result.get('extract', ''),
        'img_src': api_result.get('thumbnail', {}).get('source'),
        'urls': [{
            'title': 'Wikipedia',
            'url': wikipedia_link
        }]
    })

    return results
Beispiel #2
0
def response(resp):
    results = []

    raise_for_httperror(resp)
    dom = fromstring(resp.text)
    word = extract_text(dom.xpath(word_xpath))

    definitions = []

    for dict_src in dict_xpath:
        for src in dom.xpath(dict_src):
            src_text = extract_text(
                src.xpath(
                    './/span[@class="entry-head-title"]/text()')).strip()

            src_defs = []
            for def_item in src.xpath(
                    './/div[contains(@class, "ribbon-element")]'):
                if def_item.xpath('./div[@class="znacz"]'):
                    sub_defs = []
                    for def_sub_item in def_item.xpath(
                            './div[@class="znacz"]'):
                        def_sub_text = extract_text(def_sub_item).lstrip(
                            '0123456789. ')
                        sub_defs.append(def_sub_text)
                    src_defs.append((word, sub_defs))
                else:
                    def_text = extract_text(def_item).strip()
                    def_link = def_item.xpath('./span/a/@href')
                    if 'doroszewski' in def_link[0]:
                        def_text = f"<a href='{def_link[0]}'>{def_text}</a>"
                    src_defs.append((def_text, ''))

            definitions.append((src_text, src_defs))

    if not definitions:
        return results

    infobox = ''
    for src in definitions:
        infobox += f"<div><small>{src[0]}</small>"
        infobox += "<ul>"
        for (def_text, sub_def) in src[1]:
            infobox += f"<li>{def_text}</li>"
            if sub_def:
                infobox += "<ol>"
                for sub_def_text in sub_def:
                    infobox += f"<li>{sub_def_text}</li>"
                infobox += "</ol>"
        infobox += "</ul></div>"

    results.append({
        'infobox': word,
        'content': infobox,
    })

    return results
Beispiel #3
0
def request(method, url, **kwargs):
    """same as requests/requests/api.py request(...)"""
    time_before_request = time()

    # session start
    session = SessionSinglePool()

    # proxies
    if not kwargs.get('proxies'):
        kwargs['proxies'] = get_global_proxies()

    # timeout
    if 'timeout' in kwargs:
        timeout = kwargs['timeout']
    else:
        timeout = getattr(threadLocal, 'timeout', None)
        if timeout is not None:
            kwargs['timeout'] = timeout

    # raise_for_error
    check_for_httperror = True
    if 'raise_for_httperror' in kwargs:
        check_for_httperror = kwargs['raise_for_httperror']
        del kwargs['raise_for_httperror']

    # do request
    response = session.request(method=method, url=url, **kwargs)

    time_after_request = time()

    # is there a timeout for this engine ?
    if timeout is not None:
        timeout_overhead = 0.2  # seconds
        # start_time = when the user request started
        start_time = getattr(threadLocal, 'start_time', time_before_request)
        search_duration = time_after_request - start_time
        if search_duration > timeout + timeout_overhead:
            raise requests.exceptions.Timeout(response=response)

    # session end
    session.close()

    if hasattr(threadLocal, 'total_time'):
        threadLocal.total_time += time_after_request - time_before_request

    # raise an exception
    if check_for_httperror:
        raise_for_httperror(response)

    return response
Beispiel #4
0
def response(resp):
    results = []

    raise_for_httperror(resp)
    dom = fromstring(resp.text)
    word = extract_text(dom.xpath('//*[@id="headword"]/text()'))

    definitions = []
    for src in dom.xpath('//*[@id="define"]//h3[@class="source"]'):
        src_text = extract_text(src).strip()
        if src_text.startswith('from '):
            src_text = src_text[5:]

        src_defs = []
        for def_item in src.xpath('following-sibling::ul[1]/li'):
            def_abbr = extract_text(def_item.xpath('.//abbr')).strip()
            def_text = extract_text(def_item).strip()
            if def_abbr:
                def_text = def_text[len(def_abbr):].strip()
            src_defs.append((def_abbr, def_text))

        definitions.append((src_text, src_defs))

    if not definitions:
        return results

    infobox = ''
    for src_text, src_defs in definitions:
        infobox += f"<small>{src_text}</small>"
        infobox += "<ul>"
        for def_abbr, def_text in src_defs:
            if def_abbr:
                def_abbr += ": "
            infobox += f"<li><i>{def_abbr}</i> {def_text}</li>"
        infobox += "</ul>"

    results.append({
        'infobox': word,
        'content': infobox,
    })

    return results
Beispiel #5
0
def response(resp):
    if resp.status_code == 404:
        return []

    if resp.status_code == 400:
        try:
            api_result = loads(resp.text)
        except:
            pass
        else:
            if api_result['type'] == 'https://mediawiki.org/wiki/HyperSwitch/errors/bad_request' \
               and api_result['detail'] == 'title-invalid-characters':
                return []

    raise_for_httperror(resp)

    results = []
    api_result = loads(resp.text)

    # skip disambiguation pages
    if api_result.get('type') != 'standard':
        return []

    title = api_result['title']
    wikipedia_link = api_result['content_urls']['desktop']['page']

    results.append({'url': wikipedia_link, 'title': title})

    results.append({
        'infobox': title,
        'id': wikipedia_link,
        'content': api_result.get('extract', ''),
        'img_src': api_result.get('thumbnail', {}).get('source'),
        'urls': [{
            'title': 'Wikipedia',
            'url': wikipedia_link
        }]
    })

    return results
Beispiel #6
0
def response(resp):
    results = []

    # According to https://www.qwant.com/js/app.js
    if resp.status_code == 429:
        raise SearxEngineCaptchaException()

    # raise for other errors
    raise_for_httperror(resp)

    # load JSON result
    search_results = loads(resp.text)

    # check for an API error
    if search_results.get('status') != 'success':
        raise SearxEngineAPIException('API error ' +
                                      str(search_results.get('error', '')))

    # return empty array if there are no results
    if 'data' not in search_results:
        return []

    data = search_results.get('data', {})

    res = data.get('result', {})

    # parse results
    for result in res.get('items', {}):

        title = html_to_text(result['title'])
        res_url = result['url']
        content = html_to_text(result['desc'])

        if category_to_keyword.get(categories[0], '') == 'web':
            results.append({
                'title': title,
                'content': content,
                'url': res_url
            })

        elif category_to_keyword.get(categories[0], '') == 'images':
            thumbnail_src = result['thumbnail']
            img_src = result['media']
            results.append({
                'template': 'images.html',
                'url': res_url,
                'title': title,
                'content': '',
                'thumbnail_src': thumbnail_src,
                'img_src': img_src
            })

        elif category_to_keyword.get(categories[0], '') == 'news':
            published_date = datetime.fromtimestamp(result['date'], None)
            media = result.get('media', [])
            if len(media) > 0:
                img_src = media[0].get('pict', {}).get('url', None)
            else:
                img_src = None
            results.append({
                'url': res_url,
                'title': title,
                'publishedDate': published_date,
                'content': content,
                'img_src': img_src
            })

    return results