Esempi in Python per extract_text, esempi in Python per searx.utils.extract_text

Esempio n. 1

0

Mostra file

def response(resp):
    resp_url = urlparse(resp.url)
    if resp_url.path.startswith('/verify'):
        raise SearxEngineAccessDeniedException()

    results = []

    dom = html.fromstring(resp.content.decode())
    for result_element in eval_xpath_list(dom, '//div[@id="searchpage-root"]//div[@data-dot="results"]/div'):
        dot_data = eval_xpath_getindex(result_element, './div/div[@data-dot-data]/@data-dot-data', 0, default=None)
        if dot_data is None:
            title_element = eval_xpath_getindex(result_element, './/h3/a', 0)
            results.append({
                'url': title_element.get('href'),
                'title': extract_text(title_element),
                'content': extract_text(eval_xpath_getindex(title_element, '../../div[2]', 0)),
            })
        elif dot_data == '{"reporter_name":"hint/related/relates"}':
            suggestions_element = eval_xpath_getindex(result_element,
                                                      './div/div[@data-dot="main-box"]', 0, default=None)
            if suggestions_element is not None:
                for suggestion in eval_xpath_list(suggestions_element, './/ul/li'):
                    results.append({'suggestion': extract_text(suggestion)})

    return results

Esempio n. 2

0

Mostra file

File: 1337x.py Progetto: searxng/searxng

def response(resp):
    results = []

    dom = html.fromstring(resp.text)

    for result in eval_xpath_list(
            dom, '//table[contains(@class, "table-list")]/tbody//tr'):
        href = urljoin(
            url,
            eval_xpath_getindex(result,
                                './td[contains(@class, "name")]/a[2]/@href',
                                0))
        title = extract_text(
            eval_xpath(result, './td[contains(@class, "name")]/a[2]'))
        seed = extract_text(
            eval_xpath(result, './/td[contains(@class, "seeds")]'))
        leech = extract_text(
            eval_xpath(result, './/td[contains(@class, "leeches")]'))
        filesize_info = extract_text(
            eval_xpath(result, './/td[contains(@class, "size")]/text()'))
        filesize, filesize_multiplier = filesize_info.split()
        filesize = get_torrent_size(filesize, filesize_multiplier)

        results.append({
            'url': href,
            'title': title,
            'seed': seed,
            'leech': leech,
            'filesize': filesize,
            'template': 'torrent.html'
        })

    return results

Esempio n. 3

0

Mostra file

File: digbt.py Progetto: 0xn3xus/neovo

def response(resp):
    dom = html.fromstring(resp.text)
    search_res = dom.xpath('.//td[@class="x-item"]')

    if not search_res:
        return list()

    results = list()
    for result in search_res:
        url = urljoin(URL, result.xpath('.//a[@title]/@href')[0])
        title = extract_text(result.xpath('.//a[@title]'))
        content = extract_text(result.xpath('.//div[@class="files"]'))
        files_data = extract_text(
            result.xpath('.//div[@class="tail"]')).split()
        filesize = get_torrent_size(files_data[FILESIZE],
                                    files_data[FILESIZE_MULTIPLIER])
        magnetlink = result.xpath(
            './/div[@class="tail"]//a[@class="title"]/@href')[0]

        results.append({
            'url': url,
            'title': title,
            'content': content,
            'filesize': filesize,
            'magnetlink': magnetlink,
            'seed': 'N/A',
            'leech': 'N/A',
            'template': 'torrent.html'
        })

    return results

Esempio n. 4

0

Mostra file

File: yahoo.py Progetto: shiftinv/searx

def response(resp):
    results = []
    dom = html.fromstring(resp.text)

    for result in eval_xpath_list(dom, '//div[contains(@class,"algo-sr")]'):
        url = eval_xpath_getindex(result, './/h3/a/@href', 0, default=None)
        if url is None:
            continue
        url = parse_url(url)

        title = eval_xpath_getindex(result, './/h3/a', 0, default=None)
        if title is None:
            continue
        offset = len(extract_text(title.xpath('span')))
        title = extract_text(title)[offset:]

        content = eval_xpath_getindex(
            result, './/div[contains(@class, "compText")]', 0, default=''
        )
        if content:
            content = extract_text(content)

        # append result
        results.append({'url': url,
                        'title': title,
                        'content': content})

    for suggestion in eval_xpath_list(dom, '//div[contains(@class, "AlsoTry")]'):
        # append suggestion
        results.append({'suggestion': extract_text(suggestion)})

    return results

Esempio n. 5

0

Mostra file

def response(resp):
    if resp.url.path.startswith('/verify'):
        raise SearxEngineAccessDeniedException()

    results = []

    dom = html.fromstring(resp.content.decode())
    for result_element in eval_xpath_list(dom,
                                          '//div[@data-dot="results"]/div'):
        result_data = eval_xpath_getindex(result_element,
                                          './/div[contains(@class, "bec586")]',
                                          0,
                                          default=None)
        if result_data is None:
            continue
        title_element = eval_xpath_getindex(result_element, './/h3/a', 0)
        results.append({
            'url':
            title_element.get('href'),
            'title':
            extract_text(title_element),
            'content':
            extract_text(eval_xpath(result_data, './/div[@class="_3eded7"]')),
        })

    return results

Esempio n. 6

0

Mostra file

File: digg.py Progetto: shiftinv/searx

def response(resp):
    results = []

    dom = html.fromstring(resp.text)

    results_list = eval_xpath(dom,
                              '//section[contains(@class, "search-results")]')

    for result in results_list:

        titles = eval_xpath(result, '//article//header//h2')
        contents = eval_xpath(result, '//article//p')
        urls = eval_xpath(result, '//header/a/@href')
        published_dates = eval_xpath(result,
                                     '//article/div/div/time/@datetime')

        for (title, content, url,
             published_date) in zip(titles, contents, urls, published_dates):
            results.append({
                'url':
                url,
                'publishedDate':
                datetime.strptime(published_date, '%Y-%m-%dT%H:%M:%SZ'),
                'title':
                extract_text(title),
                'content':
                extract_text(content),
            })

    return results

Esempio n. 7

0

Mostra file

File: framalibre.py Progetto: 0xn3xus/neovo

def response(resp):
    results = []

    dom = html.fromstring(resp.text)

    # parse results
    for result in dom.xpath(results_xpath):
        link = result.xpath(link_xpath)[0]
        href = urljoin(base_url, link.attrib.get('href'))
        # there's also a span (class="rdf-meta element-hidden" property="dc:title")'s content property for this...
        title = escape(extract_text(link))
        thumbnail_tags = result.xpath(thumbnail_xpath)
        thumbnail = None
        if len(thumbnail_tags) > 0:
            thumbnail = extract_text(thumbnail_tags[0])
            if thumbnail[0] == '/':
                thumbnail = base_url + thumbnail
        content = escape(extract_text(result.xpath(content_xpath)))

        # append result
        results.append({
            'url': href,
            'title': title,
            'img_src': thumbnail,
            'content': content
        })

    # return results
    return results

Esempio n. 8

0

Mostra file

def response(resp):
    '''post-response callback
    resp: requests response object
    '''
    results = []
    tree = html.fromstring(resp.text)
    search_results = tree.xpath('//li[contains(@class, "searchresult")]')
    for result in search_results:
        link = result.xpath('.//div[@class="itemurl"]/a')[0]
        result_id = parse_qs(urlparse(link.get('href')).query)["search_item_id"][0]
        title = result.xpath('.//div[@class="heading"]/a/text()')
        date = dateparse(result.xpath('//div[@class="released"]/text()')[0].replace("released ", ""))
        content = result.xpath('.//div[@class="subhead"]/text()')
        new_result = {
            "url": extract_text(link),
            "title": extract_text(title),
            "content": extract_text(content),
            "publishedDate": date,
        }
        thumbnail = result.xpath('.//div[@class="art"]/img/@src')
        if thumbnail:
            new_result['thumbnail'] = thumbnail[0]
        if "album" in result.classes:
            new_result["embedded"] = embedded_url.format(type='album', result_id=result_id)
        elif "track" in result.classes:
            new_result["embedded"] = embedded_url.format(type='track', result_id=result_id)
        results.append(new_result)
    return results

Esempio n. 9

0

Mostra file

File: duckduckgo.py Progetto: scmsec/searx

def response(resp):
    results = []

    doc = fromstring(resp.text)

    # parse results
    for i, r in enumerate(eval_xpath(doc, result_xpath)):
        if i >= 30:
            break
        try:
            res_url = eval_xpath(r, url_xpath)[-1]
        except:
            continue

        if not res_url:
            continue

        title = extract_text(eval_xpath(r, title_xpath))
        content = extract_text(eval_xpath(r, content_xpath))

        # append result
        results.append({'title': title,
                        'content': content,
                        'url': res_url})

    # parse correction
    for correction in eval_xpath(doc, correction_xpath):
        # append correction
        results.append({'correction': extract_text(correction)})

    # return results
    return results

Esempio n. 10

0

Mostra file

File: google_videos.py Progetto: lensmark/shielded

def response(resp):
    results = []

    dom = html.fromstring(resp.text)

    # parse results
    for result in dom.xpath('//div[@class="g"]'):

        title = extract_text(result.xpath('.//h3'))
        url = result.xpath('.//div[@class="r"]/a/@href')[0]
        content = extract_text(result.xpath('.//span[@class="st"]'))

        # get thumbnails
        script = str(dom.xpath('//script[contains(., "_setImagesSrc")]')[0].text)
        ids = result.xpath('.//div[@class="s"]//img/@id')
        if len(ids) > 0:
            thumbnails_data = \
                re.findall('s=\'(.*?)(?:\\\\[a-z,1-9,\\\\]+\'|\')\;var ii=\[(?:|[\'vidthumb\d+\',]+)\'' + ids[0],
                           script)
            tmp = []
            if len(thumbnails_data) != 0:
                tmp = re.findall('(data:image/jpeg;base64,[a-z,A-Z,0-9,/,\+]+)', thumbnails_data[0])
            thumbnail = ''
            if len(tmp) != 0:
                thumbnail = tmp[-1]

        # append result
        results.append({'url': url,
                        'title': title,
                        'content': content,
                        'thumbnail': thumbnail,
                        'template': 'videos.html'})

    return results

Esempio n. 11

0

Mostra file

File: xpath.py Progetto: therealnet-work/searx

def response(resp):
    results = []
    dom = html.fromstring(resp.text)
    is_onion = True if 'onions' in categories else False

    if results_xpath:
        for result in eval_xpath(dom, results_xpath):
            url = extract_url(eval_xpath(result, url_xpath), search_url)
            title = extract_text(eval_xpath(result, title_xpath))
            content = extract_text(eval_xpath(result, content_xpath))
            tmp_result = {'url': url, 'title': title, 'content': content}

            # add thumbnail if available
            if thumbnail_xpath:
                thumbnail_xpath_result = eval_xpath(result, thumbnail_xpath)
                if len(thumbnail_xpath_result) > 0:
                    tmp_result['img_src'] = extract_url(
                        thumbnail_xpath_result, search_url)

            # add alternative cached url if available
            if cached_xpath:
                tmp_result['cached_url'] = cached_url + extract_text(
                    result.xpath(cached_xpath))

            if is_onion:
                tmp_result['is_onion'] = True

            results.append(tmp_result)
    else:
        if cached_xpath:
            for url, title, content, cached in zip(
                (extract_url(x, search_url) for x in dom.xpath(url_xpath)),
                    map(extract_text, dom.xpath(title_xpath)),
                    map(extract_text, dom.xpath(content_xpath)),
                    map(extract_text, dom.xpath(cached_xpath))):
                results.append({
                    'url': url,
                    'title': title,
                    'content': content,
                    'cached_url': cached_url + cached,
                    'is_onion': is_onion
                })
        else:
            for url, title, content in zip(
                (extract_url(x, search_url) for x in dom.xpath(url_xpath)),
                    map(extract_text, dom.xpath(title_xpath)),
                    map(extract_text, dom.xpath(content_xpath))):
                results.append({
                    'url': url,
                    'title': title,
                    'content': content,
                    'is_onion': is_onion
                })

    if not suggestion_xpath:
        return results
    for suggestion in eval_xpath(dom, suggestion_xpath):
        results.append({'suggestion': extract_text(suggestion)})
    return results

Esempio n. 12

0

Mostra file

File: sjp.py Progetto: zlsdzh001/searx

def response(resp):
    results = []

    raise_for_httperror(resp)
    dom = fromstring(resp.text)
    word = extract_text(dom.xpath(word_xpath))

    definitions = []

    for dict_src in dict_xpath:
        for src in dom.xpath(dict_src):
            src_text = extract_text(
                src.xpath(
                    './/span[@class="entry-head-title"]/text()')).strip()

            src_defs = []
            for def_item in src.xpath(
                    './/div[contains(@class, "ribbon-element")]'):
                if def_item.xpath('./div[@class="znacz"]'):
                    sub_defs = []
                    for def_sub_item in def_item.xpath(
                            './div[@class="znacz"]'):
                        def_sub_text = extract_text(def_sub_item).lstrip(
                            '0123456789. ')
                        sub_defs.append(def_sub_text)
                    src_defs.append((word, sub_defs))
                else:
                    def_text = extract_text(def_item).strip()
                    def_link = def_item.xpath('./span/a/@href')
                    if 'doroszewski' in def_link[0]:
                        def_text = f"<a href='{def_link[0]}'>{def_text}</a>"
                    src_defs.append((def_text, ''))

            definitions.append((src_text, src_defs))

    if not definitions:
        return results

    infobox = ''
    for src in definitions:
        infobox += f"<div><small>{src[0]}</small>"
        infobox += "<ul>"
        for (def_text, sub_def) in src[1]:
            infobox += f"<li>{def_text}</li>"
            if sub_def:
                infobox += "<ol>"
                for sub_def_text in sub_def:
                    infobox += f"<li>{sub_def_text}</li>"
                infobox += "</ol>"
        infobox += "</ul></div>"

    results.append({
        'infobox': word,
        'content': infobox,
    })

    return results

Esempio n. 13

0

Mostra file

File: yahoo_news.py Progetto: ShutUpTrackers/search.shutuptrackers.com

def response(resp):
    results = []

    dom = html.fromstring(resp.text)

    # parse results
    for result in dom.xpath(results_xpath):
        urls = result.xpath(url_xpath)
        if len(urls) != 1:
            continue
        url = sanitize_url(parse_url(extract_url(urls, search_url)))
        title = extract_text(result.xpath(title_xpath)[0])
        content = extract_text(result.xpath(content_xpath)[0])

        # parse publishedDate
        publishedDate = extract_text(result.xpath(publishedDate_xpath)[0])

        # still useful ?
        if re.match("^[0-9]+ minute(s|) ago$", publishedDate):
            publishedDate = datetime.now() - timedelta(
                minutes=int(re.match(r'\d+', publishedDate).group()))
        elif re.match("^[0-9]+ days? ago$", publishedDate):
            publishedDate = datetime.now() - timedelta(
                days=int(re.match(r'\d+', publishedDate).group()))
        elif re.match("^[0-9]+ hour(s|), [0-9]+ minute(s|) ago$",
                      publishedDate):
            timeNumbers = re.findall(r'\d+', publishedDate)
            publishedDate = datetime.now()\
                - timedelta(hours=int(timeNumbers[0]))\
                - timedelta(minutes=int(timeNumbers[1]))
        else:
            try:
                publishedDate = parser.parse(publishedDate)
            except:
                publishedDate = datetime.now()

        if publishedDate.year == 1900:
            publishedDate = publishedDate.replace(year=datetime.now().year)

        # append result
        results.append({
            'url': url,
            'title': title,
            'content': content,
            'publishedDate': publishedDate
        })

    # return results
    return results

Esempio n. 14

0

Mostra file

File: wikidata.py Progetto: lensmark/shielded

def add_url(urls,
            result,
            id_cache,
            property_id=None,
            default_label=None,
            url_prefix=None,
            results=None,
            link_type=None,
            only_first=True):
    links = []

    # wiki links don't have property in wikidata page
    if link_type and 'wiki' in link_type:
        links.append(get_wikilink(result, link_type))
    else:
        dom_element = id_cache.get(property_id, None)
        if dom_element is not None:
            if not default_label:
                label = extract_text(eval_xpath(dom_element, label_xpath))
                label = label[0].upper() + label[1:]

            if link_type == 'geo':
                links.append(get_geolink(dom_element))

            elif link_type == 'imdb':
                links.append(get_imdblink(dom_element, url_prefix))

            else:
                url_results = eval_xpath(dom_element, url_xpath)
                for link in url_results:
                    if link is not None:
                        if url_prefix:
                            link = url_prefix + extract_text(link)
                        else:
                            link = extract_text(link)
                        links.append(link)

    # append urls
    for url in links:
        if url is not None:
            u = {'title': default_label or label, 'url': url}
            if property_id == 'P856':
                u['official'] = True
                u['domain'] = url.split('/')[2]
            urls.append(u)
            if results is not None:
                results.append(u)
            if only_first:
                break

Esempio n. 15

0

Mostra file

def response(resp):
    results = []

    doc = fromstring(resp.text)

    # parse results
    # Quickhits
    for r in eval_xpath(doc, '//div[@class="search_quickresult"]/ul/li'):
        try:
            res_url = eval_xpath(r, './/a[@class="wikilink1"]/@href')[-1]
        except:
            continue

        if not res_url:
            continue

        title = extract_text(eval_xpath(r, './/a[@class="wikilink1"]/@title'))

        # append result
        results.append({
            'title': title,
            'content': "",
            'url': base_url + res_url
        })

    # Search results
    for r in eval_xpath(doc, '//dl[@class="search_results"]/*'):
        try:
            if r.tag == "dt":
                res_url = eval_xpath(r, './/a[@class="wikilink1"]/@href')[-1]
                title = extract_text(
                    eval_xpath(r, './/a[@class="wikilink1"]/@title'))
            elif r.tag == "dd":
                content = extract_text(eval_xpath(r, '.'))

                # append result
                results.append({
                    'title': title,
                    'content': content,
                    'url': base_url + res_url
                })
        except:
            continue

        if not res_url:
            continue

    # return results
    return results

Esempio n. 16

0

Mostra file

def response(resp):
    results = []

    dom = html.fromstring(resp.text)

    for result in eval_xpath(dom,
                             '//table[@class="result"]//td[@class="record"]'):
        url = eval_xpath(result, './a/@href')[0]
        title = extract_text(eval_xpath(result, './a//text()'))
        content = extract_text(
            eval_xpath(result, './/div[@class="text"]//text()'))

        results.append({'url': url, 'title': title, 'content': content})

    return results

Esempio n. 17

0

Mostra file

File: www1x.py Progetto: lensmark/shielded

def response(resp):
    results = []

    dom = html.fromstring(resp.text)
    for res in dom.xpath('//div[@class="List-item MainListing"]'):
        # processed start and end of link
        link = res.xpath('//a')[0]

        url = urljoin(base_url, link.attrib.get('href'))
        title = extract_text(link)

        thumbnail_src = urljoin(base_url, res.xpath('.//img')[0].attrib['src'])
        # TODO: get image with higher resolution
        img_src = thumbnail_src

        # append result
        results.append({
            'url': url,
            'title': title,
            'img_src': img_src,
            'content': '',
            'thumbnail_src': thumbnail_src,
            'template': 'images.html'
        })

    # return results
    return results

Esempio n. 18

0

Mostra file

File: duden.py Progetto: lensmark/shielded

def response(resp):
    '''post-response callback
    resp: requests response object
    '''
    results = []

    dom = html.fromstring(resp.text)

    try:
        number_of_results_string =\
            re.sub('[^0-9]', '',
                   eval_xpath(dom, '//a[@class="active" and contains(@href,"/suchen/dudenonline")]/span/text()')[0])

        results.append({'number_of_results': int(number_of_results_string)})

    except:
        logger.debug("Couldn't read number of results.")
        pass

    for result in eval_xpath(dom, '//section[not(contains(@class, "essay"))]'):
        try:
            url = eval_xpath(result, './/h2/a')[0].get('href')
            url = urljoin(base_url, url)
            title = eval_xpath(result, 'string(.//h2/a)').strip()
            content = extract_text(eval_xpath(result, './/p'))
            # append result
            results.append({'url': url,
                            'title': title,
                            'content': content})
        except:
            logger.debug('result parse error in:\n%s', etree.tostring(result, pretty_print=True))
            continue

    return results

Esempio n. 19

0

Mostra file

File: duden.py Progetto: scmsec/searx

def response(resp):
    '''post-response callback
    resp: requests response object
    '''
    results = []

    dom = html.fromstring(resp.text)

    number_of_results_element =\
        eval_xpath_getindex(dom, '//a[@class="active" and contains(@href,"/suchen/dudenonline")]/span/text()',
                            0, default=None)
    if number_of_results_element is not None:
        number_of_results_string = re.sub('[^0-9]', '',
                                          number_of_results_element)
        results.append({'number_of_results': int(number_of_results_string)})

    for result in eval_xpath_list(dom,
                                  '//section[not(contains(@class, "essay"))]'):
        url = eval_xpath_getindex(result, './/h2/a', 0).get('href')
        url = urljoin(base_url, url)
        title = eval_xpath(result, 'string(.//h2/a)').strip()
        content = extract_text(eval_xpath(result, './/p'))
        # append result
        results.append({'url': url, 'title': title, 'content': content})

    return results

Esempio n. 20

0

Mostra file

File: torrentz.py Progetto: searxng/searxng

def response(resp):
    results = []

    dom = html.fromstring(resp.text)

    for result in dom.xpath('//div[@class="results"]/dl'):
        name_cell = result.xpath('./dt')[0]
        title = extract_text(name_cell)

        # skip rows that do not contain a link to a torrent
        links = name_cell.xpath('./a')
        if len(links) != 1:
            continue

        # extract url and remove a slash in the beginning
        link = links[0].attrib.get('href').lstrip('/')

        seed = 0
        leech = 0
        try:
            seed = int(result.xpath('./dd/span[4]/text()')[0].replace(',', ''))
            leech = int(
                result.xpath('./dd/span[5]/text()')[0].replace(',', ''))
        except:
            pass

        params = {
            'url': base_url + link,
            'title': title,
            'seed': seed,
            'leech': leech,
            'template': 'torrent.html'
        }

        # let's try to calculate the torrent size
        try:
            filesize_info = result.xpath('./dd/span[3]/text()')[0]
            filesize, filesize_multiplier = filesize_info.split()
            filesize = get_torrent_size(filesize, filesize_multiplier)

            params['filesize'] = filesize
        except:
            pass

        # does our link contain a valid SHA1 sum?
        if re.compile('[0-9a-fA-F]{40}').match(link):
            # add a magnet link to the result
            params['magnetlink'] = 'magnet:?xt=urn:btih:' + link

        # extract and convert creation date
        try:
            date_ts = result.xpath('./dd/span[2]')[0].attrib.get('title')
            date = datetime.fromtimestamp(float(date_ts))
            params['publishedDate'] = date
        except:
            pass

        results.append(params)

    return results

Esempio n. 21

0

Mostra file

def response(resp):
    results = []

    dom = html.fromstring(resp.text)

    # parse results
    for result in dom.xpath(results_xpath):
        link = result.xpath(link_xpath)[0]
        href = urljoin(url, link.attrib.get('href'))
        title = extract_text(link)
        content = extract_text(result.xpath(content_xpath))

        # append result
        results.append({'url': href, 'title': title, 'content': content})

    # return results
    return results

Esempio n. 22

0

Mostra file

File: wikidata.py Progetto: lensmark/shielded

def add_attribute(attributes,
                  id_cache,
                  property_id,
                  default_label=None,
                  date=False,
                  trim=False):
    attribute = id_cache.get(property_id, None)
    if attribute is not None:

        if default_label:
            label = default_label
        else:
            label = extract_text(eval_xpath(attribute, label_xpath))
            label = label[0].upper() + label[1:]

        if date:
            trim = True
            # remove calendar name
            calendar_name = eval_xpath(attribute, calendar_name_xpath)
            for calendar in calendar_name:
                calendar.getparent().remove(calendar)

        concat_values = ""
        values = []
        first_value = None
        for row in eval_xpath(attribute, property_row_xpath):
            if not first_value or not trim or eval_xpath(
                    row, preferred_rank_xpath):
                value = eval_xpath(row, value_xpath)
                if not value:
                    continue
                value = extract_text(value)

                # save first value in case no ranked row is found
                if trim and not first_value:
                    first_value = value
                else:
                    # to avoid duplicate values
                    if value not in values:
                        concat_values += value + ", "
                        values.append(value)

        if trim and not values:
            attributes.append({'label': label, 'value': first_value})
        else:
            attributes.append({'label': label, 'value': concat_values[:-2]})

Esempio n. 23

0

Mostra file

File: duckduckgo_definitions.py Progetto: lensmark/shielded

def result_to_text(url, text, htmlResult):
    # TODO : remove result ending with "Meaning" or "Category"
    dom = html.fromstring(htmlResult)
    a = dom.xpath('//a')
    if len(a) >= 1:
        return extract_text(a[0])
    else:
        return text

Esempio n. 24

0

Mostra file

File: bing.py Progetto: lensmark/shielded

def response(resp):
    results = []
    result_len = 0

    dom = html.fromstring(resp.text)
    # parse results
    for result in eval_xpath(dom, '//div[@class="sa_cc"]'):
        link = eval_xpath(result, './/h3/a')[0]
        url = link.attrib.get('href')
        title = extract_text(link)
        content = extract_text(eval_xpath(result, './/p'))

        # append result
        results.append({'url': url, 'title': title, 'content': content})

    # parse results again if nothing is found yet
    for result in eval_xpath(dom, '//li[@class="b_algo"]'):
        link = eval_xpath(result, './/h2/a')[0]
        url = link.attrib.get('href')
        title = extract_text(link)
        content = extract_text(eval_xpath(result, './/p'))

        # append result
        results.append({'url': url, 'title': title, 'content': content})

    try:
        result_len_container = "".join(
            eval_xpath(dom, '//span[@class="sb_count"]//text()'))
        if "-" in result_len_container:
            # Remove the part "from-to" for paginated request ...
            result_len_container = result_len_container[
                result_len_container.find("-") * 2 + 2:]

        result_len_container = re.sub('[^0-9]', '', result_len_container)
        if len(result_len_container) > 0:
            result_len = int(result_len_container)
    except Exception as e:
        logger.debug('result error :\n%s', e)
        pass

    if result_len and _get_offset_from_pageno(
            resp.search_params.get("pageno", 0)) > result_len:
        return []

    results.append({'number_of_results': result_len})
    return results

Esempio n. 25

0

Mostra file

File: startpage.py Progetto: lensmark/shielded

def _fetch_supported_languages(resp):
    # startpage's language selector is a mess
    # each option has a displayed name and a value, either of which may represent the language name
    # in the native script, the language name in English, an English transliteration of the native name,
    # the English name of the writing script used by the language, or occasionally something else entirely.

    # this cases are so special they need to be hardcoded, a couple of them are mispellings
    language_names = {
        'english_uk': 'en-GB',
        'fantizhengwen': ['zh-TW', 'zh-HK'],
        'hangul': 'ko',
        'malayam': 'ml',
        'norsk': 'nb',
        'sinhalese': 'si',
        'sudanese': 'su'
    }

    # get the English name of every language known by babel
    language_names.update({
        name.lower(): lang_code
        for lang_code, name in Locale('en')._data['languages'].items()
    })

    # get the native name of every language known by babel
    for lang_code in filter(lambda lang_code: lang_code.find('_') == -1,
                            locale_identifiers()):
        native_name = Locale(lang_code).get_language_name().lower()
        # add native name exactly as it is
        language_names[native_name] = lang_code

        # add "normalized" language name (i.e. français becomes francais and español becomes espanol)
        unaccented_name = ''.join(
            filter(lambda c: not combining(c), normalize('NFKD', native_name)))
        if len(unaccented_name) == len(unaccented_name.encode()):
            # add only if result is ascii (otherwise "normalization" didn't work)
            language_names[unaccented_name] = lang_code

    dom = html.fromstring(resp.text)
    sp_lang_names = []
    for option in dom.xpath(
            '//form[@id="settings-form"]//select[@name="language"]/option'):
        sp_lang_names.append(
            (option.get('value'), extract_text(option).lower()))

    supported_languages = {}
    for sp_option_value, sp_option_text in sp_lang_names:
        lang_code = language_names.get(sp_option_value) or language_names.get(
            sp_option_text)
        if isinstance(lang_code, str):
            supported_languages[lang_code] = {'alias': sp_option_value}
        elif isinstance(lang_code, list):
            for lc in lang_code:
                supported_languages[lc] = {'alias': sp_option_value}
        else:
            print('Unknown language option in Startpage: {} ({})'.format(
                sp_option_value, sp_option_text))

    return supported_languages

Esempio n. 26

0

Mostra file

File: btdigg.py Progetto: Danutu89/NowSearch-Backend

def response(resp):
    results = []

    dom = html.fromstring(resp.text)

    search_res = dom.xpath('//div[@class="one_result"]')

    # return empty array if nothing is found
    if not search_res:
        return []

    # parse results
    for result in search_res:
        link = result.xpath('.//div[@class="torrent_name"]//a')[0]
        href = urljoin(url, link.attrib.get('href'))
        title = extract_text(link)

        excerpt = result.xpath('.//div[@class="torrent_excerpt"]')[0]
        content = html.tostring(excerpt,
                                encoding='unicode',
                                method='text',
                                with_tail=False)
        # it is better to emit <br/> instead of |, but html tags are verboten
        content = content.strip().replace('\n', ' | ')
        content = ' '.join(content.split())

        filesize = result.xpath(
            './/span[@class="torrent_size"]/text()')[0].split()[0]
        filesize_multiplier = result.xpath(
            './/span[@class="torrent_size"]/text()')[0].split()[1]
        files = (result.xpath('.//span[@class="torrent_files"]/text()')
                 or ['1'])[0]

        # convert filesize to byte if possible
        filesize = get_torrent_size(filesize, filesize_multiplier)

        # convert files to int if possible
        try:
            files = int(files)
        except:
            files = None

        magnetlink = result.xpath(
            './/div[@class="torrent_magnet"]//a')[0].attrib['href']

        # append result
        results.append({
            'url': href,
            'title': title,
            'content': content,
            'filesize': filesize,
            'files': files,
            'magnetlink': magnetlink,
            'template': 'torrent.html'
        })

    # return results sorted by seeder
    return results

Esempio n. 27

0

Mostra file

File: duckduckgo.py Progetto: Danutu89/NowSearch-Backend

def response(resp):
    results = []

    doc = fromstring(resp.text)

    # parse results

    for i, r in enumerate(eval_xpath(doc, result_xpath)):
        if i >= 30:
            break
        try:
            res_url = eval_xpath(r, url_xpath)[-1]
        except:
            continue

        if not res_url:
            continue

        title = extract_text(eval_xpath(r, title_xpath))
        content = extract_text(eval_xpath(r, content_xpath))

        # append result
        results.append({"title": title, "content": content, "url": res_url})

    if eval_xpath(doc, answer_title_xpath) != []:
        answer_title = eval_xpath(doc, answer_title_xpath)
        answer_link = eval_xpath(doc, answer_link_xpath)
        answer_content = eval_xpath(doc, answer_content_xpath)
        results.append({
            "answer": "stackoverflow",
            "data": {
                "title": answer_title,
                "link": answer_link,
                "content": answer_content,
            },
            "url": answer_link,
        })

    # parse correction
    for correction in eval_xpath(doc, correction_xpath):
        # append correction
        results.append({"correction": extract_text(correction)})

    # return results
    return results

Esempio n. 28

0

Mostra file

def response(resp):
    results = []
    dom = html.fromstring(resp.text)

    # parse results
    for result in eval_xpath_list(
            dom, '//ol[contains(@class,"searchCenterMiddle")]//li'):

        url = eval_xpath_getindex(result, './/h4/a/@href', 0, None)
        if url is None:
            continue
        url = parse_url(url)
        title = extract_text(result.xpath('.//h4/a'))
        content = extract_text(result.xpath('.//p'))
        img_src = eval_xpath_getindex(result, './/img/@data-src', 0, None)

        item = {
            'url': url,
            'title': title,
            'content': content,
            'img_src': img_src
        }

        pub_date = extract_text(
            result.xpath('.//span[contains(@class,"s-time")]'))
        ago = AGO_RE.search(pub_date)
        if ago:
            number = int(ago.group(1))
            delta = AGO_TIMEDELTA[ago.group(2)]
            pub_date = datetime.now() - delta * number
        else:
            try:
                pub_date = parser.parse(pub_date)
            except parser.ParserError:
                pub_date = None

        if pub_date is not None:
            item['publishedDate'] = pub_date
        results.append(item)

        for suggestion in eval_xpath_list(
                dom, '//div[contains(@class,"AlsoTry")]//td'):
            results.append({'suggestion': extract_text(suggestion)})

    return results

Esempio n. 29

0

Mostra file

def response(resp):
    results = []

    dom = html.fromstring(resp.text)

    for app in dom.xpath('//a[@class="package-header"]'):
        app_url = app.xpath('./@href')[0]
        app_title = extract_text(app.xpath('./div/h4[@class="package-name"]/text()'))
        app_content = extract_text(app.xpath('./div/div/span[@class="package-summary"]')).strip() \
            + ' - ' + extract_text(app.xpath('./div/div/span[@class="package-license"]')).strip()
        app_img_src = app.xpath('./img[@class="package-icon"]/@src')[0]

        results.append({'url': app_url,
                        'title': app_title,
                        'content': app_content,
                        'img_src': app_img_src})

    return results

Esempio n. 30

0

Mostra file

File: google_videos.py Progetto: 0xn3xus/neovo

def response(resp):
    """Get response from google's search request"""
    results = []

    detect_google_sorry(resp)

    # convert the text to dom
    dom = html.fromstring(resp.text)
    vidthumb_imgdata = scrap_out_thumbs(dom)

    # parse results
    for result in eval_xpath_list(dom, results_xpath):

        # google *sections*
        if extract_text(eval_xpath(result, g_section_with_header)):
            logger.debug("ingoring <g-section-with-header>")
            continue

        title = extract_text(eval_xpath_getindex(result, title_xpath, 0))
        url = eval_xpath_getindex(result, href_xpath, 0)
        c_node = eval_xpath_getindex(result, content_xpath, 0)

        # <img id="vidthumb1" ...>
        img_id = eval_xpath_getindex(c_node,
                                     './div[1]//a/g-img/img/@id',
                                     0,
                                     default=None)
        if img_id is None:
            continue
        img_src = vidthumb_imgdata.get(img_id, None)
        if not img_src:
            logger.error("no vidthumb imgdata for: %s" % img_id)
            img_src = eval_xpath_getindex(c_node, './div[1]//a/g-img/img/@src',
                                          0)

        length = extract_text(eval_xpath(c_node, './/div[1]//a/div[3]'))
        content = extract_text(eval_xpath(c_node, './/div[2]/span'))
        pub_info = extract_text(eval_xpath(c_node, './/div[2]/div'))

        results.append({
            'url': url,
            'title': title,
            'content': content,
            'length': length,
            'author': pub_info,
            'thumbnail': img_src,
            'template': 'videos.html',
        })

    # parse suggestion
    for suggestion in eval_xpath_list(dom, suggestion_xpath):
        # append suggestion
        results.append({'suggestion': extract_text(suggestion)})

    for correction in eval_xpath_list(dom, spelling_suggestion_xpath):
        results.append({'correction': extract_text(correction)})

    return results