def _merge_infobox(self, infobox):
        add_infobox = True
        infobox_id = infobox.get('id', None)
        if infobox_id is not None:
            for existingIndex in self.infoboxes:
                if compare_urls(urlparse(existingIndex.get('id', '')), urlparse(infobox_id)):
                    merge_two_infoboxes(existingIndex, infobox)
                    add_infobox = False

        if add_infobox:
            self.infoboxes.append(infobox)
Beispiel #2
0
    def _merge_result(self, result, position):
        if result.__contains__('showurl'):
            result['parsed_url'] = urlparse(result['showurl'])
        else:
            result['parsed_url'] = urlparse(result['url'])

        # if the result has no scheme, use http as default
        if not result['parsed_url'].scheme:
            result['parsed_url'] = result['parsed_url']._replace(scheme="http")
            result['url'] = urlparse(result['url']).geturl()

        result['engines'] = set([result['engine']])

        # strip multiple spaces and cariage returns from content
        if result.get('content'):
            result['content'] = WHITESPACE_REGEX.sub(' ', result['content'])

        # check for duplicates
        duplicated = False
        for merged_result in self._merged_results:
            if compare_urls(result['parsed_url'], merged_result['parsed_url'])\
               and result.get('template') == merged_result.get('template'):
                duplicated = merged_result
                break

        # merge duplicates together
        if duplicated:
            # using content with more text
            if result_content_len(result.get('content', '')) >\
                    result_content_len(duplicated.get('content', '')):
                duplicated['content'] = result['content']

            # merge all result's parameters not found in duplicate
            for key in result.keys():
                if not duplicated.get(key):
                    duplicated[key] = result.get(key)

            # add the new position
            duplicated['positions'].append(position)

            # add engine to list of result-engines
            duplicated['engines'].add(result['engine'])

            # using https if possible
            if duplicated['parsed_url'].scheme != 'https' and result['parsed_url'].scheme == 'https':
                duplicated['url'] = result['parsed_url'].geturl()
                duplicated['parsed_url'] = result['parsed_url']

        # if there is no duplicate found, append result
        else:
            result['positions'] = [position]
            with RLock():
                self._merged_results.append(result)
Beispiel #3
0
    def _merge_infobox(self, infobox):
        add_infobox = True
        infobox_id = infobox.get('id', None)
        infobox['engines'] = set([infobox['engine']])
        if infobox_id is not None:
            parsed_url_infobox_id = urlparse(infobox_id)
            for existingIndex in self.infoboxes:
                if compare_urls(urlparse(existingIndex.get('id', '')), parsed_url_infobox_id):
                    merge_two_infoboxes(existingIndex, infobox)
                    add_infobox = False

        if add_infobox:
            self.infoboxes.append(infobox)
Beispiel #4
0
def response(resp):
    dom = html.fromstring(resp.text)

    results = []
    for element in dom.xpath('//div[@id="search"] //td'):
        link = element.xpath('./a')[0]

        google_url = urlparse(link.xpath('.//@href')[0])
        query = parse_qs(google_url.query)
        source_url = next(iter(query.get('q', [])), None)

        title_parts = element.xpath('./cite//following-sibling::*/text()')
        title_parts.extend(element.xpath('./cite//following-sibling::text()')[:-1])

        result = {
            'title': ''.join(title_parts),
            'content': '',
            'template': 'images.html',
            'url': source_url,
            'img_src': source_url,
            'thumbnail_src': next(iter(link.xpath('.//img //@src')), None)
        }

        if not source_url or not result['thumbnail_src']:
            continue

        results.append(result)
    return results
Beispiel #5
0
def response(resp):
    dom = html.fromstring(resp.text)

    results = []
    for element in dom.xpath('//div[@id="search"] //td'):
        link = element.xpath('./a')[0]

        google_url = urlparse(link.xpath('.//@href')[0])
        query = parse_qs(google_url.query)
        source_url = next(iter(query.get('q', [])), None)

        title_parts = element.xpath('./cite//following-sibling::*/text()')
        title_parts.extend(
            element.xpath('./cite//following-sibling::text()')[:-1])

        result = {
            'title': ''.join(title_parts),
            'content': '',
            'template': 'images.html',
            'url': source_url,
            'img_src': source_url,
            'thumbnail_src': next(iter(link.xpath('.//img //@src')), None)
        }

        if not source_url or not result['thumbnail_src']:
            continue

        results.append(result)
    return results
Beispiel #6
0
def clean_url(url):
    parsed = urlparse(url)
    query = [(k, v) for (k, v) in parse_qsl(parsed.query)
             if k not in ['ixid', 's']]

    return urlunparse(
        (parsed.scheme, parsed.netloc, parsed.path, parsed.params,
         urlencode(query), parsed.fragment))
Beispiel #7
0
def on_result(request, search, result):
    doi = extract_doi(result['parsed_url'])
    if doi and len(doi) < 50:
        for suffix in ('/', '.pdf', '/full', '/meta', '/abstract'):
            if doi.endswith(suffix):
                doi = doi[:-len(suffix)]
        result['url'] = get_doi_resolver(request.args, request.preferences.get_value('doi_resolver')) + doi
        result['parsed_url'] = urlparse(result['url'])
    return True
Beispiel #8
0
def on_result(request, search, result):
    doi = extract_doi(result['parsed_url'])
    if doi and len(doi) < 50:
        for suffix in ('/', '.pdf', '/full', '/meta', '/abstract'):
            if doi.endswith(suffix):
                doi = doi[:-len(suffix)]
        result['url'] = 'http://doai.io/' + doi
        result['parsed_url'] = urlparse(result['url'])
    return True
Beispiel #9
0
def on_result(request, search, result):
    doi = extract_doi(result['parsed_url'])
    if doi and len(doi) < 50:
        for suffix in ('/', '.pdf', '/full', '/meta', '/abstract'):
            if doi.endswith(suffix):
                doi = doi[:-len(suffix)]
        result['url'] = get_doi_resolver(request.args, request.preferences.get_value('doi_resolver')) + doi
        result['parsed_url'] = urlparse(result['url'])
    return True
Beispiel #10
0
def clean_url(url):
    parsed = urlparse(url)
    query = [(k, v) for (k, v) in parse_qsl(parsed.query) if k not in ['ixid', 's']]

    return urlunparse((parsed.scheme,
                       parsed.netloc,
                       parsed.path,
                       parsed.params,
                       urlencode(query),
                       parsed.fragment))
Beispiel #11
0
def parse_url(url_string, google_hostname):
    # sanity check
    if url_string is None:
        return url_string

    # normal case
    parsed_url = urlparse(url_string)
    if (parsed_url.netloc in [google_hostname, '']
            and parsed_url.path == redirect_path):
        query = dict(parse_qsl(parsed_url.query))
        return query['q']
    else:
        return url_string
Beispiel #12
0
def parse_url(url_string, google_hostname):
    # sanity check
    if url_string is None:
        return url_string

    # normal case
    parsed_url = urlparse(url_string)
    if (parsed_url.netloc in [google_hostname, '']
            and parsed_url.path == redirect_path):
        query = dict(parse_qsl(parsed_url.query))
        return query['q']
    else:
        return url_string
def https_url_rewrite(result):
    skip_https_rewrite = False
    # check if HTTPS rewrite is possible
    for target, rules, exclusions in https_rules:

        # check if target regex match with url
        if target.match(result['parsed_url'].netloc):
            # process exclusions
            for exclusion in exclusions:
                # check if exclusion match with url
                if exclusion.match(result['url']):
                    skip_https_rewrite = True
                    break

            # skip https rewrite if required
            if skip_https_rewrite:
                break

            # process rules
            for rule in rules:
                try:
                    new_result_url = rule[0].sub(rule[1], result['url'])
                except:
                    break

                # parse new url
                new_parsed_url = urlparse(new_result_url)

                # continiue if nothing was rewritten
                if result['url'] == new_result_url:
                    continue

                # get domainname from result
                # TODO, does only work correct with TLD's like
                #  asdf.com, not for asdf.com.de
                # TODO, using publicsuffix instead of this rewrite rule
                old_result_domainname = '.'.join(
                    result['parsed_url'].hostname.split('.')[-2:])
                new_result_domainname = '.'.join(
                    new_parsed_url.hostname.split('.')[-2:])

                # check if rewritten hostname is the same,
                # to protect against wrong or malicious rewrite rules
                if old_result_domainname == new_result_domainname:
                    # set new url
                    result['url'] = new_result_url

            # target has matched, do not search over the other rules
            break
    return result
Beispiel #14
0
def https_url_rewrite(result):
    skip_https_rewrite = False
    # check if HTTPS rewrite is possible
    for target, rules, exclusions in https_rules:

        # check if target regex match with url
        if target.match(result['parsed_url'].netloc):
            # process exclusions
            for exclusion in exclusions:
                # check if exclusion match with url
                if exclusion.match(result['url']):
                    skip_https_rewrite = True
                    break

            # skip https rewrite if required
            if skip_https_rewrite:
                break

            # process rules
            for rule in rules:
                try:
                    new_result_url = rule[0].sub(rule[1], result['url'])
                except:
                    break

                # parse new url
                new_parsed_url = urlparse(new_result_url)

                # continiue if nothing was rewritten
                if result['url'] == new_result_url:
                    continue

                # get domainname from result
                # TODO, does only work correct with TLD's like
                #  asdf.com, not for asdf.com.de
                # TODO, using publicsuffix instead of this rewrite rule
                old_result_domainname = '.'.join(
                    result['parsed_url'].hostname.split('.')[-2:])
                new_result_domainname = '.'.join(
                    new_parsed_url.hostname.split('.')[-2:])

                # check if rewritten hostname is the same,
                # to protect against wrong or malicious rewrite rules
                if old_result_domainname == new_result_domainname:
                    # set new url
                    result['url'] = new_result_url

            # target has matched, do not search over the other rules
            break
    return result
Beispiel #15
0
def extract_url(xpath_results, search_url):
    if xpath_results == []:
        raise Exception('Empty url resultset')
    url = extract_text(xpath_results)

    if url.startswith('//'):
        # add http or https to this kind of url //example.com/
        parsed_search_url = urlparse(search_url)
        url = u'{0}:{1}'.format(parsed_search_url.scheme, url)
    elif url.startswith('/'):
        # fix relative url to the search engine
        url = urljoin(search_url, url)

    # normalize url
    url = normalize_url(url)

    return url
Beispiel #16
0
def extract_url(xpath_results, search_url):
    if xpath_results == []:
        raise Exception('Empty url resultset')
    url = extract_text(xpath_results)

    if url.startswith('//'):
        # add http or https to this kind of url //example.com/
        parsed_search_url = urlparse(search_url)
        url = u'{0}:{1}'.format(parsed_search_url.scheme or 'http', url)
    elif url.startswith('/'):
        # fix relative url to the search engine
        url = urljoin(search_url, url)

    # normalize url
    url = normalize_url(url)

    return url
Beispiel #17
0
def normalize_url(url):
    parsed_url = urlparse(url)

    # add a / at this end of the url if there is no path
    if not parsed_url.netloc:
        raise Exception('Cannot parse url')
    if not parsed_url.path:
        url += '/'

    # FIXME : hack for yahoo
    if parsed_url.hostname == 'search.yahoo.com'\
       and parsed_url.path.startswith('/r'):
        p = parsed_url.path
        mark = p.find('/**')
        if mark != -1:
            return unquote(p[mark + 3:]).decode('utf-8')

    return url
Beispiel #18
0
def normalize_url(url):
    parsed_url = urlparse(url)

    # add a / at this end of the url if there is no path
    if not parsed_url.netloc:
        raise Exception('Cannot parse url')
    if not parsed_url.path:
        url += '/'

    # FIXME : hack for yahoo
    if parsed_url.hostname == 'search.yahoo.com'\
       and parsed_url.path.startswith('/r'):
        p = parsed_url.path
        mark = p.find('/**')
        if mark != -1:
            return unquote(p[mark + 3:]).decode('utf-8')

    return url
Beispiel #19
0
def fetch_firefox_versions():
    resp = requests.get(URL, timeout=2.0)
    if resp.status_code != 200:
        raise Exception("Error fetching firefox versions, HTTP code " + resp.status_code)
    else:
        dom = html.fromstring(resp.text)
        versions = []

        for link in dom.xpath('//a/@href'):
            url = urlparse(urljoin(URL, link))
            path = url.path
            if path.startswith(RELEASE_PATH):
                version = path[len(RELEASE_PATH):-1]
                if NORMAL_REGEX.match(version):
                    versions.append(LooseVersion(version))

        list.sort(versions, reverse=True)
        return versions
def fetch_firefox_versions():
    resp = requests.get(URL, timeout=2.0)
    if resp.status_code != 200:
        raise Exception("Error fetching firefox versions, HTTP code " +
                        resp.status_code)
    else:
        dom = html.fromstring(resp.text)
        versions = []

        for link in dom.xpath('//a/@href'):
            url = urlparse(urljoin(URL, link))
            path = url.path
            if path.startswith(RELEASE_PATH):
                version = path[len(RELEASE_PATH):-1]
                if NORMAL_REGEX.match(version):
                    versions.append(LooseVersion(version))

        list.sort(versions, reverse=True)
        return versions
def response(resp):
    img_results = []
    text_results = []

    search_results = json.loads(resp.text)

    # return empty array if there are no results
    if 'data' not in search_results:
        return []

    posts = search_results.get('data', {}).get('children', [])

    # process results
    for post in posts:
        data = post['data']

        # extract post information
        params = {
            'url': urljoin(base_url, data['permalink']),
            'title': data['title']
        }

        # if thumbnail field contains a valid URL, we need to change template
        thumbnail = data['thumbnail']
        url_info = urlparse(thumbnail)
        # netloc & path
        if url_info[1] != '' and url_info[2] != '':
            params['img_src'] = data['url']
            params['thumbnail_src'] = thumbnail
            params['template'] = 'images.html'
            img_results.append(params)
        else:
            created = datetime.fromtimestamp(data['created_utc'])
            content = data['selftext']
            if len(content) > 500:
                content = content[:500] + '...'
            params['content'] = content
            params['publishedDate'] = created
            text_results.append(params)

    # show images first and text results second
    return img_results + text_results
Beispiel #22
0
def url_proxy():
    """get real url for baidu sogou and 360sousuo"""

    url = request.args.get('proxyurl')
    token = request.args.get('token')
    if token != new_hmac(settings['result_proxy']['key'], url.encode('utf-8')):
        return render('404.html'), 404

    if "www.baidu.com/link?url" in url:
        try:
            resp = requests.head(url, timeout=1)
        except requests.exceptions.Timeout:
            return redirect(url)

        if resp.status_code == 200:
            realurl = resp.url
        else:
            realurl = url
        return redirect(realurl)
    else:
        try:
            resp = requests.get(url, timeout=1)
        except requests.exceptions.Timeout:
            return redirect(url)

        if resp.status_code == 200:
            if "http:" not in resp.text and "https:" not in resp.text:
                # try to fix response with host in window.location.replace function
                resp_content = resp.text.strip()
                count = resp_content.index("window.location.replace(")
                str_content = list(resp_content)
                # 25 is len("window.location.replace(")+1
                str_content.insert(count + 25, "https://" + urlparse(url)[1])
                resp_content = "".join(str_content)
                return resp_content
            else:
                # to get url from html response
                return resp.content
        else:
            return redirect(url)
Beispiel #23
0
def response(resp):
    results = []

    dom = html.fromstring(resp.text)

    # parse results
    for img in dom.xpath('//a'):
        r = {
            'title': u' '.join(img.xpath('.//div[class="rg_ilmbg"]//text()')),
            'content': '',
            'template': 'images.html',
        }
        url = urlparse(img.xpath('.//@href')[0])
        query = parse_qs(url.query)
        r['url'] = query['imgrefurl'][0]
        r['img_src'] = query['imgurl'][0]
        r['thumbnail_src'] = r['img_src']
        # append result
        results.append(r)

    # return results
    return results
Beispiel #24
0
    def __merge_url_result(self, result, position):
        result['parsed_url'] = urlparse(result['url'])

        # if the result has no scheme, use http as default
        if not result['parsed_url'].scheme:
            result['parsed_url'] = result['parsed_url']._replace(scheme="http")
            result['url'] = result['parsed_url'].geturl()

        result['engines'] = set([result['engine']])

        # strip multiple spaces and cariage returns from content
        if result.get('content'):
            result['content'] = WHITESPACE_REGEX.sub(' ', result['content'])

        duplicated = self.__find_duplicated_http_result(result)
        if duplicated:
            self.__merge_duplicated_http_result(duplicated, result, position)
            return

        # if there is no duplicate found, append result
        result['positions'] = [position]
        with RLock():
            self._merged_results.append(result)
Beispiel #25
0
def response(resp):
    """Get response from google's search request"""
    results = []

    # detect google sorry
    resp_url = urlparse(resp.url)
    if resp_url.netloc == 'sorry.google.com' or resp_url.path == '/sorry/IndexRedirect':
        raise RuntimeWarning('sorry.google.com')

    if resp_url.path.startswith('/sorry'):
        raise RuntimeWarning(gettext('CAPTCHA required'))

    # which subdomain ?
    # subdomain = resp.search_params.get('google_subdomain')

    # convert the text to dom
    dom = html.fromstring(resp.text)
    img_bas64_map = scrap_out_thumbs(dom)

    # parse results
    #
    # root element::
    #     <div id="islmp" ..>
    # result div per image::
    #     <div jsmodel="tTXmib"> / <div jsaction="..." data-id="..."
    #     The data-id matches to a item in a json-data structure in::
    #         <script nonce="I+vqelcy/01CKiBJi5Z1Ow">AF_initDataCallback({key: 'ds:1', ... data:function(){return [ ...
    #     In this structure the ling to the origin PNG, JPG or whatever is given
    #     (we do not blow out the link there, you could still implement that)
    # first link per image-div contains a <img> with the data-iid for bas64 encoded image data::
    #      <img class="rg_i Q4LuWd" data-iid="0"
    # second link per image-div is the target link::
    #      <a class="VFACy kGQAp" href="https://en.wikipedia.org/wiki/The_Sacrament_of_the_Last_Supper">
    # the second link also contains two div tags with the *description* and *publisher*::
    #      <div class="WGvvNb">The Sacrament of the Last Supper ...</div>
    #      <div class="fxgdke">en.wikipedia.org</div>

    root = eval_xpath(dom, '//div[@id="islmp"]')
    if not root:
        logger.error("did not find root element id='islmp'")
        return results

    root = root[0]
    for img_node in eval_xpath(root, './/img[contains(@class, "rg_i")]'):

        try:
            img_alt = eval_xpath(img_node, '@alt')[0]

            img_base64_id = eval_xpath(img_node, '@data-iid')
            if img_base64_id:
                img_base64_id = img_base64_id[0]
                thumbnail_src = img_bas64_map[img_base64_id]
            else:
                thumbnail_src = eval_xpath(img_node, '@src')
                if not thumbnail_src:
                    thumbnail_src = eval_xpath(img_node, '@data-src')
                if thumbnail_src:
                    thumbnail_src = thumbnail_src[0]
                else:
                    thumbnail_src = ''

            link_node = eval_xpath(img_node, '../../../a[2]')[0]
            url = eval_xpath(link_node, '@href')[0]

            pub_nodes = eval_xpath(link_node, './div/div')
            pub_descr = img_alt
            pub_source = ''
            if pub_nodes:
                pub_descr = extract_text(pub_nodes[0])
                pub_source = extract_text(pub_nodes[1])

            results.append({
                'url': url,
                'title': img_alt,
                'content': pub_descr,
                'source': pub_source,
                'img_src': url,
                # 'img_format': img_format,
                'thumbnail_src': thumbnail_src,
                'template': 'images.html'
            })
        except Exception as e:  # pylint: disable=broad-except
            logger.error(e, exc_info=True)
            # from lxml import etree
            # logger.debug(etree.tostring(img_node, pretty_print=True))
            # import pdb
            # pdb.set_trace()
            continue

    return results
Beispiel #26
0
def response(resp):
    results = []

    # detect google sorry
    resp_url = urlparse(resp.url)
    if resp_url.netloc == 'sorry.google.com' or resp_url.path == '/sorry/IndexRedirect':
        raise RuntimeWarning('sorry.google.com')

    if resp_url.path.startswith('/sorry'):
        raise RuntimeWarning(gettext('CAPTCHA required'))

    # which hostname ?
    google_hostname = resp.search_params.get('google_hostname')
    google_url = "https://" + google_hostname

    # convert the text to dom
    dom = html.fromstring(resp.text)

    instant_answer = dom.xpath('//div[contains(@id, "ires")]//div[contains(@class, "hp-xpdbox")]')
    if instant_answer:
        answer_re = r'(?P<prefix><a\s+href=")\/url\?q=(?P<url>[^"]+?)\&amp\;[^"]*(?P<suffix>"\s*>)'
        answer_subst = "\\g<prefix>\\g<url>\\g<suffix>"
        answer_html = ['<br>']
        for element in instant_answer:
            answer_html.append(etree.tostring(element, method="html"))
        answer_str = u' '.join(answer_html)
        answer_fixed = re.sub(answer_re, answer_subst, answer_str, 0, re.MULTILINE)
        results.append({'answer': answer_fixed})

    try:
        results_num = int(dom.xpath('//div[@id="resultStats"]//text()')[0]
                          .split()[1].replace(',', ''))
        results.append({'number_of_results': results_num})
    except:
        pass

    # parse results
    for result in dom.xpath(results_xpath):
        try:
            title = extract_text(result.xpath(title_xpath)[0])
            url = parse_url(extract_url(result.xpath(url_xpath), google_url), google_hostname)
            parsed_url = urlparse(url, google_hostname)

            # map result
            if parsed_url.netloc == google_hostname:
                # TODO fix inside links
                continue
                # if parsed_url.path.startswith(maps_path) or parsed_url.netloc.startswith(map_hostname_start):
                #     print "yooooo"*30
                #     x = result.xpath(map_near)
                #     if len(x) > 0:
                #         # map : near the location
                #         results = results + parse_map_near(parsed_url, x, google_hostname)
                #     else:
                #         # map : detail about a location
                #         results = results + parse_map_detail(parsed_url, result, google_hostname)
                # # google news
                # elif parsed_url.path == search_path:
                #     # skipping news results
                #     pass

                # # images result
                # elif parsed_url.path == images_path:
                #     # only thumbnail image provided,
                #     # so skipping image results
                #     # results = results + parse_images(result, google_hostname)
                #     pass

            else:
                # normal result
                content = extract_text_from_dom(result, content_xpath)
                if content is None:
                    continue
                content_misc = extract_text_from_dom(result, content_misc_xpath)
                if content_misc is not None:
                    content = content_misc + "<br />" + content
                # append result
                results.append({'url': url,
                                'title': title,
                                'content': content
                                })
        except:
            logger.debug('result parse error in:\n%s', etree.tostring(result, pretty_print=True))
            continue

    # parse suggestion
    for suggestion in dom.xpath(suggestion_xpath):
        # append suggestion
        results.append({'suggestion': extract_text(suggestion)})

    for correction in dom.xpath(spelling_suggestion_xpath):
        results.append({'correction': extract_text(correction)})

    # return results
    return results
Beispiel #27
0
def response(resp):
    """Get response from google's search request"""
    results = []

    # detect google sorry
    resp_url = urlparse(resp.url)
    if resp_url.netloc == 'sorry.google.com' or resp_url.path == '/sorry/IndexRedirect':
        raise RuntimeWarning('sorry.google.com')

    if resp_url.path.startswith('/sorry'):
        raise RuntimeWarning(gettext('CAPTCHA required'))

    # which subdomain ?
    # subdomain = resp.search_params.get('google_subdomain')

    # convert the text to dom
    dom = html.fromstring(resp.text)

    # results --> answer
    answer = eval_xpath(dom, '//div[contains(@class, "LGOjhe")]//text()')
    if answer:
        results.append({'answer': ' '.join(answer)})
    else:
        logger.debug("did not found 'answer'")

    # results --> number_of_results
    try:
        _txt = eval_xpath(dom, '//div[@id="result-stats"]//text()')[0]
        _digit = ''.join([n for n in _txt if n.isdigit()])
        number_of_results = int(_digit)
        results.append({'number_of_results': number_of_results})

    except Exception as e:  # pylint: disable=broad-except
        logger.debug("did not 'number_of_results'")
        logger.error(e, exc_info=True)

    # parse results
    for result in eval_xpath(dom, results_xpath):

        # google *sections*
        if extract_text(eval_xpath(result, g_section_with_header)):
            logger.debug("ingoring <g-section-with-header>")
            continue

        try:
            title = extract_text(eval_xpath(result, title_xpath)[0])
            url = eval_xpath(result, href_xpath)[0]
            content = extract_text_from_dom(result, content_xpath)
            results.append({'url': url, 'title': title, 'content': content})
        except Exception as e:  # pylint: disable=broad-except
            logger.error(e, exc_info=True)
            # from lxml import etree
            # logger.debug(etree.tostring(result, pretty_print=True))
            # import pdb
            # pdb.set_trace()
            continue

    # parse suggestion
    for suggestion in eval_xpath(dom, suggestion_xpath):
        # append suggestion
        results.append({'suggestion': extract_text(suggestion)})

    for correction in eval_xpath(dom, spelling_suggestion_xpath):
        results.append({'correction': extract_text(correction)})

    # return results
    return results
Beispiel #28
0
def image_url_cleanup(url_string):
    parsed_url = urlparse(url_string)
    if parsed_url.netloc.endswith('bing4.com') and parsed_url.path == '/th':
        query = dict(parse_qsl(parsed_url.query))
        return "https://www.bing.com/th?id=" + query.get('id')
    return url_string
Beispiel #29
0
def url_cleanup(url_string):
    parsed_url = urlparse(url_string)
    if parsed_url.netloc == 'www.bing.com' and parsed_url.path == '/news/apiclick.aspx':
        query = dict(parse_qsl(parsed_url.query))
        return query.get('url', None)
    return url_string
Beispiel #30
0
def extract_domain(url):
    return urlparse(url)[1]
Beispiel #31
0
def merge_two_infoboxes(infobox1, infobox2):
    # get engines weights
    if hasattr(engines[infobox1['engine']], 'weight'):
        weight1 = engines[infobox1['engine']].weight
    else:
        weight1 = 1
    if hasattr(engines[infobox2['engine']], 'weight'):
        weight2 = engines[infobox2['engine']].weight
    else:
        weight2 = 1

    if weight2 > weight1:
        infobox1['engine'] = infobox2['engine']

    if 'urls' in infobox2:
        urls1 = infobox1.get('urls', None)
        if urls1 is None:
            urls1 = []

        for url2 in infobox2.get('urls', []):
            unique_url = True
            for url1 in infobox1.get('urls', []):
                if compare_urls(urlparse(url1.get('url', '')),
                                urlparse(url2.get('url', ''))):
                    unique_url = False
                    break
            if unique_url:
                urls1.append(url2)

        infobox1['urls'] = urls1

    if 'img_src' in infobox2:
        img1 = infobox1.get('img_src', None)
        img2 = infobox2.get('img_src')
        if img1 is None:
            infobox1['img_src'] = img2
        elif weight2 > weight1:
            infobox1['img_src'] = img2

    if 'attributes' in infobox2:
        attributes1 = infobox1.get('attributes', None)
        if attributes1 is None:
            attributes1 = []
            infobox1['attributes'] = attributes1

        attributeSet = set()
        for attribute in infobox1.get('attributes', []):
            if attribute.get('label', None) not in attributeSet:
                attributeSet.add(attribute.get('label', None))

        for attribute in infobox2.get('attributes', []):
            if attribute.get('label', None) not in attributeSet:
                attributes1.append(attribute)

    if 'content' in infobox2:
        content1 = infobox1.get('content', None)
        content2 = infobox2.get('content', '')
        if content1 is not None:
            if result_content_len(content2) > result_content_len(content1):
                infobox1['content'] = content2
        else:
            infobox1['content'] = content2
Beispiel #32
0
def url_cleanup(url_string):
    parsed_url = urlparse(url_string)
    if parsed_url.netloc == 'www.bing.com' and parsed_url.path == '/news/apiclick.aspx':
        query = dict(parse_qsl(parsed_url.query))
        return query.get('url', None)
    return url_string
Beispiel #33
0
def response(resp):
    results = []

    # detect google sorry
    resp_url = urlparse(resp.url)
    if resp_url.netloc == 'sorry.google.com' or resp_url.path == '/sorry/IndexRedirect':
        raise RuntimeWarning('sorry.google.com')

    if resp_url.path.startswith('/sorry'):
        raise RuntimeWarning(gettext('CAPTCHA required'))

    # which hostname ?
    google_hostname = resp.search_params.get('google_hostname')
    google_url = "https://" + google_hostname

    # convert the text to dom
    dom = html.fromstring(resp.text)

    instant_answer = dom.xpath('//div[@id="_vBb"]//text()')
    if instant_answer:
        results.append({'answer': u' '.join(instant_answer)})
    try:
        results_num = int(dom.xpath('//div[@id="resultStats"]//text()')[0]
                          .split()[1].replace(',', ''))
        results.append({'number_of_results': results_num})
    except:
        pass

    # parse results
    for result in dom.xpath(results_xpath):
        try:
            title = extract_text(result.xpath(title_xpath)[0])
            url = parse_url(extract_url(result.xpath(url_xpath), google_url), google_hostname)
            parsed_url = urlparse(url, google_hostname)

            # map result
            if parsed_url.netloc == google_hostname:
                # TODO fix inside links
                continue
                # if parsed_url.path.startswith(maps_path) or parsed_url.netloc.startswith(map_hostname_start):
                #     print "yooooo"*30
                #     x = result.xpath(map_near)
                #     if len(x) > 0:
                #         # map : near the location
                #         results = results + parse_map_near(parsed_url, x, google_hostname)
                #     else:
                #         # map : detail about a location
                #         results = results + parse_map_detail(parsed_url, result, google_hostname)
                # # google news
                # elif parsed_url.path == search_path:
                #     # skipping news results
                #     pass

                # # images result
                # elif parsed_url.path == images_path:
                #     # only thumbnail image provided,
                #     # so skipping image results
                #     # results = results + parse_images(result, google_hostname)
                #     pass

            else:
                # normal result
                content = extract_text_from_dom(result, content_xpath)
                if content is None:
                    continue
                content_misc = extract_text_from_dom(result, content_misc_xpath)
                if content_misc is not None:
                    content = content_misc + "<br />" + content
                # append result
                results.append({'url': url,
                                'title': title,
                                'content': content
                                })
        except:
            logger.debug('result parse error in:\n%s', etree.tostring(result, pretty_print=True))
            continue

    # parse suggestion
    for suggestion in dom.xpath(suggestion_xpath):
        # append suggestion
        results.append({'suggestion': extract_text(suggestion)})

    for correction in dom.xpath(spelling_suggestion_xpath):
        results.append({'correction': extract_text(correction)})

    # return results
    return results
Beispiel #34
0
def image_url_cleanup(url_string):
    parsed_url = urlparse(url_string)
    if parsed_url.netloc.endswith('bing4.com') and parsed_url.path == '/th':
        query = dict(parse_qsl(parsed_url.query))
        return "https://www.bing.com/th?id=" + query.get('id')
    return url_string
Beispiel #35
0
def extract_domain(url):
    return urlparse(url)[1]
def response(resp):
    results = []

    # detect google sorry
    resp_url = urlparse(resp.url)
    if resp_url.netloc == 'sorry.google.com' or resp_url.path == '/sorry/IndexRedirect':
        raise RuntimeWarning('sorry.google.com')

    # which hostname ?
    google_hostname = resp.search_params.get('google_hostname')
    google_url = "https://" + google_hostname

    # convert the text to dom
    dom = html.fromstring(resp.text)

    instant_answer = dom.xpath('//div[@id="_vBb"]//text()')
    if instant_answer:
        results.append({'answer': u' '.join(instant_answer)})
    try:
        results_num = int(
            dom.xpath('//div[@id="resultStats"]//text()')[0].split()
            [1].replace(',', ''))
        results.append({'number_of_results': results_num})
    except:
        pass

    # parse results
    for result in dom.xpath(results_xpath):
        try:
            title = extract_text(result.xpath(title_xpath)[0])
            url = parse_url(extract_url(result.xpath(url_xpath), google_url),
                            google_hostname)
            parsed_url = urlparse(url, google_hostname)

            # map result
            if parsed_url.netloc == google_hostname:
                # TODO fix inside links
                continue
                # if parsed_url.path.startswith(maps_path) or parsed_url.netloc.startswith(map_hostname_start):
                #     print "yooooo"*30
                #     x = result.xpath(map_near)
                #     if len(x) > 0:
                #         # map : near the location
                #         results = results + parse_map_near(parsed_url, x, google_hostname)
                #     else:
                #         # map : detail about a location
                #         results = results + parse_map_detail(parsed_url, result, google_hostname)
                # # google news
                # elif parsed_url.path == search_path:
                #     # skipping news results
                #     pass

                # # images result
                # elif parsed_url.path == images_path:
                #     # only thumbnail image provided,
                #     # so skipping image results
                #     # results = results + parse_images(result, google_hostname)
                #     pass

            else:
                # normal result
                content = extract_text_from_dom(result, content_xpath)
                if content is None:
                    continue
                content_misc = extract_text_from_dom(result,
                                                     content_misc_xpath)
                if content_misc is not None:
                    content = content_misc + "<br />" + content
                # append result
                results.append({
                    'url': url,
                    'title': title,
                    'content': content
                })
        except:
            logger.debug('result parse error in:\n%s',
                         etree.tostring(result, pretty_print=True))
            continue

    # parse suggestion
    for suggestion in dom.xpath(suggestion_xpath):
        # append suggestion
        results.append({'suggestion': extract_text(suggestion)})

    for correction in dom.xpath(spelling_suggestion_xpath):
        results.append({'correction': extract_text(correction)})

    # return results
    return results