Exemple #1
0
def pipeline(response, spider) :
    """
    Index a page.
    """
    # skip rss or atom urls
    if not response.css("html").extract_first() :
        return

    # get domain
    domain = url.domain(response.url)

    # extract title
    title = response.css('title::text').extract_first()
    title = title.strip() if title else ""

    # extract description
    description = response.css("meta[name=description]::attr(content)").extract_first()
    description = description.strip() if description else ""

    # get main language of page, and main content of page
    lang = url.detect_language(response.body)
    if lang not in languages :
        raise InvalidUsage('Language not supported')
    body, boilerplate = url.extract_content(response.body, languages.get(lang))

    # weight of page
    weight = 3
    if not title and not description :
        weight = 0
    elif not title :
        weight = 1
    elif not description :
        weight = 2
    if body.count(" ") < boilerplate.count(" ") or not url.create_description(body) :
        # probably bad content quality
        weight -= 1

    res = spider.es_client.index(index="web-%s"%lang, id=response.url, body={
        "url":response.url,
        "domain":domain,
        "title":title,
        "description":description,
        "body":body,
        "weight":weight
    })


    if response.status in spider.handle_httpstatus_list and 'Location' in response.headers:
        newurl = response.headers['Location']
        meta = {'dont_redirect': True, "handle_httpstatus_list" : spider.handle_httpstatus_list}
        meta.update(response.request.meta)
        return Request(url = newurl.decode("utf8"), meta = meta, callback=spider.parse)
def index():
    """
    URL : /index
    Index a new URL in search engine.
    Method : POST
    Form data :
        - url : the url to index [string, required]
    Return a success message.
    """
    # get POST data
    data = dict((key, request.form.get(key)) for key in request.form.keys())
    if "url" not in data:
        raise InvalidUsage('No url specified in POST data')

    # crawl url
    url_data = url.crawl(data["url"])
    if not url_data:
        raise InvalidUsage("URL is invalid or has no text inside")

    # get main language of page
    lang = url.detect_language(url_data.text)
    if lang not in languages:
        raise InvalidUsage('Language not supported')

    # extract title of url
    title = url.extract_title(url_data.text)

    # extract description of url
    description = url.extract_description(url_data.text)

    # extract main content of url
    body = url.extract_content(url_data.text, languages.get(lang))

    # index url and data
    res = client.index(index="web-%s" % lang,
                       doc_type='page',
                       id=data["url"],
                       body={
                           "title": title,
                           "description": description,
                           "body": body,
                           "url": data["url"]
                       })

    return "Success"
Exemple #3
0
def pipeline(response, spider) :
    """
    Index a page.
    """
    # skip rss or atom urls
    if not response.css("html").extract_first() :
        return

    # get domain
    domain = url.domain(response.url)

    # extract title
    title = response.css('title::text').extract_first()
    title = title.strip() if title else ""

    # extract description
    description = response.css("meta[name=description]::attr(content)").extract_first()
    description = description.strip() if description else ""

    # get main language of page, and main content of page
    lang = url.detect_language(response.body)
    if lang not in languages :
        raise InvalidUsage('Language not supported')
    body, boilerplate = url.extract_content(response.body, languages.get(lang))

    # weight of page
    weight = 3
    if not title and not description :
        weight = 0
    elif not title :
        weight = 1
    elif not description :
        weight = 2
    if body.count(" ") < boilerplate.count(" ") or not url.create_description(body) :
        # probably bad content quality
        weight -= 1

    # -- TEST -- #
    """keywords = Counter()
    text_for_keywords = "%s\n%s\n%s"%(title, description, bestbody)
    r = requests.post('http://localhost:5001/keywords_from_text', data = {'text':text_for_keywords})
    data = r.json()
    #print(hit.url, data)
    for k in data["keywords"] :
        keywords[k] += 1
    keywords = " ".join(["%s "%(kw)*score for kw, score in keywords.most_common(100)])"""

    # index url and data
    res = spider.es_client.index(index="web-%s"%lang, doc_type='page', id=response.url, body={
        "url":response.url,
        "domain":domain,
        "title":title,
        "description":description,
        "body":body,
        "weight":weight
    })

    # try to create thumbnail from page
    img_link = response.css("meta[property='og:image']::attr(content)").extract_first()
    if not img_link :
        img_link = response.css("meta[name='twitter:image']::attr(content)").extract_first()
    if img_link :
        q = Queue(connection=spider.redis_conn)
        q.enqueue(create_thumbnail, response.url, lang, img_link)

    # check for redirect url
    if response.status in spider.handle_httpstatus_list and 'Location' in response.headers:
        newurl = response.headers['Location']
        meta = {'dont_redirect': True, "handle_httpstatus_list" : spider.handle_httpstatus_list}
        meta.update(response.request.meta)
        return Request(url = newurl.decode("utf8"), meta = meta, callback=spider.parse)