Exemple #1
0
def pipeline(response, spider) :
    """
    Index a page.
    """
    # skip rss or atom urls
    if not response.css("html").extract_first() :
        return

    # get domain
    domain = url.domain(response.url)

    # extract title
    title = response.css('title::text').extract_first()
    title = title.strip() if title else ""

    # extract description
    description = response.css("meta[name=description]::attr(content)").extract_first()
    description = description.strip() if description else ""

    # get main language of page, and main content of page
    lang = url.detect_language(response.body)
    if lang not in languages :
        raise InvalidUsage('Language not supported')
    body, boilerplate = url.extract_content(response.body, languages.get(lang))

    # weight of page
    weight = 3
    if not title and not description :
        weight = 0
    elif not title :
        weight = 1
    elif not description :
        weight = 2
    if body.count(" ") < boilerplate.count(" ") or not url.create_description(body) :
        # probably bad content quality
        weight -= 1

    res = spider.es_client.index(index="web-%s"%lang, id=response.url, body={
        "url":response.url,
        "domain":domain,
        "title":title,
        "description":description,
        "body":body,
        "weight":weight
    })


    if response.status in spider.handle_httpstatus_list and 'Location' in response.headers:
        newurl = response.headers['Location']
        meta = {'dont_redirect': True, "handle_httpstatus_list" : spider.handle_httpstatus_list}
        meta.update(response.request.meta)
        return Request(url = newurl.decode("utf8"), meta = meta, callback=spider.parse)
Exemple #2
0
def explore_job(link):
    """
    Explore a website and index all urls (redis-rq process).
    """
    print("explore website at : %s" % link)

    # get final url after possible redictions
    try:
        link = url.crawl(link).url
    except:
        return 0

    # create or update domain data
    domain = url.domain(link)
    res = client.index(index="web",
                       doc_type='domain',
                       id=domain,
                       body={
                           "homepage": link,
                           "domain": domain,
                           "last_crawl": datetime.now()
                       })

    # start crawler
    process = CrawlerProcess({
        'USER_AGENT':
        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.75 Safari/537.36",
        'DOWNLOAD_TIMEOUT': 100,
        'DOWNLOAD_DELAY': 0.25,
        'ROBOTSTXT_OBEY': True,
        'HTTPCACHE_ENABLED': False,
        'REDIRECT_ENABLED': False,
        'SPIDER_MIDDLEWARES': {
            'scrapy.downloadermiddlewares.robotstxt.RobotsTxtMiddleware': True,
            'scrapy.spidermiddlewares.httperror.HttpErrorMiddleware': True,
            'scrapy.downloadermiddlewares.httpcache.HttpCacheMiddleware': True,
            'scrapy.extensions.closespider.CloseSpider': True
        },
        'CLOSESPIDER_PAGECOUNT': 500  #only for debug
    })
    process.crawl(crawler.Crawler,
                  allowed_domains=[urlparse(link).netloc],
                  start_urls=[
                      link,
                  ],
                  es_client=client,
                  redis_conn=redis_conn)
    process.start()

    return 1
def reference_job(link, email) :
    """
    Request the referencing of a website.
    """
    print("referencing page %s with email %s"%(link,email))

    # get final url after possible redictions
    try :
        link = url.crawl(link).url
    except :
        return 0

    # create or update domain data
    domain = url.domain(link)
    res = client.index(index="web", doc_type='domain', id=domain, body={
        "homepage":link,
        "domain":domain,
        "email":email
    })

    return 1
Exemple #4
0
def pipeline(response, spider) :
    """
    Index a page.
    """
    # skip rss or atom urls
    if not response.css("html").extract_first() :
        return

    # get domain
    domain = url.domain(response.url)

    # extract title
    title = response.css('title::text').extract_first()
    title = title.strip() if title else ""

    # extract description
    description = response.css("meta[name=description]::attr(content)").extract_first()
    description = description.strip() if description else ""

    # get main language of page, and main content of page
    lang = url.detect_language(response.body)
    if lang not in languages :
        raise InvalidUsage('Language not supported')
    body, boilerplate = url.extract_content(response.body, languages.get(lang))

    # weight of page
    weight = 3
    if not title and not description :
        weight = 0
    elif not title :
        weight = 1
    elif not description :
        weight = 2
    if body.count(" ") < boilerplate.count(" ") or not url.create_description(body) :
        # probably bad content quality
        weight -= 1

    # -- TEST -- #
    """keywords = Counter()
    text_for_keywords = "%s\n%s\n%s"%(title, description, bestbody)
    r = requests.post('http://localhost:5001/keywords_from_text', data = {'text':text_for_keywords})
    data = r.json()
    #print(hit.url, data)
    for k in data["keywords"] :
        keywords[k] += 1
    keywords = " ".join(["%s "%(kw)*score for kw, score in keywords.most_common(100)])"""

    # index url and data
    res = spider.es_client.index(index="web-%s"%lang, doc_type='page', id=response.url, body={
        "url":response.url,
        "domain":domain,
        "title":title,
        "description":description,
        "body":body,
        "weight":weight
    })

    # try to create thumbnail from page
    img_link = response.css("meta[property='og:image']::attr(content)").extract_first()
    if not img_link :
        img_link = response.css("meta[name='twitter:image']::attr(content)").extract_first()
    if img_link :
        q = Queue(connection=spider.redis_conn)
        q.enqueue(create_thumbnail, response.url, lang, img_link)

    # check for redirect url
    if response.status in spider.handle_httpstatus_list and 'Location' in response.headers:
        newurl = response.headers['Location']
        meta = {'dont_redirect': True, "handle_httpstatus_list" : spider.handle_httpstatus_list}
        meta.update(response.request.meta)
        return Request(url = newurl.decode("utf8"), meta = meta, callback=spider.parse)