Python Crawler Examples, crawler.Crawler Python Examples

Example #1

0

Show file

async def recrawl():
    """Gets the latest matches and inserts them into the database."""
    print("getting recent matches")
    api = crawler.Crawler()

    # TODO: insert API version (force update if changed)
    # TODO: create database indices
    # get or put when the last crawl was executed

    # crawl and upsert
    for region in ["na", "eu"]:
        try:
            last_match_update = (await db.select(
                """
                SELECT data->'attributes'->>'createdAt' AS created
                FROM match
                WHERE data->'attributes'->>'shardId'='""" + region + """'
                ORDER BY data->'attributes'->>'createdAt' DESC LIMIT 1
                """)
            )[0]["created"]
        except:
            last_match_update = "2017-02-05T01:01:01Z"

        matches = await api.matches_since(last_match_update, region=region)
        if len(matches) > 0:
            print(region + " got a lot new data items: " + str(len(matches)))
        else:
            print(region + " got no new matches.")
        await db.upsert(matches, True)

    asyncio.ensure_future(recrawl_soon())

Example #2

0

Show file

 def downloadFile(self):
     crawl = crawler.Crawler(self.fileNameUrls)
     lista = crawl.crawlFile()
     for video in lista:
         aux = crawler.Crawler.downloadItem(video)
         if (aux != None):
             lsedatasetBuild.buildPoseFile(aux)

Example #3

0

Show file

def dispatcher_q():
    _crawler = None
    try:
        q = Queue()
        _crawler = crawler.Crawler(q, callback=emit_flight_info,
                                   driver_path=config["driver_path"],
                                   driver_type=config["driver_type"],
                                   page_wait_interval=int(config["page_wait_interval"]))
        _crawler.daemon = True
        _crawler.start()
        while True:
            if len(flight_qs) == 0:
                time.sleep(1)
                continue

            cnt = 0
            ids = [x for x in flight_qs.keys()]
            for i in ids:
                f = flight_qs[i]
                if f["in_progress"] is False \
                        and f["deleted"] is False \
                        and f["updated_at"] + int(config["refresh_interval"]) <= int(time.time() * 1000):
                    f["in_progress"] = True
                    q.put(f)
                    cnt += 1

            deleted = [f["id"] for f in flight_qs.values() if f["deleted"]]
            for d in deleted:
                flight_qs.pop(d, None)

            if cnt == 0:
                time.sleep(1)
    finally:
        _crawler.stop()

Example #4

0

Show file

 def test_exclude(self):
     crawler = C.Crawler(['http://example.com'],
                         exclude=r'.*pattern',
                         loop=self.loop)
     self.addCleanup(crawler.close)
     self.assertTrue(crawler.url_allowed("http://example.com"))
     self.assertFalse(crawler.url_allowed("http://example.com/pattern"))

Example #5

0

Show file

 def test_lenient_host_checking(self):
     crawler = C.Crawler(['http://example.com'],
                         strict=False,
                         loop=self.loop)
     self.addCleanup(crawler.close)
     self.assertTrue(crawler.url_allowed("http://www.example.com"))
     self.assertTrue(crawler.url_allowed("http://foo.example.com"))

Example #6

0

Show file

File: api.py Project: vainglorygame/vgstats.io

async def crawl_region(region):
    """Gets some matches from a region and inserts them
       until the DB is up to date."""
    api = crawler.Crawler()

    while True:
        try:
            last_match_update = (await db.select("""
                SELECT data->'attributes'->>'createdAt' AS created
                FROM match
                WHERE data->'attributes'->>'shardId'='""" + region + """'
                ORDER BY data->'attributes'->>'createdAt' DESC LIMIT 1
                """))[0]["created"]
        except:
            last_match_update = "2017-02-05T01:01:01Z"

        print(region + " fetching matches after " + last_match_update)

        # wait for http requests
        matches = await api.matches_since(last_match_update,
                                          region=region,
                                          params={"page[limit]": 50})
        if len(matches) > 0:
            print(region + " got new data items: " + str(len(matches)))
        else:
            print(region + " got no new matches.")
            return
        # insert asynchronously in the background
        await db.upsert(matches, True)

Example #7

0

Show file

File: feed_update.py Project: amumu/paperlens

def GetFeedInfo(url):
    c = crawler.Crawler('')
    rss = c.download(url)
    ret = []
    if len(rss) < 20:
        return ret
    try:
        dom = xml.dom.minidom.parseString(str.strip(rss))
        items = dom.getElementsByTagName('item')
        title = ''
        link = ''
        pub_date = ''
        for item in items:
            title_node = item.getElementsByTagName('title')
            if len(title_node) > 0:
                title = title_node[0].firstChild.data
            link_node = item.getElementsByTagName('link')
            if len(link_node) > 0:
                link = link_node[0].firstChild.data
            date_node = item.getElementsByTagName('pubDate')
            if len(date_node) > 0:
                pub_date = date_node[0].firstChild.data
            pdate = GetDate(pub_date)
            itemxml = item.toxml()
            if pdate > 0:
                ret.append([title, link, pdate, itemxml])
        return ret
    except xml.parsers.expat.ExpatError, e:
        return ret

Example #8

0

Show file

def search_leboncoin(rabbit_channel):
    leboncoin = crawler.Crawler()

    for offer in leboncoin.offers():
        logging.info('Found offer "%s" -- %s', offer['title'], offer['identifier'])
        rabbit_channel.basic_publish(exchange='', routing_key=OFFERS_QUEUE, body=json.dumps(offer), properties=pika.BasicProperties(
            delivery_mode = 2, # make message persistent
        ))

Example #9

0

Show file

File: dc.py Project: cash2one/xiaodu

def batch_test():
    import crawler
    logging.basicConfig(format='%(asctime)s %(levelname)s:%(message)s', level=logging.DEBUG)
    cc = crawler.Crawler()
    cc.add_headers({
                    'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'})
    hc, url, header, page = cc.get_page('http://my.tv.sohu.com/us/241816781/80597444.shtml')
    logging.info('%s %s %s' % (hc, url, header))

Example #10

0

Show file

File: main.py Project: ZacThePaul/CraigslistCrawler

def get_listings():
    keywords = request.args.get('keywords')
    category = request.args.get('category')
    days = request.args.get('days')
    crawl = c.Crawler(category, keywords, days)
    listings = vars(crawl)['complete_list']
    listings = json.dumps(listings)
    return listings

Example #11

0

Show file

 def crawl(self, urls=None, *args, **kwargs):
     if self.crawler:
         self.crawler.close()
     if urls is None:
         urls = [self.app_url]
     self.crawler = C.Crawler(urls, *args, loop=self.loop, **kwargs)
     self.addCleanup(self.crawler.close)
     self.loop.run_until_complete(self.crawler.crawl())

Example #12

0

Show file

 def test_roots(self):
     crawler = C.Crawler(['http://a', 'http://b', 'not-a-host'],
                         loop=self.loop)
     self.addCleanup(crawler.close)
     self.assertTrue(crawler.url_allowed("http://a/a"))
     self.assertTrue(crawler.url_allowed("http://b/b"))
     self.assertFalse(crawler.url_allowed("http://c/c"))
     self.assertFalse(crawler.url_allowed("http://127.0.0.1"))

Example #13

0

Show file

 def index(self, *args, **kwargs):
     data = {}
     if kwargs.get('search'):
         c = crawler.Crawler()
         r = c.fetch(kwargs['search'])
         data['recipes'] = r
         data['search'] = kwargs['research']
     mytemplate = Template(filename='search.html')
     return mytemplate.render(**data)

Example #14

0

Show file

File: taobao_crawler.py Project: zuoshigang/LightCommentCrawler

 def __init__(self):
     self.taobao_crawler = crawler.Crawler()
     cur_path = os.path.split(os.path.realpath(__file__))[0]
     config_path = cur_path + os.path.sep + "config.json"
     json_file = open(config_path)
     self.taobao_config = json.load(json_file)['taobao']
     # 登录淘宝，这里用微博用户名登录，需要首先在淘宝网页上进行绑定，使用之后请将自己的用户名密码删除
     weibo_username = ""  # 默认: username
     weibo_password = ""  # 默认：password
     self.taobao_crawler.login_taobao(weibo_username, weibo_password)

Example #15

0

Show file

File: core_unit_tests.py Project: carlosfem/crawler

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)

        domain = "https://www.epocacosmeticos.com.br"
        self.crawler = cw.Crawler(
            domain,
            req_limit=10,
            greedy=False,
            indentify_target=lambda page: page.valid_target)
        self.crawler.run(10)

Example #16

0

Show file

File: resources.py Project: SWE110/service

def do_crawl(crawler_params):
    """Starts and runs a crawler"""
    user_id = g.user.user_id
    def crawler_callback(recipe, app):
        with app.app_context():
            add_recipe_to_db(full_content=recipe, uploader_id=user_id, src_url=crawler_params["base_url"])
    crawler_params["recipe_callback"] = crawler_callback
    crawler_params["recipe_callback_args"] = (current_app._get_current_object(),)
    crawler_params["recipe_callback_kwargs"] = {}
    crawler.Crawler(**crawler_params)

Example #17

0

Show file

File: test_crawler.py Project: dtylam/python-tech-test

def test_crawler_recurses_into_discovered_links(getter):
    spider = crawler.Crawler("https://www.example.com", getter)

    spider.start(iterations=2)

    assert spider.visited_links == [
        "https://www.example.com",
        "https://www.touchsurgery.com/1",
        "https://www.touchsurgery.com/2",
    ]

Example #18

0

Show file

File: test_crawler.py Project: dtylam/python-tech-test

def test_crawler_handles_discovering_relative_urls(getter, request):
    spider = crawler.Crawler("https://www.example.com", getter)

    spider.start(iterations=2)

    assert spider.visited_links == [
        "https://www.example.com",
        "https://www.example.com/1",
        "https://www.example.com/1/2",
    ]

Example #19

0

Show file

File: Dmqtt.py Project: yenkuanlee/IPDC

def DoCrawler(message):
        print "DO CRAWLER MESSAGE : "+message
        import crawler
        Jconf = json.loads(message)
        RunnerID = Jconf["RunnerID"]
        RunnerList = Jconf["RunnerList"]
        JobID = Jconf["JobID"]
	JobOwner = Jconf["JobOwner"]
        client.JobDict[JobID] = Jconf
        Cclass = crawler.Crawler(JobID, RunnerID, RunnerList, JobOwner)
        Cclass.Run()

Example #20

0

Show file

def hello_world():
    url = request.form['url']
    depth = request.form['depth']
    logger.info("Recieved request for url: " + url + " depth: " + depth)
    c = crawler.Crawler(url, int(depth))
    try:
        site_map = c.crawl()
    except:
        logger.error("Error while crawling", exc_info=True)
        return json.dumps('{ "message" : "Server error"}')
    return jsonify(dict(site_map))

Example #21

0

Show file

File: lambda_function.py Project: oforiaddaejnr/cypherVPublic

def get_big_boy_from_session(intent, session):
	session_attributes = ses_att
	reprompt_text = ""
	speech_output = ""
	try:
		crawl = crawler.Crawler()
		results = crawl.getLaundryData(session_attributes['laundryData'], "")
		for result in results:
			speech_output = speech_output + result[0] + " has " + result[1] + " washers " + result[2] + " dryers available. "
	except Exception as e:
		speech_output = str(e)
	should_end_session = False
	return build_response(session_attributes, build_speechlet_response(intent['name'], speech_output, reprompt_text, should_end_session))

Example #22

0

Show file

File: program.py Project: jorsae/Yiasa

def test_full_fld_crawl2():
    url = 'google.com'
    fld = utility.get_fld(url)
    setup_db = p.database.setup_database()
    spider = crawler.Crawler(fld, p)
    spider.extractor.robots.rules["Disallow"].append("\S+/Partier/\S+")
    spider.extractor.robots.rules["Disallow"].append("/\S+.html")
    spider.extractor.robots.rules["Disallow"].append("/javascript")
    spider.extractor.robots.rules["Disallow"].append("\S+.cbv")
    spider.extractor.robots.rules["Disallow"].append("\S+2014")
    spider.extractor.robots.rules["Disallow"].append("\S+beta")
    print('Starting crawling')
    spider.start_crawling()

Example #23

0

Show file

def compile_active_list(file):
    dict = {}
    rosie = crawler.Crawler()
    rosie.crawl_nodes_api()
    list_of_active_nodes = [x[0].split('/')[4] for x in rosie.node_url_tuples]
    dict['list_of_active_nodes'] = list_of_active_nodes
    rosie.crawl_users_api()
    list_of_active_users = [x.split('/')[4] for x in rosie.user_urls]
    dict['list_of_active_users'] = list_of_active_users
    rosie.crawl_registrations_api()
    list_of_active_registrations = [
        x[0].split('/')[3] for x in rosie.registration_url_tuples
    ]
    dict['list_of_active_registrations'] = list_of_active_registrations
    json.dump(dict, file, indent=4)

Example #24

0

Show file

File: program.py Project: jorsae/Yiasa

def test_full_fld_crawl():
    """ This is just a short website, that my bot can crawl through it's entirety fast,
        so I can use it as test """

    url = 'vg.no'
    fld = utility.get_fld(url)
    setup_db = p.database.setup_database()
    spider = crawler.Crawler(fld, p)
    spider.extractor.robots.rules["Disallow"].append("\S+/Partier/\S+")
    spider.extractor.robots.rules["Disallow"].append("/\S+.html")
    spider.extractor.robots.rules["Disallow"].append("/javascript")
    spider.extractor.robots.rules["Disallow"].append("\S+.cbv")
    spider.extractor.robots.rules["Disallow"].append("\S+2014")
    spider.extractor.robots.rules["Disallow"].append("\S+beta")
    print('Starting crawling')
    spider.start_crawling()

Example #25

0

Show file

def main():
    """Main program.

    Parse arguments, set up event loop, run crawler, print report.
    """
    args = ARGS.parse_args()
    if not args.roots:
        print('Use --help for command line help')
        return

    levels = [logging.ERROR, logging.WARN, logging.INFO, logging.DEBUG]
    logging.basicConfig(level=levels[min(args.level, len(levels) - 1)])

    if args.iocp:
        from asyncio.windows_events import ProactorEventLoop
        loop = ProactorEventLoop()
        asyncio.set_event_loop(loop)
    elif args.select:
        loop = asyncio.SelectorEventLoop()
        asyncio.set_event_loop(loop)
    else:
        loop = asyncio.get_event_loop()

    roots = {fix_url(root) for root in args.roots}

    c = crawler.Crawler(
        roots,
        exclude=args.exclude,
        strict=args.strict,
        max_redirect=args.max_redirect,
        max_tries=args.max_tries,
        max_tasks=args.max_tasks,
    )
    try:
        loop.run_until_complete(c.crawl())  # Crawler gonna crawl.
    except KeyboardInterrupt:
        sys.stderr.flush()
        print('\nInterrupted\n')
    finally:
        reporter.report(c)
        c.close()

        # next two lines are required for actual aiohttp resource cleanup
        loop.stop()
        loop.run_forever()

        loop.close()

Example #26

0

Show file

File: main.py Project: thebrandonlucas/ScrapeBox

    def main(self):
        dict_arg = {}
        dict_arg['skipext'] = []
        dict_arg['num_workers'] = 1
        dict_arg['parserobots'] = False
        dict_arg['debug'] = False
        dict_arg['verbose'] = False
        dict_arg['exclude'] = []
        dict_arg['drop'] = []
        dict_arg['report'] = False
        dict_arg['images'] = False
        dict_arg['domain'] = self.domain
        dict_arg['output'] = self.output
        crawl = crawler.Crawler(**dict_arg)
        crawl.run()

        pass

Example #27

0

Show file

def rack_url(model_url,shop_No,model_name):#店の特定機種のすべてのurl回収 return(machine_URL,machine_No,machine_name)
	print "台データの回収開始.....{0}".format(model_name.encode("utf-8"))

	scraping = crawler.Crawler()
	bs = scraping.scraping(model_url)

	root_url = sd.Shop_data(shop_No).root_url

	url_data = bs.table.find_all("a",class_="btn-base")

	all_url = []
	for i in url_data:
		No = i.string
		url =  root_url + i.get("href")
		all_url.append((url,No,model_name))

	return all_url

Example #28

0

Show file

File: exp.py Project: Daiver/jff

 def lineReceived(self, line):
     self.sendLine('Echo: ' + line)
     if line == 'test':
         self.massTest()
     if line == 'check':
         self.massCheck()
     if line == 'index':
         cr = crawler.Crawler()
         freq = cr.grabFromPage('http://habrahabr.ru', 1)
         dg.shareIndex2(self, freq)
     if 'search' in line:
         q = line[len('search'):]
         dg.activate2(self, q)
     #self.getValue('123')
     #self.getValue('key')
     #self.getValue('123')
     self.transport.write('>>> ')

Example #29

0

Show file

def ch_update():
    update_file = 'log_file/update.txt'
    crawer = crawler.Crawler('http://www.wandoujia.com/category/app',update_file)
    crawer.crow()
    past = open('log_file/urls.txt','r')
    now = open('log_file/update.txt','r')
    past_list = past.readlines()
    now_list = now.readlines()
    if len(now_list)>len(past_list):
        past.close()
        now.close()
        print u'有更新'
        return True
    else:
        past.close()
        now.close()
        print u'没有更新'
        return False

Example #30

0

Show file

def get_welcome_response():
    """ If we wanted to initialize the session to have some attributes we could
    add those here
    """
    web_crawler = crawler.Crawler()
    expBal, flexBal, swipesBal = web_crawler.checkCash()[0], web_crawler.checkCash()[1], web_crawler.checkCash()[2]

    session_attributes = {"expBal": expBal, "flexBal": flexBal, "swipesBal": swipesBal}
                          #Exp     #Flex    #Swipes
    card_title = "Welcome"
    speech_output = "Welcome to the Aflexa, your personal William and Mary services assistant. " \
                    "Would you like to check your flex or express balance, meal swipes, or laundry machines?" \
    # If the user either does not reply to the welcome message or says something
    # that is not understood, they will be prompted again with this text.
    reprompt_text = "Would you like to check your flex or express balance, meal swipes, or laundry machines?"
    should_end_session = False
    return build_response(session_attributes, build_speechlet_response(
        card_title, speech_output, reprompt_text, should_end_session))