コード例 #1
0
ファイル: queen.py プロジェクト: Leon-Wulfgang/myCrawler
    def service_sis(self):
        process = CrawlerProcess({
            'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'
        })

        process.crawl(worker.Worker)
        process.start()  # the script will block here until the crawling is finished
コード例 #2
0
class CrawlerWorker(multiprocessing.Process):
 
    def __init__(self, spider, result_queue):
        multiprocessing.Process.__init__(self)
        self.result_queue = result_queue
 
        self.crawler = CrawlerProcess(settings)
        if not hasattr(project, 'crawler'):
            self.crawler.install()
        self.crawler.configure()
 
        self.items = []
        self.spider = spider
        dispatcher.connect(self._item_passed, signals.item_passed)
    #__init__
    
    def _item_passed(self, item):
        self.items.append(item)
    # _item_passed
    
    def run(self):
        self.crawler.crawl(self.spider)
        self.crawler.start()
        self.crawler.stop()
        self.result_queue.put(self.items)
    #run
コード例 #3
0
class BaseScraper(CrawlSpider):
    name = "base"
    rules = (
        Rule(LinkExtractor(allow=()), callback='parse_item'),
    )

    def __init__(self, index, start_urls, allowed_domains=[], *args, **kwargs):
        self.allowed_domains = allowed_domains
        self.start_urls = start_urls
        self.index = index
        super(BaseScraper, self).__init__(*args, **kwargs)

    def parse_item(self, response):
        item = {}
        item["body"] = response.body
        yield item

    # Instantiates a CrawlerProcess, which spins up a Twisted Reactor.
    def connect(self):
        self.process = CrawlerProcess(get_project_settings())

    # Start the scraper. The crawl process must be instantiated with the same
    # attributes as the instance.
    def start(self):
        self.connect()
        self.process.crawl(
            self.name,
            self.index,
            start_urls = self.start_urls,
            allowed_domains = self.allowed_domains,
        )
        self.process.start()
コード例 #4
0
ファイル: decc.py プロジェクト: CharlesNie/DECC
def main(argv):

	try:
		opts, args = getopt.getopt(argv, "ch:t:s:", ['title=', 'section='])
	except getopt.GetoptError:
		print 'Usage:\npython2.7 decc.py -h(help)\npython2.7 decc.py -c(crawl articles)\npython2.7 decc.py -s(search article by section) <section>\npython2.7 decc.py -t(search article by title) <title>'
		sys.exit(2)
	for opt, arg in opts:
		if opt == '-h':
			print 'Usage:\npython2.7 decc.py -h(help)\npython2.7 decc.py -c(crawl articles)\npython2.7 decc.py -s(search article by section) <section>\npython2.7 decc.py -t(search article by title) <title>'
			sys.exit()
		elif opt == '-c':
			# start crawling article here
			print "crawling"
			process = CrawlerProcess(get_project_settings())
			process.crawl(BBCArticleSpider)
			process.start()
		elif opt in  ('-t', '--title'):
			print "search by title"
			# start searching article by title
			results = BBCArticleItem.fetch_by_title(arg)
			for result in results:
				print result
		elif opt in ('-s', '--section'):
			print "search by section"
			# start searching article by section
			results = BBCArticleItem.fetch_by_section(arg)
			for result in results:
				print result
コード例 #5
0
ファイル: cli.py プロジェクト: Lukas0907/feeds
def crawl(ctx, spiders, stats):
    """
    Crawl one or many or all pages.

    What spider(s) to run is determined in the following order:

      1. Spider(s) given as argument(s)

      2. Spider(s) specified in the configuration file

    Note that if a spider is given as an argument, the spiders in the
    configuration file are ignored. All available spiders will be used to
    crawl if no arguments are given and no spiders are configured.
    """
    settings = ctx.obj["settings"]
    if stats:
        settings.set("STATS_CLASS", "scrapy.statscollectors.MemoryStatsCollector")

    # Start a new crawler process.
    process = CrawlerProcess(settings)
    spiders = spiders_to_crawl(process, spiders)
    if not spiders:
        logger.error("Please specify what spiders you want to run!")
    else:
        for spider in spiders:
            logger.info("Starting crawl of {} ...".format(spider))
            process.crawl(spider)

    process.start()

    if settings.getbool("HTTPCACHE_ENABLED"):
        run_cleanup_cache(settings)
コード例 #6
0
ファイル: exporterfromlist.py プロジェクト: arapidhs/scry
	def __init__(self, titlesfile = None, platform = None, region = None):

		# set default encoding to utf8 for parsing and logging
		# utf-8 characters in console and files
		#
		reload(sys)
		sys.setdefaultencoding('utf8')
        
		configure_logging(install_root_handler=False)
		logging.basicConfig(
			filename='export.log',
			filemode = 'a',
			format='%(levelname)s: %(message)s',
			level=logging.INFO
		)
                				
		# identify platform
		#
		self.platform = platform
		if self.platform is None:
			logging.error('No platform found! Pass it as an argument.')
			return
		else:			
			platformId = platforms.getId(self.platform)
			if platformId is None:
				logging.error('Platform ' + self.platform + ' not supported.')
				return
						
		self.titlesfile = titlesfile
		self.region = region		
		if self.region is None:
			self.region = "Worldwide"
		
		if titlesfile:		
		
			titles = []
			urls = []
			
			with open( self.titlesfile ) as f:
				titles = f.read().splitlines()
				
			for title in titles:
				logging.debug('Submitting title:' + title )
				urls.append(
					'http://mobygames.com/search/quick' +
					'?q=' + title +
					'&p=' + platformId +
					'&search=Go'
					'&sFilter=1'
					'&sG=on'
					'&search_title=' + urllib.quote( title ) + 
					'&search_platform=' + urllib.quote(self.platform) +
					'&search_region=' + urllib.quote(self.region)
				)
				
			process = CrawlerProcess(get_project_settings())
			process.crawl(MobygamesSpider, start_urls=urls)
			process.start()									
		else:
			logging.warning('No file.')
コード例 #7
0
ファイル: getProxy.py プロジェクト: lanxinxichen/ScrapyDemo
def run_spider():
    settings = Settings()
    settings.set("USER_AGENT", "Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1667.0 Safari/537.36")
    settings.set("ITEM_PIPELINES", {
        'pipelines.FilterProxyPipline': 1,
        'pipelines.SaveProxyPipeline': 2
    })
    settings.set("LOG_STDOUT ", True)

    # 配置日志记录规则设置
    # configure_logging({
    #     'filename': datetime.now().strftime('%Y_%m_%d_%H_proxy.log'),
    #     'format': '%(asctime)s %(levelname)-8s %(name)-15s %(message)s',
    #     'level': logging.INFO
    # })
    configure_logging(install_root_handler=False)
    # 初始化日志路径
    logpath = datetime.now().strftime(log_path)
    if not os.path.isdir(logpath):
        os.makedirs(logpath)
    logging.basicConfig(
        filename=datetime.now().strftime('%s/%s_proxy.log' % (logpath, log_file)),
        format=log_format,
        level=logging.INFO
    )
    process = CrawlerProcess(settings)
    process.crawl(GetProxySpider)
    process.start()
コード例 #8
0
ファイル: spider_start.py プロジェクト: NLPScott/bdbk-kb
    def handle(self, *args, **options):
        setting = {
            'USER_AGENT': options['user_agent'],
            'DOWNLOAD_DELAY': options['download_delay'],
            'LOG_FILE': settings.SCRAPY_LOG_FILE,
            'LOG_LEVEL': settings.SCRAPY_LOG_LEVEL,
        }

        if options['proxy_list']:
            try:
                f = open(options['proxy_list'])
            except IOError as e:
                raise CommandError('cannot open proxy list file for read')

            # Retry many times since proxies often fail
            setting['RETRY_TIMES'] = 10
            # Retry on most error codes since proxies fail for different reasons
            setting['RETRY_HTTP_CODES'] = [500, 503, 504, 400, 403, 404, 408]
            setting['DOWNLOADER_MIDDLEWARES'] = {
                'scrapy.downloadermiddlewares.retry.RetryMiddleware': 90,
                'spider.randomproxy.RandomProxy': 100,
                'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware': 110,
            }
            setting['PROXY_LIST'] = options['proxy_list']

        process = CrawlerProcess(setting)

        process.crawl(BaiduSpider)
        process.start()
コード例 #9
0
ファイル: news_flash_crawl.py プロジェクト: hasadna/anyway
def news_flash_crawl(rss_link, site_name, maps_key):
    id_flash = get_latest_id_from_db() + 1
    latest_date = get_latest_date_from_db()
    d = feedparser.parse(rss_link)
    process = CrawlerProcess()
    for entry in d.entries[::-1]:
        entry_parsed_date = datetime.strptime(entry.published[:-6], '%a, %d %b %Y %H:%M:%S')
        entry_parsed_date = entry_parsed_date.replace(tzinfo=None)
        if (latest_date is not None and entry_parsed_date > latest_date) or latest_date is None:
            news_item = {'id_flash': id_flash, 'date_parsed': entry_parsed_date, 'title': entry.title,
                         'link': entry.links[0].href, 'date': entry.published, 'location': '', 'lat': 0, 'lon': 0}
            if (u'תאונ' in entry.title and u'תאונת עבודה' not in entry.title and u'תאונות עבודה' not in entry.title)\
                    or ((u'רכב' in entry.title or u'אוטובוס' in entry.title or u"ג'יפ" in entry.title
                         or u'משאית' in entry.title or u'קטנוע'
                         in entry.title or u'אופנוע' in entry.title or u'אופניים' in entry.title or u'קורקינט'
                         in entry.title or u'הולך רגל' in entry.title or u'הולכת רגל' in entry.title
                         or u'הולכי רגל' in entry.title) and
                        (u'נפגע' in entry.title or u'פגיע' in entry.title or
                         u'נפצע' in entry.title or u'פציע' in entry.title or u'התנגש' in entry.title or u'התהפך'
                         in entry.title or u'התהפכ' in entry.title)):
                news_item['accident'] = True
            else:
                news_item['accident'] = False
            if site_name == 'ynet':
                news_item['source'] = 'ynet'
                process.crawl(YnetFlashScrap, entry.links[0].href, news_item=news_item, maps_key=maps_key)
            id_flash = id_flash + 1
    process.start()
コード例 #10
0
ファイル: main.py プロジェクト: dagrooms52/TabCrawler
def main(tabLink):
    if(tabLink.find("ultimate-guitar.com")):
        tabSpider = Spiders.Ultimate(tabLink)
    elif(tabLink.find("guitartabs.cc")):
        tabSpider = Spiders.TabCC(tabLink)
    else:
        print("Domain name not supported.")
        return

    # Make a process to instantiate a Ultimate spider with the given
    # arguments and make it crawl the link
    process = CrawlerProcess(get_project_settings())
    process.crawl(tabSpider, link=tabLink)
    process.start()

    # Link has been scraped, now process it
    tree = xmltree.parse(tabs.pipelines.filename)
    root = tree.getroot()
    value = root[0][0][0]
    rawTab = value.text

    if("\M" in rawTab):
        rawTab = parsefuncs.removeLineEndings(rawTab)

    cleanTab = parsefuncs.parseTab(rawTab)

    print("Clean tab is:")
    count = 0
    for line in cleanTab:
        count += 1
        print line
        if(count % 6 == 0):
            print(" ")
コード例 #11
0
    def run(self):
        settings = get_project_settings()
        process = CrawlerProcess(settings)

        process.crawl('stackoverflow',
                      )
        process.start()
コード例 #12
0
ファイル: uiCompare.py プロジェクト: AugustLONG/mcubed
def ScrapeSite():
    db = 'crunchbase_startups'
    sitedomain = raw_input("Enter site domain: ") # get user input
    sitedomain = parse_base_url(sitedomain) # clean url
    
    sql = 'SELECT text FROM {} WHERE siteurl = %s'.format(db)
    
    cur.execute(sql, sitedomain)
    sitetext = cur.fetch()
    
    if sitetext != '': # what does an empty ping return?
        print 'Site already scraped.'
        return sitetext
    
    process = CrawlerProcess({
        'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)',
        'ITEM_PIPELINES': {'pipelines.UserInputPipeline': 100},
        'DEPTH_LIMIT': 2,
        'DOWNLOAD_HANDLERS': {'s3': None,}
        ,'LOG_LEVEL': 'INFO'
    })
    
    process.crawl(SoloSpider, domain = sitedomain)
    process.start()
    
    # presumably finished here - pull newly loaded sitetext for domain
    
    cur.execute(sql, sitedomain)
    return cur.fetch()
コード例 #13
0
ファイル: web_run.py プロジェクト: shanyue-video/video_scrapy
 def get(self):
     while True:
         process = CrawlerProcess(get_project_settings())
         process.crawl('iqiyi')
         process.start()
         time.sleep(3000)
     self.finish()
コード例 #14
0
ファイル: cmdline.py プロジェクト: Root-nix/scrapy
def execute(argv=None):
    if argv is None:
        argv = sys.argv
    crawler = CrawlerProcess(settings)
    crawler.install()
    inproject = inside_project()
    cmds = _get_commands_dict(inproject)
    cmdname = _pop_command_name(argv)
    parser = optparse.OptionParser(formatter=optparse.TitledHelpFormatter(), \
        conflict_handler='resolve')
    if not cmdname:
        _print_commands(inproject)
        sys.exit(0)
    elif cmdname not in cmds:
        _print_unknown_command(cmdname, inproject)
        sys.exit(2)

    cmd = cmds[cmdname]
    parser.usage = "scrapy %s %s" % (cmdname, cmd.syntax())
    parser.description = cmd.long_desc()
    settings.defaults.update(cmd.default_settings)
    cmd.settings = settings
    cmd.add_options(parser)
    opts, args = parser.parse_args(args=argv[1:])
    _run_print_help(parser, cmd.process_options, args, opts)
    cmd.set_crawler(crawler)
    _run_print_help(parser, _run_command, cmd, args, opts)
    sys.exit(cmd.exitcode)
コード例 #15
0
    def handle(self, *args, **options):
        # It would be better to pass this in as a parameter to PayoutSpider
        global start_date
        start_date = datetime.datetime(2015, 1, 1, tzinfo=pytz.UTC)

        delete = options.get('delete')
        delete_all = options.get('delete_all')
        retrieve_all = options.get('retrieve_all')

        previous_payout = None
        previous_payouts = codementor_models.Payout.objects.all().order_by('-date')
        if delete_all or (delete and previous_payouts.count() == 0):
            codementor_models.Review.objects.all().delete()
            codementor_models.Session.objects.all().delete()
            codementor_models.Payout.objects.all().delete()
            codementor_models.Payment.objects.all().delete()
        elif delete:
            previous_payout = previous_payouts[0]
            codementor_models.Review.objects.filter(date__gt=start_date).delete()
            codementor_models.Session.objects.filter(started_at__gt=start_date).delete()
            previous_payout.delete()
            codementor_models.Payment.objects.filter(payout__isnull=True).delete()

        if not retrieve_all and previous_payout:
            start_date = previous_payout.date

        process = CrawlerProcess({
            'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'
        })

        process.crawl(PayoutSpider)
        process.start()
コード例 #16
0
ファイル: scrape.py プロジェクト: benbp/showfinder
def run(urls, city):
    process = CrawlerProcess()
    spiders = [make_spider(artist, url, city) for artist, url in urls]
    for spider_cls in spiders:
        process.crawl(spider_cls)
    # the script will block here until the crawling is finished
    process.start()
コード例 #17
0
ファイル: see_whats_going_on.py プロジェクト: georgi0u/mgrok
def get_scraped_sites_data():
    """Returns output for venues which need to be scraped."""
    class RefDict(dict):
        """A dictionary which returns a reference to itself when deepcopied."""
        def __deepcopy__(self, memo):
            return self

    # Hack: we pass a dictionary which can't be deep-copied into the settings
    # so as to _return_ the scraper output. As far as I can tell, this is the
    # only way to return the scraper output to the script itself.
    output = RefDict()

    settings = Settings({
        'LOG_ENABLED': False,
        'ITEM_PIPELINES': {
            'mgrok.pipelines.JsonWriterPipeline': 1
            },
        'PIPELINE_OUTPUT': output,
        'USER_AGENT': 'Chrome/41.0.2228.0'
        })

    crawler_process = CrawlerProcess(settings)
    for spider in SCRAPY_SPIDERS:
        crawler_process.crawl(spider)

    crawler_process.start()

    return output
コード例 #18
0
ファイル: run.py プロジェクト: algoo/crawlers
def crawl(spiders_classes, connector, debug=False, spider_error_callback=stdout_error_callback):
    """
    Launch crawl job for JobSpider class
    :param debug: (bool) Activate or disable debug
    :param spider_error_callback: callback foir spider errors (see http://doc.scrapy.org/en/latest/topics/signals.html#spider-error)
    :param connector: Connector instance
    :param spiders_classes: JobSpider class list
    :return: spider instance
    """
    if debug:
        dispatcher.connect(spider_error_callback, signals.spider_error)

    process = CrawlerProcess({
        'ITEM_PIPELINES': {
            'pyjobs_crawlers.pipelines.RecordJobPipeline': 1,
        },
        'connector': connector,
        'LOG_ENABLED': False
    })

    for spider_class in spiders_classes:
        process.crawl(spider_class)

    spiders = []
    for crawler in list(process.crawlers):
        spiders.append(crawler.spider)
    process.start()

    return spiders
コード例 #19
0
def spiderCrawl(bandname):
   createLink(bandname)
   settings = get_project_settings()
   settings.set('USER_AGENT','Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)')
   process = CrawlerProcess(settings)
   process.crawl(MySpider)
   process.start()
コード例 #20
0
ファイル: ScapyTest.py プロジェクト: ohansrud/StockUtils
    def Test_Scapy(self):
        spider = FtpSpider()

        process = CrawlerProcess({"USER_AGENT": "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)"})

        process.crawl(spider)
        process.start()
コード例 #21
0
ファイル: try.py プロジェクト: Rygbee/aminer-spider
class CrawlerScript():
	def __init__(self):
		self.crawler = CrawlerProcess(settings)
		#if not hasattr(project, 'crawler'):
			#self.crawler.install()
		#self.crawler.configure()
		self.items = []
		dispatcher.connect(self._item_passed, signals.item_passed)

	def _item_passed(self, item):
		self.items.append(item)

	def _crawl(self, queue, spider_name):
		spider = self.crawler.spiders.create(spider_name)
		if spider:
			self.crawler.queue.append_spider(spider)
			self.crawler.start()
			self.crawler.stop()
			queue.put(self.items)
	def crawl(self, spider):
		queue = Queue()
		p = Process(target=self._crawl, args=(queue, spider,))
		p.start()
		p.join()
		return queue.get(True)
コード例 #22
0
ファイル: task.py プロジェクト: ohhdemgirls/od-database
    def execute_task(self, website: Website, busy: Value, post_id: str, comment_id: str):
        busy.value = 1
        if os.path.exists("data.json"):
            os.remove("data.json")
        print("Started crawling task")
        process = CrawlerProcess(get_project_settings())
        process.crawl("od_links", base_url=website.url)
        process.start()
        print("Done crawling")

        self.db.import_json("data.json", website)
        os.remove("data.json")
        print("Imported in SQLite3")

        if post_id:
            # Reply to post
            stats = self.db.get_website_stats(website.id)
            comment = self.reddit_bot.get_comment({"": stats}, website.id)
            print(comment)
            if "total_size" in stats and stats["total_size"] > 10000000:
                post = self.reddit_bot.reddit.submission(post_id)
                self.reddit_bot.reply(post, comment)
                pass
            else:
                self.reddit_bot.log_crawl(post_id)

        elif comment_id:
            # Reply to comment
            stats = self.db.get_website_stats(website.id)
            comment = self.reddit_bot.get_comment({"There you go!": stats}, website.id)
            print(comment)
            reddit_comment = self.reddit_bot.reddit.comment(comment_id)
            self.reddit_bot.reply(reddit_comment, comment)
        busy.value = 0
        print("Done crawling task")
コード例 #23
0
ファイル: utils.py プロジェクト: chenhc/laravel
def get_fetch(log=False):
    settings = Settings()
    settings.set('LOG_ENABLED', log)

    crawler_process = CrawlerProcess(settings)
    crawler = crawler_process.create_crawler()
    crawler_process.start_crawling()

    t = Thread(target=crawler_process.start_reactor)
    t.daemon = True
    t.start()

    shell = Shell(crawler)
    shell.code = 'adsf'

    import threading
    lock = threading.Lock()

    def fetch(url_or_request):
        lock.acquire()
        try:
            shell.fetch(url_or_request)
            response = shell.vars.get('response')
            return response
        finally:
            lock.release()

    return fetch
コード例 #24
0
ファイル: scr.py プロジェクト: JINDALG/Roofpik_scrapy
def magic():

	process = CrawlerProcess(get_project_settings())

	# 'followall' is the name of one of the spiders of the project.
	process.crawl('magic')
	process.start() # the script will block here until the crawling is fini
コード例 #25
0
def scrape(spider):
    with transaction.atomic(), reversion.create_revision():
        process = CrawlerProcess(DEFAULT_CRAWLER_OPTIONS)
        process.crawl(spider)
        # the script will block here until the crawling is finished
        process.start()
    return
コード例 #26
0
ファイル: t.py プロジェクト: szqh97/test
def main():
    """Setups item signal and run the spider"""
    # set up signal to catch items scraped
    from scrapy import signals
    from scrapy.xlib.pydispatch import dispatcher
 
    def catch_item(sender, item, **kwargs):
        print "Got:", item
 
    dispatcher.connect(catch_item, signal=signals.item_passed)
 
    # shut off log
    from scrapy.conf import settings
    settings.overrides['LOG_ENABLED'] = False
 
    # set up crawler
    from scrapy.crawler import CrawlerProcess
 
    crawler = CrawlerProcess(settings)
    crawler.start()
 
    # schedule spider
 
    # start engine scrapy/twisted
    print "STARTING ENGINE"
    crawler.start()
    print "ENGINE STOPPED"
コード例 #27
0
ファイル: run.py プロジェクト: CkuT/crawlers
def crawl(spiders_classes, connector, debug=False, spider_error_callback=stdout_error_callback, scrapy_settings=None):
    """
    Launch crawl job for JobSpider class
    :param scrapy_settings: dict of setting merged with CrawlerProcess default settings
    :param debug: (bool) Activate or disable debug
    :param spider_error_callback: callback foir spider errors (see http://doc.scrapy.org/en/latest/topics/signals.html#spider-error)
    :param connector: Connector instance
    :param spiders_classes: JobSpider class list
    :return: spider instance
    """
    if debug:
        dispatcher.connect(spider_error_callback, signals.spider_error)

    settings = {
        'ITEM_PIPELINES': {
            'pyjobs_crawlers.pipelines.RecordJobPipeline': 1,
        },
        'connector': connector,
        'LOG_ENABLED': False,
        'DOWNLOAD_DELAY': 1 if not debug else 0,
    }
    if scrapy_settings:
        settings.update(scrapy_settings)

    process = CrawlerProcess(settings)

    for spider_class in spiders_classes:
        process.crawl(spider_class, debug=debug)

    spiders = []
    for crawler in list(process.crawlers):
        spiders.append(crawler.spider)
    process.start()

    return spiders
コード例 #28
0
ファイル: ScrapyWebScraper2.py プロジェクト: ccampell/ATS
def main():
    settings = get_project_settings()
    # TODO: Initialize item pipelines
    # settings.set('ITEM_PIPELINES', {'Program.Scrapy.Items.HikerJournalWriterPipeline': 2})
    crawler = CrawlerProcess(settings=settings)
    spider = HikerScraper()
    crawler.crawl(spider, domain="http://www.trailjournals.com")
    crawler.start()
コード例 #29
0
ファイル: tasks.py プロジェクト: richardcornish/timgorin
def scrape_task():
    """Celery task to scrape website with Scrapy.

    http://doc.scrapy.org/en/latest/topics/practices.html#run-scrapy-from-a-script
    """
    process = CrawlerProcess(get_project_settings())
    process.crawl('eslcafe', domain='eslcafe.com')
    process.start()
コード例 #30
0
def crawl_Info():
    """
    该函数用于从http://www.ishadowsocks.com/网站上爬取免费SHADOWSOCKS帐号
    结果存储在Result.json文件中
    """
    process = CrawlerProcess(get_project_settings())
    process.crawl('SSSpider')
    process.start()
コード例 #31
0
ファイル: tasks.py プロジェクト: srmchcy/WeatherCrawler
def _crawl(path=None):
    crawl = CrawlerProcess(
        {'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'})
    crawl.crawl(ProvinceSpider)
    crawl.start()
    crawl.stop()
コード例 #32
0
        css_selector = '//img'
        for x in response.xpath(css_selector):
            newsel = '::attr(src)'
            links = x.css(newsel).extract_first()
            if links.endswith(".jpg"):
                yield {'Image Link': links}

    #this is to check the next page for images and scrap them out
        Page_selector = '.next a ::attr(href)'
        NextPage = response.css(Page_selector).extract_first()
        if NextPage:
            yield scrapy.Request(response.urljoin(NextPage),
                                 callback=self.parse)


process = CrawlerProcess({'FEED_FORMAT': 'json', 'FEED_URI': 'results.json'})

process.crawl(NewSpider)  # Selecting spider class
process.start()  #

#to read the results.json file and display it when run
with open('results.json', 'rt') as filehandle:
    lines = filehandle.readlines()[1:15]
for line in lines:
    print(website + line.replace('{"Image Link": "', "").replace(
        '"}', "").replace(",", ""))

#To Display the Website
testurls = ['http://172.18.58.238/hr2/']
import webbrowser
for url in testurls:
コード例 #33
0
            img_path = 'images/' + str("_".join(data['name'].split())) + '.jpg'
            with open(img_path, 'wb') as handle:
                response = requests.get(img_url, stream=True)

                if not response.ok:
                    print(response)

                for block in response.iter_content(1024):
                    if not block:
                        break

                    handle.write(block)

            data['image_path'] = img_path
        else:
            data['image_path'] = None
        yield data


process = CrawlerProcess(settings={
    'FEED_FORMAT': 'csv',
    'FEED_URI': 'celeb.csv'
})

process.crawl(CelebSpider)
process.start()

total = 100 + 799 + 925
df = pd.read_csv('celeb.csv')

print("percentage found = {}%".format(len(df) / total))
コード例 #34
0
ファイル: hessen.py プロジェクト: AndreBruecke/TWM
            extractor = LinkExtractor(allow=("presse"),
                                      allow_domains='hessen.de')
            links = extractor.extract_links(response)
            extractor = LinkExtractor(
                deny_domains=('www.hessen.de', 'facebook.com', 'youtube.com',
                              'twitter.com', 'instagram.com',
                              'radroutenplaner.hessen.de'))
            linksext = extractor.extract_links(response)
            for link in linksext:
                yield {
                    'from': response.url,
                    'url': link.url,
                    'text': link.text.strip()
                }
            for link in links:
                absolute_next_page_url = response.urljoin(link.url)
                yield scrapy.Request(absolute_next_page_url)


c = CrawlerProcess({
    'USER_AGENT': 'HochschuleDarmstadt-TextWebMining',
    'FEED_FORMAT': 'csv',
    'FEED_URI': '/media/sf_Shared/Git/data/HessenPresse.csv',
    'DOWNLOAD_DELAY': 1,
    'ROBOTSTXT_OBEY': True,
    'HTTPCACHE_ENABLED': True
})

c.crawl(HessenSpider)
c.start()  # the script will block here until the crawling is finished
コード例 #35
0
def run_spider(spiders):
	process = CrawlerProcess(get_project_settings())
	for spider in spiders:
		process.crawl(spider)
	
	process.start()
コード例 #36
0
        # Narrow in on the course blocks
        course_blocks = response.css('div.course-block')
        # Direct to the course links
        course_links = course_blocks.xpath('./a/@href')
        # Extract the links (as a list of strings)
        links_to_follow = course_links.extract()
        # Follow the links to the next parser
        for url in links_to_follow:
            yield response.follow(url=url, callback=self.parse_pages)

    def parse_pages(self, response):
        """Code to parse course pages"""
        # Direct to the course title text
        crs_title = response.xpath(
            '//h1[contains(@class,"header-hero__title")]/text()')
        # Extract and clean the course title text
        crs_title_ext = crs_title.extract_first().strip()
        # Direct to the chapter titles text
        ch_titles = response.css('h4.chapter__title::text')
        # Extract and clean the chapter titles text
        ch_titles_ext = [t.stip() for t in ch_titles.extract()]
        # Store this in our dictionary
        dc_dict[crs_title_ext] = ch_titles_ext


dc_dict = dict()

process = CrawlerProcess()
process.crawl(DC_Chapter_Spider)
process.start()
コード例 #37
0
                           c,
                           flags=re.S)[0]
            s = json.loads(s)
        content = s['detail']['content']
        print(content)


# 配置在单脚本情况也能爬取的脚本的备选方案,使用项目启动则下面的代码无效
if __name__ == '__main__':
    import os, time
    from scrapy.crawler import CrawlerProcess
    timestamp = time.strftime("%Y%m%d_%H%M%S", time.localtime())  # 年月日_时分秒
    filename = 'v{}.json'.format(timestamp)  # 这是输出文件名字(解开 'FEED_URI' 配置注释生效)
    jobdir = 'JOBDIR/JKzECrMyDU'  # 这是队列信息地址(解开 'JOBDIR'   配置注释生效)

    p = CrawlerProcess({
        'TELNETCONSOLE_ENABLED': False,  # 几乎没人使用到这个功能,直接关闭提高爬虫启动时间
        'MEDIA_ALLOW_REDIRECTS': True,  # 允许图片下载地址重定向,存在图片下载需求时,请尽量使用该设置
        'LOG_LEVEL': 'INFO',  # DEBUG , INFO , WARNING , ERROR , CRITICAL
        # 'JOBDIR':                   jobdir,     # 解开注释则增加断点续爬功能
        # 任务队列、任务去重指纹、任务状态存储空间(简单来说就是一个文件夹)
        # 'FEED_URI':                 filename,   # 下载数据到文件
        # 'FEED_EXPORT_ENCODING':     'utf-8',    # 在某种程度上,约等于 ensure_ascii=False 的配置选项
        # 'FEED_FORMAT':              'json',     # 下载的文件格式,不配置默认以 jsonlines 方式写入文件,
        # 支持的格式 json, jsonlines, csv, xml, pickle, marshal
        # 'DOWNLOAD_TIMEOUT':         8,          # 全局请求超时,默认180。也可以在 meta 中配置单个请求的超时( download_timeout )
        # 'DOWNLOAD_DELAY':           1,          # 全局下载延迟,这个配置相较于其他的节流配置要直观很多
    })
    p.crawl(VSpider)
    p.start()
コード例 #38
0
                items["content"] = cons
                items["images"] = ""
                items["release_time"] = time.strftime("%Y-%m-%d")
                items["qa"] = ""
                items["source"] = urlparse(response.url).netloc
                items["author"] = ""
                items["url"] = response.url
                items["entity"] = ""
                items["label"] = []
                items["summary"] = []
                items["time_stamp"] = int(time.time())
                items["priority"] = 0
                items["nlp_state"] = 0
                items["static_page"] = 0
                s1 = {'hotword_id': hotword_id}
                s2 = {'$set': {'article_state': 1}}
                self.client.dailypops.hotword.update(s1, s2)
                yield items

    def md5_(self, str):
        md5 = hashlib.md5()
        data = str
        md5.update(data.encode('utf-8'))
        return md5.hexdigest()


if __name__ == '__main__':
    chinadaily = CrawlerProcess()
    chinadaily.crawl(Chinadaily)
    chinadaily.start()
コード例 #39
0
            #        'addressStreet': item['addressStreet'],
            #        'addressState': item['addressState'],
            #        'addressCity': item['addressCity'],
            #        'addressZipcode': item['addressZipcode'],
            #        # 'description': item['description'],
            #        'beds': item['beds'],
            #        'baths': item['baths'],
            #        'area': item['area'],
            #        'latitude': item['latLong']['latitude'],
            #        'longitude': item['latLong']['longitude'],
            #        # 'brokerName': item['brokerName'],
            #        # 'brokerPhone': item['brokerPhone'],
            #        'yearBuilt': item['hdpData']['homeInfo']['yearBuilt'],
            #        'lotSize': item['hdpData']['homeInfo']['lotSize'],
            #        'homeType': item['hdpData']['homeInfo']['homeType'],
            #        'homeStatus': item['hdpData']['homeInfo']['homeStatus'],
            #        'zestimate': item['hdpData']['homeInfo']['zestimate'],
            #        # 'rentZestimate': item['hdpData']['homeInfo']['rentZestimate'],
            #        'festimate': item['hdpData']['homeInfo']['festimate'],
            #        'hiResImageLink': item['hdpData']['homeInfo']['hiResImageLink'], }

# main driver
if __name__ == '__main__':
    # run spider
    process = CrawlerProcess()
    process.crawl(ZillowSpider)
    process.start()

    # debug data extraction logic
    # ZillowSpider.parse(ZillowSpider, '')
コード例 #40
0
ファイル: runner.py プロジェクト: Artem3824/Scraping_Crawling
from scrapy.crawler import CrawlerProcess
from scrapy.settings import Settings
from leroymerlin import settings
from leroymerlin.spiders.leroy import LeroySpider

if __name__ == '__main__':
    crawler_settings = Settings()
    crawler_settings.setmodule(settings)
    process_object = CrawlerProcess(settings=crawler_settings)
    process_object.crawl(LeroySpider, category='молоток')
    process_object.start()
コード例 #41
0
ファイル: runner.py プロジェクト: olightnn/geekbrains
from scrapy.crawler import CrawlerProcess
from scrapy.settings import Settings
from gbparse import settings
#from gbparse.spiders.avito import AvitoSpider
#from gbparse.spiders.geekbrains import GeekbrainsSpider
#from gbparse.spiders.hhru import HhruSpider
from gbparse.spiders.vk import VkSpider

if __name__ == '__main__':
    scr_settings = Settings()
    scr_settings.setmodule(settings)
    process = CrawlerProcess(settings=scr_settings)
    #    process.crawl(AvitoSpider)
    #    process.crawl(GeekbrainsSpider)
    #    process.crawl(HhruSpider)
    process.crawl(VkSpider)
    process.start()
コード例 #42
0
ファイル: main.py プロジェクト: LTNMinh/LightNovelCrawler
        r = r"url\('(.*)'\)"

        name = "Name :" + html_soup.select(NAME_SELECTOR)[0].text
        author = "Author :" + html_soup.select(AUTHOR_SELECTOR)[0].text
        image_url = re.findall(
            r,
            html_soup.select(IMAGE_SELECTOR)[0].attrs['style'])[0]

        with open(basename(image_url), "wb") as f:
            f.write(get(image_url).content)

        window["-NAME-"].update(name)
        window["-AUTHOR-"].update(author)
        im = Image.open(basename(image_url))
        im.save('temp.png')
        window["-IMAGE-"].update(filename='temp.png')

    elif event == "DOWNLOAD":
        process = CrawlerProcess()
        process.crawl(GetLightNovelSpider,
                      start_url=url,
                      author=author,
                      name=name,
                      html_soup=html_soup)
        process.start()
        process.stop()

    elif event == "UPDATE":
        spider = UpdateLightNovel(name[6:] + '.epub')
        spider.update()
コード例 #43
0
# import the spiders you want to run
from spiders.toscrape import ToScrapeSpider
from spiders.toscrape2 import ToScrapeSpiderTwo

# scrapy api imports
# from scrapy import signals, log
from scrapy import signals
import logging
from twisted.internet import reactor
# from scrapy.crawler import Crawler
from scrapy.crawler import CrawlerProcess
# from scrapy.crawler import CrawlerRunner
from scrapy.settings import Settings

process = CrawlerProcess()
process.crawl(ToScrapeSpider)
process.crawl(ToScrapeSpiderTwo)
process.start(
)  # the script will block here until all crawling jobs are finished
コード例 #44
0
ファイル: scraper.py プロジェクト: weiks/flatfooted
 def _setup_process(self):
     self.now = datetime.now(self.settings.timezone).strftime(TS_FORMAT)
     self.process = CrawlerProcess(self._crawler_options())
     for site in self.sites:
         self.process.crawl(Spider, settings=site.settings, now=self.now)
コード例 #45
0
ファイル: main.py プロジェクト: yonghuming/python-examples
#!/usr/bin/env python3

import scrapy


class MySpider(scrapy.Spider):

    name = 'myspider'

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        # get list or use empty list
        # (as default it would return `None` but `start_urls` has to be list)
        self.start_urls = kwargs.get('urls', [])

    def parse(self, response):
        print('url:', response.url)


# --- it runs without project and saves in `output.csv` ---

from scrapy.crawler import CrawlerProcess

c = CrawlerProcess({
    'USER_AGENT': 'Mozilla/5.0',
})
c.crawl(MySpider, urls=['http://quotes.toscrape.com'])
c.start()
コード例 #46
0
ファイル: scraper.py プロジェクト: weiks/flatfooted
class Scraper:
    def __init__(self, settings):
        self.settings = Settings(settings)
        self.sites = [Site(name, settings) for name in self.settings.names]
        self._setup_process()

    def start(self):
        self.process.start()
        self._json_to_csv()

    def _json_to_csv(self):
        for name in self.settings.names:
            data_exists = False
            errors_csv_name = self._file_with_name(name,
                                                   ext='csv',
                                                   appendix='_errors')
            csv_name = self._file_with_name(name, ext='csv')
            json_name = self._file_with_name(name)
            try:
                data = pandas.read_json(json_name)
                errors = pandas.read_csv(errors_csv_name)
                data_exists = True
            except ValueError:
                pass
            if data_exists:
                results = self._postprocess_dataframe(data, errors)
                results.to_csv(csv_name)

    def _postprocess_dataframe(self, data, errors):
        if 'url_item' in data.columns and 'url_search' in data.columns:
            searches = (data[data['url_item'].isnull()].set_index(
                'search_string').dropna(axis='columns', how='all'))
            items = (data[data['url_search'].isnull()].set_index(
                'search_string').dropna(axis='columns', how='all'))
            results = searches.join(items, how='outer', rsuffix='_delete')
            return results[[c for c in results.columns if '_delete' not in c]]
        return data

    def _setup_process(self):
        self.now = datetime.now(self.settings.timezone).strftime(TS_FORMAT)
        self.process = CrawlerProcess(self._crawler_options())
        for site in self.sites:
            self.process.crawl(Spider, settings=site.settings, now=self.now)

    def _crawler_options(self):
        """Return crrawlwer options

        `DOWNLOAD_DELAY` is in seconds, and is such that if
        `RANDOMIZE_DOWNLOAD_DELAY` is set to `True`, then the requests will
        happen between 0.5 * `DOWNLOAD_DELAY` and 1.5 * `DOWNLOAD_DELAY`.

        `RETRY_TIMES` and `RETRY_HTTP_CODES` must be much more flexible if
        proxies are being used because proxies can fail for a variety of
        reasons, and we need to be able to adapt to that.
        """
        options = {
            'RANDOMIZE_DOWNLOAD_DELAY': True,
            'AUTOTHROTTLE_TARGET_CONCURRENCY': 0.1,
            'AUTOTHROTTLE_ENABLED': True,
            'CONCURRENT_REQUESTS': 2,
            'CONCURRENT_REQUESTS_PER_DOMAIN': 1,
            'FEED_FORMAT': 'json',
            'FEED_URI': self._file_name(),
            'COOKIES_ENABLED': False,
            'LOG_LEVEL': 'DEBUG',
            'RETRY_TIMES': 2,
            'DOWNLOAD_DELAY': 5,
            'DOWNLOAD_TIMEOUT': 120,
            'DOWNLOADER_MIDDLEWARES': {
                # 'scrapy.downloadermiddlewares.retry.RetryMiddleware': None,
                # 'scraper.middlewares.CustomRetriesMiddleware': 550,
                'scraper.middlewares.SeleniumMiddleware': 950
            },
            'DUPEFILTER_CLASS': 'scrapy.dupefilters.BaseDupeFilter'
        }
        m = 'DOWNLOADER_MIDDLEWARES'
        if self.settings.random_proxies:
            options[m]['scraper.middlewares.ProxiesMiddleware'] = 410
        if self.settings.random_user_agents:
            options[m]['scraper.middlewares.RandomUserAgentsMiddleware'] = 400
        if self.settings.mongo:
            options['ITEM_PIPELINES'] = {
                'scraper.pipelines.MongoWriterPipeline': 700
            }
        return options

    def _file_with_name(self, name, ext='json', appendix=''):
        return self._file_name(ext, appendix).replace("%(name)s", name)

    def _file_name(self, ext='json', appendix=''):
        return "outputs/%(name)s_{}{}.{}".format(self.now, appendix, ext)
コード例 #47
0
                "search":
                ""
            }

            yield response.follow(
                url=self.base_url,
                method='POST',
                dont_filter=True,
                headers=self.headers,
                body=json.dumps(para),
                meta={
                    # "dis_id": dis_id,
                    'filename': filename
                    # "seo_url": seo_url
                },
                callback=self.parse_page)

    def parse_page(self, response):
        data = json.loads(response.body)
        filename = response.meta["filename"]
        with open(filename, "a") as f:
            for item in data["articles"]:
                f.write(json.dumps(item) + '\n')


if __name__ == '__main__':
    # run scraper
    process = CrawlerProcess()
    process.crawl(chothueSpider)
    process.start()
コード例 #48
0
from scrapy.crawler import CrawlerProcess
from scrapy.settings import Settings
from workparser import settings
from workparser.spiders.hhru import HhruSpider
from workparser.spiders.sjru import SjruSpider

if __name__ == '__main__':
    crawler_settings = Settings()
    crawler_settings.setmodule(settings)
    process = CrawlerProcess(settings=crawler_settings)
    process.crawl(HhruSpider)
    #process.crawl(SjruSpider)
    process.start()
コード例 #49
0
            for tag in body.select('style'):
                tag.decompose()

            text = body.get_text(separator='\n')
            text = text.replace("\n", " ").replace("\t",
                                                   " ").replace("\r", " ")
            return text.lower()

        web_text = get_text_bs(web_text)

        exsit_list = checkActivity(act_list, web_text)
        activities = ', '.join(exsit_list)
        start_url = ', '.join(self.start_urls)

        item = {}
        item['start_url'] = start_url
        item['activities'] = activities
        return item


process = CrawlerProcess(settings={
    "FEEDS": {
        "data/items_9.json": {
            "format": "json"
        },
    },
})

process.crawl(ActivitySpider9)
process.start()
コード例 #50
0
ファイル: start_parse.py プロジェクト: ZharovaNV/Data-Mining
from scrapy.crawler import CrawlerProcess
from scrapy.settings import Settings
from blogparse import settings
# from blogparse.spiders.habr_blog import HabrBlogSpider
from blogparse.spiders.avito import AvitoSpider

if __name__ == '__main__':
    craw_settings = Settings()
    craw_settings.setmodule(settings)
    crawler_proc = CrawlerProcess(settings=craw_settings)
    # crawler_proc.crawl(HabrBlogSpider)
    crawler_proc.crawl(AvitoSpider)
    crawler_proc.start()
コード例 #51
0
    #start_urls = []

    tags = ['love', 'inspirational', 'life', 'humor', 'books', 'reading']
    pages = 3
    url_template = 'http://quotes.toscrape.com/tag/{}/page/{}'

    def start_requests(self):

        for tag in self.tags:
            for page in range(self.pages):
                url = self.url_template.format(tag, page)
                yield scrapy.Request(url)

    def parse(self, response):
        print('url:', response.url)


# --- run it without project ---

from scrapy.crawler import CrawlerProcess

#c = CrawlerProcess({
#    'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)',
#    'FEED_FORMAT': 'csv',
#    'FEED_URI': 'data.json',
#}

c = CrawlerProcess()
c.crawl(MySpider)
c.start()
コード例 #52
0
from shixiseng.spiders.shixisengspider import ShixisengspiderSpider
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings

# 获取settings.py模块的设置
settings = get_project_settings()
process = CrawlerProcess(settings=settings)

# 可以添加多个spider
process.crawl(ShixisengspiderSpider)
# 启动爬虫,会阻塞,直到爬取完成
process.start()
コード例 #53
0
ファイル: controlador.py プロジェクト: GitNolas/tfgVidalAmor
import argparse
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings

parser = argparse.ArgumentParser()
#Argumentos do Script
parser.add_argument('-f', '--futbol', action='store_true')
parser.add_argument('-bm', '--balonman', action='store_true')
parser.add_argument('-cF', '--codigoFederacion', default='none')
parser.add_argument('-cC', '--codigoCompeticion', required=True)
parser.add_argument('-cG', '--codigoGrupo', default=None)
parser.add_argument('-cT', '--codigoTemporada', default='15')
parser.add_argument('-cX', '--codigoXornada', default=0)

args = parser.parse_args()
process = CrawlerProcess({'SPIDER_MODULES': 'tfgObtencionDatos.spiders'})
if args.futbol:  #En caso de que se necesiten datos do Futbol
    federacionsPNFG = ['gal', 'mad', 'ceu', 'rioj', 'clm', 'and', 'cant']
    federacionsPNFGBasic = ['arg', 'mur', 'ext']
    if args.codigoFederacion in federacionsPNFG:  #Scraper PNFG
        process.crawl('pnfg',
                      federacion=args.codigoFederacion,
                      grupo=args.codigoGrupo,
                      competicion=args.codigoCompeticion,
                      temporada=args.codigoTemporada,
                      xornada=args.codigoXornada)
        process.start()
    elif args.codigoFederacion in federacionsPNFGBasic:  #Scraper PNFG basic
        process.crawl('pnfgBasic',
                      federacion=args.codigoFederacion,
                      grupo=args.codigoGrupo,
コード例 #54
0
ファイル: main.py プロジェクト: whitmans-max/python-examples
        def parse(self, response):
            data = json.loads(response.body)
            for item in data.get('data', []):
                yield {
                    'car_id': item.get('id'),
                    'car_name' : item.get('title'),
                    'price': item.get('price.value.currency.display'),
                    'user_id': item.get('user_id')
                 #   'user_name':
                }

            metadata = data.get('metadata')
            if metadata:
                url = metadata.get('next_page_url')
                if url:
                    yield scrapy.Request(url)
                
            
# --- it runs without project and saves in `output.csv` ---

from scrapy.crawler import CrawlerProcess

c = CrawlerProcess({
    'USER_AGENT': 'Mozilla/5.0',
    # save in file as CSV, JSON or XML
    'FEED_FORMAT': 'csv',     # csv, json, xml
    'FEED_URI': 'output.csv', # 
})
c.crawl(MySpider)
c.start()                
コード例 #55
0
ファイル: index.py プロジェクト: zipme/docsearch-scraper
def run_config(config):
    config = ConfigLoader(config)
    CustomDownloaderMiddleware.driver = config.driver
    DocumentationSpider.NB_INDEXED = 0

    if config.use_anchors:
        from . import scrapy_patch

    strategy = DefaultStrategy(config)

    algolia_helper = AlgoliaHelper(
        config.app_id, config.api_key, config.index_name,
        AlgoliaSettings.get(config, strategy.levels), config.query_rules,
        environ.get('REPLACE_DOMAIN', None))

    DOWNLOADER_MIDDLEWARES_PATH = 'scraper.src.custom_downloader_middleware.CustomDownloaderMiddleware'
    DOWNLOADER_CLIENTCONTEXTFACTORY = 'scraper.src.scrapy_patch.CustomContextFactory'
    DUPEFILTER_CLASS_PATH = 'scraper.src.custom_dupefilter.CustomDupeFilter'

    if __name__ == '__main__':
        DOWNLOADER_MIDDLEWARES_PATH = 'src.custom_downloader_middleware.CustomDownloaderMiddleware'
        DOWNLOADER_CLIENTCONTEXTFACTORY = 'src.scrapy_patch.CustomContextFactory'
        DUPEFILTER_CLASS_PATH = 'src.custom_dupefilter.CustomDupeFilter'

    process = CrawlerProcess({
        'LOG_ENABLED': '1',
        'LOG_LEVEL': 'ERROR',
        'USER_AGENT': config.user_agent,
        'DOWNLOADER_MIDDLEWARES': {
            DOWNLOADER_MIDDLEWARES_PATH: 900
        },
        # Need to be > 600 to be after the redirectMiddleware
        'DOWNLOADER_CLIENTCONTEXTFACTORY': DOWNLOADER_CLIENTCONTEXTFACTORY,
        'DUPEFILTER_USE_ANCHORS': config.use_anchors,
        # Use our custom dupefilter in order to be scheme agnostic regarding link provided
        'DUPEFILTER_CLASS': DUPEFILTER_CLASS_PATH
    })

    process.crawl(DocumentationSpider,
                  config=config,
                  algolia_helper=algolia_helper,
                  strategy=strategy)

    process.start()
    process.stop()

    # Kill browser if needed
    BrowserHandler.destroy(config.driver)

    if len(config.extra_records) > 0:
        algolia_helper.add_records(config.extra_records, "Extra records")

    print("")

    if DocumentationSpider.NB_INDEXED > 0:
        algolia_helper.commit_tmp_index()
        print('Nb hits: ' + str(DocumentationSpider.NB_INDEXED))
        # config.update_nb_hits(DocumentationSpider.NB_INDEXED)
    else:
        print('Crawling issue: nbHits 0 for ' + config.index_name)
        algolia_helper.report_crawling_issue()

    print("")
コード例 #56
0
def crawl_run():
    scope = 'all'
    process = CrawlerProcess(settings=get_project_settings())
    process.crawl(QuotesSpider, scope)
    process.start()
    process.join()
コード例 #57
0
                    fRelationName = tds[1].get_text()
                    fHouseName = tds[2].get_text()
                    fSerialNo = tds[3].get_text().strip()
                    fLACNo = tds[4].get_text().strip()
                    fPSNo = tds[5].get_text().strip()
                    fIdCardNo = tds[6].select('td > a')[0].get_text().strip()
                    fStatus = tds[7].get_text().strip()
                    fPrimaryIdCardNo = idCardNo
                    self.familyWriter.writerow([
                        fNameOfElector, fRelationName, fHouseName, fSerialNo,
                        fLACNo, fPSNo, fIdCardNo, fStatus, fPrimaryIdCardNo,
                        addStr
                    ])
        except Exception as e:
            print("[Family Write Error]", e)
            filename = response.url.split('=')
            filename = filename[len(filename) - 1] + '.html'
            with open(filename, 'wb') as f:
                f.write(response.body)
            self.log('Saved file %s' % filename)
            soup = BeautifulSoup(open(filename, encoding="utf-8"), 'lxml')
            if ('Invalid access to the page' in soup.text):
                os.remove(filename)


if __name__ == "__main__":

    process = CrawlerProcess()
    process.crawl(QuotesSpider)
    process.start()
コード例 #58
0
ファイル: sample1.py プロジェクト: oujx28/Spider_study
# coding:utf-8

import scrapy
from scrapy.crawler import CrawlerProcess

class MySpider1(scrapy.Spider):
    # Your first spider definition
    pass

class MySpider2(scrapy.Spider):
    # Your second spider definition
    pass

process = CrawlerProcess()
process.crawl(MySpider1)
process.crawl(MySpider2)
process.start()
コード例 #59
0
ファイル: main.py プロジェクト: mso13/BrazilianFinancialNews
        results_dict['full_text'] = news_full_text_ext
        results_dict['link'] = response.url
        results_dict['tags'] = news_tags_ext

        results_list.append(results_dict)


if __name__ == '__main__':

    THIS_DIR = os.path.dirname(os.path.abspath(__file__))

    filename = 'moneytimes'

    # List to save the data collected
    results_list = list()

    # Initiate a CrawlerProcess
    process = CrawlerProcess()

    # Tell the process which spider to use
    process.crawl(MoneyTimesSpider)

    # Start the crawling process
    process.start()

    # Save the list of dicts
    with open(os.path.join(THIS_DIR +
                           '/data/results-{}.json'.format(filename)),
              'w',
              encoding='utf8') as f:
        json.dump(results_list, f, ensure_ascii=False)
コード例 #60
0
        for data in datas:
            yield {
                'scrape_date': scrape_date,
                'types': types,
                'user_pic': user_pic,
                'date_update': date_update,
                'provinsi': provinsi,
                'kabkot': kabkot,
                'kecamatan': data["properties"]["name"],
                'kelurahan': '',
                'alamat': '',
                'total_odp': data["properties"]["odp_total"],
                'total_pdp': data["properties"]["pdp_total"],
                'total_positif': data["properties"]["positif_total"],
                'positif_sembuh': data["properties"]["positif_sembuh"],
                'positif_dirawat': data["properties"]["positif_dirawat"],
                'positif_isolasi': '',
                'positif_meninggal': data["properties"]["positif_meninggal"],
                'total_otg': '',
                'odr_total': '',
                'total_pp': '',
                'total_ppdt': '',
                'source_link': source_link,
            }


if __name__ == "__main__":
    process = CrawlerProcess()
    process.crawl(PurbalinggaSpider)
    process.start()