Python CrawlerProcess Examples, scrapy.crawler.CrawlerProcess Python Examples

Example #1

0

Show file

File: queen.py Project: Leon-Wulfgang/myCrawler

    def service_sis(self):
        process = CrawlerProcess({
            'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'
        })

        process.crawl(worker.Worker)
        process.start()  # the script will block here until the crawling is finished

Example #2

0

Show file

File: CrawlerWorker.py Project: zhaishuai/NewsfeedsService

class CrawlerWorker(multiprocessing.Process):
 
    def __init__(self, spider, result_queue):
        multiprocessing.Process.__init__(self)
        self.result_queue = result_queue
 
        self.crawler = CrawlerProcess(settings)
        if not hasattr(project, 'crawler'):
            self.crawler.install()
        self.crawler.configure()
 
        self.items = []
        self.spider = spider
        dispatcher.connect(self._item_passed, signals.item_passed)
    #__init__
    
    def _item_passed(self, item):
        self.items.append(item)
    # _item_passed
    
    def run(self):
        self.crawler.crawl(self.spider)
        self.crawler.start()
        self.crawler.stop()
        self.result_queue.put(self.items)
    #run

Example #3

0

Show file

File: scrapers.py Project: ContinuumIO/scrapy_scrapers

class BaseScraper(CrawlSpider):
    name = "base"
    rules = (
        Rule(LinkExtractor(allow=()), callback='parse_item'),
    )

    def __init__(self, index, start_urls, allowed_domains=[], *args, **kwargs):
        self.allowed_domains = allowed_domains
        self.start_urls = start_urls
        self.index = index
        super(BaseScraper, self).__init__(*args, **kwargs)

    def parse_item(self, response):
        item = {}
        item["body"] = response.body
        yield item

    # Instantiates a CrawlerProcess, which spins up a Twisted Reactor.
    def connect(self):
        self.process = CrawlerProcess(get_project_settings())

    # Start the scraper. The crawl process must be instantiated with the same
    # attributes as the instance.
    def start(self):
        self.connect()
        self.process.crawl(
            self.name,
            self.index,
            start_urls = self.start_urls,
            allowed_domains = self.allowed_domains,
        )
        self.process.start()

Example #4

0

Show file

File: decc.py Project: CharlesNie/DECC

def main(argv):

	try:
		opts, args = getopt.getopt(argv, "ch:t:s:", ['title=', 'section='])
	except getopt.GetoptError:
		print 'Usage:\npython2.7 decc.py -h(help)\npython2.7 decc.py -c(crawl articles)\npython2.7 decc.py -s(search article by section) <section>\npython2.7 decc.py -t(search article by title) <title>'
		sys.exit(2)
	for opt, arg in opts:
		if opt == '-h':
			print 'Usage:\npython2.7 decc.py -h(help)\npython2.7 decc.py -c(crawl articles)\npython2.7 decc.py -s(search article by section) <section>\npython2.7 decc.py -t(search article by title) <title>'
			sys.exit()
		elif opt == '-c':
			# start crawling article here
			print "crawling"
			process = CrawlerProcess(get_project_settings())
			process.crawl(BBCArticleSpider)
			process.start()
		elif opt in  ('-t', '--title'):
			print "search by title"
			# start searching article by title
			results = BBCArticleItem.fetch_by_title(arg)
			for result in results:
				print result
		elif opt in ('-s', '--section'):
			print "search by section"
			# start searching article by section
			results = BBCArticleItem.fetch_by_section(arg)
			for result in results:
				print result

Example #5

0

Show file

File: cli.py Project: Lukas0907/feeds

def crawl(ctx, spiders, stats):
    """
    Crawl one or many or all pages.

    What spider(s) to run is determined in the following order:

      1. Spider(s) given as argument(s)

      2. Spider(s) specified in the configuration file

    Note that if a spider is given as an argument, the spiders in the
    configuration file are ignored. All available spiders will be used to
    crawl if no arguments are given and no spiders are configured.
    """
    settings = ctx.obj["settings"]
    if stats:
        settings.set("STATS_CLASS", "scrapy.statscollectors.MemoryStatsCollector")

    # Start a new crawler process.
    process = CrawlerProcess(settings)
    spiders = spiders_to_crawl(process, spiders)
    if not spiders:
        logger.error("Please specify what spiders you want to run!")
    else:
        for spider in spiders:
            logger.info("Starting crawl of {} ...".format(spider))
            process.crawl(spider)

    process.start()

    if settings.getbool("HTTPCACHE_ENABLED"):
        run_cleanup_cache(settings)

Example #6

0

Show file

File: exporterfromlist.py Project: arapidhs/scry

	def __init__(self, titlesfile = None, platform = None, region = None):

		# set default encoding to utf8 for parsing and logging
		# utf-8 characters in console and files
		#
		reload(sys)
		sys.setdefaultencoding('utf8')
        
		configure_logging(install_root_handler=False)
		logging.basicConfig(
			filename='export.log',
			filemode = 'a',
			format='%(levelname)s: %(message)s',
			level=logging.INFO
		)
                				
		# identify platform
		#
		self.platform = platform
		if self.platform is None:
			logging.error('No platform found! Pass it as an argument.')
			return
		else:			
			platformId = platforms.getId(self.platform)
			if platformId is None:
				logging.error('Platform ' + self.platform + ' not supported.')
				return
						
		self.titlesfile = titlesfile
		self.region = region		
		if self.region is None:
			self.region = "Worldwide"
		
		if titlesfile:		
		
			titles = []
			urls = []
			
			with open( self.titlesfile ) as f:
				titles = f.read().splitlines()
				
			for title in titles:
				logging.debug('Submitting title:' + title )
				urls.append(
					'http://mobygames.com/search/quick' +
					'?q=' + title +
					'&p=' + platformId +
					'&search=Go'
					'&sFilter=1'
					'&sG=on'
					'&search_title=' + urllib.quote( title ) + 
					'&search_platform=' + urllib.quote(self.platform) +
					'&search_region=' + urllib.quote(self.region)
				)
				
			process = CrawlerProcess(get_project_settings())
			process.crawl(MobygamesSpider, start_urls=urls)
			process.start()									
		else:
			logging.warning('No file.')

Example #7

0

Show file

File: getProxy.py Project: lanxinxichen/ScrapyDemo

def run_spider():
    settings = Settings()
    settings.set("USER_AGENT", "Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1667.0 Safari/537.36")
    settings.set("ITEM_PIPELINES", {
        'pipelines.FilterProxyPipline': 1,
        'pipelines.SaveProxyPipeline': 2
    })
    settings.set("LOG_STDOUT ", True)

    # 配置日志记录规则设置
    # configure_logging({
    #     'filename': datetime.now().strftime('%Y_%m_%d_%H_proxy.log'),
    #     'format': '%(asctime)s %(levelname)-8s %(name)-15s %(message)s',
    #     'level': logging.INFO
    # })
    configure_logging(install_root_handler=False)
    # 初始化日志路径
    logpath = datetime.now().strftime(log_path)
    if not os.path.isdir(logpath):
        os.makedirs(logpath)
    logging.basicConfig(
        filename=datetime.now().strftime('%s/%s_proxy.log' % (logpath, log_file)),
        format=log_format,
        level=logging.INFO
    )
    process = CrawlerProcess(settings)
    process.crawl(GetProxySpider)
    process.start()

Example #8

0

Show file

File: spider_start.py Project: NLPScott/bdbk-kb

    def handle(self, *args, **options):
        setting = {
            'USER_AGENT': options['user_agent'],
            'DOWNLOAD_DELAY': options['download_delay'],
            'LOG_FILE': settings.SCRAPY_LOG_FILE,
            'LOG_LEVEL': settings.SCRAPY_LOG_LEVEL,
        }

        if options['proxy_list']:
            try:
                f = open(options['proxy_list'])
            except IOError as e:
                raise CommandError('cannot open proxy list file for read')

            # Retry many times since proxies often fail
            setting['RETRY_TIMES'] = 10
            # Retry on most error codes since proxies fail for different reasons
            setting['RETRY_HTTP_CODES'] = [500, 503, 504, 400, 403, 404, 408]
            setting['DOWNLOADER_MIDDLEWARES'] = {
                'scrapy.downloadermiddlewares.retry.RetryMiddleware': 90,
                'spider.randomproxy.RandomProxy': 100,
                'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware': 110,
            }
            setting['PROXY_LIST'] = options['proxy_list']

        process = CrawlerProcess(setting)

        process.crawl(BaiduSpider)
        process.start()

Example #9

0

Show file

File: news_flash_crawl.py Project: hasadna/anyway

def news_flash_crawl(rss_link, site_name, maps_key):
    id_flash = get_latest_id_from_db() + 1
    latest_date = get_latest_date_from_db()
    d = feedparser.parse(rss_link)
    process = CrawlerProcess()
    for entry in d.entries[::-1]:
        entry_parsed_date = datetime.strptime(entry.published[:-6], '%a, %d %b %Y %H:%M:%S')
        entry_parsed_date = entry_parsed_date.replace(tzinfo=None)
        if (latest_date is not None and entry_parsed_date > latest_date) or latest_date is None:
            news_item = {'id_flash': id_flash, 'date_parsed': entry_parsed_date, 'title': entry.title,
                         'link': entry.links[0].href, 'date': entry.published, 'location': '', 'lat': 0, 'lon': 0}
            if (u'תאונ' in entry.title and u'תאונת עבודה' not in entry.title and u'תאונות עבודה' not in entry.title)\
                    or ((u'רכב' in entry.title or u'אוטובוס' in entry.title or u"ג'יפ" in entry.title
                         or u'משאית' in entry.title or u'קטנוע'
                         in entry.title or u'אופנוע' in entry.title or u'אופניים' in entry.title or u'קורקינט'
                         in entry.title or u'הולך רגל' in entry.title or u'הולכת רגל' in entry.title
                         or u'הולכי רגל' in entry.title) and
                        (u'נפגע' in entry.title or u'פגיע' in entry.title or
                         u'נפצע' in entry.title or u'פציע' in entry.title or u'התנגש' in entry.title or u'התהפך'
                         in entry.title or u'התהפכ' in entry.title)):
                news_item['accident'] = True
            else:
                news_item['accident'] = False
            if site_name == 'ynet':
                news_item['source'] = 'ynet'
                process.crawl(YnetFlashScrap, entry.links[0].href, news_item=news_item, maps_key=maps_key)
            id_flash = id_flash + 1
    process.start()

Example #10

0

Show file

File: main.py Project: dagrooms52/TabCrawler

def main(tabLink):
    if(tabLink.find("ultimate-guitar.com")):
        tabSpider = Spiders.Ultimate(tabLink)
    elif(tabLink.find("guitartabs.cc")):
        tabSpider = Spiders.TabCC(tabLink)
    else:
        print("Domain name not supported.")
        return

    # Make a process to instantiate a Ultimate spider with the given
    # arguments and make it crawl the link
    process = CrawlerProcess(get_project_settings())
    process.crawl(tabSpider, link=tabLink)
    process.start()

    # Link has been scraped, now process it
    tree = xmltree.parse(tabs.pipelines.filename)
    root = tree.getroot()
    value = root[0][0][0]
    rawTab = value.text

    if("\M" in rawTab):
        rawTab = parsefuncs.removeLineEndings(rawTab)

    cleanTab = parsefuncs.parseTab(rawTab)

    print("Clean tab is:")
    count = 0
    for line in cleanTab:
        count += 1
        print line
        if(count % 6 == 0):
            print(" ")

Example #11

0

Show file

File: starting_crawler.py Project: JohnDoes95/project_parser

    def run(self):
        settings = get_project_settings()
        process = CrawlerProcess(settings)

        process.crawl('stackoverflow',
                      )
        process.start()

Example #12

0

Show file

File: uiCompare.py Project: AugustLONG/mcubed

def ScrapeSite():
    db = 'crunchbase_startups'
    sitedomain = raw_input("Enter site domain: ") # get user input
    sitedomain = parse_base_url(sitedomain) # clean url
    
    sql = 'SELECT text FROM {} WHERE siteurl = %s'.format(db)
    
    cur.execute(sql, sitedomain)
    sitetext = cur.fetch()
    
    if sitetext != '': # what does an empty ping return?
        print 'Site already scraped.'
        return sitetext
    
    process = CrawlerProcess({
        'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)',
        'ITEM_PIPELINES': {'pipelines.UserInputPipeline': 100},
        'DEPTH_LIMIT': 2,
        'DOWNLOAD_HANDLERS': {'s3': None,}
        ,'LOG_LEVEL': 'INFO'
    })
    
    process.crawl(SoloSpider, domain = sitedomain)
    process.start()
    
    # presumably finished here - pull newly loaded sitetext for domain
    
    cur.execute(sql, sitedomain)
    return cur.fetch()

Example #13

0

Show file

File: web_run.py Project: shanyue-video/video_scrapy

 def get(self):
     while True:
         process = CrawlerProcess(get_project_settings())
         process.crawl('iqiyi')
         process.start()
         time.sleep(3000)
     self.finish()

Example #14

0

Show file

File: cmdline.py Project: Root-nix/scrapy

def execute(argv=None):
    if argv is None:
        argv = sys.argv
    crawler = CrawlerProcess(settings)
    crawler.install()
    inproject = inside_project()
    cmds = _get_commands_dict(inproject)
    cmdname = _pop_command_name(argv)
    parser = optparse.OptionParser(formatter=optparse.TitledHelpFormatter(), \
        conflict_handler='resolve')
    if not cmdname:
        _print_commands(inproject)
        sys.exit(0)
    elif cmdname not in cmds:
        _print_unknown_command(cmdname, inproject)
        sys.exit(2)

    cmd = cmds[cmdname]
    parser.usage = "scrapy %s %s" % (cmdname, cmd.syntax())
    parser.description = cmd.long_desc()
    settings.defaults.update(cmd.default_settings)
    cmd.settings = settings
    cmd.add_options(parser)
    opts, args = parser.parse_args(args=argv[1:])
    _run_print_help(parser, cmd.process_options, args, opts)
    cmd.set_crawler(crawler)
    _run_print_help(parser, _run_command, cmd, args, opts)
    sys.exit(cmd.exitcode)

Example #15

0

Show file

File: scrape_codementor_payouts.py Project: jessamynsmith/mentor

    def handle(self, *args, **options):
        # It would be better to pass this in as a parameter to PayoutSpider
        global start_date
        start_date = datetime.datetime(2015, 1, 1, tzinfo=pytz.UTC)

        delete = options.get('delete')
        delete_all = options.get('delete_all')
        retrieve_all = options.get('retrieve_all')

        previous_payout = None
        previous_payouts = codementor_models.Payout.objects.all().order_by('-date')
        if delete_all or (delete and previous_payouts.count() == 0):
            codementor_models.Review.objects.all().delete()
            codementor_models.Session.objects.all().delete()
            codementor_models.Payout.objects.all().delete()
            codementor_models.Payment.objects.all().delete()
        elif delete:
            previous_payout = previous_payouts[0]
            codementor_models.Review.objects.filter(date__gt=start_date).delete()
            codementor_models.Session.objects.filter(started_at__gt=start_date).delete()
            previous_payout.delete()
            codementor_models.Payment.objects.filter(payout__isnull=True).delete()

        if not retrieve_all and previous_payout:
            start_date = previous_payout.date

        process = CrawlerProcess({
            'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'
        })

        process.crawl(PayoutSpider)
        process.start()

Example #16

0

Show file

File: scrape.py Project: benbp/showfinder

def run(urls, city):
    process = CrawlerProcess()
    spiders = [make_spider(artist, url, city) for artist, url in urls]
    for spider_cls in spiders:
        process.crawl(spider_cls)
    # the script will block here until the crawling is finished
    process.start()

Example #17

0

Show file

File: see_whats_going_on.py Project: georgi0u/mgrok

def get_scraped_sites_data():
    """Returns output for venues which need to be scraped."""
    class RefDict(dict):
        """A dictionary which returns a reference to itself when deepcopied."""
        def __deepcopy__(self, memo):
            return self

    # Hack: we pass a dictionary which can't be deep-copied into the settings
    # so as to _return_ the scraper output. As far as I can tell, this is the
    # only way to return the scraper output to the script itself.
    output = RefDict()

    settings = Settings({
        'LOG_ENABLED': False,
        'ITEM_PIPELINES': {
            'mgrok.pipelines.JsonWriterPipeline': 1
            },
        'PIPELINE_OUTPUT': output,
        'USER_AGENT': 'Chrome/41.0.2228.0'
        })

    crawler_process = CrawlerProcess(settings)
    for spider in SCRAPY_SPIDERS:
        crawler_process.crawl(spider)

    crawler_process.start()

    return output

Example #18

0

Show file

File: run.py Project: algoo/crawlers

def crawl(spiders_classes, connector, debug=False, spider_error_callback=stdout_error_callback):
    """
    Launch crawl job for JobSpider class
    :param debug: (bool) Activate or disable debug
    :param spider_error_callback: callback foir spider errors (see http://doc.scrapy.org/en/latest/topics/signals.html#spider-error)
    :param connector: Connector instance
    :param spiders_classes: JobSpider class list
    :return: spider instance
    """
    if debug:
        dispatcher.connect(spider_error_callback, signals.spider_error)

    process = CrawlerProcess({
        'ITEM_PIPELINES': {
            'pyjobs_crawlers.pipelines.RecordJobPipeline': 1,
        },
        'connector': connector,
        'LOG_ENABLED': False
    })

    for spider_class in spiders_classes:
        process.crawl(spider_class)

    spiders = []
    for crawler in list(process.crawlers):
        spiders.append(crawler.spider)
    process.start()

    return spiders

Example #19

0

Show file

File: vs_spider.py Project: loremIpsum1771/Concert-Price-Tracker

def spiderCrawl(bandname):
   createLink(bandname)
   settings = get_project_settings()
   settings.set('USER_AGENT','Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)')
   process = CrawlerProcess(settings)
   process.crawl(MySpider)
   process.start()

Example #20

0

Show file

File: ScapyTest.py Project: ohansrud/StockUtils

    def Test_Scapy(self):
        spider = FtpSpider()

        process = CrawlerProcess({"USER_AGENT": "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)"})

        process.crawl(spider)
        process.start()

Example #21

0

Show file

File: try.py Project: Rygbee/aminer-spider

class CrawlerScript():
	def __init__(self):
		self.crawler = CrawlerProcess(settings)
		#if not hasattr(project, 'crawler'):
			#self.crawler.install()
		#self.crawler.configure()
		self.items = []
		dispatcher.connect(self._item_passed, signals.item_passed)

	def _item_passed(self, item):
		self.items.append(item)

	def _crawl(self, queue, spider_name):
		spider = self.crawler.spiders.create(spider_name)
		if spider:
			self.crawler.queue.append_spider(spider)
			self.crawler.start()
			self.crawler.stop()
			queue.put(self.items)
	def crawl(self, spider):
		queue = Queue()
		p = Process(target=self._crawl, args=(queue, spider,))
		p.start()
		p.join()
		return queue.get(True)

Example #22

0

Show file

File: task.py Project: ohhdemgirls/od-database

    def execute_task(self, website: Website, busy: Value, post_id: str, comment_id: str):
        busy.value = 1
        if os.path.exists("data.json"):
            os.remove("data.json")
        print("Started crawling task")
        process = CrawlerProcess(get_project_settings())
        process.crawl("od_links", base_url=website.url)
        process.start()
        print("Done crawling")

        self.db.import_json("data.json", website)
        os.remove("data.json")
        print("Imported in SQLite3")

        if post_id:
            # Reply to post
            stats = self.db.get_website_stats(website.id)
            comment = self.reddit_bot.get_comment({"": stats}, website.id)
            print(comment)
            if "total_size" in stats and stats["total_size"] > 10000000:
                post = self.reddit_bot.reddit.submission(post_id)
                self.reddit_bot.reply(post, comment)
                pass
            else:
                self.reddit_bot.log_crawl(post_id)

        elif comment_id:
            # Reply to comment
            stats = self.db.get_website_stats(website.id)
            comment = self.reddit_bot.get_comment({"There you go!": stats}, website.id)
            print(comment)
            reddit_comment = self.reddit_bot.reddit.comment(comment_id)
            self.reddit_bot.reply(reddit_comment, comment)
        busy.value = 0
        print("Done crawling task")

Example #23

0

Show file

File: utils.py Project: chenhc/laravel

def get_fetch(log=False):
    settings = Settings()
    settings.set('LOG_ENABLED', log)

    crawler_process = CrawlerProcess(settings)
    crawler = crawler_process.create_crawler()
    crawler_process.start_crawling()

    t = Thread(target=crawler_process.start_reactor)
    t.daemon = True
    t.start()

    shell = Shell(crawler)
    shell.code = 'adsf'

    import threading
    lock = threading.Lock()

    def fetch(url_or_request):
        lock.acquire()
        try:
            shell.fetch(url_or_request)
            response = shell.vars.get('response')
            return response
        finally:
            lock.release()

    return fetch

Example #24

0

Show file

File: scr.py Project: JINDALG/Roofpik_scrapy

def magic():

	process = CrawlerProcess(get_project_settings())

	# 'followall' is the name of one of the spiders of the project.
	process.crawl('magic')
	process.start() # the script will block here until the crawling is fini

Example #25

0

Show file

File: tasks.py Project: gustavpursche/OffenesParlament

def scrape(spider):
    with transaction.atomic(), reversion.create_revision():
        process = CrawlerProcess(DEFAULT_CRAWLER_OPTIONS)
        process.crawl(spider)
        # the script will block here until the crawling is finished
        process.start()
    return

Example #26

0

Show file

File: t.py Project: szqh97/test

def main():
    """Setups item signal and run the spider"""
    # set up signal to catch items scraped
    from scrapy import signals
    from scrapy.xlib.pydispatch import dispatcher
 
    def catch_item(sender, item, **kwargs):
        print "Got:", item
 
    dispatcher.connect(catch_item, signal=signals.item_passed)
 
    # shut off log
    from scrapy.conf import settings
    settings.overrides['LOG_ENABLED'] = False
 
    # set up crawler
    from scrapy.crawler import CrawlerProcess
 
    crawler = CrawlerProcess(settings)
    crawler.start()
 
    # schedule spider
 
    # start engine scrapy/twisted
    print "STARTING ENGINE"
    crawler.start()
    print "ENGINE STOPPED"

Example #27

0

Show file

File: run.py Project: CkuT/crawlers

def crawl(spiders_classes, connector, debug=False, spider_error_callback=stdout_error_callback, scrapy_settings=None):
    """
    Launch crawl job for JobSpider class
    :param scrapy_settings: dict of setting merged with CrawlerProcess default settings
    :param debug: (bool) Activate or disable debug
    :param spider_error_callback: callback foir spider errors (see http://doc.scrapy.org/en/latest/topics/signals.html#spider-error)
    :param connector: Connector instance
    :param spiders_classes: JobSpider class list
    :return: spider instance
    """
    if debug:
        dispatcher.connect(spider_error_callback, signals.spider_error)

    settings = {
        'ITEM_PIPELINES': {
            'pyjobs_crawlers.pipelines.RecordJobPipeline': 1,
        },
        'connector': connector,
        'LOG_ENABLED': False,
        'DOWNLOAD_DELAY': 1 if not debug else 0,
    }
    if scrapy_settings:
        settings.update(scrapy_settings)

    process = CrawlerProcess(settings)

    for spider_class in spiders_classes:
        process.crawl(spider_class, debug=debug)

    spiders = []
    for crawler in list(process.crawlers):
        spiders.append(crawler.spider)
    process.start()

    return spiders

Example #28

0

Show file

File: ScrapyWebScraper2.py Project: ccampell/ATS

def main():
    settings = get_project_settings()
    # TODO: Initialize item pipelines
    # settings.set('ITEM_PIPELINES', {'Program.Scrapy.Items.HikerJournalWriterPipeline': 2})
    crawler = CrawlerProcess(settings=settings)
    spider = HikerScraper()
    crawler.crawl(spider, domain="http://www.trailjournals.com")
    crawler.start()

Example #29

0

Show file

File: tasks.py Project: richardcornish/timgorin

def scrape_task():
    """Celery task to scrape website with Scrapy.

    http://doc.scrapy.org/en/latest/topics/practices.html#run-scrapy-from-a-script
    """
    process = CrawlerProcess(get_project_settings())
    process.crawl('eslcafe', domain='eslcafe.com')
    process.start()

Example #30

0

Show file

File: updateConfig.py Project: chaosimple/iShadowsocks_Spider

def crawl_Info():
    """
    该函数用于从http://www.ishadowsocks.com/网站上爬取免费SHADOWSOCKS帐号
    结果存储在Result.json文件中
    """
    process = CrawlerProcess(get_project_settings())
    process.crawl('SSSpider')
    process.start()

Example #31

0

Show file

File: tasks.py Project: srmchcy/WeatherCrawler

def _crawl(path=None):
    crawl = CrawlerProcess(
        {'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'})
    crawl.crawl(ProvinceSpider)
    crawl.start()
    crawl.stop()

Example #32

0

Show file

        css_selector = '//img'
        for x in response.xpath(css_selector):
            newsel = '::attr(src)'
            links = x.css(newsel).extract_first()
            if links.endswith(".jpg"):
                yield {'Image Link': links}

    #this is to check the next page for images and scrap them out
        Page_selector = '.next a ::attr(href)'
        NextPage = response.css(Page_selector).extract_first()
        if NextPage:
            yield scrapy.Request(response.urljoin(NextPage),
                                 callback=self.parse)


process = CrawlerProcess({'FEED_FORMAT': 'json', 'FEED_URI': 'results.json'})

process.crawl(NewSpider)  # Selecting spider class
process.start()  #

#to read the results.json file and display it when run
with open('results.json', 'rt') as filehandle:
    lines = filehandle.readlines()[1:15]
for line in lines:
    print(website + line.replace('{"Image Link": "', "").replace(
        '"}', "").replace(",", ""))

#To Display the Website
testurls = ['http://172.18.58.238/hr2/']
import webbrowser
for url in testurls:

Example #33

0

Show file

File: main.py Project: VaranRohila/Indian-Celebrity-Data

            img_path = 'images/' + str("_".join(data['name'].split())) + '.jpg'
            with open(img_path, 'wb') as handle:
                response = requests.get(img_url, stream=True)

                if not response.ok:
                    print(response)

                for block in response.iter_content(1024):
                    if not block:
                        break

                    handle.write(block)

            data['image_path'] = img_path
        else:
            data['image_path'] = None
        yield data


process = CrawlerProcess(settings={
    'FEED_FORMAT': 'csv',
    'FEED_URI': 'celeb.csv'
})

process.crawl(CelebSpider)
process.start()

total = 100 + 799 + 925
df = pd.read_csv('celeb.csv')

print("percentage found = {}%".format(len(df) / total))

Example #34

0

Show file

File: hessen.py Project: AndreBruecke/TWM

            extractor = LinkExtractor(allow=("presse"),
                                      allow_domains='hessen.de')
            links = extractor.extract_links(response)
            extractor = LinkExtractor(
                deny_domains=('www.hessen.de', 'facebook.com', 'youtube.com',
                              'twitter.com', 'instagram.com',
                              'radroutenplaner.hessen.de'))
            linksext = extractor.extract_links(response)
            for link in linksext:
                yield {
                    'from': response.url,
                    'url': link.url,
                    'text': link.text.strip()
                }
            for link in links:
                absolute_next_page_url = response.urljoin(link.url)
                yield scrapy.Request(absolute_next_page_url)


c = CrawlerProcess({
    'USER_AGENT': 'HochschuleDarmstadt-TextWebMining',
    'FEED_FORMAT': 'csv',
    'FEED_URI': '/media/sf_Shared/Git/data/HessenPresse.csv',
    'DOWNLOAD_DELAY': 1,
    'ROBOTSTXT_OBEY': True,
    'HTTPCACHE_ENABLED': True
})

c.crawl(HessenSpider)
c.start()  # the script will block here until the crawling is finished

Example #35

0

Show file

File: spiders_entry.py Project: shieldOnTheRoad/Knowledge-Graph-demo

def run_spider(spiders):
	process = CrawlerProcess(get_project_settings())
	for spider in spiders:
		process.crawl(spider)
	
	process.start()

Example #36

0

Show file

File: spider.py Project: MatthewMadden/scrapy_crawlers

        # Narrow in on the course blocks
        course_blocks = response.css('div.course-block')
        # Direct to the course links
        course_links = course_blocks.xpath('./a/@href')
        # Extract the links (as a list of strings)
        links_to_follow = course_links.extract()
        # Follow the links to the next parser
        for url in links_to_follow:
            yield response.follow(url=url, callback=self.parse_pages)

    def parse_pages(self, response):
        """Code to parse course pages"""
        # Direct to the course title text
        crs_title = response.xpath(
            '//h1[contains(@class,"header-hero__title")]/text()')
        # Extract and clean the course title text
        crs_title_ext = crs_title.extract_first().strip()
        # Direct to the chapter titles text
        ch_titles = response.css('h4.chapter__title::text')
        # Extract and clean the chapter titles text
        ch_titles_ext = [t.stip() for t in ch_titles.extract()]
        # Store this in our dictionary
        dc_dict[crs_title_ext] = ch_titles_ext


dc_dict = dict()

process = CrawlerProcess()
process.crawl(DC_Chapter_Spider)
process.start()

Example #37

0

Show file

File: 360咨询简单解析.py Project: xkcomeon/any-whim

                           c,
                           flags=re.S)[0]
            s = json.loads(s)
        content = s['detail']['content']
        print(content)


# 配置在单脚本情况也能爬取的脚本的备选方案，使用项目启动则下面的代码无效
if __name__ == '__main__':
    import os, time
    from scrapy.crawler import CrawlerProcess
    timestamp = time.strftime("%Y%m%d_%H%M%S", time.localtime())  # 年月日_时分秒
    filename = 'v{}.json'.format(timestamp)  # 这是输出文件名字（解开 'FEED_URI' 配置注释生效）
    jobdir = 'JOBDIR/JKzECrMyDU'  # 这是队列信息地址（解开 'JOBDIR'   配置注释生效）

    p = CrawlerProcess({
        'TELNETCONSOLE_ENABLED': False,  # 几乎没人使用到这个功能，直接关闭提高爬虫启动时间
        'MEDIA_ALLOW_REDIRECTS': True,  # 允许图片下载地址重定向，存在图片下载需求时，请尽量使用该设置
        'LOG_LEVEL': 'INFO',  # DEBUG , INFO , WARNING , ERROR , CRITICAL
        # 'JOBDIR':                   jobdir,     # 解开注释则增加断点续爬功能
        # 任务队列、任务去重指纹、任务状态存储空间(简单来说就是一个文件夹)
        # 'FEED_URI':                 filename,   # 下载数据到文件
        # 'FEED_EXPORT_ENCODING':     'utf-8',    # 在某种程度上，约等于 ensure_ascii=False 的配置选项
        # 'FEED_FORMAT':              'json',     # 下载的文件格式，不配置默认以 jsonlines 方式写入文件，
        # 支持的格式 json, jsonlines, csv, xml, pickle, marshal
        # 'DOWNLOAD_TIMEOUT':         8,          # 全局请求超时，默认180。也可以在 meta 中配置单个请求的超时( download_timeout )
        # 'DOWNLOAD_DELAY':           1,          # 全局下载延迟，这个配置相较于其他的节流配置要直观很多
    })
    p.crawl(VSpider)
    p.start()

Example #38

0

Show file

                items["content"] = cons
                items["images"] = ""
                items["release_time"] = time.strftime("%Y-%m-%d")
                items["qa"] = ""
                items["source"] = urlparse(response.url).netloc
                items["author"] = ""
                items["url"] = response.url
                items["entity"] = ""
                items["label"] = []
                items["summary"] = []
                items["time_stamp"] = int(time.time())
                items["priority"] = 0
                items["nlp_state"] = 0
                items["static_page"] = 0
                s1 = {'hotword_id': hotword_id}
                s2 = {'$set': {'article_state': 1}}
                self.client.dailypops.hotword.update(s1, s2)
                yield items

    def md5_(self, str):
        md5 = hashlib.md5()
        data = str
        md5.update(data.encode('utf-8'))
        return md5.hexdigest()


if __name__ == '__main__':
    chinadaily = CrawlerProcess()
    chinadaily.crawl(Chinadaily)
    chinadaily.start()

Example #39

0

Show file

            #        'addressStreet': item['addressStreet'],
            #        'addressState': item['addressState'],
            #        'addressCity': item['addressCity'],
            #        'addressZipcode': item['addressZipcode'],
            #        # 'description': item['description'],
            #        'beds': item['beds'],
            #        'baths': item['baths'],
            #        'area': item['area'],
            #        'latitude': item['latLong']['latitude'],
            #        'longitude': item['latLong']['longitude'],
            #        # 'brokerName': item['brokerName'],
            #        # 'brokerPhone': item['brokerPhone'],
            #        'yearBuilt': item['hdpData']['homeInfo']['yearBuilt'],
            #        'lotSize': item['hdpData']['homeInfo']['lotSize'],
            #        'homeType': item['hdpData']['homeInfo']['homeType'],
            #        'homeStatus': item['hdpData']['homeInfo']['homeStatus'],
            #        'zestimate': item['hdpData']['homeInfo']['zestimate'],
            #        # 'rentZestimate': item['hdpData']['homeInfo']['rentZestimate'],
            #        'festimate': item['hdpData']['homeInfo']['festimate'],
            #        'hiResImageLink': item['hdpData']['homeInfo']['hiResImageLink'], }

# main driver
if __name__ == '__main__':
    # run spider
    process = CrawlerProcess()
    process.crawl(ZillowSpider)
    process.start()

    # debug data extraction logic
    # ZillowSpider.parse(ZillowSpider, '')

Example #40

0

Show file

File: runner.py Project: Artem3824/Scraping_Crawling

from scrapy.crawler import CrawlerProcess
from scrapy.settings import Settings
from leroymerlin import settings
from leroymerlin.spiders.leroy import LeroySpider

if __name__ == '__main__':
    crawler_settings = Settings()
    crawler_settings.setmodule(settings)
    process_object = CrawlerProcess(settings=crawler_settings)
    process_object.crawl(LeroySpider, category='молоток')
    process_object.start()

Example #41

0

Show file

File: runner.py Project: olightnn/geekbrains

from scrapy.crawler import CrawlerProcess
from scrapy.settings import Settings
from gbparse import settings
#from gbparse.spiders.avito import AvitoSpider
#from gbparse.spiders.geekbrains import GeekbrainsSpider
#from gbparse.spiders.hhru import HhruSpider
from gbparse.spiders.vk import VkSpider

if __name__ == '__main__':
    scr_settings = Settings()
    scr_settings.setmodule(settings)
    process = CrawlerProcess(settings=scr_settings)
    #    process.crawl(AvitoSpider)
    #    process.crawl(GeekbrainsSpider)
    #    process.crawl(HhruSpider)
    process.crawl(VkSpider)
    process.start()

Example #42

0

Show file

File: main.py Project: LTNMinh/LightNovelCrawler

        r = r"url\('(.*)'\)"

        name = "Name :" + html_soup.select(NAME_SELECTOR)[0].text
        author = "Author :" + html_soup.select(AUTHOR_SELECTOR)[0].text
        image_url = re.findall(
            r,
            html_soup.select(IMAGE_SELECTOR)[0].attrs['style'])[0]

        with open(basename(image_url), "wb") as f:
            f.write(get(image_url).content)

        window["-NAME-"].update(name)
        window["-AUTHOR-"].update(author)
        im = Image.open(basename(image_url))
        im.save('temp.png')
        window["-IMAGE-"].update(filename='temp.png')

    elif event == "DOWNLOAD":
        process = CrawlerProcess()
        process.crawl(GetLightNovelSpider,
                      start_url=url,
                      author=author,
                      name=name,
                      html_soup=html_soup)
        process.start()
        process.stop()

    elif event == "UPDATE":
        spider = UpdateLightNovel(name[6:] + '.epub')
        spider.update()

Example #43

0

Show file

# import the spiders you want to run
from spiders.toscrape import ToScrapeSpider
from spiders.toscrape2 import ToScrapeSpiderTwo

# scrapy api imports
# from scrapy import signals, log
from scrapy import signals
import logging
from twisted.internet import reactor
# from scrapy.crawler import Crawler
from scrapy.crawler import CrawlerProcess
# from scrapy.crawler import CrawlerRunner
from scrapy.settings import Settings

process = CrawlerProcess()
process.crawl(ToScrapeSpider)
process.crawl(ToScrapeSpiderTwo)
process.start(
)  # the script will block here until all crawling jobs are finished

Example #44

0

Show file

File: scraper.py Project: weiks/flatfooted

 def _setup_process(self):
     self.now = datetime.now(self.settings.timezone).strftime(TS_FORMAT)
     self.process = CrawlerProcess(self._crawler_options())
     for site in self.sites:
         self.process.crawl(Spider, settings=site.settings, now=self.now)

Example #45

0

Show file

File: main.py Project: yonghuming/python-examples

#!/usr/bin/env python3

import scrapy


class MySpider(scrapy.Spider):

    name = 'myspider'

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        # get list or use empty list
        # (as default it would return `None` but `start_urls` has to be list)
        self.start_urls = kwargs.get('urls', [])

    def parse(self, response):
        print('url:', response.url)


# --- it runs without project and saves in `output.csv` ---

from scrapy.crawler import CrawlerProcess

c = CrawlerProcess({
    'USER_AGENT': 'Mozilla/5.0',
})
c.crawl(MySpider, urls=['http://quotes.toscrape.com'])
c.start()

Example #46

0

Show file

File: scraper.py Project: weiks/flatfooted

class Scraper:
    def __init__(self, settings):
        self.settings = Settings(settings)
        self.sites = [Site(name, settings) for name in self.settings.names]
        self._setup_process()

    def start(self):
        self.process.start()
        self._json_to_csv()

    def _json_to_csv(self):
        for name in self.settings.names:
            data_exists = False
            errors_csv_name = self._file_with_name(name,
                                                   ext='csv',
                                                   appendix='_errors')
            csv_name = self._file_with_name(name, ext='csv')
            json_name = self._file_with_name(name)
            try:
                data = pandas.read_json(json_name)
                errors = pandas.read_csv(errors_csv_name)
                data_exists = True
            except ValueError:
                pass
            if data_exists:
                results = self._postprocess_dataframe(data, errors)
                results.to_csv(csv_name)

    def _postprocess_dataframe(self, data, errors):
        if 'url_item' in data.columns and 'url_search' in data.columns:
            searches = (data[data['url_item'].isnull()].set_index(
                'search_string').dropna(axis='columns', how='all'))
            items = (data[data['url_search'].isnull()].set_index(
                'search_string').dropna(axis='columns', how='all'))
            results = searches.join(items, how='outer', rsuffix='_delete')
            return results[[c for c in results.columns if '_delete' not in c]]
        return data

    def _setup_process(self):
        self.now = datetime.now(self.settings.timezone).strftime(TS_FORMAT)
        self.process = CrawlerProcess(self._crawler_options())
        for site in self.sites:
            self.process.crawl(Spider, settings=site.settings, now=self.now)

    def _crawler_options(self):
        """Return crrawlwer options

        `DOWNLOAD_DELAY` is in seconds, and is such that if
        `RANDOMIZE_DOWNLOAD_DELAY` is set to `True`, then the requests will
        happen between 0.5 * `DOWNLOAD_DELAY` and 1.5 * `DOWNLOAD_DELAY`.

        `RETRY_TIMES` and `RETRY_HTTP_CODES` must be much more flexible if
        proxies are being used because proxies can fail for a variety of
        reasons, and we need to be able to adapt to that.
        """
        options = {
            'RANDOMIZE_DOWNLOAD_DELAY': True,
            'AUTOTHROTTLE_TARGET_CONCURRENCY': 0.1,
            'AUTOTHROTTLE_ENABLED': True,
            'CONCURRENT_REQUESTS': 2,
            'CONCURRENT_REQUESTS_PER_DOMAIN': 1,
            'FEED_FORMAT': 'json',
            'FEED_URI': self._file_name(),
            'COOKIES_ENABLED': False,
            'LOG_LEVEL': 'DEBUG',
            'RETRY_TIMES': 2,
            'DOWNLOAD_DELAY': 5,
            'DOWNLOAD_TIMEOUT': 120,
            'DOWNLOADER_MIDDLEWARES': {
                # 'scrapy.downloadermiddlewares.retry.RetryMiddleware': None,
                # 'scraper.middlewares.CustomRetriesMiddleware': 550,
                'scraper.middlewares.SeleniumMiddleware': 950
            },
            'DUPEFILTER_CLASS': 'scrapy.dupefilters.BaseDupeFilter'
        }
        m = 'DOWNLOADER_MIDDLEWARES'
        if self.settings.random_proxies:
            options[m]['scraper.middlewares.ProxiesMiddleware'] = 410
        if self.settings.random_user_agents:
            options[m]['scraper.middlewares.RandomUserAgentsMiddleware'] = 400
        if self.settings.mongo:
            options['ITEM_PIPELINES'] = {
                'scraper.pipelines.MongoWriterPipeline': 700
            }
        return options

    def _file_with_name(self, name, ext='json', appendix=''):
        return self._file_name(ext, appendix).replace("%(name)s", name)

    def _file_name(self, ext='json', appendix=''):
        return "outputs/%(name)s_{}{}.{}".format(self.now, appendix, ext)

Example #47

0

Show file

File: cho_thue.py Project: hoahanie/meeyland_finsihed

                "search":
                ""
            }

            yield response.follow(
                url=self.base_url,
                method='POST',
                dont_filter=True,
                headers=self.headers,
                body=json.dumps(para),
                meta={
                    # "dis_id": dis_id,
                    'filename': filename
                    # "seo_url": seo_url
                },
                callback=self.parse_page)

    def parse_page(self, response):
        data = json.loads(response.body)
        filename = response.meta["filename"]
        with open(filename, "a") as f:
            for item in data["articles"]:
                f.write(json.dumps(item) + '\n')


if __name__ == '__main__':
    # run scraper
    process = CrawlerProcess()
    process.crawl(chothueSpider)
    process.start()

Example #48

0

Show file

from scrapy.crawler import CrawlerProcess
from scrapy.settings import Settings
from workparser import settings
from workparser.spiders.hhru import HhruSpider
from workparser.spiders.sjru import SjruSpider

if __name__ == '__main__':
    crawler_settings = Settings()
    crawler_settings.setmodule(settings)
    process = CrawlerProcess(settings=crawler_settings)
    process.crawl(HhruSpider)
    #process.crawl(SjruSpider)
    process.start()

Example #49

0

Show file

File: targ_activity9.py Project: cMinzel-Z/Web-crawler

            for tag in body.select('style'):
                tag.decompose()

            text = body.get_text(separator='\n')
            text = text.replace("\n", " ").replace("\t",
                                                   " ").replace("\r", " ")
            return text.lower()

        web_text = get_text_bs(web_text)

        exsit_list = checkActivity(act_list, web_text)
        activities = ', '.join(exsit_list)
        start_url = ', '.join(self.start_urls)

        item = {}
        item['start_url'] = start_url
        item['activities'] = activities
        return item


process = CrawlerProcess(settings={
    "FEEDS": {
        "data/items_9.json": {
            "format": "json"
        },
    },
})

process.crawl(ActivitySpider9)
process.start()

Example #50

0

Show file

File: start_parse.py Project: ZharovaNV/Data-Mining

from scrapy.crawler import CrawlerProcess
from scrapy.settings import Settings
from blogparse import settings
# from blogparse.spiders.habr_blog import HabrBlogSpider
from blogparse.spiders.avito import AvitoSpider

if __name__ == '__main__':
    craw_settings = Settings()
    craw_settings.setmodule(settings)
    crawler_proc = CrawlerProcess(settings=craw_settings)
    # crawler_proc.crawl(HabrBlogSpider)
    crawler_proc.crawl(AvitoSpider)
    crawler_proc.start()

Example #51

0

Show file

    #start_urls = []

    tags = ['love', 'inspirational', 'life', 'humor', 'books', 'reading']
    pages = 3
    url_template = 'http://quotes.toscrape.com/tag/{}/page/{}'

    def start_requests(self):

        for tag in self.tags:
            for page in range(self.pages):
                url = self.url_template.format(tag, page)
                yield scrapy.Request(url)

    def parse(self, response):
        print('url:', response.url)


# --- run it without project ---

from scrapy.crawler import CrawlerProcess

#c = CrawlerProcess({
#    'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)',
#    'FEED_FORMAT': 'csv',
#    'FEED_URI': 'data.json',
#}

c = CrawlerProcess()
c.crawl(MySpider)
c.start()

Example #52

0

Show file

from shixiseng.spiders.shixisengspider import ShixisengspiderSpider
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings

# 获取settings.py模块的设置
settings = get_project_settings()
process = CrawlerProcess(settings=settings)

# 可以添加多个spider
process.crawl(ShixisengspiderSpider)
# 启动爬虫，会阻塞，直到爬取完成
process.start()

Example #53

0

Show file

File: controlador.py Project: GitNolas/tfgVidalAmor

import argparse
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings

parser = argparse.ArgumentParser()
#Argumentos do Script
parser.add_argument('-f', '--futbol', action='store_true')
parser.add_argument('-bm', '--balonman', action='store_true')
parser.add_argument('-cF', '--codigoFederacion', default='none')
parser.add_argument('-cC', '--codigoCompeticion', required=True)
parser.add_argument('-cG', '--codigoGrupo', default=None)
parser.add_argument('-cT', '--codigoTemporada', default='15')
parser.add_argument('-cX', '--codigoXornada', default=0)

args = parser.parse_args()
process = CrawlerProcess({'SPIDER_MODULES': 'tfgObtencionDatos.spiders'})
if args.futbol:  #En caso de que se necesiten datos do Futbol
    federacionsPNFG = ['gal', 'mad', 'ceu', 'rioj', 'clm', 'and', 'cant']
    federacionsPNFGBasic = ['arg', 'mur', 'ext']
    if args.codigoFederacion in federacionsPNFG:  #Scraper PNFG
        process.crawl('pnfg',
                      federacion=args.codigoFederacion,
                      grupo=args.codigoGrupo,
                      competicion=args.codigoCompeticion,
                      temporada=args.codigoTemporada,
                      xornada=args.codigoXornada)
        process.start()
    elif args.codigoFederacion in federacionsPNFGBasic:  #Scraper PNFG basic
        process.crawl('pnfgBasic',
                      federacion=args.codigoFederacion,
                      grupo=args.codigoGrupo,

Example #54

0

Show file

File: main.py Project: whitmans-max/python-examples

        def parse(self, response):
            data = json.loads(response.body)
            for item in data.get('data', []):
                yield {
                    'car_id': item.get('id'),
                    'car_name' : item.get('title'),
                    'price': item.get('price.value.currency.display'),
                    'user_id': item.get('user_id')
                 #   'user_name':
                }

            metadata = data.get('metadata')
            if metadata:
                url = metadata.get('next_page_url')
                if url:
                    yield scrapy.Request(url)
                
            
# --- it runs without project and saves in `output.csv` ---

from scrapy.crawler import CrawlerProcess

c = CrawlerProcess({
    'USER_AGENT': 'Mozilla/5.0',
    # save in file as CSV, JSON or XML
    'FEED_FORMAT': 'csv',     # csv, json, xml
    'FEED_URI': 'output.csv', # 
})
c.crawl(MySpider)
c.start()

Example #55

0

Show file

File: index.py Project: zipme/docsearch-scraper

def run_config(config):
    config = ConfigLoader(config)
    CustomDownloaderMiddleware.driver = config.driver
    DocumentationSpider.NB_INDEXED = 0

    if config.use_anchors:
        from . import scrapy_patch

    strategy = DefaultStrategy(config)

    algolia_helper = AlgoliaHelper(
        config.app_id, config.api_key, config.index_name,
        AlgoliaSettings.get(config, strategy.levels), config.query_rules,
        environ.get('REPLACE_DOMAIN', None))

    DOWNLOADER_MIDDLEWARES_PATH = 'scraper.src.custom_downloader_middleware.CustomDownloaderMiddleware'
    DOWNLOADER_CLIENTCONTEXTFACTORY = 'scraper.src.scrapy_patch.CustomContextFactory'
    DUPEFILTER_CLASS_PATH = 'scraper.src.custom_dupefilter.CustomDupeFilter'

    if __name__ == '__main__':
        DOWNLOADER_MIDDLEWARES_PATH = 'src.custom_downloader_middleware.CustomDownloaderMiddleware'
        DOWNLOADER_CLIENTCONTEXTFACTORY = 'src.scrapy_patch.CustomContextFactory'
        DUPEFILTER_CLASS_PATH = 'src.custom_dupefilter.CustomDupeFilter'

    process = CrawlerProcess({
        'LOG_ENABLED': '1',
        'LOG_LEVEL': 'ERROR',
        'USER_AGENT': config.user_agent,
        'DOWNLOADER_MIDDLEWARES': {
            DOWNLOADER_MIDDLEWARES_PATH: 900
        },
        # Need to be > 600 to be after the redirectMiddleware
        'DOWNLOADER_CLIENTCONTEXTFACTORY': DOWNLOADER_CLIENTCONTEXTFACTORY,
        'DUPEFILTER_USE_ANCHORS': config.use_anchors,
        # Use our custom dupefilter in order to be scheme agnostic regarding link provided
        'DUPEFILTER_CLASS': DUPEFILTER_CLASS_PATH
    })

    process.crawl(DocumentationSpider,
                  config=config,
                  algolia_helper=algolia_helper,
                  strategy=strategy)

    process.start()
    process.stop()

    # Kill browser if needed
    BrowserHandler.destroy(config.driver)

    if len(config.extra_records) > 0:
        algolia_helper.add_records(config.extra_records, "Extra records")

    print("")

    if DocumentationSpider.NB_INDEXED > 0:
        algolia_helper.commit_tmp_index()
        print('Nb hits: ' + str(DocumentationSpider.NB_INDEXED))
        # config.update_nb_hits(DocumentationSpider.NB_INDEXED)
    else:
        print('Crawling issue: nbHits 0 for ' + config.index_name)
        algolia_helper.report_crawling_issue()

    print("")

Example #56

0

Show file

File: tasks.py Project: WalkOnMars/celery_for_scrapy_sample

def crawl_run():
    scope = 'all'
    process = CrawlerProcess(settings=get_project_settings())
    process.crawl(QuotesSpider, scope)
    process.start()
    process.join()

Example #57

0

Show file

                    fRelationName = tds[1].get_text()
                    fHouseName = tds[2].get_text()
                    fSerialNo = tds[3].get_text().strip()
                    fLACNo = tds[4].get_text().strip()
                    fPSNo = tds[5].get_text().strip()
                    fIdCardNo = tds[6].select('td > a')[0].get_text().strip()
                    fStatus = tds[7].get_text().strip()
                    fPrimaryIdCardNo = idCardNo
                    self.familyWriter.writerow([
                        fNameOfElector, fRelationName, fHouseName, fSerialNo,
                        fLACNo, fPSNo, fIdCardNo, fStatus, fPrimaryIdCardNo,
                        addStr
                    ])
        except Exception as e:
            print("[Family Write Error]", e)
            filename = response.url.split('=')
            filename = filename[len(filename) - 1] + '.html'
            with open(filename, 'wb') as f:
                f.write(response.body)
            self.log('Saved file %s' % filename)
            soup = BeautifulSoup(open(filename, encoding="utf-8"), 'lxml')
            if ('Invalid access to the page' in soup.text):
                os.remove(filename)


if __name__ == "__main__":

    process = CrawlerProcess()
    process.crawl(QuotesSpider)
    process.start()

Example #58

0

Show file

File: sample1.py Project: oujx28/Spider_study

# coding:utf-8

import scrapy
from scrapy.crawler import CrawlerProcess

class MySpider1(scrapy.Spider):
    # Your first spider definition
    pass

class MySpider2(scrapy.Spider):
    # Your second spider definition
    pass

process = CrawlerProcess()
process.crawl(MySpider1)
process.crawl(MySpider2)
process.start()

Example #59

0

Show file

File: main.py Project: mso13/BrazilianFinancialNews

        results_dict['full_text'] = news_full_text_ext
        results_dict['link'] = response.url
        results_dict['tags'] = news_tags_ext

        results_list.append(results_dict)


if __name__ == '__main__':

    THIS_DIR = os.path.dirname(os.path.abspath(__file__))

    filename = 'moneytimes'

    # List to save the data collected
    results_list = list()

    # Initiate a CrawlerProcess
    process = CrawlerProcess()

    # Tell the process which spider to use
    process.crawl(MoneyTimesSpider)

    # Start the crawling process
    process.start()

    # Save the list of dicts
    with open(os.path.join(THIS_DIR +
                           '/data/results-{}.json'.format(filename)),
              'w',
              encoding='utf8') as f:
        json.dump(results_list, f, ensure_ascii=False)

Example #60

0

Show file

File: purbalingga.py Project: alfieqashwa/scrapy_within_one_file

        for data in datas:
            yield {
                'scrape_date': scrape_date,
                'types': types,
                'user_pic': user_pic,
                'date_update': date_update,
                'provinsi': provinsi,
                'kabkot': kabkot,
                'kecamatan': data["properties"]["name"],
                'kelurahan': '',
                'alamat': '',
                'total_odp': data["properties"]["odp_total"],
                'total_pdp': data["properties"]["pdp_total"],
                'total_positif': data["properties"]["positif_total"],
                'positif_sembuh': data["properties"]["positif_sembuh"],
                'positif_dirawat': data["properties"]["positif_dirawat"],
                'positif_isolasi': '',
                'positif_meninggal': data["properties"]["positif_meninggal"],
                'total_otg': '',
                'odr_total': '',
                'total_pp': '',
                'total_ppdt': '',
                'source_link': source_link,
            }


if __name__ == "__main__":
    process = CrawlerProcess()
    process.crawl(PurbalinggaSpider)
    process.start()