Example #1
0
def run_spider():
    settings = Settings()
    settings.set("USER_AGENT", "Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1667.0 Safari/537.36")
    settings.set("ITEM_PIPELINES", {
        'pipelines.FilterProxyPipline': 1,
        'pipelines.SaveProxyPipeline': 2
    })
    settings.set("LOG_STDOUT ", True)

    # 配置日志记录规则设置
    # configure_logging({
    #     'filename': datetime.now().strftime('%Y_%m_%d_%H_proxy.log'),
    #     'format': '%(asctime)s %(levelname)-8s %(name)-15s %(message)s',
    #     'level': logging.INFO
    # })
    configure_logging(install_root_handler=False)
    # 初始化日志路径
    logpath = datetime.now().strftime(log_path)
    if not os.path.isdir(logpath):
        os.makedirs(logpath)
    logging.basicConfig(
        filename=datetime.now().strftime('%s/%s_proxy.log' % (logpath, log_file)),
        format=log_format,
        level=logging.INFO
    )
    process = CrawlerProcess(settings)
    process.crawl(GetProxySpider)
    process.start()
Example #2
0
	def __init__(self, titlesfile = None, platform = None, region = None):

		# set default encoding to utf8 for parsing and logging
		# utf-8 characters in console and files
		#
		reload(sys)
		sys.setdefaultencoding('utf8')
        
		configure_logging(install_root_handler=False)
		logging.basicConfig(
			filename='export.log',
			filemode = 'a',
			format='%(levelname)s: %(message)s',
			level=logging.INFO
		)
                				
		# identify platform
		#
		self.platform = platform
		if self.platform is None:
			logging.error('No platform found! Pass it as an argument.')
			return
		else:			
			platformId = platforms.getId(self.platform)
			if platformId is None:
				logging.error('Platform ' + self.platform + ' not supported.')
				return
						
		self.titlesfile = titlesfile
		self.region = region		
		if self.region is None:
			self.region = "Worldwide"
		
		if titlesfile:		
		
			titles = []
			urls = []
			
			with open( self.titlesfile ) as f:
				titles = f.read().splitlines()
				
			for title in titles:
				logging.debug('Submitting title:' + title )
				urls.append(
					'http://mobygames.com/search/quick' +
					'?q=' + title +
					'&p=' + platformId +
					'&search=Go'
					'&sFilter=1'
					'&sG=on'
					'&search_title=' + urllib.quote( title ) + 
					'&search_platform=' + urllib.quote(self.platform) +
					'&search_region=' + urllib.quote(self.region)
				)
				
			process = CrawlerProcess(get_project_settings())
			process.crawl(MobygamesSpider, start_urls=urls)
			process.start()									
		else:
			logging.warning('No file.')
Example #3
0
def crawl_articles(spids):
    settings = get_project_settings()
    configure_logging(settings, install_root_handler=False)
    logging.getLogger('scrapy').setLevel(logging.WARNING)
    runner = CrawlerRunner(settings)
    loader = runner.spider_loader
    if 'all' in spids:
        spids = loader.list()
    spiders = [
        loader.load(spid)
        for spid in spids
        if spid in loader.list()
    ]
    if not spiders:
        return
    random.shuffle(spiders)
    for spider in spiders:
        runner.crawl(spider)
    d = runner.join()
    d.addBoth(lambda _: reactor.stop())
    logger.info('crawl job starting...')
    try:
        reactor.run()
    except Exception:
        logger.exception('crawl job got exception:')
    logger.info('crawl job finished')
Example #4
0
    def test_spider_custom_settings_log_level(self):
        with tempfile.NamedTemporaryFile() as log_file:
            class MySpider(scrapy.Spider):
                name = 'spider'
                custom_settings = {
                    'LOG_LEVEL': 'INFO',
                    'LOG_FILE': log_file.name,
                }

            configure_logging()
            self.assertEqual(get_scrapy_root_handler().level, logging.DEBUG)
            crawler = Crawler(MySpider, {})
            self.assertEqual(get_scrapy_root_handler().level, logging.INFO)
            info_count = crawler.stats.get_value('log_count/INFO')
            logging.debug('debug message')
            logging.info('info message')
            logging.warning('warning message')
            logging.error('error message')
            logged = log_file.read().decode('utf8')
        self.assertNotIn('debug message', logged)
        self.assertIn('info message', logged)
        self.assertIn('warning message', logged)
        self.assertIn('error message', logged)
        self.assertEqual(crawler.stats.get_value('log_count/ERROR'), 1)
        self.assertEqual(crawler.stats.get_value('log_count/WARNING'), 1)
        self.assertEqual(
            crawler.stats.get_value('log_count/INFO') - info_count, 1)
        self.assertEqual(crawler.stats.get_value('log_count/DEBUG', 0), 0)
Example #5
0
def crawler_start(usage, tasks):
    """Start specified spiders or validators from cmd with scrapy core api.
    There are four kinds of spiders: common, ajax, gfw, ajax_gfw. If you don't
    assign any tasks, all these spiders will run.
    """
    maps = CRAWLER_TASK_MAPS if usage == 'crawler' else TEMP_TASK_MAPS
    origin_spiders = DEFAULT_CRAWLERS if usage == 'crawler' else DEFAULT_VALIDATORS
    if not tasks:
        spiders = origin_spiders
    else:
        spiders = list()
        cases = list(map(BaseCase, origin_spiders))
        for task in tasks:
            for case in cases:
                if case.check(task, maps):
                    spiders.append(case.spider)
                    break
            else:
                # crawler_logger.warning('spider task {} is an invalid task, the allowed tasks are {}'.format(
                #     task, list(maps.keys())))
                pass
    if not spiders:
        #crawler_logger.warning('no spider starts up, please check your task input')
        return

    settings = get_project_settings()
    configure_logging(settings)
    runner = CrawlerRunner(settings)
    for spider in spiders:
        runner.crawl(spider)
    d = runner.join()
    d.addBoth(lambda _: reactor.stop())
    reactor.run()
Example #6
0
 def handle_lj(self):
     configure_logging({'LOG_FORMAT': '%(levelname)s: %(message)s'})
     runner = CrawlerRunner(crawler_setting)
     #d = runner.crawl(HouseSpider)
     d = runner.crawl(LianjiaHouseSpider)
     d.addBoth(lambda _: reactor.stop())
     reactor.run()
Example #7
0
def run_spider():
	options = {
	    'CONCURRENT_ITEMS': 250,
	    'USER_AGENT': 'Googlebot/2.1 (+http://www.google.com/bot.html)',
	    'CONCURRENT_REQUESTS': 30,
	    'DOWNLOAD_DELAY': 0.5,
	    'COOKIES_ENABLED': False,
	    }

	settings = get_project_settings()
	configure_logging({'LOG_FORMAT': '%(levelname)s: %(message)s'})
	settings.update(options);

	#BookToscrapeSpider basic version
	from tutorial.spiders.booktoscrape_basic import BookToscrapeSpider
	#runner = CrawlerRunner(settings)
	#runner.crawl(BookToscrapeSpider())

	#BookToscrapeSpider crawl version
	from tutorial.spiders.booktoscrape_crawl import BookToscrapeSpider as BookToscrapeSpider_crawl
	runner = CrawlerRunner(settings)
	runner.crawl(BookToscrapeSpider_crawl())

    #crawler = Crawler(settings)
    #crawler.signals.connect(reactor.stop, signal=signals.spider_closed)
    #crawler.install()
    #crawler.configure()
    #crawler.crawl(spider)
    #crawler.start()
    #log.start(logfile="results.log", loglevel=log.DEBUG, crawler=crawler, logstdout=False)
	d= runner.join()
	d.addBoth(lambda _:reactor.stop())

	reactor.run()
Example #8
0
def main():
    """
    Scrapy pull request:
        configure_logging() should accept a config argument

    Note:
        scrapy.utils.log.TopLevelFormatter is cool
        need to access Scrapy loggger's handler and replace the filter with a new
        TopLevelFormatter with more names: e.g. ['scrapy', 'scrapybox', 'aiohttp']
    """
    configure_logging(settings)
    logger.info('Scrapybox server starting')
    # formatter = logging.Formatter(fmt=settings.get('LOG_FORMAT'),
    #                               datefmt=settings.get('LOG_DATEFORMAT'))
    # handler = logging.StreamHandler()
    # handler.setFormatter(formatter)
    # handler.setLevel(settings.get('LOG_LEVEL'))
    # logging.root.addHandler(handler)

    twisted.internet.reactor.run()
    logger.info('Twisted reactor running')

    app = aiohttp.web.Application(
        loop=asyncio.get_event_loop(),
        # middlewares=[aiohttp_debugtoolbar.toolbar_middleware_factory]
    )
    # aiohttp_debugtoolbar.setup(app)  # http://127.0.0.1:8080/_debugtoolbar
    aiohttp_jinja2.setup(app, loader=jinja2.FileSystemLoader(user_path))
    app.on_shutdown.append(on_shutdown)
    app['static_path'] = user_path

    scrapybox.server.routes.add(app)

    logger.info('Aiohttp server starting')
    aiohttp.web.run_app(app)
Example #9
0
    def test_spider_custom_settings_log_level(self):
        log_file = self.mktemp()
        class MySpider(scrapy.Spider):
            name = 'spider'
            custom_settings = {
                'LOG_LEVEL': 'INFO',
                'LOG_FILE': log_file,
                # disable telnet if not available to avoid an extra warning
                'TELNETCONSOLE_ENABLED': telnet.TWISTED_CONCH_AVAILABLE,
            }

        configure_logging()
        self.assertEqual(get_scrapy_root_handler().level, logging.DEBUG)
        crawler = Crawler(MySpider, {})
        self.assertEqual(get_scrapy_root_handler().level, logging.INFO)
        info_count = crawler.stats.get_value('log_count/INFO')
        logging.debug('debug message')
        logging.info('info message')
        logging.warning('warning message')
        logging.error('error message')

        with open(log_file, 'rb') as fo:
            logged = fo.read().decode('utf8')

        self.assertNotIn('debug message', logged)
        self.assertIn('info message', logged)
        self.assertIn('warning message', logged)
        self.assertIn('error message', logged)
        self.assertEqual(crawler.stats.get_value('log_count/ERROR'), 1)
        self.assertEqual(crawler.stats.get_value('log_count/WARNING'), 1)
        self.assertEqual(
            crawler.stats.get_value('log_count/INFO') - info_count, 1)
        self.assertEqual(crawler.stats.get_value('log_count/DEBUG', 0), 0)
Example #10
0
 def handle_cap(self):
     configure_logging({'LOG_FORMAT': '%(levelname)s: %(message)s'})
     runner = CrawlerRunner(crawler_setting)
     #d = runner.crawl(AnjukeCaptchaSpider)
     #d.addBoth(lambda _: reactor.stop())
     #reactor.run()
     print 'skip'
 def runSpider(self, spider):
     configure_logging({'LOG_FORMAT': '%(asctime)s [%(name)s] %(levelname)s: %(message)s'})
     settings = Settings()
     settings.set('FEED_URI', 'output.json')
     settings.set('FEED_FORMAT', 'json')
     
     runner = CrawlerRunner(settings)
     dfd = runner.crawl(spider)
     dfd.addBoth(lambda _: reactor.stop())
    def runProcess(self):
        configure_logging()
        dbHandler.check_watches()
        runner = CrawlerRunner()
        runner.crawl(spider.available_courses_spider)
        dbHandler.check_watches()
        d = runner.join()
        d.addBoth(lambda _: reactor.stop())

        reactor.run()
Example #13
0
def main():
    locale.setlocale(locale.LC_TIME, 'es_ES')

    configure_logging({'LOG_FORMAT': '%(levelname)s: %(message)s'})
    runner = CrawlerRunner()

    d = runner.crawl(LotoSpider)
    d.addBoth(lambda _: reactor.stop())
    reactor.run()
    return None
Example #14
0
    def run(self, args, opts):
        configure_logging(install_root_handler=False)
        logger = logging.getLogger()
        databaseLogHandler = DatabaseLogHandler()
        logger.addHandler(databaseLogHandler)

        for spider_name in self.crawler_process.spider_loader.list():
            spider_class = self.crawler_process.spider_loader.load(spider_name)
            self.crawler_process.crawl(spider_class)

        self.crawler_process.start()
Example #15
0
 def __init__(self, seedUrl='http://www.amazon.co.uk/gp/bestsellers/electronics?ie=UTF8&ref_=sv_ce_0'):
     print "Starting gadgetzon spider"
     self.seedUrl = seedUrl
     configure_logging(install_root_handler=False)
     self.thisDir = os.path.abspath(os.path.dirname(__file__))
     logConfigFile = os.path.join(self.thisDir, 'gadget.log')
     logging.basicConfig(
         filename=logConfigFile,
         format='%(asctime)s - %(name)s - %(levelname)s - %(module)s : %(lineno)d - %(message)s',
         level=logging.DEBUG
     )
     self.price_parser = price_parser.PriceParser()
Example #16
0
    def run(self, args, opts):
        if len(args) < 1:
            raise UsageError()
        elif len(args) > 1:
            raise UsageError("running 'scrapy crawl' with more than one spider is no longer supported")
        spider_name = args[0]

        configure_logging(install_root_handler=False)
        logger = logging.getLogger()
        databaseLogHandler = DatabaseLogHandler()
        logger.addHandler(databaseLogHandler)

        self.crawler_process.crawl(spider_name)
        self.crawler_process.start()
Example #17
0
def scrape_it(output_path):
    csv_filename= path.join(output_path, 'items.csv')
    # TODO fix cantrestartreactor problem
    settings = Settings(dict(FEED_FORMAT='csv',
                            FEED_URI=csv_filename,
                            ))


    configure_logging({'LOG_ENABLED':False})

    process = CrawlerProcess( settings)

    process.crawl(ListingSpider)
    process.start()#stop_after_crawl=False)
Example #18
0
def perform_scrape(pick_or_shake):
    '''Perform a Scraping run for either a PickSpider or Shakespider.
    
    Args:
        pick_or_shake (str): string 'pick' or 'shake' specifiying what spider to run
    '''
    #get the settings and configure the logging level
    settings = scrapingtools.get_settings()
	settings.set('LOG_ENABLED',True)
	configure_logging({'LOG_FORMAT': '%(levelname)s: %(message)s'})
    #get the job list for the spider
	if pick_or_shake=='Pick':
	    joblist=get_pick_joblist(settings.get('SOURCE_FILE_NAME'))
	elif pick_or_shake='Shake':
	    joblist=get_pick_joblist(settings.get('SOURCE_FILE_NAME'))
Example #19
0
    def scrape(self):
        '''
        create a scrapy spider and scrape this users start_url
        '''
        start_urls = ['http://airbnb.com/s?host_id=%i'%(self.userid)]
        # TODO fix cantrestartreactor problem
        settings = Settings(dict(FEED_FORMAT='csv',
                                FEED_URI=self.scraped_csv,
                                ))


        
        process = CrawlerProcess( settings)
        process.crawl(ListingSpider, start_urls=start_urls )
        configure_logging({'LOG_ENABLED':False,'LOG_LEVEL':'CRITICAL'})
        process.start()#stop_after_crawl=False)
Example #20
0
def main():
    configure_logging({'LOG_FORMAT': '%(levelname)s: %(message)s'})
    settings = get_project_settings()
    runner = CrawlerRunner(settings)
    

    # settings.set('FEED_FORMAT','json')
    # settings.set('FEED_URI', 'result.json')

    runner.crawl(PttBoard)
    runner.crawl(PTTArticle)
    d = runner.join()
    d.addBoth(lambda _: reactor.stop())
    result = reactor.run() # the script will block here until the crawling is finished

    print result
Example #21
0
def cleanup(ctx):
    """
    Cleanup old cache entries.

    By default, entries older than 90 days will be removed. This value can be
    overriden in the config file.
    """
    settings = ctx.obj["settings"]
    # Manually configure logging since we don't have a CrawlerProcess which
    # would take care of that.
    configure_logging(settings)

    if not settings.getbool("HTTPCACHE_ENABLED"):
        logger.error("Cache is disabled, will not clean up cache dir.")
        return 1

    run_cleanup_cache(settings)
Example #22
0
def _run_feed_spider(url, feed):
    spid = str(uuid.uuid4())
    feed['_id'] = spid
    configure_logging(TEST_SETTINGS, install_root_handler=False)
    logging.getLogger('scrapy').setLevel(logging.WARNING)
    save_feed(url)
    cls = SpiderFactory.create_spider(feed)
    runner = CrawlerRunner(TEST_SETTINGS)
    d = runner.crawl(cls)
    d.addBoth(lambda _: reactor.stop())
    reactor.run(installSignalHandlers=False)
    n = get_stats([spid])[spid]
    if n == 0:
        raise Exception(f'feed spider crawled 0 articles')
    if is_exists_spider(url):
        raise Exception(f'feed[{url}] existed')
    del feed['_id']
    save_spider_settings(feed)
Example #23
0
def perform_scrape():
    '''Perform a MunchSpider scrape using the current Scrapy Settings
    '''
	settings = scrapingtools.get_settings()
	publisher_database = get_publisher_database(settings,mongo=False)
	configure_logging({'LOG_FORMAT': '%(levelname)s: %(message)s'})
	(doi_links,doi_sources) = get_joblist(settings.get('COLLECT_FILE_NAME'))
	domains = get_domains(publisher_database)
	runner=CrawlerRunner(settings)
	d=runner.crawl(Spiders.MunchSpider.MunchSpider,
			 start_urls=doi_links,
			 crossref_items = doi_sources,
			 allowed_domains=domains,
			 publisher_database=publisher_database,
			 )
	d2=d.addBoth(lambda _: reactor.stop())
	d2.addCallback(lambda _: scrapingtools.finalise_file(settings.get('COMPLETE_FILE_NAME')))
	d2.addCallback(lambda _: scrapingtools.finalise_file(settings.get('ERROR_FILE_NAME')))
Example #24
0
def crawl_guardian(job_id, url):
    import scrapy
    from scrapy.crawler import CrawlerProcess, CrawlerRunner
    from scrapy.settings import Settings
    from scrapy.utils.project import get_project_settings
    from scrapy.utils.log import configure_logging
    from scraper.guardianukscraper.spiders.guardian_spider import GuardianSpider
    from scraper.guardianukscraper import settings
    import os

    os.environ['SCRAPY_SETTINGS_MODULE'] = 'scraper.guardianukscraper.settings'
    settings_module_path = os.environ['SCRAPY_SETTINGS_MODULE']
    settings = Settings()
    settings.setmodule(settings_module_path, priority='project')

    configure_logging({'LOG_FORMAT': '%(levelname)s: %(message)s'})

    crawler = scrapy.crawler.Crawler(GuardianSpider,settings)
    return crawler.crawl(job_id=job_id, url=url)
    def __setup(self):
        """
        Setup
        :return:
        """
        if not os.path.exists(self.__local_download_dir_warc):
            os.makedirs(self.__local_download_dir_warc)

        # make loggers quite
        configure_logging({"LOG_LEVEL": "ERROR"})
        logging.getLogger('requests').setLevel(logging.CRITICAL)
        logging.getLogger('readability').setLevel(logging.CRITICAL)
        logging.getLogger('PIL').setLevel(logging.CRITICAL)
        logging.getLogger('newspaper').setLevel(logging.CRITICAL)
        logging.getLogger('newsplease').setLevel(logging.CRITICAL)
        logging.getLogger('urllib3').setLevel(logging.CRITICAL)

        # set own logger
        logging.basicConfig(level=self.__log_level)
        self.__logger = logging.getLogger(__name__)
        self.__logger.setLevel(self.__log_level)
Example #26
0
def run():
    # Logging settings
    configure_logging(install_root_handler=False)
    logging.basicConfig(
        datefmt='%Y-%m-%d %H:%M:%S',
        filemode='w',
        filename='output/' + datetime.datetime.utcnow().strftime("%Y%m%d%H%M%S") + '.log',
        format='%(asctime)s %(levelname)s: %(message)s',
        level=logging.INFO
    )

    # Project settings
    settings = Settings()
    settings.setmodule('settings', priority='project')

    # Class to run parallel spiders
    process = CrawlerProcess(settings)
    process.crawl(spiders.LiquipediaSpider)

    # Block until crawling is complete
    process.start()
def run_spiders():
    """
    说明:
        如果该调用程序是程序的最外层循环,那么此处可以直接调用爬虫的配置文件:
        在文件中使用如下代码:
        from scrapy.utils.project import get_project_settings
        # some code
        runner = CrawlerRunner(get_project_settings())

        如果该程序调用只是一个封装的函数,则配置文件需要自己构造,如下面的代码
    """
    configure_logging({'LOG_FORMAT': '%(levelname)s: %(message)s'})  # 定义日志格式
    # 设置当前爬虫的配置信息,此处是选择要调用的pipe
    settings = Settings()
    settings.set('ITEM_PIPELINES', {'spider.tutorial.pipelines.TutorialPipeline': 300,})
    # 将加载后的配置文件加载到爬虫中
    runner = CrawlerRunner(settings)  # 启用爬虫运行器

    d = runner.crawl(ChinazSpider)
    d.addBoth(lambda _: reactor.stop())
    reactor.run()  # the script will block here until the crawling is finished
    def __init__(self, crawler):
        _settuings = get_project_settings()

        # log文件名的时间格式
        dt_fmt = _settuings.get("LOG_NAME_DATE_FMT", "%Y-%m-%d")
        dtstr = datetime.utcnow().strftime(dt_fmt)

        # 存储log的目录
        log_dir = _settuings.get("LOG_DIR", "./")

        # log_dir 不存在,就递归创建log_dir
        if not os.path.exists(log_dir):
            try:
                os.makedirs(log_dir)

                logger.debug(u"定义的日志目录 {} 不存在,已创建!".format(log_dir))

            except Exception as e:
                logger.exception(u"自定义的日志目录 {} 不存在且创建失败! 重置为默认!".format(log_dir))
                logger.exception(e)
                log_dir = "./"

        # 普通级别的log
        log_file_name = "{}_{}.log".format(crawler.spidercls.name, dtstr)
        log_setting = {
            "LOG_FILE": os.path.join(log_dir, log_file_name),
            "LOG_LEVEL": logging.DEBUG
        }
        configure_logging(log_setting)

        # 错误级别的log
        log_file_name = "{}_error_{}.log".format(crawler.spidercls.name, dtstr)
        log_setting = {
            "LOG_FILE": os.path.join(log_dir, log_file_name),
            "LOG_LEVEL": logging.ERROR
        }
        configure_logging(log_setting)
Example #29
0
 def access0(self, runId, records):
     logger.info('Start accessing on Linkedin')
     # runner = CrawlerRunner({
     #     'USER_AGENT': CrawlerBrowser.get_useragent(CrawlerBrowser.FIREFOX),
     #     'DOWNLOAD_DELAY': 1
     # })
     # self.crawl(runner, runId, records)
     # # d = runner.crawl(Spiderman, runId=runId, pipeline=self, items=records, siteCfg=self.config)
     # # d.addBoth(lambda _: reactor.stop())
     # reactor.run()
     configure_logging(install_root_handler=False)
     process = CrawlerProcess({
         'USER_AGENT':
         CrawlerBrowser.get_useragent(CrawlerBrowser.EDGE),
         'DOWNLOAD_DELAY':
         1
     })
     process.crawl(Spiderman,
                   runId=runId,
                   pipeline=self,
                   items=records,
                   browser=CrawlerBrowser.EDGE,
                   siteCfg=self.config)
     process.start()
Example #30
0
def runspider(spargs):
    url = spargs.get('url')
    name = spargs.get('name', 'jd')
    guid = spargs.get('guid')
    product_id = spargs.get('product_id')

    if not os.path.exists('log'):
        os.makedirs('log')

    configure_logging(install_root_handler=False)
    logging.basicConfig(filename='log/%s.log' % name,
                        format='%(levelname)s %(asctime)s: %(message)s',
                        level=logging.ERROR)
    print "get_project_settings().attributes:", get_project_settings(
    ).attributes['SPIDER_MODULES']
    process = CrawlerProcess(get_project_settings())
    start_time = time.time()
    try:
        logging.info('进入爬虫')
        process.crawl(name, **spargs)
        process.start()
    except Exception, e:
        process.stop()
        logging.error("url:%s, errorMsg:%s" % (url, e.message))
Example #31
0
 def f(q):
     try:
         s = get_project_settings()
         user_agent_list = data.getUserAgentList()
         user_agent = None
         if len(user_agent_list) > 0:
             user_agent = random.choice(user_agent_list)
         if user_agent:
             s.update({
                 "LOG_ENABLED": "True",
                 "TELNETCONSOLE_ENABLED": "False",
                 "USER_AGENT": user_agent
             })
         else:
             s.update({
                 "LOG_ENABLED": "True",
                 "TELNETCONSOLE_ENABLED": "False"
             })
         configure_logging({'LOG_FORMAT': '%(levelname)s: %(message)s'})
         runner = crawler.CrawlerRunner(s)
         agent.state['fire'] = False
         agent.state['data'] = data
         deferred = runner.crawl(
             agent.class_element,
             cus_urls=data.getUrls(),
             cus_allowed_domains=data.getAllowedDomains(),
             agent=agent)
         deferred.addBoth(lambda _: reactor.stop())
         reactor.run()
         if not agent.state['fire']:
             q.put(self.case_without_rules_trigger())
         q.put(None)
     except Exception as e:
         q.put(e)
         self.log.error(str(e))
         raise SpiderException('[Warning, execute]: %s' % str(e))
Example #32
0
def __setup(local_download_dir_warc, log_level):
    """
    Setup
    :return:
    """
    os.makedirs(local_download_dir_warc, exist_ok=True)

    global __log_pathname_fully_extracted_warcs
    __log_pathname_fully_extracted_warcs = os.path.join(local_download_dir_warc, 'fullyextractedwarcs.list')

    # make loggers quite
    configure_logging({"LOG_LEVEL": "ERROR"})
    logging.getLogger('requests').setLevel(logging.CRITICAL)
    logging.getLogger('readability').setLevel(logging.CRITICAL)
    logging.getLogger('PIL').setLevel(logging.CRITICAL)
    logging.getLogger('newspaper').setLevel(logging.CRITICAL)
    logging.getLogger('newsplease').setLevel(logging.CRITICAL)
    logging.getLogger('urllib3').setLevel(logging.CRITICAL)
    logging.getLogger('jieba').setLevel(logging.CRITICAL)

    # set own logger
    logging.basicConfig(level=log_level)
    __logger = logging.getLogger(__name__)
    __logger.setLevel(log_level)
Example #33
0
    def crawl(self,
              spidername,
              keyword,
              times,
              log=True,
              runner=None,
              settings=None):
        thesettings = copy.deepcopy(get_project_settings())

        if log and not settings:
            self.logfilename = LOG_DIR + getCurrentTimeReadable() \
                               + '-' + spidername + '.log'
            logfilename = self.logfilename
            thesettings['LOG_FILE'] = logfilename
        else:
            thesettings = settings

        # https://docs.scrapy.org/en/latest/topics
        # /api.html#scrapy.settings.Settings
        # process = CrawlerProcess(get_project_settings())

        if not runner:
            configure_logging(thesettings)
            therunner = CrawlerRunner(thesettings)
        else:
            therunner = runner

        d = therunner.crawl(spidername, q=keyword, t=times)

        if not runner:
            d.addBoth(lambda _: reactor.stop())

        if self.loop < 3:
            self.loop = self.loop + 1
            d.addBoth(lambda _: self.crawl(spidername, keyword, self.loop, log,
                                           runner, settings))
Example #34
0
    def __setup(self):
        """
        Setup
        :return:
        """
        os.makedirs(self.__local_download_dir_warc, exist_ok=True)

        # make loggers quiet
        configure_logging({"LOG_LEVEL": "ERROR"})
        logging.getLogger('requests').setLevel(logging.CRITICAL)
        logging.getLogger('readability').setLevel(logging.CRITICAL)
        logging.getLogger('PIL').setLevel(logging.CRITICAL)
        logging.getLogger('newspaper').setLevel(logging.CRITICAL)
        logging.getLogger('newsplease').setLevel(logging.CRITICAL)
        logging.getLogger('urllib3').setLevel(logging.CRITICAL)

        boto3.set_stream_logger('botocore', self.__log_level)
        boto3.set_stream_logger('boto3', self.__log_level)
        boto3.set_stream_logger('s3transfer', self.__log_level)

        # set own logger
        logging.basicConfig(level=self.__log_level)
        self.__logger = logging.getLogger(__name__)
        self.__logger.setLevel(self.__log_level)
Example #35
0
 def run_all(self):
     hasRunnedToday = os.path.isfile(checkfile)
     if not hasRunnedToday:  # 今天此脚本未执行
         with open(checkfile, 'w+') as f:  # 执行前创建此文件
             pass
     else:
         os.remove(checkfile)
     with open('Checkjiuyue/Checkjiuyue/domain.txt', 'r') as f1:
         moudle_list = []
         for r in f1.readlines():
             name = r.replace('.', '').replace('\n',
                                               '').strip()  # name为domain去.
             domain = r.replace('\n', '').strip()
             with open(
                     'Checkjiuyue/Checkjiuyue/spiders/{}_spider.py'.format(
                         name), 'w') as f:
                 f.write(
                     'from Checkjiuyue.Checkjiuyue.spiders.base_baidu_spider import BaseBaiduSpider'
                     + '\n' + '\n')
                 f.write('class {}Spider(BaseBaiduSpider):'.format(name) +
                         '\n')
                 f.write('\t' + "name='{}Spider'".format(name) + '\n')
                 f.write('\t' + "domain='{}'".format(domain) + '\n')
             moudle = get(
                 'Checkjiuyue.Checkjiuyue.spiders.{0}_spider.{1}Spider'.
                 format(name, name))
             moudle_list.append(moudle)
     configure_logging()
     runner = CrawlerRunner(settings)
     for spider in moudle_list:
         runner.crawl(spider)
     d = runner.join()
     d.addBoth(lambda _: reactor.stop())
     reactor.run()
     self.send_email_zip_summary()
     time.sleep(self.getsleeptime())
Example #36
0
def run_scraper():
    crawler_settings = Settings()
    crawler_settings.setmodule(settings)
    configure_logging()
    # runner = CrawlerRunner(settings=crawler_settings)
    # task = LoopingCall(lambda: runner.crawl(NewsOeOffshoreSpider))
    # task.start(6000 * 100)
    # reactor.run()
    process = CrawlerProcess(settings=crawler_settings)
    process.crawl(OilCrossSpider)
    process.crawl(LngConSpider)
    process.crawl(CnpcNewsSpider)
    process.crawl(PetroTradingSpider)

    process.crawl(EnergyExpressSpider)
    process.crawl(HaiBeiSpider)
    process.crawl(WeiXinOffshoreEnergySpider)

    process.crawl(HaiBoSpider)
    process.crawl(CRSLSpider)
    process.crawl(OilCubicSpider)
    process.crawl(OilLinkSpider)

    process.start()
Example #37
0
class MySpider(CrawlSpider):
    start_urls = ['https://www.gmail.com']
    name = 'link_checker'
    configure_logging(install_root_handler=True)
    custom_settings = {
        'DEPTH_LIMIT': '1'}
    rules = (Rule(LinkExtractor(), callback='parse_url', follow=True), )
    def parse_url(self, response):
        logging.info("\n")
        logging.info("The Response URL is: {}".format(response.url))
        logging.info("The Time of Parsing is: {}".format(now.strftime("%Y-%m-%d %H:%M")))
        #logging.info('\n\n\n')
        logging.info("Source Code is: \n {}".format(response.body))
        #logging.info('\n\n\n')
        #logging.info(process.memory_info().rss)
        #logging.info('\n\n\n')
class emailSpider(scrapy.spiders.CrawlSpider):
    name = 'email'

    logging.getLogger('scrapy').propagate = False

    configure_logging(install_root_handler=True)
    logging.basicConfig(filename='log.txt',
                        format='%(levelname)s:%(message)s',
                        level=logging.INFO)

    rules = (Rule(LxmlLinkExtractor(allow=()),
                  callback='parse_obj',
                  follow=True), )
    """
        Description: Parse each response from the spidering and try to scrape email from the html body. 
        If the email is not in global "maillist" variable, add to the maillist. 

        This way, no duplicate emails will be inserted into maillist. 
    """
    def parse_obj(self, response):
        # Use set instead of list to remove all duplicate occurrences
        try:
            email = set(
                re.findall(r"[a-z0-9\.\-+_]+@[a-z0-9\.\-+_]+\.[a-z]+",
                           response.text))

        except:
            email = set()

        # For emails found in the response html, see if the email already exists in the
        # current maillist, and if not, print it out and add it to the maillist.
        for item in email:
            if len(maillist) >= emailLimit:
                #print("Email number limit have reached: ", emailLimit)
                raise CloseSpider('Email number limit have reached')
            elif item == '':
                continue

            # TIL, if item in <dictionary> is actually using O(1) and is fast.
            elif item in maillist:
                continue

            maillist[item] = 1

            print(item)
Example #39
0
class ArtistSpider(scrapy.Spider):
    """Collect the Internal Soundcloud ID and then all necessary data from
	the /user endpoint on Soundcloud's API"""

    # $ scrapy crawl [name]
    name = 'artists'
    LOGGING = True

    def __init__(self, limit=None, *args, **kwargs):
        super(ArtistSpider, self).__init__(*args, **kwargs)
        self.api = Soundcloud(wait=0.25)
        self.limit = int(limit)
        self.start_urls = self.api.get_start_urls(spider=self.name,
                                                  url_limit=self.limit)

    # log all output
    if LOGGING:
        configure_logging(install_root_handler=False)
        logging.basicConfig(filename='log.txt',
                            format='%(levelname)s: %(message)s',
                            level=logging.INFO)

    custom_settings = {
        'ITEM_PIPELINES': {
            'sc_scraper.pipelines.ArtistPipeline': 0,
        }
    }

    def parse(self, response):

        # get internal ID from profile URL and call SC's /user endpoint
        internal_id = self.api.get_internal_sc_user_id(response.url)
        user = self.api.get_user(internal_id, db_data_only=True)

        # add to Artist Item
        artist = Artist()
        artist['item_type'] = 'artist'
        artist['dt_crawled'] = self.api.get_timestamp()
        artist['retrieved_tracks'] = False
        for k in user.keys():
            artist[k] = user[k]

        yield artist
Example #40
0
class IkeaCategoriesSpider(scrapy.Spider):
    configure_logging(install_root_handler=False)
    logging.basicConfig(filename='log.txt',
                        format='%(levelname)s: %(message)s',
                        level=logging.INFO)

    name = 'ikea_categories'

    def start_requests(self):
        scrape_urls = ['https://www.ikea.com/ca/en/cat/products-products/']

        for url in scrape_urls:
            yield scrapy.Request(url=url, callback=self.parse)

    def parse(self, response):
        # creating/overwriting the product list with the columns of both
        # the product and category files
        ikea_category_df_data = {'category_id': [], 'category_name': []}

        category_list = enumerate(
            response.css(
                '.vn-accordion__item > ul > li > a::attr(href)').getall())

        for index, category_url in category_list:
            # Getting the last array element in the stripped URL
            # .strip('/') removes the trailing / in the URL
            category_name_id = category_url.strip('/').split('/')[-1]
            category_name = ' '.join(category_name_id.split('-')[:-1]).title()
            category_id = category_name_id.split('-')[-1]

            ikea_category_df_data['category_id'] += [category_id]
            ikea_category_df_data['category_name'] += [category_name]

        ikea_category_df = pd.DataFrame(data=ikea_category_df_data, dtype=str)
        ikea_category_df.to_csv(categories_csv, encoding='utf-8', index=False)

        print('Categories completed. Starting product crawl.')

        product_spider = IkeaProductsSpider(products_csv)
        product_spider.crawl_products()

        if not product_spider.completed:
            print('Unable to complete product crawl.')
Example #41
0
class JobsSpider(scrapy.Spider):
    name = "linkedinJobCard"
    configure_logging(install_root_handler=False)

    def start_requests(self):
        query = 'Python'
        place = 'Germany'
        search_url = 'https://www.linkedin.com/jobs/search?keywords=' + query.lower() + '&location=' + place.lower()
        self.jobs_to_scrape = 20
        yield SeleniumRequest(url=search_url, callback=self.get_job_urls)
    
    def get_job_urls(self,response):
        for jj in range(1, self.jobs_to_scrape + 1):
            url=response.xpath('//*[@id="main-content"]/div/section/ul/li[' + str(jj) + ']/a/@href').get()
            yield SeleniumRequest(url=url, callback=self.parse)
            
            
    def parse(self, response):
        text = response.xpath('/html/body/main/section[1]/section[3]/div/section/div/text()').get()
        data = { 'url': response.url, 'text': text }
        yield data
Example #42
0
class TestSpider(scrapy.Spider):

    name = 'test_spider'
    start_urls = ['http://quotes.toscrape.com/']

    custom_settings = {
        'EXTENSIONS': {
            'scrapy.extensions.test_extension.CustomStats': 500
        }
    }

    configure_logging(install_root_handler=False)

    log_format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
    log_level = logging.INFO
    log_file = 'test_log.log'

    logging.basicConfig(format=log_format, level=log_level)

    rotating_file_log = TimedRotatingFileHandler(log_file,
                                                 when='S',
                                                 interval=1,
                                                 backupCount=5)
    rotating_file_log.setFormatter(logging.Formatter(log_format))

    root_logger = logging.getLogger()
    root_logger.addHandler(rotating_file_log)

    def parse(self, response):
        for row in response.xpath(
                "//div[@class='row']/div[@class='col-md-8']/div"):
            yield {
                'quote': row.xpath(".//span[@class='text']/text()").get(),
            }

        next_page = response.xpath("//li[@class='next']/a/@href").get()
        if next_page is not None:
            next_page_link = response.urljoin(next_page)
            yield scrapy.Request(url=next_page_link, callback=self.parse)
Example #43
0
class ModSpider(scrapy.Spider):
    name = "spider"
    start_urls = ['http://www.google.com']

    # Configure Setting for main driver code
    configure_logging()
    runner = CrawlerRunner()

    def crawl(self):
        self.crawl(ModSpider)
        reactor.stop()

    crawl()
    reactor.run()

    # the script will block here until the last crawl call is finished

    # SETTING_SELECTOR instructs spider which 'key word' to use when parsing the data
    # NAME_SELECTOR is the <HTML> tag to search within
    # The Object we are iterating over has its own CSS Method so we pass a selector element
    # to parse out child elements.
    def parse(self, response):
        SETTING_SELECTOR = '.set'
        for k in response.css(SETTING_SELECTOR):
            pass
            NAME_SELECTOR = 'h1:: text'
            yield {
                'name': k.css(NAME_SELECTOR).extract_first(),
            }
        # We define a selector for the next_page element (link), extract first match
        # and check to see if it exists.
        NEXT_PAGE_SELECT = '.next a ::attr(href)'
        next_page = response.css(NEXT_PAGE_SELECT).extract_first()
        if next_page:
            yield scrapy.Request(response.urljoin(next_page),
                                 callback=self.parse)
Example #44
0
 def set_logger(self, name: str = "COMMAND", level: str = "DEBUG"):
     self.logger = logging.getLogger(name=name)
     self.logger.setLevel(level)
     configure_logging()
     logging.getLogger("pika").setLevel(
         self.project_settings.get("PIKA_LOG_LEVEL", "WARNING"))
Example #45
0
        for k, v in os.environ.items() if k.startswith('SCRAPY_')
    }
    if env_overrides:
        settings.setdict(env_overrides, priority='project')

    return settings


ENVVAR = 'SCRAPY_SETTINGS_MODULE'

s = get_project_settings()

if CUSTOM_LOGGING:
    logging.basicConfig(level=LOG_LEVEL)
else:
    configure_logging(settings=s, install_root_handler=False)

runner = CrawlerRunner(s)

base_string = 'https://www.olx.pl/nieruchomosci'
housing_types = ['mieszkania', 'stancje-pokoje']
business_types = ['sprzedaz', 'wynajem']
urls_flats_OLX = []
urls_rooms_OLX = []
cities_scope = [unidecode.unidecode(x) for x in cities_scope]

for type in housing_types:
    for city in cities_scope:
        if type == 'mieszkania':
            for purpose in business_types:
                urls_flats_OLX.append('/'.join(
Example #46
0
def run():
    configure_logging()
    runner = CrawlerRunner(get_project_settings())
    d = runner.crawl(SasaSpider)
    d.addBoth(lambda _: reactor.stop())
    reactor.run()  # the script will block here until the crawling is finished
Example #47
0
#Imports related to scrapy
from scrapy import Spider
from scrapy import Request

#Model import , saving data
from uniScrapers.items import UniversityInfoItem

#Generating module logs
import logging
from scrapy.utils.log import configure_logging

#Configuring logs for Main crawler Module.
#Log file name can be changed below
#Log Level can be modified to INFO , ERROR , WARNING , DEBUG etc
#Format specifies file format
configure_logging(install_root_handler=False)
logging.basicConfig(filename='log.txt',
                    format='%(levelname)s: %(message)s',
                    level=logging.ERROR)


#Main Crawler class
class UniversityCrawler(Spider):

    #Defining Name of Spider | It will be used for running spider
    #Use - scrapy crawl <spider-name>  [scrapeData in our case]
    name = 'UniversityCrawler'

    # Domains allowed during scrape session | Outer domains will be filtered
    # This section need not to be altered in our case
    # So it can be left unaltered
Example #48
0
 def __init__(self, settings=None, install_root_handler=True):
     super(CrawlerProcess, self).__init__(settings)
     install_shutdown_handlers(self._signal_shutdown)
     configure_logging(self.settings, install_root_handler)
     log_scrapy_info(self.settings)
Example #49
0
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

from twisted.internet import reactor
from scrapy.crawler import CrawlerRunner
from scrapy.utils.log import configure_logging
from scrapy.utils.project import get_project_settings
from tophub.spiders.github_spider import GitHubSpider
from tophub.spiders.juejin_spider import JueJinSpider
from tophub.spiders.douban_spider import DouBanSpiderFiction, \
    DouBanSpiderNonFiction
from tophub.spiders.reddit_spider import RedditSpider
from tophub.spiders.segmentfault_spider import SegmentFaultSpider
from tophub.spiders.hacker_news import HackerNewsSpider

configure_logging({'LOG_FORMAT': '%(levelname)s: %(message)s'})
runner = CrawlerRunner(get_project_settings())
runner.crawl(GitHubSpider)
runner.crawl(JueJinSpider)
runner.crawl(HackerNewsSpider)
runner.crawl(DouBanSpiderFiction)
runner.crawl(DouBanSpiderNonFiction)
# runner.crawl(RedditSpider)
runner.crawl(SegmentFaultSpider)
d = runner.join()
d.addBoth(lambda _: reactor.stop())

reactor.run()
Example #50
0
from twisted.internet import reactor, defer
from scrapy.crawler import CrawlerRunner
from scrapy.utils.log import configure_logging

from spiders.course_spider import CourseSpider

configure_logging()
runner = CrawlerRunner()

@defer.inlineCallbacks
def crawl():
    yield runner.crawl(CourseSpider)
    reactor.stop()

crawl()
reactor.run() # the script will block here until the last crawl call is finished
Example #51
0
settings.set('ITEM_PIPELINES', {
    '__main__.JsonLinesExportPipeline': 100,
})
settings.set('USER_AGENT', 'Mozilla/5.0 (Windows NT 6.3; Win64; x64)')

# instantiate a spider
ff_spider = FFSpider()
ao_spider = AOSpider()

# instantiate a crawler passing in settings
# crawler = Crawler(settings)
ff_crawler = Crawler(ff_spider, settings)
ao_crawler = Crawler(ao_spider, settings)
# configure signals
ff_crawler.signals.connect(callback, signal=signals.spider_closed)
ao_crawler.signals.connect(callback, signal=signals.spider_closed)

# configure and start the crawler
# crawler.configure()
# crawler.crawl(spider)
ff_crawler.crawl()
# le_crawler.crawl()
# ao_crawler.crawl()

# start logging
# log.start()
log.configure_logging()

# start the reactor (blocks execution)
reactor.run()
Example #52
0
    def __init__(self, cfg_file_path, json_file_path,
                 site_index, shall_resume, daemonize, library_mode=False):
        # set up logging before it's defined via the config file,
        # this will be overwritten and all other levels will be put out
        # as well, if it will be changed.
        configure_logging({"LOG_LEVEL": "CRITICAL"})
        self.log = logging.getLogger(__name__)

        self.cfg_file_path = cfg_file_path
        self.json_file_path = json_file_path
        self.site_number = int(site_index)
        self.shall_resume = shall_resume \
            if isinstance(shall_resume, bool) else literal_eval(shall_resume)
        self.daemonize = daemonize \
            if isinstance(daemonize, bool) else literal_eval(daemonize)

        # set up the config file
        self.cfg = CrawlerConfig.get_instance()
        self.cfg.setup(self.cfg_file_path)
        self.log.debug("Config initialized - Further initialisation.")

        self.cfg_crawler = self.cfg.section("Crawler")

        # load the URL-input-json-file or - if in library mode - take the json_file_path as the site information (
        # kind of hacky..)
        if not library_mode:
            self.json = JsonConfig.get_instance()
            self.json.setup(self.json_file_path)
            sites = self.json.get_site_objects()
            site = sites[self.site_number]
        else:
            sites = [json_file_path]
            site = json_file_path

        if "ignore_regex" in site:
            ignore_regex = "(%s)" % site["ignore_regex"]
        else:
            ignore_regex = "(%s)" % \
                self.cfg.section('Crawler')['ignore_regex']

        # Get the default crawler. The crawler can be overwritten by fallbacks.
        if "additional_rss_daemon" in site and self.daemonize:
            self.crawler_name = "RssCrawler"
        elif "crawler" in site:
            self.crawler_name = site["crawler"]
        else:
            self.crawler_name = self.cfg.section("Crawler")["default"]
        # Get the real crawler-class (already "fallen back")
        crawler_class = self.get_crawler(self.crawler_name, site["url"])

        if not self.cfg.section('Files')['relative_to_start_processes_file']:
            relative_to_path = os.path.dirname(self.cfg_file_path)
        else:
            # absolute dir this script is in
            relative_to_path = os.path.dirname(__file__)

        self.helper = Helper(self.cfg.section('Heuristics'),
                             self.cfg.section("Files")["local_data_directory"],
                             relative_to_path,
                             self.cfg.section('Files')['format_relative_path'],
                             sites,
                             crawler_class,
                             self.cfg.get_working_path())

        self.__scrapy_options = self.cfg.get_scrapy_options()

        self.update_jobdir(site)

        # make sure the crawler does not resume crawling
        # if not stated otherwise in the arguments passed to this script
        self.remove_jobdir_if_not_resume()

        self.load_crawler(crawler_class,
                          site["url"],
                          ignore_regex)

        # start the job. if in library_mode, do not stop the reactor and so on after this job has finished
        # so that further jobs can be executed. it also needs to run in a thread since the reactor.run method seems
        # to not return. also, scrapy will attempt to start a new reactor, which fails with an exception, but
        # the code continues to run. we catch this excepion in the function 'start_process'.
        if library_mode:
            start_new_thread(start_process, (self.process, False,))
        else:
            self.process.start()
Example #53
0
def test_injection_failure(settings):
    configure_logging(settings)
    items, url, crawler = yield crawl_items(
        spider_for(UnressolvableProductPage), ProductHtml, settings)
    assert items == []
Example #54
0
 def __init__(self, settings=None, install_root_handler=True):
     super().__init__(settings)
     configure_logging(self.settings, install_root_handler)
     log_scrapy_info(self.settings)
     self._initialized_reactor = False
Example #55
0
 def __init__(self, id_list, crawl_list, *args, **kwargs):
     super().__init__(*args, **kwargs)
     self.id_list = id_list
     self.crawl_list = crawl_list
     configure_logging()
     self.runner = CrawlerRunner(get_project_settings())
Example #56
0
def newscrape(requested, ticker):
    # here begins the definition for the web scraper
    logging.getLogger('scrapy').propagate = False  # turn off logging
    targetdrop = requested['Percent'][len(requested) - 1] * 0.9
    #targetdrop = -5
    if targetdrop > 0:
        return 'Target percent was greater than 0- Bogged Again should only be used with dips.'

    #print(f'Target percent is {targetdrop}... gathering data')
    datelist = []  # placeholder for dates of interest
    percentlist = []
    finalpercentlist = []
    finalopenpercent = []
    for x in range(0, len(requested) - 3):
        if requested['Percent'][x] <= targetdrop:
            final = truncate(
                (requested["Close"][x + 3] - requested["Close"][x]) /
                requested["Close"][x])
            final_open = truncate(
                (requested["Open"][x + 3] - requested["Close"][x]) /
                requested["Close"][x])
            datelist.append(requested["Date"]
                            [x])  # saving items of interest to separate lists
            percentlist.append(requested["Percent"][x])
            finalpercentlist.append(final)
            finalopenpercent.append(final_open)
            #print(f'{ticker} suffered a {requested["Percent"][x]} percent drop on \
            #    {requested["Date"][x]}, 3 days later it went to {requested["Close"][x+3]} for a {final} percent change')
    datelist.append(requested["Date"][
        len(requested) - 1])  # we add the items for today as well to the lists
    percentlist.append(requested['Percent'][len(requested) - 1])
    finalpercentlist.append(
        0)  # place holder because we don't know the future percentage change
    finalopenpercent.append(0)
    datelist.reverse(
    )  # reverse all the lists to be in reverse chronological order
    percentlist.reverse()
    finalpercentlist.reverse()
    finalopenpercent.reverse()
    datesecs = []
    for date in datelist:
        datesecs.append(time.mktime(time.strptime(
            date, "%Y-%m-%d")))  # convert all the dates into seconds
    if len(datesecs) > 100:
        return "There are at least 100 similar drops in the past, which probably means it's not a big enough drop. "
    categorylist = ['Unknown'] * len(datesecs)
    finaloutput = pd.DataFrame({
        'Date': datelist,
        'Percent': percentlist,
        '3 Day Percent': finalpercentlist,
        '3 Day Open Percent': finalopenpercent
    })
    # launching the webspider here
    stuff = {'datesecs': datesecs, 'ticker': ticker}
    configure_logging({'LOG_FORMAT': '%(levelname)s: %(message)s'})
    runner = CrawlerRunner(get_project_settings())
    crawltime = abs(
        datesecs[0] - datesecs[-1]
    ) / 60 * 60 * 24  # how many days are in between the first and the last

    def run_spider(spidername, thing):
        #d = runner.crawl(spidername, thing)
        #return d
        #reactor.run()  # the script will block here until the crawling is finished
        #a = process.crawl(spidername, thing)  # the script will block here until the crawling is finished
        process = CrawlerProcess({
            'USER_AGENT':
            'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'
        })
        process.crawl(
            spidername,
            thing)  # the script will block here until the crawling is finished
        #process.stop()

    #run_spider(QuotesSpider, stuff)
    # while the spider is still broken:
    filename = f'{lpath}stocks/headlines-{ticker}.csv'
    a = open(filename, 'w', newline='', encoding="utf-8")
    writer = csv.writer(a)
    writer.writerow(['Headline', 'URL'])  # adding a header for better pandas
    a.close()

    # since scrapy checks the news page by page, if the first and last date are far apart it may take some time for the
    # crawling to complete.
    timeout = 0
    while not os.path.exists(
            f'{lpath}stocks/headlines-{ticker}.csv') and timeout < 1500:
        time.sleep(
            5
        )  # assume it takes 1 second per page, 20 days of news on one page
        timeout += 5
    # if the spider ran successfully, it will have saved the headlines in a csv in the same directory
    if os.path.exists(f'{lpath}stocks/headlines-{ticker}.csv'):
        headlines = pd.read_csv(f'{lpath}stocks/headlines-{ticker}.csv',
                                encoding='unicode_escape')
        os.remove(f'{lpath}stocks/headlines-{ticker}.csv'
                  )  # deleting file once it's fulfilled its purpose
        newstext = []
        # make a dataframe out of all the stuff we care about, tack on the headlines to the side, and save it!
        finaloutput.assign(Headline=headlines.Headline)
        #finaloutput.assign(MatchPercent=headlines.MatchPercent)

        # unpickle pre-generated classifier to classify news
        f = open(f'{lpath}polls/boggedagain/my_classifier.pickle', 'rb')
        classifier = pickle.load(f)
        f.close()
        categorydecoder = {
            'AD': 'Analyst downgrade',
            'B': 'Bankruptcy',
            'CS': 'Company scandal',
            'LC': 'Leadership change',
            'LG': 'Lowered guidance',
            'LL': 'Lost lawsuit',
            'LS': 'Leadership scandal',
            'M': 'Merger',
            'NO': 'New options',
            'PO': 'Public offering',
            'R': 'Regulation',
            'RL': 'Restructuring/Layoff',
            'RM': 'Revenue miss',
            'SD': 'Sector dump',
            'SS': 'Stock split',
            'T': 'Trump',
            'TW': 'Trade war'
        }
        for x in range(0, len(headlines.Headline)):
            if headlines.Headline[x] != 'No match':
                filename = f'{ticker}-{time.strftime("%Y-%m-%d", time.gmtime(datesecs[x]))}'
                with open(f'{lpath}news/{filename}.txt',
                          encoding="utf-8") as f:
                    data = f.read()
                    f.close()
                    category = classifier.classify(cl.news_features(data))
                    categorylist[x] = categorydecoder[category]
                    # if this is not part of the training set yet, copy over as unlabeled so I can label it later.
                    if not os.path.exists(
                            f'{lpath}trainer/{category}/{filename}_labeled.txt'
                    ):
                        copyfile(f'{lpath}news/{filename}.txt',
                                 f'{lpath}trainer/{filename}.txt')
                        os.remove(f'{lpath}news/{filename}.txt')
        finaloutput.assign(Category=categorylist)
        finalfinal = finaloutput.join(headlines)
        finalfinal['Category'] = pd.Series(categorylist,
                                           index=finalfinal.index)
        return finalfinal
    else:
        return "QuoteSpider did not finish after 150 seconds. Either there are too many news per page for the crawler" \
               "to parse through, or something went wrong with the spider and it hung. Please try again!"
Example #57
0
    def parse(self, response):
        pass
'''
from scrapy.spiders import CrawlSpider, Rule
from scrapy.selector import Selector
#from scrapy.linkextractors.sgml import SgmlLinkExtractor
from itbooks.items import ItbooksItem
from scrapy.http import Request
from scrapy.linkextractors import LinkExtractor 
import logging
from scrapy.utils.log import configure_logging
from scrapy.shell import inspect_response


configure_logging(install_root_handler=False)
logging.basicConfig(
    filename='log.txt',
    format='%(levelname)s: %(message)s',
    level=logging.WARNING
)
class ItbooksSpider(CrawlSpider):
    name = "itbooks"
    allowed_domains = ["it-ebooks.info"]
    start_urls = ["http://it-ebooks.info/tag/programming/"]
 
    rules = (
#        Rule(SgmlLinkExtractor(allow=(r"page\/\d+\/$",), ), callback="parse_start_url", follow=True ),
#        Rule(LinkExtractor(allow=(r"page\/\d+\/$",), ), callback="parse_start_url_a", follow=False ),
        Rule(LinkExtractor(allow=(r"page\/\d+\/$",), ), callback="parse_start_url_a", follow=True ),
#         Rule(LinkExtractor(allow=(r"page\/\d+\/$",), ),),
 def __configure_logging(self):
     configure_logging(install_root_handler=False)
     logging.basicConfig(filename=self.spider + '_log.txt',
                         format='%(levelname)s: %(message)s',
                         level=logging.INFO)
Example #59
0
from tests.mockserver import MockServer, MockDNSServer


class LocalhostSpider(Spider):
    name = "localhost_spider"

    def start_requests(self):
        yield Request(self.url)

    def parse(self, response):
        netloc = urlparse(response.url).netloc
        self.logger.info("Host: %s" % netloc.split(":")[0])
        self.logger.info("Type: %s" % type(response.ip_address))
        self.logger.info("IP address: %s" % response.ip_address)


if __name__ == "__main__":
    with MockServer() as mock_http_server, MockDNSServer() as mock_dns_server:
        port = urlparse(mock_http_server.http_address).port
        url = "http://not.a.real.domain:{port}/echo".format(port=port)

        servers = [(mock_dns_server.host, mock_dns_server.port)]
        reactor.installResolver(createResolver(servers=servers))

        configure_logging()
        runner = CrawlerRunner()
        d = runner.crawl(LocalhostSpider, url=url)
        d.addBoth(lambda _: reactor.stop())
        reactor.run()
Example #60
0
import sys
reload(sys)
sys.setdefaultencoding('utf8')
import scrapy
from twisted.internet import reactor
from scrapy.crawler import CrawlerRunner
from scrapy.utils.log import configure_logging
from scrapy.spiders import Spider
from scrapy.selector import HtmlXPathSelector
from items import FaqscrapyItem
from scrapy.http import Request
from scrapy.selector import Selector
from scrapy.utils.project import get_project_settings
from spiders.FAQ_jingdong import JingdongSpider
from spiders.FAQ_suning import SuningSpider
import re

if __name__ == '__main__':
    settings = get_project_settings()
    configure_logging(settings)
    runner = CrawlerRunner(settings)

    runner.crawl(JingdongSpider)
    runner.crawl(SuningSpider)

    d = runner.join()
    d.addBoth(lambda _: reactor.stop())

    # blocks process so always keep as the last statement
    reactor.run()