Beispiel #1
0
def run():
    config.connect_to_client()

    print('Running event processor...')

    crawlerProcess = CrawlerProcess(get_project_settings())

    settings = project.get_project_settings()
    spider_loader = spiderloader.SpiderLoader.from_settings(settings)
    spiders = spider_loader.list()
    classes = [
        s
        for s in (spider_loader.load(name) for name in spiders
                  if config.spider_name == None or name == config.spider_name)
        if s.enabled
    ]

    crawlerProcess = CrawlerProcess(get_project_settings())

    for spider_class in classes:
        crawlerProcess.crawl(spider_class)

    crawlerProcess.start()
    crawlerProcess.join()

    print('Event processor completed')

    events = requests.get(config.get_events, params={})

    if len(events.json()) > 0:
        print('Data retrieved successfully')
    else:
        print('No data retrieved')
Beispiel #2
0
def main():
    settings = get_project_settings()

    process = CrawlerProcess(settings)
    process.crawl(IndiatimesSpider)
    process.start()
    process.join()
 def run(self):  # 固定用run方法,启动进程自动调用run方法
     print("启动前台抓取任务")
     settings = get_project_settings()
     settings.set(
         'USER_AGENT',
         "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
         "Chrome/70.0.3538.77 Safari/537.36")
     settings.set('LOG_FILE', self.name + ".log")
     settings.set('ROBOTSTXT_OBEY', False)
     process = CrawlerProcess(settings)
     process.crawl(QuotesSpider, shop_name=self.name)
     process.start()
     process.join()
     print("前台抓取数据一轮完成")
     count = random.randint(10, 30)
     database = DataManager(self.name)
     attr = database.getAttr("EMPTY")
     if attr["minute"] != 0:
         count = attr["minute"] * 60
     minute = 0
     while minute <= count:
         database.handlerStatus()
         minute += 1
         time.sleep(1)
         attr = database.getAttr("EMPTY")
         count = attr["minute"] * 60
Beispiel #4
0
def run():
    status, msg = config.connect_to_client()
    if not status:
       print(msg)
       sys.exit(1)

    # Look for one month of events for testing purposes
    start_date = datetime.now().strftime('%m-%d-%Y')
    end_date = (datetime.now() + relativedelta(months=+1)).strftime('%m-%d-%Y')

    print('Running event processor...')

    crawlerProcess = CrawlerProcess(get_project_settings())

    crawlerProcess.crawl(HistorySpider, start_date, end_date)
    crawlerProcess.crawl(WpbccSpider, start_date, end_date)
    crawlerProcess.crawl(LWVChicago, start_date, end_date)
    crawlerProcess.crawl(LibraryEvents, start_date, end_date)
    crawlerProcess.crawl(GreatLakesReader, start_date, end_date)

    crawlerProcess.start()
    crawlerProcess.join()

    print('Event processor completed')
 
    events = requests.get(config.db_get_events, params = {
        'start_timestamp': 0, 
        'end_timestamp': 10000000000
    })

    if len(events.json()) > 0:
        print('Data retrieved successfully')
    else:
        print('No data retrieved')
Beispiel #5
0
def a():
    settings = get_project_settings()
    settings.set('ITEM_PIPELINES', {'pipl.MoviePipeline': 100})
    crawler = CrawlerProcess(settings)
    crawler.crawl(MeijuSpider)
    crawler.start()
    crawler.join()
    def handleDatas(self, oriurl, orilable, urllists, delete_urllists):
        """
		当点击下载时候,获取所添加的URL,启动爬虫开始下载
		:return:
		"""
        self.urllists = urllists
        if delete_urllists != []:
            # p_delete = Process(target=self.deleteOldDatas,args=(delete_urllists,))
            # p_delete.start()
            # p_delete.join()
            # p_delete = threading.Thread(target=self.deleteOldDatas, args=(delete_urllists,))
            # p_delete.start()
            # p_delete.join()
            self.deleteOldDatas(delete_urllists)

        aim_lables = []
        for item in urllists:
            aim_lables.append(orilable[oriurl.index(item)])

        # 这里执行插入语句,将新的url+lable数据插入到details数据表
        threading.Thread(target=self.insertUrlLableIntoSQL,
                         args=(urllists, aim_lables)).start()

        urllists = ','.join(urllists)
        # cmdline.execute(["scrapy", "crawl", "tmallMain","-a","url_lists="+urllists])
        process = CrawlerProcess(get_project_settings())
        # 'credit'替换成你自己的爬虫名
        process.crawl('tmallMain', url_lists=urllists)
        process.start(
        )  # the script will block here until the crawling is finished
        process.join()
Beispiel #7
0
def run():
    settings = get_project_settings()
    crawler = CrawlerProcess(settings)
    crawler.crawl(RyanscomputersSpider)
    crawler.crawl(StartechSpider)
    crawler.start()
    crawler.join()
Beispiel #8
0
def crawl_run():
    print('开始爬取............')
    scope = 'all'
    process = CrawlerProcess(settings=get_project_settings())
    process.crawl(SentispiderSpider, scope)
    process.start()
    process.join()
    print('爬取结束............')
Beispiel #9
0
def crawl_policy_watch():
    """ Starts crawling process which fetches government policies from website covid19policywatch.org"""
    settings = Settings()
    process = CrawlerProcess(settings)

    process.crawl(PolicyWatchSpider)
    process.start()
    process.join()
Beispiel #10
0
    def run(self):
        """
        Starting client and scrapping jobs. And then get results from
        scrapping (url list of images) and start processes in pool for
        downloading and storing non-duplicate images.
        :return: self
        """
        if self.hashes is None:
            logging.error(f'prepare() function was not called before')
            return None
        # results = []
        queue = multiprocessing.Queue()
        pool = [
            multiprocessing.Process(target=self._queue_worker, args=(queue, ))
            for _ in range(self.num_processes)
        ]
        for process in pool:
            process.start()

        # pool = multiprocessing.Pool(self.num_processes, self._worker_main, (queue,))

        def crawler_results(signal, sender, item, response, spider):
            """
            help function for getting result when one page scrapped
            :param signal:
            :param sender:
            :param item:
            :param response:
            :param spider:
            :return:
            """
            # results.append(item)
            for x in item['urls']:
                queue.put(x)

        dispatcher.connect(crawler_results, signal=signals.item_passed)
        process = CrawlerProcess({
            'USER_AGENT':
            'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'
        })
        process.crawl(WallpapersSpider,
                      start_time=self.start_time,
                      end_time=self.end_time,
                      resolution=self.resolution,
                      start_url=self.BASE_URL)
        logging.getLogger('scrapy').setLevel(logging.ERROR)
        process.start()
        for _ in range(self.num_processes):
            queue.put('STOP')
        # results = [x for res in results for x in res['urls']]

        # logging.info(f'ALL IMAGES URLS: {", ".join(results)}')

        # with multiprocessing.Pool(self.num_processes) as pool:
        #     pool.map(self._process_urls, results)
        for process in pool:
            process.join()
        return self
def run_scrapy(spider):
    """
    利用scrapy爬虫框架启动weahter爬虫抓取从2011-2019年所有月份的天气数据信息,并存放到MangoDB数据库中
    """
    project_settings = get_project_settings()
    process = CrawlerProcess(project_settings)
    process.crawl(spider)
    process.start()
    process.join()  # 进程间同步
Beispiel #12
0
def main(*, query_type, url):

    file_path = _get_file_path(query_type)
    process = CrawlerProcess({
        **get_project_settings(), "FEED_URI": file_path,
        "FEED_FORMAT": "csv"
    })
    process.crawl('immoscout', url=url)
    process.start()
    process.join()
    _post_process(query_type, suffix="csv")
Beispiel #13
0
def ensure_msig_path() -> str:
    """Download the GSEA data and return the path."""
    if not os.path.exists(GMT_ENTREZ_PATH):
        process = CrawlerProcess({
            'USER_AGENT':
            'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)',
        })
        process.crawl(GSEASpider)
        process.start()
        process.join()
    return GMT_ENTREZ_PATH
Beispiel #14
0
def runSpider(uid):
    print(uid)
    dir_name = './result/' + str(uid)
    if not os.path.isdir(dir_name):
        os.makedirs(dir_name)
    process = CrawlerProcess(get_project_settings())
    print(get_project_settings())
    # name = ['weibocn']
    process.crawl('weibocn', uid)
    process.start()
    process.join()
Beispiel #15
0
def crawl_lad(depth=lad_depth, urls=None, domain=lad_domain):
    """Starts crawling process which downloads pdfs from all prepared .gov websites"""
    if urls is None:
        urls = list(get_gov_websites(gov_sites_path))

    settings = scrapy_settings(depth, concurrent_requests)
    process = CrawlerProcess(settings)

    process.crawl(LadSpider, urls, domain)
    process.start()
    process.join()
Beispiel #16
0
    def crawl(self):
        """ crawl through the database and either save the results to a database or text files. """
        # setup settings
        from scrapy.settings import Settings
        from scrapytest.spiders import GuardianNewsSpider

        settings = Settings()
        settings.set("USER_AGENT", config['crawler_user_agent'])
        settings.set("LOG_LEVEL", self._args['log_level'])
        settings.set('custom_guardian_config', self._custom_guardian_config)

        crawler = CrawlerProcess(settings=settings)
        crawler.crawl(GuardianNewsSpider)
        crawler.start()
        crawler.join()
Beispiel #17
0
def main():
    if len(sys.argv) != 2:
        print('usage: run_spider.py file-config')
        sys.exit(1)
        
    file_config = sys.argv[1]
    
    if not os.path.exists(file_config):
        print(f'Not found: {file_config}')
        sys.exit(1)

    configure_logging()
    settings = get_project_settings()

    _murlok = Murlok(file_config)

    formats = ('json', 'xml', 'jsonlines', 'csv', 'pickle', 'marshal')
    settings.set('FEED_EXPORT_ENCODING',str(_murlok.encoding))
    
    if _murlok.format in formats:
        settings.set('FEED_FORMAT',str(_murlok.format))
        settings.set('FEED_URI',(_murlok.spider) + '.' + str(_murlok.format))
    else:
        #Pippeline config peewee - pip install -U peewee
        #format mysql, postg, sqlite
        settings.set('ITEM_PIPELINES','{"murlok.pipelines.MurlokPipeline": 300}')
        pass

    runner = CrawlerProcess(settings)
    runner.crawl(MurlokSpider,murlok = _murlok)
    d = runner.join()
    d.addBoth(lambda _: reactor.stop())
    reactor.run()
Beispiel #18
0
def main(args):
    settings = Settings()
    settings.setmodule(iw_settings)
    spider = ThedySpider()

    process = CrawlerProcess(settings)
    crawler = process.create_crawler(spider)
    crawler.signals.connect(item_scraped, signal=signals.item_scraped)
    process.crawl(crawler)
    process.start(stop_after_crawl=True)
    process.join()

    result["scraping_time"] = result["scraping_time"].isoformat()

    doc = {"doc": dict(result)}

    return doc
Beispiel #19
0
def runAllCities(cityPairs, days):
	a = time.time()
	process = CrawlerProcess(get_project_settings())
	for pair in cityPairs:
		process.crawl(SWAFareSpider, fromCity = pair[0], days = days, toCity = pair[1])		
	d = process.join()
	d.addBoth(lambda _: reactor.stop())
	reactor.run() # the script will block here until all crawling jobs are finished
	print("crawl time: " + str(time.time() - a))
Beispiel #20
0
def main(argv=None):
    """Entry point to the anime planet link scraper."""
    argv = argv or sys.argv[1:]
    parser = argparse.ArgumentParser("""Scrape anime character profiles.""")
    parser.add_argument("--manifest",
                        metavar="OUTPUT",
                        type=str,
                        default=None,
                        required=False)
    parser.add_argument("--pages-directory",
                        metavar="PAGES",
                        type=str,
                        required=True)
    result = parser.parse_args(argv)

    # maybe get the previous manifest entries (to write back out into the new
    # manifest).
    if result.manifest and os.path.exists(result.manifest):
        with open(result.manifest, "r") as fileobj:
            previous_manifest = json.load(fileobj)
    else:
        previous_manifest = []

    with open(result.manifest or sys.stdout, "w") as manifest_fileobj, \
             JSONListStream(manifest_fileobj) as json_stream:
        previously_scraped_urls = set()
        for item in previous_manifest:
            json_stream.write(item)
            previously_scraped_urls.add(item["url"])
        for entry in os.scandir(result.pages_directory):
            filename = os.path.basename(entry.path)
            b64_url, ext = filename.split(".")
            if ext != "html":
                continue
            previously_scraped_urls.add(base64_urldecode(b64_url))
        spider_cls = \
            make_anime_planet_spider_cls(previously_scraped_urls)
        process = CrawlerProcess({"COOKIES_ENABLED": False})
        process.crawl(spider_cls,
                      manifest_file=json_stream,
                      pages_directory=result.pages_directory,
                      previously_scraped_urls=previously_scraped_urls)
        process.start()
        process.join()
Beispiel #21
0
def runUserFlights(userFlights):
	a = time.time()
	process = CrawlerProcess(get_project_settings())
	for flight in userFlights:
		if flight.date > datetime.now(): #check in timezone of flight..
			process.crawl(SWAFareSpider, fromCity = flight.origin, days = 1, toCity = flight.destination, startDate = flight.date)		
	d = process.join()
	d.addBoth(lambda _: reactor.stop())
	reactor.run() # the script will block here until all crawling jobs are finished
	print("crawl time: " + str(time.time() - a))
Beispiel #22
0
def run_all_spiders():
    process_default = CrawlerProcess()
    # We assume that the default spider has to run first and finish before the other spiders run.
    process_default.crawl(DefaultQuotesSpider)
    process_default.start()
    time.sleep(120)
    # process_default.stop()

    process = CrawlerProcess()
    process.join()
    active_spiders = [
        TableQuotesSpider,
        JavascriptQuotesSpider,
        LoginQuotesSpider,
        InfiniteScrollQuotesSpider,
    ]
    for spider in active_spiders:
        process.crawl(spider)
    process.start()
Beispiel #23
0
 def _crawl(self, spider, qis_running):
     write_in_a_file('CrawlerProcess.signal.error', {'signals': dir(signals)}, 't.txt')
     qis_running.put(spider)
     crawler = CrawlerProcess(get_project_settings())
     crawler.crawl(spider)
     # To prevent the infamous error: django.db.utils.InterfaceError: (0, '')
     db.connection.close()
     crawler.start()
     write_in_a_file('SpiderProcess.start: process started', {}, 'debug.txt')
     crawler.join()
     write_in_a_file('SpiderProcess.crawl: process joined', {}, 'task.txt')
     write_in_a_file('SpiderProcess.crawl: process joined', {}, 'tasks.txt')
     write_in_a_file('SpiderProcess.crawl: process joined', {}, 'spider.txt')
     write_in_a_file(f'Crawler Process - before: qis_running.qsize: {qis_running.qsize()}', {}, 'tasks.txt')
     try:
         qis_running.get()
     except Exception as e:
         write_in_a_file(f'Crawler Process - error in qis_running.get: {e}', {}, 'tasks.txt')
     write_in_a_file(f'Crawler Process - after: qis_running.qsize: {qis_running.qsize()}', {}, 'tasks.txt')
     write_in_a_file('===========================================================================================', {}, 'tasks.txt')
Beispiel #24
0
def main(args):
    settings = Settings()
    settings.setmodule(iw_settings)
    spider = ThedySpider()

    process = CrawlerProcess(settings)
    crawler = process.create_crawler(spider)
    crawler.signals.connect(item_scraped,
                            signal=signals.item_scraped)
    process.crawl(crawler)
    process.start(stop_after_crawl=True)
    process.join()

    result["scraping_time"] = result["scraping_time"].isoformat()

    doc = {
        "doc": dict(result)
    }

    return doc
Beispiel #25
0
class RunCrawler(object):
    """
    RunCrawler class
    """
    def __init__(self):
        self.running = False
        self.process = None

    def start(self):
        """
        开始
        """
        if self.running:
            return
        self.running = True
        # 获取通用项目设置
        self.process = CrawlerProcess(get_project_settings())
        # 根据规则启动爬虫
        proxyRules = ProxyRules()
        for (k, v) in proxyRules.Rules.iteritems():
            if isinstance(v, dict):
                if 'enable' in v and v['enable']:
                    logger.info('Start crawl name:%(name)s rule:%(rule)s', {
                        'name': v['name'],
                        'rule': k
                    })
                    self.process.crawl(CommonSpider, v)

        # the script will block here until the crawling is finished
        self.process.start()
        self.process.join()
        self.process.stop()
        self.running = False

    def stop(self):
        """
        停止
        """
        if self.running:
            self.running = False
            self.process.stop()
Beispiel #26
0
def main():
	'''
	Starts harvest script
	'''
	logger.info('launching main')
	# tz = pytz.timezone('America/Los_Angeles')
	tz = pytz.timezone('EST')
	start = datetime.datetime.now(tz=tz)

	if config['use_proxy']:
		update_proxies()

	# launching crawlers
	store = Store()
	process = CrawlerProcess(custom_settings)
	for spider in spiders:
		logger.info(f'starting {spider.name}')
		process.crawl(spider, store=store)
	process.start()
	process.join()

	end = datetime.datetime.now(tz=tz)
	logger.info(f"runtime: {end - start}")
Beispiel #27
0
class CrawlerExecutor():

    count = 0

    def __init__(self, spider):
        def increment_count(cls):
            print('incrementing count')
            cls.count = cls.count + 1

        dispatcher.connect(lambda _: print('FINIsh'),
                           signal=signals.spider_closed)
        dispatcher.connect(increment_count, signal=signals.item_passed)
        settings = get_project_settings()
        self.process = CrawlerProcess(settings)

        self.spider = spider

    def start(self):
        self.process.crawl(self.spider)
        self.process.start()

    def join(self):
        self.process.join()
Beispiel #28
0
def getDomainUrls(startUrl, allowedDomains, invalidStrings):
    pool = Urllib3PoolFactory.getPool()
    request = pool.request('GET', startUrl + '/robots.txt')

    disallowUris = []
    try:
        for line in request.data.split('\n'):
            if 'disallow' in line.lower():
                disallowUris.append(line.lower().split(':')[1].strip())
    except:
        pass

    process = CrawlerProcess({
        'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)',
        'LOG_ENABLED': True,
    })

    ScrapyCrawler.configure([startUrl], allowedDomains, invalidStrings,
                            disallowUris)
    process.crawl(ScrapyCrawler)
    process.start()
    process.join()
    return ScrapyCrawler.domainUrls
Beispiel #29
0
    os.chdir('data_aggregators')

    # Look for one month of events for testing purposes
    start_date = datetime.now().strftime('%m-%d-%Y')
    end_date = (datetime.now() + relativedelta(months=+1)).strftime('%m-%d-%Y')

    print('Running data engine...')

    crawlerProcess = CrawlerProcess(get_project_settings())

    crawlerProcess.crawl(HistorySpider, start_date, end_date)
    crawlerProcess.crawl(WpbccSpider, start_date, end_date)
    crawlerProcess.crawl(LWVchicago, start_date, end_date)
    crawlerProcess.crawl(LibraryEvents, start_date, end_date)
    crawlerProcess.crawl(GreatLakesReader, start_date, end_date)

    crawlerProcess.start()
    crawlerProcess.join()

    print('Data engine complete')

    events = requests.get(config.db_get_events,
                          params={
                              'start_timestamp': 0,
                              'end_timestamp': 10000000000
                          })

    if len(events.json()) > 0:
        print('Data retrieved successfully')
    else:
        print('No data retrieved')
'''
import logging

import gevent

from scrapy import signals
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings

from tddc import Singleton, TaskManager, Task, TaskCacheManager, TaskConfigModel, DBSession

from .Scrapy import SingleSpider

settings = get_project_settings()
crawler_process = CrawlerProcess(settings)
crawler_process.join()

log = logging.getLogger(__name__)


class Crawler(object):
    '''
    Spider管理、任务分配
    '''
    __metaclass__ = Singleton

    def __init__(self):
        '''
        Constructor
        '''
        log.info('Spider Is Starting.')
# settings.set('LOG_LEVEL', 'ERROR')

# pipeline 설정
settings.setdict({
    'ITEM_PIPELINES': {
        # 'shopper.pipelines.CsvPipeline': 300,  # 크롤링결과를 csv 파일로 export
        'shopper.pipelines.JsonPipeline': 300,  # 크롤링결과를 json 파일로 export
        # 'shopper.pipelines.CrawlNewPipeline': 300, # DB Import
    }
})
# 슬랙 연동
settings.setdict(
    {'EXTENSIONS': {
        'shopper.middleware.slack_middleware.SlackStats': 100,
    }})
slack = SlackSum(settings.get("SLACK_TOKEN"), settings.get("SLACK_CHANNEL"),
                 settings.get("SLACK_BOT"))

process = CrawlerProcess(settings)

# process.crawl(ChanelSpider)
# process.crawl(YslSpider)
# process.crawl(LouisVuittonSpider)
# process.crawl(GucciSpider)
process.crawl(HermesSpider)

process.start()
process.join()

# 크롤링 결과 데이터 보내는
# slack.total_finish()
Beispiel #32
0
import scrapy
from twisted.internet import reactor
from scrapy.crawler import CrawlerProcess
from scrapy.utils.log import configure_logging

from scrapy.utils.project import get_project_settings



configure_logging()
runner = CrawlerProcess(get_project_settings())
runner.crawl(AbcdinSpider)
runner.crawl(CasaximenaSpider)
runner.crawl(CoronaSpider)
runner.crawl(DafitiSpider)
runner.crawl(EasySpider)
runner.crawl(FalabellaSpider)
runner.crawl(HitesSpider)
runner.crawl(LapolarSpider)
runner.crawl(LinioSpider)
runner.crawl(ParisSpider)
runner.crawl(PcfactorySpider)
runner.crawl(RipleySpider)
runner.crawl(SodimacSpider)
runner.crawl(ZmartSpider)

d = runner.join()
d.addBoth(lambda _: reactor.stop())

reactor.run() # the script will block here until all crawling jobs are finished