Ejemplo n.º 1
0
    def _retrieve_phrase_data_subprocess(self, queue: Queue) -> list:
        """This method retrieves a list of phrases from an html document

        Returns:
            list: List of strings with phrases containing searched phrase
        """
        # Set up a crawler process to use a spider
        runner = CrawlerRunner(self._crawler_meta)

        # Middleware between downloader and spider
        dispatcher.connect(self.crawler_results, signal=signals.item_passed)
        dispatcher.connect(reactor.stop, signal=signals.spider_closed)

        # Apply requests from spider - Add arguments to initialize the spider
        if self.check_parameters():
            try:
                defered = runner.crawl(self._spider_bot, self._parameter_dict)

                defered.addBoth(lambda _: reactor.stop())

                reactor.run()
                queue.put(self._crawler_results)
            except Exception as e:
                queue.put(e)
        else:
            queue.put(None)
Ejemplo n.º 2
0
def spider_results(spidername, keywords, pagenum, sorttype):
    spider_class = None
    if spidername == 'bing':
        spider_class = BingSpider
    elif spidername == 'weixin':
        spider_class = SogouWxSpider
    elif spidername == 'weibo':
        spider_class = WeiboSpider
    elif spidername == 'baidu':
        spider_class = BaiduSpider
    elif spidername == 'baidunews':
        spider_class = BaidunewsSpider
    elif spidername == "ss_360":
        spider_class = Ss360Spider
    elif spidername == "ss_360_zx":
        spider_class = Ss360ZZSpider
    elif spidername == "chinaso":
        spider_class = ChinaSoSpider
    elif spidername == "chinaso_news":
        spider_class = ChinaSoNewsSpider
    else:
        return []

    results = []

    def crawler_results(signal, sender, item, response, spider):
        results.append(dict(item))

    dispatcher.connect(crawler_results, signal=signals.item_passed)

    process = CrawlerProcess(get_project_settings())
    process.crawl(spider_class, keywords=keywords,
                  pagenum=pagenum, sorttype=sorttype)
    process.start()  # the script will block here until the crawling is finished
    return json.dumps(results, ensure_ascii=False).encode('gbk', 'ignore').decode('gbk')
Ejemplo n.º 3
0
def test_parser_true(start_time, end_time, resolution, start_url, correct_res):
    res = []

    def crawler_results(signal, sender, item, response, spider):
        """
        help function for getting result when one page scrapped
        :param signal:
        :param sender:
        :param item:
        :param response:
        :param spider:
        :return:
        """
        for x in item['urls']:
            res.append(x)

    dispatcher.connect(crawler_results, signal=signals.item_passed)
    process = CrawlerProcess(
        {'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'})
    process.crawl(WallpapersSpider,
                  start_time=start_time,
                  end_time=end_time,
                  resolution=resolution,
                  start_url=start_url)
    process.start()

    assert sorted(correct_res) == sorted(res)
Ejemplo n.º 4
0
    def scrape_with_crochet(self, domain):
        """
                signal fires when single item is processed and calls _crawler_result to save that item.

        Consider some synchronous do-one-thing-after-the-other application code that wants to use event-driven Twisted-using code.
        We have two threads at a minimum: the application thread(s) and the reactor thread. There are also multiple layers
        of code involved in this interaction

        Twisted code: Should only be called in reactor thread. This may be code from the Twisted package itself, or more
        likely code you have written that is built on top of Twisted.

        @wait_for/@run_in_reactor wrappers: The body of the functions runs in the reactor thread... but the caller
        should be in the application thread.

        The application code: Runs in the application thread(s), expects synchronous/blocking calls.
        dispatcher.connect will connect to the dispatcher that will kind of loop the code between these two functions.
        crawl_runner.crawl will connect to the our particular spider function based on the domain name,
        in our scrapy file and after each yield will pass to the crawler_result function.
        The setting.py is applied to the crawl runner.

        :param domain: the domain to crawl
        :return: a twisted.internet.defer.Deferred

        """
        configure_logging({'LOG_FORMAT': '%(levelname)s: %(message)s'})
        crawler_settings = Settings()
        crawler_settings.setmodule(sets)
        self.crawl_runner.settings = crawler_settings
        dispatcher.connect(self._crawler_result, signal=signals.item_scraped)

        for i in self.dict_of_spiders:
            if i in domain:
                eventual = self.crawl_runner.crawl(self.dict_of_spiders[i], category=domain)
                return eventual
def start_crawler(start_url, max_parsed_pages, num_processes, db):
    queue = multiprocessing.Queue()
    pool = [
        multiprocessing.Process(target=queue_worker, args=(queue, db))
        for _ in range(num_processes)
    ]
    for process in pool:
        process.start()

    def crawler_results(signal, sender, item, response, spider):
        """
        help function for getting result when one page scrapped
        :param signal:
        :param sender:
        :param item:
        :param response:
        :param spider:
        :return:
        """
        queue.put(item)

    dispatcher.connect(crawler_results, signal=signals.item_passed)
    process = CrawlerProcess(
        {'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'})
    process.crawl(AdvertisementScrapper,
                  start_url=start_url,
                  max_parsed_pages=max_parsed_pages)
    process.start()
Ejemplo n.º 6
0
    def handle(self, *args, **options):
        if not options.get("period"):
            target_date = date.today() + relativedelta(months=-2)
            target_date = target_date.strftime("%m/%Y")
        else:
            target_date = options.get("period")

        dispatcher.connect(self.save, signal=signals.item_passed)
        os.environ["SCRAPY_SETTINGS_MODULE"] = "scraper.settings"
        settings = get_project_settings()
        settings["COOKIES_ENABLED"] = True

        if options.get("scrapy_args"):
            scrapy_args = json.loads(options.get("scrapy_args"))
            settings.update(scrapy_args)

        process = CrawlerProcess(settings=settings)

        args = {
            "unidade": options.get("unit"),
            "competencia": target_date,
            "cidade": "feira de santana",
            "periodicidade": options.get("period_type"),
        }
        self.warn(str(args))
        process.crawl(ConsultaPublicaSpider, **args)
        self.warn("Iniciando a coleta dos documentos do TCM-BA...")
        process.start()
        self.success("Pronto!")
Ejemplo n.º 7
0
def spider_results():
    results = []
    settings = Settings()

    os.environ['SCRAPY_SETTINGS_MODULE'] = 'LyricsFinder.LyricsFinder.settings'
    settings_module_path = os.environ['SCRAPY_SETTINGS_MODULE']
    settings.setmodule(settings_module_path, priority='project')

    process = CrawlerProcess(settings)

    def crawler_results(signal, sender, item, response, spider):
        results.append(item)

    dispatcher.connect(crawler_results, signal=signals.item_passed)

    query = ''
    name_split = args['song_name'].split()
    for name in name_split[:-1]:
        query += name + '+'
    if (args['singer']):
        query += name_split[-1] + '+by' + '+'
        singer_split = args['singer'].split()
        for name in singer_split[:-1]:
            query += name + '+'
        query += singer_split[-1] + '+lyrics' + '+-site:youtube.com'
    else:
        query += name_split[-1] + '+lyrics' + '+-site:youtube.com'
    # print(query)
    process.crawl(LyricsFinderSpider,
                  start_urls=["https://www.google.com/search?q=" + query])
    process.start()
    return results
Ejemplo n.º 8
0
    def handle(self, *args, **options):
        if options.get("drop_all"):
            self.warn("Dropping existing records...")
            CityCouncilAgenda.objects.all().delete()
            CityCouncilAttendanceList.objects.all().delete()

            if os.getenv("FEATURE_FLAG__SAVE_GAZETTE", False):
                Gazette.objects.all().delete()
                GazetteEvent.objects.all().delete()

        dispatcher.connect(self.save, signal=signals.item_passed)
        os.environ["SCRAPY_SETTINGS_MODULE"] = "scraper.settings"
        process = CrawlerProcess(settings=get_project_settings())
        process.crawl(
            AgendaSpider,
            start_from_date=CityCouncilAgenda.last_collected_item_date(),
        )
        process.crawl(AttendanceListSpider)

        if os.getenv("FEATURE_FLAG__SAVE_GAZETTE", False):
            last_collected_gazette = Gazette.last_collected_item_date()
            if last_collected_gazette is None:
                process.crawl(LegacyGazetteSpider)
            process.crawl(
                ExecutiveAndLegislativeGazetteSpider,
                start_from_date=last_collected_gazette,
            )

        process.start()
        self.success("Done!")
Ejemplo n.º 9
0
def scrape_with_crochet(baseURL):
    # This will connect to the dispatcher that will kind of loop the code between these two functions.
    dispatcher.connect(_crawler_result, signal=signals.item_scraped)

    # This will connect to the ReviewspiderSpider function in our scrapy file and after each yield will pass to the crawler_result function.
    eventual = crawl_runner.crawl(ReviewspiderSpider, category=baseURL)
    return eventual
Ejemplo n.º 10
0
def run_crawler2(q):
    print('run_crawler')
    def close():
        q.put('close')
        print('CLOSE')

    def scraped():
        q.put('scraped')
        print('SCRAPED')

    try:
        print('run_crawler')
        crawler_settings = get_project_settings()
        runner = CrawlerRunner(crawler_settings)
        dispatcher.connect(close, signal=signals.spider_closed)#'item_scraped'
        dispatcher.connect(scraped, signal=signals.item_scraped)#'item_scraped'
        deferred = runner.crawl(InfoempleoSpider)
        deferred.addBoth(lambda _: reactor.stop())
        print('reactor...')
        q.put('reactor...')
        reactor.run()
        print('run!!!!!')
        q.put('run')
    except Exception as e:
        print(e)
        q.put(e)
def scrape_paytm_with_crochet(retailer_id, search_string, category_name):
    dispatcher.connect(_crawler_result, signal=signals.item_scraped)
    eventual = crawl_runner.crawl(PaytmscraperSpider,
                                  retailer_id=retailer_id,
                                  search_string=search_string,
                                  category_name=category_name)
    return eventual
Ejemplo n.º 12
0
def scrape_with_crochet(post_form, post_head):

    dispatcher.connect(_crawler_result, signal=signals.item_scraped)
    eventual = crawler_runner.crawl(OjkCFS_Spider,
                                    req_head=post_head,
                                    req_form=post_form)
    # dispatcher.connect(_crawler_stop, signals.engine_stopped)
    return eventual
def scrape_croma_with_crochet(retailer_id, search_string, category_name):
    dispatcher.connect(_crawler_result, signal=signals.item_scraped)
    print(f"Croma retailer ID {retailer_id}")
    eventual = crawl_runner.crawl(CromascraperSpider,
                                  retailer_id=retailer_id,
                                  search_string=search_string,
                                  category_name=category_name)
    return eventual
Ejemplo n.º 14
0
    def run(self):
        """
        Starting client and scrapping jobs. And then get results from
        scrapping (url list of images) and start processes in pool for
        downloading and storing non-duplicate images.
        :return: self
        """
        if self.hashes is None:
            logging.error(f'prepare() function was not called before')
            return None
        # results = []
        queue = multiprocessing.Queue()
        pool = [
            multiprocessing.Process(target=self._queue_worker, args=(queue, ))
            for _ in range(self.num_processes)
        ]
        for process in pool:
            process.start()

        # pool = multiprocessing.Pool(self.num_processes, self._worker_main, (queue,))

        def crawler_results(signal, sender, item, response, spider):
            """
            help function for getting result when one page scrapped
            :param signal:
            :param sender:
            :param item:
            :param response:
            :param spider:
            :return:
            """
            # results.append(item)
            for x in item['urls']:
                queue.put(x)

        dispatcher.connect(crawler_results, signal=signals.item_passed)
        process = CrawlerProcess({
            'USER_AGENT':
            'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'
        })
        process.crawl(WallpapersSpider,
                      start_time=self.start_time,
                      end_time=self.end_time,
                      resolution=self.resolution,
                      start_url=self.BASE_URL)
        logging.getLogger('scrapy').setLevel(logging.ERROR)
        process.start()
        for _ in range(self.num_processes):
            queue.put('STOP')
        # results = [x for res in results for x in res['urls']]

        # logging.info(f'ALL IMAGES URLS: {", ".join(results)}')

        # with multiprocessing.Pool(self.num_processes) as pool:
        #     pool.map(self._process_urls, results)
        for process in pool:
            process.join()
        return self
Ejemplo n.º 15
0
 def __init__(self):
     self.logger.info('Lagou webdrive start')
     super(LagouSpider, self).__init__()
     chrome_opt = webdriver.ChromeOptions()
     pref = {"profile.managed_default_content_settings.images": 2}
     chrome_opt.add_experimental_option("prefs", pref)
     self.browser = webdriver.Chrome(executable_path=CHROME_PATH,
                                     chrome_options=chrome_opt)
     dispatcher.connect(self.spider_close, signals.spider_closed)
Ejemplo n.º 16
0
            def f(return_list):
                def collect_items(signal, sender, item, response, spider):
                    return_list.append(item)

                dispatcher.connect(collect_items, signal=signals.item_passed)
                runner = crawler.CrawlerRunner()
                deferred = runner.crawl(PlantInfoSpider, url=returned_url)
                deferred.addBoth(lambda _: reactor.stop())
                reactor.run()
def scrape_amazon_with_crochet(retailer_id, search_string, category_name):
    # This will connect to the dispatcher that will kind of loop the code between these two functions.
    dispatcher.connect(_crawler_result, signal=signals.item_scraped)
    print(f"Amazon retailer ID {retailer_id}")
    # This will connect to the ReviewspiderSpider function in our scrapy file and after each yield will pass to the crawler_result function.
    eventual = crawl_runner.crawl(AmazonscraperSpider,
                                  retailer_id=retailer_id,
                                  search_string=search_string,
                                  category_name=category_name)
    return eventual
Ejemplo n.º 18
0
def scrape_with_crochet():
    """
    Deferete function who permit to call the crawler to fetch the articles.
    The crawler is launched in an asynchronous processus.
    """
    # signal fires when single item is processed
    # and calls _crawler_result to append that item
    dispatcher.connect(_crawler_result, signal=signals.item_scraped)
    eventual = crawl_runner.crawl(ArticlesSpider)
    return eventual  # returns a twisted.internet.defer.Deferred
Ejemplo n.º 19
0
    def handle(self, *args, **options):
        if options.get("drop_all"):
            self.warn("Dropping existing records...")
            Kid.objects.all().delete()

        dispatcher.connect(self.save, signal=signals.item_passed)
        process = CrawlerProcess(settings={"LOG_LEVEL": "INFO"})
        process.crawl(ParanaSpider)
        process.start()
        self.success("Done!")
Ejemplo n.º 20
0
def spider_results(site, project='renault', out_file='out.json'):
    """
    Wrapper for launching Scrapy.
        
    Parameters :
    
    site : str
        Name of the site we are scraping from.
    project : str
        Name of the project we are working on. The default is 'renault'
    out_file : str
        Name of the file where we want to save the result. The default is
        out.json
    Returns :
    
        List of items (dictionaries) processed by the scraper

    """

    if project == 'renault':
        from broad_crawl_spider import MySpider
    elif project == 'iterative':
        from iterative_spider import MySpider
    else:
        print('No spider for project:', project)
        return
    results = []

    def crawler_results(signal, sender, item, response, spider):
        results.append(item)

    dispatcher.connect(crawler_results, signal=signals.item_passed)
    # Scrapy default_settings are overridden by below rules
    settings = get_project_settings()
    settings['ROBOTSTXT_OBEY'] = True
    settings['LOG_LEVEL'] = 'CRITICAL'
    settings['FEED_FORMAT'] = 'json'
    settings['FEED_URI'] = 'file:../output/%s/store.json' % site
    settings['CLOSESPIDER_ITEMCOUNT'] = 2000
    # 250000 for production use
    # 1000 to 5000 for testing
    settings['HTTPERROR_ALLOWED_CODES'] = [301]
    '''
    If you get HTTP error 403 - change USER_AGENT

    To activate Selenium use below setting:
    DOWNLOADER_MIDDLEWARES = {
        'mobility.mobility.scraper.code.selenium_mid.SeleniumMiddleware': 500
    }
    '''
    process = CrawlerProcess(settings)
    process.crawl(MySpider)
    process.start(
    )  # the script will block here until the crawling is finished
    return results
Ejemplo n.º 21
0
def run_proc(name, q):
    results = []

    def crawler_results(signal, sender, item, response, spider):
        results.append(item)

    dispatcher.connect(crawler_results, signal=signals.item_scraped)
    process = CrawlerProcess(get_project_settings())
    process.crawl(ArticleSpider, start_urls=[name])
    process.start()
    q.put(results[0])
Ejemplo n.º 22
0
    def scrape_with_crochet(self, domain):
        configure_logging({'LOG_FORMAT': '%(levelname)s: %(message)s'})
        crawler_settings = Settings()
        crawler_settings.setmodule(sets)
        self.crawl_runner.settings = crawler_settings
        dispatcher.connect(self._crawler_result, signal=signals.item_scraped)

        for i in self.dict_of_spiders:
            if i in domain:
                eventual = self.crawl_runner.crawl(self.dict_of_spiders[i], category=domain)
                return eventual
Ejemplo n.º 23
0
def spider_results():
    results = []

    def crawler_results(signal, sender, item, response, spider):
        results.append(item)

    dispatcher.connect(crawler_results, signal=signals.item_passed)

    process = CrawlerProcess(get_project_settings())
    process.crawl(CLSpider)
    process.start()
    return results
Ejemplo n.º 24
0
def spider_results():
    results = []

    def crawler_results(signal, sender, item, response, spider):
        results.append(item)

    dispatcher.connect(crawler_results, signal=signals.item_passed)

    process = CrawlerProcess(get_project_settings())
    process.crawl(GIINSpider)
    process.start()  # the script will block here until the crawling is finished
    return results
Ejemplo n.º 25
0
    def __init__(self, spider):
        def increment_count(cls):
            print('incrementing count')
            cls.count = cls.count + 1

        dispatcher.connect(lambda _: print('FINIsh'),
                           signal=signals.spider_closed)
        dispatcher.connect(increment_count, signal=signals.item_passed)
        settings = get_project_settings()
        self.process = CrawlerProcess(settings)

        self.spider = spider
Ejemplo n.º 26
0
def scrape(urls):
    mapping_ShareX = ["pixl.is", "putme.ga", "putmega.com"]
    mapping_Chibisafe = [
        "cyberdrop.me", "cyberdrop.cc", "cyberdrop.to", "bunkr.is", "bunkr.to"
    ]
    mapping_GoFile = ["gofile.io"]

    replacements = [('fs-...', ''), ('img-...', ''), ('i\.', ''),
                    ('stream.', ''), ('www.', '')]

    ShareX_urls = []
    Chibisafe_urls = []
    GoFile_urls = []
    unsupported_urls = []

    cookies = []
    result_links = OrderedDict()

    for url in urls:
        base_domain = urlparse(url).netloc
        for old, new in replacements:
            base_domain = re.sub(old, new, base_domain)

        if base_domain in mapping_ShareX:
            ShareX_urls.append(url)
        elif base_domain in mapping_Chibisafe:
            Chibisafe_urls.append(url)
        elif base_domain in mapping_GoFile:
            GoFile_urls.append(url)
        else:
            unsupported_urls.append(url)

    def crawler_results(signal, sender, item, response, spider):
        domain = sanitize_key(item['netloc'])
        title = re.sub(r'[\\/*?:"<>|.]', "-", item['title'])
        referal = item['referal']
        url = item['url']
        cookies.extend(x for x in item['cookies'] if x not in cookies)
        result_links.setdefault(domain, OrderedDict()).setdefault(
            title, []).append([url, referal])

    dispatcher.connect(crawler_results, signal=signals.item_scraped)
    settings = get_project_settings()
    settings.set('LOG_LEVEL', logging.CRITICAL)
    process = CrawlerProcess(settings)

    if ShareX_urls: process.crawl(ShareX_Spider, myurls=ShareX_urls)
    if Chibisafe_urls: process.crawl(ChibisafeSpider, myurls=Chibisafe_urls)
    if GoFile_urls: process.crawl(GoFileSpider, myurls=GoFile_urls)
    process.start()

    return cookies, result_links
Ejemplo n.º 27
0
 def f(q):
     try:
         crawler_settings = get_project_settings()
         runner = CrawlerRunner(crawler_settings)
         dispatcher.connect(lambda _: print('finish'), signal=signals.spider_closed)#'item_scraped'
         dispatcher.connect(lambda _: print('item scraped'), signal=signals.item_scraped)#'item_scraped'
         deferred = runner.crawl(InfoempleoSpider)
         deferred.addBoth(lambda _: reactor.stop())
         print('reactor...')
         reactor.run()
         print('run!!!!!')
         q.put(None)
     except Exception as e:
         q.put(e)
Ejemplo n.º 28
0
def get_data():
    results = []

    def crawler_results(signal, sender, item, response, spider):
        results.append(item)

    dispatcher.connect(crawler_results, signal=signals.item_passed)
    process = CrawlerProcess(
        {'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'})

    process.crawl(ElectronicsSpider)
    process.start()

    return results
Ejemplo n.º 29
0
def spider_handler(latitude, longitude, max_number, q):
    link = get_link_for_tripadvisor(latitude, longitude)
    output = []
    _exporter = PythonItemExporter(binary=False)
    def get_crawler_output(signal, sender, item, response, spider):
        output.append(_exporter.export_item(item))

    dispatcher.connect(get_crawler_output, signal=signals.item_scraped)
    process = CrawlerProcess({
        "USER_AGENT": "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)"
    })
    process.crawl(RestaurantSpider, start_url=link, max_restaurants=max_number)
    process.start()
    q.put(output)
Ejemplo n.º 30
0
    def index(self):
        try:
            self.debounce()
        except DebounceError as e:
            log.error("Debounced indexing task: %s", e)
            return

        docs_to_process = Queue()

        Spider = type(
            'Spider', (DocumentationSpiderBase,),
            {"url": self.url, "validators": self.site.validators, "allow": self.site.allow,
             "deny": self.site.deny})

        def enqueue_document(signal, sender, item: SearchDocument, response, spider):
            """Queue a SearchDocument for indexation."""
            docs_to_process.put(item)

        def index_documents():
            while True:
                doc: SearchDocument = docs_to_process.get()
                try:
                    self.index_document(doc)
                except Exception as e:
                    log.error("Unexpected error while indexing doc %s, error: %s", doc.doc_id, e)
                docs_to_process.task_done()

        def start_indexing():
            if docs_to_process.empty():
                return
            self.search_client.redis.set(
                keys.last_index(self.site.url), datetime.datetime.now().timestamp())
            docs_to_process.join()

        for _ in range(MAX_THREADS):
            Thread(target=index_documents, daemon=True).start()

        dispatcher.connect(enqueue_document, signal=signals.item_scraped)
        dispatcher.connect(start_indexing, signal=signals.engine_stopped)

        process = CrawlerProcess(settings={
            'CONCURRENT_ITEMS': 200,
            'CONCURRENT_REQUESTS': 100,
            'CONCURRENT_REQUESTS_PER_DOMAIN': 100,
            'HTTP_CACHE_ENABLED': True,
            'REACTOR_THREADPOOL_MAXSIZE': 30,
            'LOG_LEVEL': 'ERROR'
        })
        process.crawl(Spider)
        process.start()