Example #1
0
def getLinks(start_urls, allowed_domains, filename):
    #Newest code: Scrapydo
    scrapydo.run_spider(LinkSpider,
                        start_urls=start_urls,
                        allowed_domains=allowed_domains)
    link_items = removeCycles(start_urls, LinkSpider.link_items)
    genCSV(filename, start_urls, link_items)
    return link_items
Example #2
0
 def DPSSpiderCrawl(self):
     global DPS
     DPS = []
     scrapydo.run_spider(DPSSpider(),
                         settings={
                             'USER_AGENT': 'Mozilla/5.0',
                         })
     DPSChunk = SpiderQueen.divide_chunks(DPS)
     return DPSChunk
Example #3
0
 def TankSpiderCrawl(self):
     global Tank
     Tank = []
     scrapydo.run_spider(TankSpider(),
                         settings={
                             'USER_AGENT': 'Mozilla/5.0',
                         })
     TankChunk = SpiderQueen.divide_chunks(Tank)
     return TankChunk
Example #4
0
 def HealSpiderCrawl(self):
     global Heal
     Heal = []
     scrapydo.run_spider(HealSpider(),
                         settings={
                             'USER_AGENT': 'Mozilla/5.0',
                         })
     HealChunk = SpiderQueen.divide_chunks(Heal)
     return HealChunk
Example #5
0
def runcrawl():
    """
    Run a spider within Twisted. Once it completes,
    wait 5 seconds and run another spider.
    """
    try:
        scrapydo.run_spider(JubiNoticeSpider)
        scrapydo.run_spider(YunBiNoticeSpider)
    except Exception as e:
        print(e)
Example #6
0
    def get(self):
        scrapydo.run_spider(self.spider,
                            crawl_reason='check-vulnerability',
                            injector=self.injector,
                            settings=DVWA_CRAWLER_SETTINGS)

        storage_adapter = FileAdapter()

        parser = CheckVulnerabilityParser(self.crawled_site, storage_adapter)
        vulnerable_pages = parser.check()

        self.write({'vulnerable': json.dumps(vulnerable_pages)})
Example #7
0
    def get(self):
        scrapydo.run_spider(self.spider,
                            crawl_reason='get-db-users',
                            injector=self.injector,
                            settings=DVWA_CRAWLER_SETTINGS)

        storage_adapter = FileAdapter()

        parser = GetDbUsersParser(self.crawled_site, storage_adapter)
        db_usernames = parser.get_db_usernames()

        self.write(json.dumps(db_usernames))
    def get(self):
        scrapydo.run_spider(self.spider,
                            crawl_reason='get-db-version',
                            injector=self.injector,
                            settings=DVWA_CRAWLER_SETTINGS)

        storage_adapter = FileAdapter()

        parser = GetDbVersionParser(self.crawled_site, storage_adapter)
        db_version = parser.get_db_version()

        self.write({'db_version': db_version})
Example #9
0
def run_crawl(spider):
    #######################################################################
    #定义一个StreamHandler,将INFO级别或更高的日志信息打印到标准错误,并将其添加到当前的日志处理对象#
    console = logging.StreamHandler()
    console.setLevel(logging.INFO)
    formatter = logging.Formatter(
        '%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s')
    console.setFormatter(formatter)
    requests_log = logging.getLogger("requests.packages.urllib3")
    requests_log.setLevel(logging.ERROR)
    logging.getLogger('').addHandler(console)

    scrapydo.run_spider(spider_cls=spider)
Example #10
0
def crawler_reddit(subreddits):

    message = ''
    max = 10
    count = 0

    data = scrapydo.run_spider(RedditSpider(),
                               settings=settings,
                               subreddits=subreddits)

    for item in data:
        count += 1

        if item["title"] == '':
            title = "_No Title_ :("
        else:
            title = item["title"]

        message += "*" + item["subreddit"] + "*, votes " + str(item["upvote"]) + " " + "[" + title + "](" + \
                   item["thread_link"] + ") \n"
        if count > max:
            break
    if len(data) == 0:
        message = "Desculpe, não há nenhum tópico quente nestes seus reddits :("
    return message
Example #11
0
def start():
    spiders = [
        UNDPjobSpider,
        # ESCAPjobsSpider,
        # UNESCOjobSpider,
        # ITERJobSpider,
        # CERNjobsSpider,
        #
        # UNIDOjobLink,
        # UNUjobSpider,
        # WHOjobSpider,
        # UNEPJobSpider,
        # OECDJobSpider,
        # WIPOjobSpider
    ]

    for spider in spiders:
        scrapydo.run_spider(spider_cls=spider)
Example #12
0
def crawl(spider: scrapy.Spider, settings: scrapy.settings.Settings, args: dict) -> scrapy.crawler.Crawler:
    crawler = scrapydo.run_spider(
        spider,
        settings=settings,
        return_crawler=True,
        capture_items=True,
        **args
    )

    return crawler
Example #13
0
 def test_run_spider(self):
     TestSpider = make_test_spider(self.url)
     items = scrapydo.run_spider(TestSpider)
     self.assertEqual(items, [
         {
             'path': '/text'
         },
         {
             'path': '/html'
         },
     ])
    def run_scrapydoProcess(self, spider, index, pages=10):

        # Run with scrapydo -- setup spider args
        spider_args = {
            'symbol': index,
            'pages': pages,
            'capture_items': True,
            'timeout': 360,
            'settings': self.settings
        }
        return scrapydo.run_spider(spider, **spider_args)
Example #15
0
    def start(self):
        '''使用scrapydo启动爬虫
            如有新增爬虫需要启动,可在spiders中添加'''
        if self.type == 'job':
            logger.debug("进行岗位准备")
            spiders = [
                # UNDPjobSpider,
                # CERNjobsSpider,
                # ITERJobSpider,
                MOHRSSJobSpider,  #唯一国内网站,测试ip效果较好
                # OECDJobSpider,
                # UNIDOjobLink,
                # UNUjobSpider,
                # WHOjobSpider,
                # WIPOjobSpider,
            ]
        else:
            spiders = []

        for spider in spiders:
            scrapydo.run_spider(spider_cls=spider)
Example #16
0
def start_scrape(event, context):
    jobs = []

    spider_map = {
        'stack_overflow': StackOverflowSpider,
        # 'dice': DiceSpider
    }

    spider_args = {'search_params': event['search_params']}

    spider = spider_map[event['spider']]

    scrapydo.run_spider(spider, **spider_args)

    print("Finished scraping...")

    with open("/tmp/jobs.json") as job_file:
        jobs = json.load(job_file)

    open("/tmp/jobs.json",
         'w').close()  # Clean the jobs file after we are done with it.

    return jobs
Example #17
0
def pull_content(start_urls):
    # results = []
    #
    # def crawler_results(signal, sender, item, response, spider):
    #     results.append(item)
    #
    # dispatcher.connect(crawler_results, signal=signals.item_passed)
    #
    # process = CrawlerProcess()
    # process.crawl(ContentSpider, start_urls=start_urls)
    # process.start()  # the script will block here until the crawling is finished
    #
    results = scrapydo.run_spider(ContentSpider, start_urls=start_urls)
    return results
Example #18
0
def main(argv):
    scrapydo.setup()

    settings = Settings()
    settings.set("USER_AGENT",
                 "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)")
    settings.set("FEED_FORMAT", "json")
    settings.set("FEED_URI", "result.json")

    try:
        opts, args = getopt.getopt(argv, "hs:", ["subreddit="])
    except getopt.GetoptError:
        print(
            'cli_crawler.py -s <lista de subreddit separado por vírgula, ex. programming;dogs;brazil>'
        )
        sys.exit(2)

    if len(opts) == 0:
        print(
            'cli_crawler.py -s <lista de subreddit separado por vírgula, ex. programming;dogs;brazil>'
        )

    for opt, arg in opts:
        if opt == '-s':
            subreddits = arg
            print("Iniciando crawler para buscar dados dos subreddits " +
                  subreddits + "...")
            data = scrapydo.run_spider(RedditSpider(),
                                       settings=settings,
                                       subreddits='askreddit')
            for item in data:

                if item["title"] == '':
                    title = "_No Title_ :("
                else:
                    title = item["title"]

                message = item["subreddit"] + ", votes " + str(item["upvote"]) + " " + "[" + title + "](" + \
                           item["thread_link"] + ") \n"

                print(message)

    sys.exit()
def my_crawl():  
  items = scrapydo.run_spider(RunnerSpider)
  
  text_file = open("tmp.json", "w")
  text_file.write("%s" % items)
  text_file.close()
  
  f1 = open('tmp.json', 'r')
  f2 = open('tmp_result.json', 'w')
  for line in f1:
    f2.write(line.replace("'", '"'))
  f1.close()
  f2.close()

  f3 = open('tmp_result.json', 'r')
  f4 = open('result.json', 'w')
  for line in f3:
    f4.write(line.replace('u"', '"'))
  f3.close()
  f4.close()
Example #20
0
    def Queen(self):

        global rankings
        rankings = {"DPS": [], "Heal": [], "TDPS": []}
        scrapydo.run_spider(DPSSpider(),
                            settings={
                                'USER_AGENT': 'Mozilla/5.0',
                            })
        scrapydo.run_spider(TankSpider(),
                            settings={
                                'USER_AGENT': 'Mozilla/5.0',
                            })
        scrapydo.run_spider(HealSpider(),
                            settings={
                                'USER_AGENT': 'Mozilla/5.0',
                            })
        return rankings
Example #21
0
    os.chdir(sys.path[0])

    reload(sys)
    sys.setdefaultencoding('utf-8')

    if not os.path.exists('log'):
        os.makedirs('log')

    logging.basicConfig(filename='log/proxy.log',
                        format='%(levelname)s %(asctime)s: %(message)s',
                        level=logging.DEBUG)
    sql = SqlHelper()

    while True:
        utils.log('*******************run spider start...*******************')

        command = 'DELETE FROM {0} WHERE save_time < now() - {1}'.format(
            config.free_ipproxy_table, 1800)
        sql.execute(command)

        items = scrapydo.run_spider(XiCiDaiLiSpider)
        items = scrapydo.run_spider(SixSixIpSpider)
        items = scrapydo.run_spider(IpOneEightOneSpider)
        items = scrapydo.run_spider(KuaiDaiLiSpider)
        items = scrapydo.run_spider(GatherproxySpider)

        utils.log(
            '*******************run spider waiting...*******************')
        time.sleep(300)
Example #22
0
    logging.basicConfig(
        filename='log/crawl_proxy.log',
        format='%(levelname)s %(asctime)s: %(message)s',
        level=config.log_level
    )

    sql = SqlManager()

    spiders = [
        XiCiDaiLiSpider,
        SixSixIpSpider,
        IpOneEightOneSpider,
        # KuaiDaiLiSpider,  # 使用其他方法
        # GatherproxySpider,
        HidemySpider,
        ProxylistplusSpider,
        # FreeProxyListsSpider,
        # PeulandSpider,  # 目标站点失效
        # UsProxySpider,
        ProxyDBSpider,
        # ProxyRoxSpider,
    ]

    while True:
        utils.log('*******************run spider start...*******************')
        sql.delete_old(config.free_ipproxy_table, 0.5)
        for spider in spiders:
            scrapydo.run_spider(spider_cls=spider)
        utils.log('*******************run spider waiting...*******************')
        time.sleep(1200)
Example #23
0
    sql = SqlHelper()

    spiders = [
        XiCiDaiLiSpider,
        SixSixIpSpider,
        IpOneEightOneSpider,
        KuaiDaiLiSpider,  # 在访问前加了一个 js ,反爬
        GatherproxySpider,
        HidemySpider,
        ProxylistplusSpider,
        FreeProxyListsSpider,
        # PeulandSpider,  # 目标站点失效
        UsProxySpider,
        ProxyDBSpider,
        ProxyRoxSpider,
    ]

    while True:
        utils.log('*******************run spider start...*******************')

        command = "DELETE FROM {table} where save_time < SUBDATE(NOW(), INTERVAL 0.2 DAY)".format(
                table = config.free_ipproxy_table)
        sql.execute(command)

        for spider in spiders:
            scrapydo.run_spider(spider)

        utils.log('*******************run spider waiting...*******************')
        time.sleep(1200)
Example #24
0
def update_db():
    while (True):
        scrapydo.run_spider(FolhaSpider)
        scrapydo.run_spider(EconomiaSpider)
        sleep(120)
Example #25
0
    reload(sys)
    sys.setdefaultencoding('utf-8')

    if not os.path.exists('log'):
        os.makedirs('log')

    logging.basicConfig(filename='log/validator.log',
                        format='%(levelname)s %(asctime)s: %(message)s',
                        level=logging.DEBUG)

    while True:
        utils.log(
            '----------------validator start time:%s...-----------------------'
            % datetime.datetime.now().strftime('%Y:%m:%d %H:%M:%S:%f'))

        items = scrapydo.run_spider(HttpBinSpider)

        utils.log(
            '----------------validator finish:%s time:%s-----------------------'
            % (HttpBinSpider.name,
               datetime.datetime.now().strftime('%Y:%m:%d %H:%M:%S:%f')))
        time.sleep(10)

        items = scrapydo.run_spider(DoubanSpider)
        utils.log(
            '----------------validator finish:%s time:%s-----------------------'
            % (DoubanSpider.name,
               datetime.datetime.now().strftime('%Y:%m:%d %H:%M:%S:%f')))

        # items = scrapydo.run_spider(GatherSpider)
        # items = scrapydo.run_spider(AssetStoreSpider)
Example #26
0
 def test_run_spider(self):
     TestSpider = make_test_spider(self.url)
     items = scrapydo.run_spider(TestSpider)
     self.assertEqual(items, [{"path": "/text"}, {"path": "/html"}])
Example #27
0
from ipproxytool.spiders.validator.assetstore import AssetStoreSpider
from ipproxytool.spiders.validator.gather import GatherSpider

scrapydo.setup()

if __name__ == '__main__':

    os.chdir(sys.path[0])

    reload(sys)
    sys.setdefaultencoding('utf-8')

    if not os.path.exists('log'):
        os.makedirs('log')

    logging.basicConfig(filename='log/validator.log',
                        format='%(levelname)s %(asctime)s: %(message)s',
                        level=logging.DEBUG)

    utils.make_dir('log')

    while True:
        utils.log('----------------validator start...-----------------------')
        items = scrapydo.run_spider(DoubanSpider)
        # items = scrapydo.run_spider(GatherSpider)
        # items = scrapydo.run_spider(AssetStoreSpider)
        utils.log(
            '*************************validator waiting...*************************'
        )
        time.sleep(60)
Example #28
0
            "max": _max
        })
        print link["district"], link["location"], "均价:", _avg, "最低:", _min, "最高:", _max


if __name__ == "__main__":
    logging.basicConfig(
        filename='spider.log',
        format='%(levelname)s %(asctime)s: %(message)s',
        level=logging.DEBUG
    )
    # while True:
    mongo = MongoDBPipeline()
    t = time.time()
    # 1、爬取连接,并更新district
    scrapydo.run_spider(LinkSpider)

    # 2、爬取item
    while True:
        print "爬取房源中....."
        scrapydo.run_spider(ItemSpider)
        if mongo.get_failed_urls().count() == 0:
            break
        print "开始再次爬取房源...."

    print "爬取结束, 耗时%d秒" % (time.time() - t)

    # 3、根据location的名字进行统计
    print "开始统计..."
    summarize()
Example #29
0
 def test_run_spider(self):
     TestSpider = make_test_spider(self.url)
     items = scrapydo.run_spider(TestSpider, name='test-spider')
     self.assertEqual(set(it['path'] for it in items), {'/text', '/html'})
Example #30
0
    logging.basicConfig(
        level=logging.INFO,  # set the log level to INFO level
        format='%(levelname)s :%(asctime)s: %(message)s',
        filename='crawl-%s.log' % datetime.now().strftime('%Y%m%d'),
    )

    while True:
        if printer_status:
            print('the spider started at %s' %
                  datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
        logging.info('the spider started')
        scrapydo.default_settings.update({
            'CONCURRENT_REQUESTS': 100,  # the num of concurrent_requests
            'CONCURRENT_ITEMS': 500,  # the num of CONCURRENT_ITEMS
        })
        scrapydo.run_spider(spider_cls=DoubanmovieSpider)
        if printer_status:
            print('the spider stopped at %s' %
                  datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
        logging.info('the spider stopped normally')

        # disconnect the pptp vpn and wait for the change of ip
        VPN.disconnect()
        while VPN.getstatus() == 1:
            VPN.connect()
            sleep(3)

        if os.path.exists('status.txt'):
            with open('status.txt') as file:
                if file.readline().strip() == 'exit':
                    logging.info('the crawl exited gracefully')
Example #31
0
def jobhunt(event, context):
    scrapydo.setup()
    settings = get_project_settings()
    scrapydo.run_spider(spider_dictionary[event['name']], settings=settings)
Example #32
0
 async def get(self, *args, **kwargs):
     return_developing()
     scrapydo.run_spider(XicidailiSpider)
     self.do_success({'ok': 1}, 'todo')