Ejemplo n.º 1
0
def ScrapydData(request):
    from scrapyd_api import ScrapydAPI
    scrapyd = ScrapydAPI('http://localhost:6800')
    settings = {'DOWNLOAD_DELAY': 2}
    # 运行一个爬虫
    scrapyd.schedule('project_name', 'spider_name', settings=settings)
    print(scrapyd.list_projects())
Ejemplo n.º 2
0
def url_check(request):
    url = request.POST.get('input', None).strip()

    try:
        product_id = ProductId.objects.get(url=url)

        return JsonResponse({
            'pid': product_id.pid,
            'status': True
        },
                            status=status.HTTP_200_OK)
    except ProductId.DoesNotExist:
        domain = urlparse(url).netloc
        unique_id = str(uuid4())

        settings = {
            'unique_id':
            unique_id,
            'USER_AGENT':
            'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)'
        }

        scrapyd = ScrapydAPI('http://localhost:6800')
        scrapyd.schedule('default',
                         'fab_crawler',
                         settings=settings,
                         url=url,
                         domain=domain,
                         uuid=unique_id)
    return JsonResponse({
        'status': False,
        'uuid': unique_id
    },
                        status=status.HTTP_200_OK)
Ejemplo n.º 3
0
def inspection_new(request: HttpRequest):
    # TO-DO Use Django's form validation
    url, extraction = None, None
    form_url = "scraping_url"
    scrapyc = ScrapydAPI(conf_scrapyd["target"])

    if form_url not in request.GET:
        return HttpResponseRedirect(
            reverse("url_inspector:error_new_inspection"))
    else:
        url = request.GET[form_url]

        try:
            URLValidator()(url)
        except ValidationError:
            return HttpResponseRedirect(
                reverse("url_inspector:error_new_inspection"))

        extraction = Extraction(url=url)

        extraction.save()

        try:
            scrapyc.schedule(conf_scrapyd["project"],
                             conf_scrapyd["spider"],
                             urls=url,
                             django_urls=(url, ),
                             django_ids=(extraction.id, ))
        except Exception as e:
            return HttpResponseRedirect(
                reverse("url_inspector:error_new_inspection"))

    return HttpResponseRedirect(
        reverse("url_inspector:inspection", kwargs={"pk": extraction.id}))
Ejemplo n.º 4
0
    def post(self, request):
        """
        :param request: 启动爬虫的请求参数
        :return: 爬虫启动是否成功
        """
        data = request.data
        spider_name = data.get("spider_name")
        spider_type = data.get("spider_type")
        # print(spider_name)
        # print(spider_type)
        if spider_type == "start":
            try:
                scrapyd = ScrapydAPI('http://localhost:6800')  # 这里是去调用部署分布式爬虫
                scrapyd.schedule('default', spider_name)  # 这里是启动爬虫
            except:
                return Response("failed")
        else:
            try:
                scrapyd = ScrapydAPI('http://localhost:6800')  # 这里是去调用部署分布式爬虫
                del_dict = scrapyd.list_jobs('default')  # 这里是启动爬虫
                # print(scrapyd.list_jobs('default'))
                del_jobs = []
                for k in ["pending", "running"]:
                    # print(del_dict[k])
                    for item in del_dict[k]:
                        if item.get("spider") == spider_name:
                            del_jobs.append(item.get("id"))
                for job_id in del_jobs:
                    scrapyd.cancel('default', job_id)
                # print(del_jobs)

            except:
                return Response("failed")
        return Response("ok")
Ejemplo n.º 5
0
def call_scrapyd_service():
    """通过 api 操作爬虫
    参考文档地址:https://pypi.python.org/pypi/python-scrapyd-api#downloads
    """
    scrapyd = ScrapydAPI('http://localhost:6800')
    scrapyd.job_status('govbuyscrapy', '0c838fd4b9f111e6abcc14dda97ae760')  # 查看指定爬虫任务执行状态
    scrapyd.list_jobs('govbuyscrapy')  # 查看爬虫任务列表
    scrapyd.schedule('govbuyscrapy', 'govbuy_wan_shucheng')  # 指定项目执行指定爬虫
Ejemplo n.º 6
0
def execute(*args):
    args = list(args)
    assert len(args) >= 1, u'É necessário informar pelo menos a spider'

    spider_name = args.pop(0)
    scrapy_url = 'http://localhost:6800'
    if args:
        scrapy_url = args.pop(0)

    scrapyd = ScrapydAPI(scrapy_url)
    scrapyd.schedule(
        get_project_name(), spider_name)
Ejemplo n.º 7
0
class ScrapydJob(object):

    def __init__(self, scrapyd_host="localhost", scrapyd_port="6800", project="default", spider="website_finder", screenshot_dir='/memex-pinterest/ui/static/images/screenshots'):

        scrapy_url = "http://" + scrapyd_host + ":" + str(scrapyd_port)
        self.scrapi = ScrapydAPI(scrapy_url)
        self.project = project
        self.spider = spider
        self.screenshot_dir = screenshot_dir

    def schedule(self, seed):

        if not self.screenshot_dir:
            raise Exception("Please set the screenshot path in the config before scheduling")

        self.job_id = self.scrapi.schedule(self.project, self.spider, seed_urls=seed, screenshot_dir=self.screenshot_dir)

        return self.job_id

    def schedule_keywords(self, phrases, use_splash=True):
        """ Schedule a Scrapyd job """
        if not self.screenshot_dir:
            raise Exception("Please set the screenshot path in the config before scheduling")

        self.job_id = self.scrapi.schedule(self.project, self.spider,
            phrases=phrases,
            screenshot_dir=self.screenshot_dir,
            use_splash=int(use_splash)
        )
        return self.job_id

    def list_jobs(self):
        return self.scrapi.list_jobs(self.project)

    def get_state(self, job_id):

        try:
            for job in self.scrapi.list_jobs(self.project)["running"]:
                print job_id, job["id"]
                if job["id"] == job_id:
                    return "Running"

            for job in self.scrapi.list_jobs(self.project)["pending"]:
                print job_id, job["id"]
                if job["id"] == job_id:
                    return "Pending"

        except Exception:
            print "handled exception:"
            traceback.print_exc()
            return None

        return "Done"
Ejemplo n.º 8
0
def commonSchedule(type, catagery, isChangeScheduleStatus):
    if type == 0:
        if catagery == 1:
            results = MovieCrawlState.objects.filter(task__exact=catagery)
        else:
            results = MovieCrawlState.objects.filter(manage__exact=0).filter(
                task__exact=catagery)
    elif type == 1:
        if catagery == 1:
            results = MusicCrawlState.objects.filter(task__exact=catagery)
        else:
            results = MusicCrawlState.objects.filter(manage__exact=0).filter(
                task__exact=catagery)

    results = results[:(len(settings.SCRAPYD_URLS) * scrapydBatchSize)]
    i = 0
    scheduleServer = None
    for item in results:
        try:
            dictParam = json.loads(item.json) if item.json else {}
        except BaseException as e:
            print("json传入非法数据!")
            dictParam = {}
        searchWord, searchTaskId, suffixWords, spiderList, extraParams = setDeParams(
            dictParam)
        extraParams = json.dumps(extraParams,
                                 ensure_ascii=False,
                                 separators=(',', ':'))
        if i % scrapydBatchSize == 0:
            scheduleServer = getRunServer()

        if scheduleServer:
            if isChangeScheduleStatus:
                item.manage = 1
            scrapyd = ScrapydAPI(scheduleServer, timeout=8)
            if len(searchWord):
                item.startNum = len(spiderList)
                for spider in spiderList:
                    print(spider.deployProject, spider.name, searchWord,
                          searchTaskId, suffixWords, extraParams)
                    project = spider.deployProject
                    scrapyd.schedule(project=project,
                                     spider=spider.name,
                                     keyword=searchWord,
                                     searchTaskId=searchTaskId,
                                     suffixWords=suffixWords,
                                     extraParams=extraParams)
            item.save()

        i += 1
Ejemplo n.º 9
0
 def post(self, request):
     # works only with scrapyd server launched
     scrapyd = ScrapydAPI('http://localhost:6800')
     # launches the scrapers
     countries_task = scrapyd.schedule('default', 'countries')
     food_task = scrapyd.schedule('default', 'food')
     pop_task = scrapyd.schedule('default', 'population')
     poverty_task = scrapyd.schedule('default', 'poverty')
     # returns the unique id's of each scheduled task
     return JsonResponse({
         'countries_id': countries_task,
         'food_id': food_task,
         'population_id': pop_task,
         'poverty_id': poverty_task
     })
Ejemplo n.º 10
0
def derivativeSearchWordTaskSchedule():
    derivativeSearchWordSpidersInfo = {
        '西瓜头条系列': 'derivativeSearchWord',
    }
    extraParams = {
        'proxytype': '1',
    }
    extraParams = json.dumps(extraParams,
                             ensure_ascii=False,
                             separators=(',', ':'))
    for k, v in derivativeSearchWordSpidersInfo.items():
        if k == '西瓜头条系列':
            records = derivativeSearchWordData.objects.all()
        else:
            records = []
        spider = Spider.objects.get(Q(name__exact=v), Q(status__exact=0))
        deployProject = spider.deployProject
        for record in records:
            scheduleServer = getRunServer(deployProject)

            if scheduleServer:
                scrapyd = ScrapydAPI(scheduleServer, timeout=8)
                status = scrapyd.schedule(project=deployProject,
                                          spider=spider.name,
                                          dbId=record.id,
                                          keyword=record.name,
                                          extraParams=extraParams)
                print(status)
Ejemplo n.º 11
0
def add_domain_to_network(request, network_name):
    url = request.POST.get('url', None)
    if not url:
        logger.debug('check received wrong request method.')
        messages.error(request, 'URL is missing!')
        return redirect('start')

    domain_name = get_domain_from_url(url)

    obj = None
    if Domains.objects.filter(domain__icontains=domain_name).exists():
        obj = Domains.objects.filter(domain__icontains=domain_name).first()
    else:
        obj = Domains.objects.create(domain=domain_name, url=url)
        localhost = 'http://localhost:6800'
        scrapyd = ScrapydAPI(localhost)
        job_id = scrapyd.schedule('default', 'externalspider',
                                  started_by_domain=obj.domain,
                                  keywords=[])
        ExternalSpider.objects.create(domain=obj,
                                      job_id=job_id)
        obj.status = 'external_started'
        obj.save()

    nw = None
    if not Network.objects.filter(name=network_name).exists():
        msg = 'Network: {} not found!'.format(network_name)
        messages.warning(request, msg)
        return redirect('start')

    nw = Network.objects.filter(name=network_name).first()

    nw.domains.add(obj)
    return redirect('network', network_name=nw.name)
Ejemplo n.º 12
0
def loadproxysite(request):
    siteurl = request.POST.get('siteurl', None)
    clicked_html = request.POST.get('clicked_html', None)
    if str(clicked_html) == "0":
        cc.reset()
        # deleting files on system
        click_dict.clear()
        click_dict[cc.counter] = [siteurl, FrameObject.filename]
    elif str(clicked_html) != "0":
        click_dict[cc.counter] = [siteurl, FrameObject.filename, clicked_html]
    cc.add()
    if siteurl is None:
        return HttpResponse("none")
    else:
        scrapyd = ScrapydAPI("http://localhost:6800")
        jid = scrapyd.schedule("feedbot", "sitesavespider", domain=siteurl)
        FrameObject.filename = siteurl.split("://")[1].replace("/",
                                                               ".") + ".html"
        jsondata = {
            "filename": FrameObject.filename,
            "crawljob": jid,
            "siteurl": siteurl,
            "click_dict": click_dict
        }
        while (1):
            if "finished" in scrapyd.job_status("feedbot", jid):
                jsondata["filename"] = adapt_javascript(jsondata)
                return JsonResponse(jsondata)

    return HttpResponse("hello")
Ejemplo n.º 13
0
def scheduler_job():
    """
    每分钟检查一次定时任务
    :return:
    """
    models = Task.objects.all()
    for model in models:
        scheduler_at = model.scheduler_at
        updated_at = model.updated_at
        scheduler_at_time_stamp = scheduler_at * 60
        updated_at_time_stamp = time.mktime(updated_at.timetuple())
        if time.time() - updated_at_time_stamp > scheduler_at_time_stamp:
            client_id = model.client_id
            project_name = model.project_name
            spider_name = model.spider_name
            client = Client.objects.get(id=client_id)
            scrapyd = ScrapydAPI(scrapyd_url(client.ip, client.port))
            try:
                job = scrapyd.schedule(project_name, spider_name)
                model.job_id = job
                model.success = True
            except ConnectionError:
                model.success = False
            finally:
                model.save()
Ejemplo n.º 14
0
class ScrapyTasks(object):
    def __init__(self):
        self._scrapyd = ScrapydAPI(config.SCRAPYD_HOST)

    def launch_spider(self, project, spider):
        jobid = self._scrapyd.schedule(project, spider)
        return jobid
Ejemplo n.º 15
0
def twitterscraping(request):
    context = {}
    form = TweetForm()
    context['topic'] = ""
    context['max_tweets'] = 15
    context['action'] = reverse('twitterscraping')
    context['firtsTimeLoad'] = 1
    if request.method == 'POST':
        topic = request.POST.get('topic')
        max_tweet = request.POST.get('max_tweets')
        context['topic'] = topic
        context['max_tweets'] = int(max_tweet)

        historyKey = CommonHelper.CommonHelper.RandomIdGenerator()
        _UserCrawlHistory = UserCrawlHistory(Historykey=historyKey,
                                             KeyWord=topic)
        _UserCrawlHistory.save(using="SentimentAppDB")

        global SpiderTwitterJOBID
        scrapyd = ScrapydAPI('http://127.0.0.1:6800')
        SpiderTwitterJOBID = scrapyd.schedule(SCRAPYD_PROJECT_NAME,
                                              'SpiderTwitter',
                                              historyKey=historyKey,
                                              count=max_tweet)
        context['firtsTimeLoad'] = 0
        context['historyKey'] = historyKey
    return render(request, 'SentimentApp/twitterscraping.html', {
        'result': context,
        'form': form
    })
Ejemplo n.º 16
0
class Scraper(models.Model):
    site = models.OneToOneField(Site, on_delete=models.CASCADE)
    description = models.TextField(null=True, blank=True)
    file = models.FileField(upload_to=scraper_path, null=True, validators=[validate_py_extension])
    task_id = models.CharField(null=True, blank=True, max_length=255)

    last_scraped = models.DateTimeField(null=True)

    class Meta:
        ordering = ['site__name']

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.scrapyd = ScrapydAPI("http://localhost:6800")

    def start(self):
        spider_name = "{}_{}_{}".format(self.site.name, self.site.gender, self.site.type)
        self.task_id = self.scrapyd.schedule("default", spider_name)
        self.save()

    def stop(self):
        self.scrapyd.cancel("default", self.task_id)
        self.save()

    def spider_status(self):
        if self.task_id:
            job_status = self.scrapyd.job_status('default', self.task_id)
            return job_status
        else:
            return "-"
Ejemplo n.º 17
0
def handle_request(request):
    # get keyword from input
    keyword = request.GET.get('keyword')

    # create an object of scrapyd API
    scrapyd = ScrapydAPI("http://localhost:6800")
    request_time = datetime.datetime.now()

    # create a job id
    job_id = scrapyd.schedule(project='quotes_scrape', spider='quotes_crawler', keyword=keyword,
                              request_time=request_time)
    qry = f"select * from quotes where job_id = '{job_id}'"

    job_status = "running"

    values = []

    # check for job status
    while job_status != "finished":
        job_status = scrapyd.job_status(project='quotes_scrape', job_id=job_id)
        if job_status == 'finished':

            # database connection
            con = pymysql.connect(host="localhost", user="******", passwd="", db="quotes_scrape",
                                  cursorclass=pymysql.cursors.DictCursor)
            cursor = con.cursor()
            # get records from database of particular database
            cursor.execute(qry)

            values = cursor.fetchall()
            print(values)
        else:
            sleep(1)

    return JsonResponse(data=values, safe=False)
Ejemplo n.º 18
0
class ProductChecker(models.Model):
    name = models.CharField(max_length=255, null=True, unique=True)
    description = models.TextField(null=True, blank=True)
    file = models.FileField(upload_to=scraper_path, null=True, validators=[validate_py_extension])
    task_id = models.CharField(null=True, blank=True, max_length=255)

    last_scraped = models.DateTimeField(null=True)

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.scrapyd = ScrapydAPI("http://178.128.105.215:6800")

    def start(self):
        self.task_id = self.scrapyd.schedule("default", self.name)
        self.save()

    def stop(self):
        self.scrapyd.cancel("default", self.task_id)
        self.save()

    def spider_status(self):
        if self.task_id:
            job_status = self.scrapyd.job_status('default', self.task_id)
            return job_status
        else:
            return "-"
Ejemplo n.º 19
0
class ScrapydLoginFinderJob(object):

    def __init__(self, seed_url, username, password, db_name, scrapyd_host="localhost", scrapyd_port="6800", project="default", spider="login_finder"):

        scrapy_url = "http://" + scrapyd_host + ":" + str(scrapyd_port)
        self.scrapi = ScrapydAPI(scrapy_url)
        self.project = project
        self.spider = spider
        self.seed_url = seed_url
        self.username = username
        self.password = password
        self.db_name = db_name

    def schedule(self):

        self.job_id = self.scrapi.schedule(self.project, self.spider, seed_url = self.seed_url, username = self.username, password = self.password, db_name = self.db_name)

        return self.job_id

    def list_jobs(self):
        return self.scrapi.list_jobs(self.project)

    def get_state(self):

        try:
            self.job_id
        except:
            Exception("You must schedule a job before getting the state!")

        try:
            for job in self.scrapi.list_jobs(self.project)["running"]:
                print self.job_id, job["id"]
                if job["id"] == self.job_id:
                    return "Running"

            for job in self.scrapi.list_jobs(self.project)["pending"]:
                print self.job_id, job["id"]
                if job["id"] == self.job_id:
                    return "Pending"

        except:
            print "handled exception:"
            traceback.print_exc()
            return None

        return "Done"
    
    def block_until_done(self, timeout = 120):
        
        exec_time = 0
        while 1:
            exec_time += 1
            if exec_time == timeout:
                raise Exception("Timeout time reached for login_finder spider execution")

            time.sleep(1)
            state = self.get_state()
            if state == "Done":
                break
Ejemplo n.º 20
0
 def task():
     node = Node.objects.get(id=node_id)
     scrapyd = ScrapydAPI(scrapyd_url(node.node_ip, node.node_port))
     try:
         job = scrapyd.schedule(project_name, spider_name)
         return JsonResponse({'job': job})
     except ConnectionError:
         return JsonResponse({'message': 'Connect Error'}, status=500)
Ejemplo n.º 21
0
def _start_spider(domain, keywords=None):
    localhost = 'http://localhost:6800'
    scrapyd = ScrapydAPI(localhost)
    job_id = scrapyd.schedule('default', 'externalspider',
                              started_by_domain=domain,
                              keywords=keywords)

    return job_id
Ejemplo n.º 22
0
def task(request, project_name, spider_names, client_id):
    client = Node.objects.get(id=client_id)
    scrapyd = ScrapydAPI(scrapyd_url(client.ip, client.port))
    try:
        for spider_name in spider_names:
            task = scrapyd.schedule(project_name, spider_name)
            return JsonResponse(task)
    except ConnectionError:
        return JsonResponse({'message': 'Connect Error'}, status=500)
Ejemplo n.º 23
0
def debug():
    scrapyd = ScrapydAPI("http://localhost:6800")
    job = scrapyd.schedule(project='sougouHotList', spider= 'sougouHotListSpider')
    #job = scrapyd.schedule(project='baiduHotList', spider='baiduHotListSpider')
    time.sleep(120)
    print ('开始关闭爬虫')
    scrapyd.cancel(project='sougouHotList', job=job)
    time.sleep(300)
    print ('关闭爬虫')
 def closed(self, reason):
     # join all the report summary strings together, 
     # with a new line '\n' in between
     stats = self.crawler.stats.get_stats()
     stats = pprint.pformat(stats)
     s = '\n'.join(self.report_summary)
     # log the summary report as stored in 's'
     self.logger.info(s)
 
     # Store the log to our mysql db if it's not a test run
     # Open the log file and retrieve its content
     log_path = self.settings.get('LOG_FILE')
     file = open(
         '%s' % log_path, 
         'r')
     file_content = file.read()
     file.close()
     # Store the log to our mysql db
     # Start the db connection through the custom module.
     self.cursor.execute(
         "INSERT INTO `scrapy_logs` "
         "(`spider`, `test`,`log_date`, `log_file`, `stats`, `short_msg`, `long_msg`) "
         "VALUES (%s, %s, %s, %s, %s, %s, %s)",
         ('spider_new_products', self.test, time.time(), log_path, stats, s, file_content)
         )
     self.conn.commit()
     # Count the products still need to be done
     self.count_new_products()
     # Close the db connection when done.
     self.conn.close() 
     # Check if we need to rerun the script.
     
     self.logger.info(
         "We still have number of products to update left: %s", self.items_left)
     
     # Force the spider to stop when we cancel the job
     if reason is 'finished':   
         if self.items_left >= 200 and int(self.limit) >= 51: #200
             if (self.test == 1) or (self.test == '1'):
                 scrapyd = ScrapydAPI('http://127.0.0.1:6800')
                 scrapyd.schedule('scraper1', 'ScrapeNewProduct', source=self.source, test='1', limit=self.limit)
             else:
                 self.logger.info ("Don't Reschedule because it's a test run")
         self.logger.info ("Don't Reschedule because limit is smaller then 50 or items is lower than 200")
Ejemplo n.º 25
0
def task_spider_govbuy_list_by_spider_one_page(a=None, b=None):
    print '-' * 100
    print 'now is %s' % datetime.datetime.now()
    scrapyd = ScrapydAPI('http://localhost:6800')
    spiders = scrapyd.list_spiders('govbuyscrapy')
    print 'spider has :', spiders
    run_spider_id = scrapyd.schedule('govbuyscrapy',
                                     'govbuy_wan_timing_list')  # 列表页面爬取
    print 'spider runner id is :', run_spider_id
    scrapyd.job_status('govbuyscrapy', run_spider_id)
Ejemplo n.º 26
0
 def start_hot_spider(self, request):
     """
     启动每日热点的爬虫,需要验证admin权限
     :param request: user_type:用户的身份权限
     :return:
     """
     data = request.data
     user_type = data.get("user_type")
     if user_type == "1":  #admin权限,开启爬虫
         # run_hot()
         scrapyd = ScrapydAPI('http://localhost:6800')  # 这里是去调用部署分布式爬虫
         print(scrapyd.list_projects())  #获取爬虫项目名
         # print(scrapyd.list_spiders('default'))#获取爬虫项目中的爬虫工程名
         # print(scrapyd.list_jobs('default')) #获取爬虫项目中运行的爬虫工程信息
         # print(scrapyd.list_versions('default'))# 获取爬虫项目中的版本
         scrapyd.schedule('default', 'hotdaily')  # 这里是启动爬虫
         return Response("ok")
     else:
         return Response("failed")
Ejemplo n.º 27
0
def task_spider_govbuy_content_spider(a=None, b=None):
    print '=' * 100
    print 'now is %s' % datetime.datetime.now()
    scrapyd = ScrapydAPI('http://localhost:6800')
    spiders = scrapyd.list_spiders('govbuyscrapy')
    print 'spider has :', spiders
    run_spider_id = scrapyd.schedule('govbuyscrapy',
                                     'govbuy_wan_timing_detail')  # 详情页面爬虫
    print 'spider runner id is :', run_spider_id
    scrapyd.job_status('govbuyscrapy', run_spider_id)
Ejemplo n.º 28
0
def schedule_job(project, spider, url=DEFAULT_URL, settings={}, **kwargs):
    """
    @param project: scrapy project name
    @param spider: spider name
    @param url: the url which target scrapyd daemon listens on
    @param settings: the settings dictionary

    To schedule a spider run:
        curl http://localhost:6800/schedule.json -d project=myproject -d spider=spider2
    """
    scrapyd = ScrapydAPI(url)
    return scrapyd.schedule(project, spider, settings, **kwargs)
Ejemplo n.º 29
0
def webcrawling(request):
    context = {}
    form = WebCrawlForm()
    context['formUrlList'] = "http://www.newspapers71.com/\n\
http://www.ntvbd.com/\r\n\
http://www.prothom-alo.com/\r\n\
http://www.kalerkantho.com/\r\n\
http://www.bhorerkagoj.net/\r\n\
http://www.jaijaidinbd.com/\r\n\
http://www.amadershomoy.biz/beta/\r\n\
https://www.dailyinqilab.com/\r\n\
http://www.jugantor.com/\r\n\
http://www.dailynayadiganta.com/\r\n\
http://www.mzamin.com/"

    context['formKeyWordList'] = "এসিআই\r\n\
স্বপ্ন\r\n\
স্যাভলন"

    context['action'] = reverse('webcrawling')
    context['firtsTimeLoad'] = 1

    if request.method == 'POST':
        urlText = request.POST.get('url')
        context['formUrlList'] = urlText
        keyWordList = request.POST.get('keyWord')
        context['formKeyWordList'] = keyWordList
        keyWord = keyWordList.splitlines()
        depth = request.POST.get('depth')
        context['depth'] = int(depth)

        historyKey = CommonHelper.CommonHelper.RandomIdGenerator()

        for key in keyWord:
            _UserCrawlHistory = UserCrawlHistory(Historykey=historyKey,
                                                 KeyWord=key)
            _UserCrawlHistory.save(using='SentimentAppDB')

        global SpiderWebCrawlerJOBID
        scrapyd = ScrapydAPI('http://127.0.0.1:6800')
        SpiderWebCrawlerJOBID = scrapyd.schedule(SCRAPYD_PROJECT_NAME,
                                                 'SpiderWebCrawler',
                                                 urls=urlText,
                                                 depth=depth,
                                                 historyKey=historyKey)
        context['historyKey'] = historyKey
        context['firtsTimeLoad'] = 0

    return render(request, 'SentimentApp/webcrawling.html', {
        'result': context,
        'form': form
    })
Ejemplo n.º 30
0
def videoGetDetailsTaskSchedule():
    platformInfo = {
        '哔哩哔哩视频': 'bilibiliDetailInfo',
        '西瓜视频': 'xiguaDetailedInfo',  #因前面爬虫用了较好的名字,这个爬虫爬取西瓜系列较完整的信息
        '今日头条': 'xiguaDetailedInfo',
        '今日头条_点赞数': 'xiguaDetailInfo',  #仅补充点赞量
    }
    batchCheckNums = 64
    extraParams = {
        'proxytype': '1',
    }
    extraParams = json.dumps(extraParams,
                             ensure_ascii=False,
                             separators=(',', ':'))
    for k, v in platformInfo.items():
        if k == '哔哩哔哩视频' or k == '西瓜视频' or k == '今日头条':
            records = VideoDetailsData.objects.filter(
                platform__exact=k).filter(status__exact=2)
        elif k == '今日头条_点赞数':
            records = MovieOfflineData.objects.filter(
                platform__exact='今日头条').filter(ishz__exact=1).filter(
                    detailStatus__exact=0).filter(tag__in=['待处理', '未下线'])
        else:
            records = []
        spider = Spider.objects.get(Q(name__exact=v), Q(status__exact=0))
        deployProject = spider.deployProject
        i = 0
        scheduleServer = None
        j = m = 1
        paramList = []
        for record in records:
            if i % scrapydBatchSize == 0 and (j - 1) % batchCheckNums == 0:
                scheduleServer = getRunServer(deployProject)

            if scheduleServer:
                scrapyd = ScrapydAPI(scheduleServer, timeout=8)

                paramList.append({'id': record.id, 'targetUrl': record.url})
                if j % batchCheckNums == 0 or m == len(records):
                    params = json.dumps(paramList,
                                        ensure_ascii=False,
                                        separators=(',', ':'))
                    print(params)
                    status = scrapyd.schedule(project=deployProject,
                                              spider=spider.name,
                                              idTargetUrlList=params,
                                              extraParams=extraParams)
                    print(status)
                    paramList = []
                    i += 1
                j += 1
            m += 1
Ejemplo n.º 31
0
def schedule_job(project,
                 spider,
                 url=DEFAULT_URL,
                 settings={}, **kwargs):
    """
    @param project: scrapy project name
    @param spider: spider name
    @param url: the url which target scrapyd daemon listens on
    @param settings: the settings dictionary

    To schedule a spider run:
        curl http://localhost:6800/schedule.json -d project=myproject -d spider=spider2
    """
    scrapyd = ScrapydAPI(url)
    return scrapyd.schedule(project, spider, settings, **kwargs)
Ejemplo n.º 32
0
def run_spider(sender, instance, created, **kwargs):
    """
    runs spider and if success updates its job id
    :param sender:
    :param instance:
    :param created:
    :param kwargs:
    :return:
    """
    if created:
        scrapyd = ScrapydAPI('http://scrapyd:6800')
        job_id = scrapyd.schedule(BOT_NAME, PARSER_NAME)
        if job_id:
            instance.job_id = job_id
            instance.save(update_fields=['job_id'])
Ejemplo n.º 33
0
class ParsingService(BaseService):
    def __init__(self) -> NoReturn:
        self.spider = 'category',
        self.project = 'twenty_first_century',
        self.categories = CategoryModel.objects.all()
        self.scrapyd_api = ScrapydAPI(settings.PARSER_URL)

    def start(self) -> NoReturn:
        for category in self.categories:
            job_id = self.scrapyd_api.schedule(
                spider=self.spider,
                project=self.project,
                category=category.link,
            )
            ScrapyModel.objects.create(category=category, job_id=job_id)
Ejemplo n.º 34
0
 def handle(self, *args, **options):
     username = options['username']
     feedname = options['feedname']
     element_dict = {}
     for element in elements:
         element_dict[element] = get_element_info(element, username,
                                                  feedname)
     print element_dict
     scrapyd = ScrapydAPI("http://localhost:6800")
     jid = scrapyd.schedule("feedbot",
                            "rssspider",
                            element_dict=json.dumps(element_dict))
     print jid
     print json.dumps(element_dict)
     if "finished" in scrapyd.job_status("feedbot", jid):
         return 2
Ejemplo n.º 35
0
def spider_start(request, client_id, project_name, spider_name):
    """
    start a spider
    :param request: request object
    :param client_id: client id
    :param project_name: project name
    :param spider_name: spider name
    :return: json
    """
    if request.method == 'GET':
        client = Client.objects.get(id=client_id)
        scrapyd = ScrapydAPI(scrapyd_url(client.ip, client.port))
        try:
            job = scrapyd.schedule(project_name, spider_name)
            return JsonResponse({'job': job})
        except ConnectionError:
            return JsonResponse({'message': 'Connect Error'}, status=500)
Ejemplo n.º 36
0
def scheduler_job():
    """
    每分钟检查一次定时任务
    :return:
    """
    models = Task.objects.all()
    for model in models:
        scheduler_at = model.scheduler_at
        updated_at = model.updated_at
        scheduler_at_time_stamp = scheduler_at * 60
        updated_at_time_stamp = time.mktime(updated_at.timetuple())
        if time.time() - updated_at_time_stamp > scheduler_at_time_stamp:
            client_id = model.client_id
            project_name = model.project_name
            spider_name = model.spider_name
            client = Client.objects.get(id=client_id)
            scrapyd = ScrapydAPI(scrapyd_url(client.ip, client.port))
            try:
                job = scrapyd.schedule(project_name, spider_name)
                model.success = 1
            except ConnectionError:
                model.success = 0
            finally:
                model.save()
Ejemplo n.º 37
0
class Overseer(object):
    """
    Overseer facilitate the deployment process of local spiders to a remote scrapyd server

    Available methods:
        spawn_spiders           Create spider and deploy them to remote scrapyd server
        get_status              Report the current status of the remote scrapyd server

    """

    DEFAULT_TYPE = 'sell'
    DEFAULT_VENDOR = 'None'

    def __init__(self, name, spider_name, host, mongodb_credentials):
        self.server = ScrapydAPI(host)
        self.host_name = self._strip_host_name(host)
        self.birth_date = datetime.utcnow()
        self.name = name
        self.spider_name = spider_name
        self.alive = True
        client = pymongo.MongoClient(mongodb_credentials['server'],
                                     mongodb_credentials['port'],
                                     connectTimeoutMS=30000,
                                     socketTimeoutMS=None,
                                     socketKeepAlive=True)

        db = client[mongodb_credentials['database']]
        self.collection = db[mongodb_credentials['collection']]

    def kill(self):
        self.alive = False
        return self.host_name

    def heartbeat(self):
        return self.alive

    def spawn_spiders(self, num_spiders=5, items_per_spider=100, **kwargs):
        type = kwargs.get('type', self.DEFAULT_TYPE)
        vendor = kwargs.get('vendor', self.DEFAULT_VENDOR)

        count = 0
        while count < num_spiders:
            count += 1
            self._spawn(vendor, type, items_per_spider)
            time.sleep(3)

    def get_status(self):
        """
         Return:
             the number of running spiders
             the number of finished spiders
             the average time for one spider to finish
        """
        status = self.server.list_jobs(self.name)
        running = status['running']
        finished = status['finished']
        finished_times = [self._time_diff_in_minute(job['end_time'], job['start_time']) for job in finished]
        avg_time = np.average(finished_times)

        Notification('{} - [{}] \t Running Spiders = {}, Finished Spiders = {}, Average Runtime = {}'
                     .format(datetime.utcnow(),
                             self.host_name,
                             len(running),
                             len(finished),
                             avg_time
                             )
                     .expandtabs(3)
                     ).info()

        return len(running), len(finished), avg_time

    def _spawn(self, vendor, type, items_per_spider=100):
        # Get the tasks from the database
        tasks = self._get_tasks_from_database(vendor, type, items_per_spider)
        if not tasks:
            raise ValueError('There is no more task from the database!')

        links, property_ids = zip(*tasks)

        # Schedule the tasks with the remote scrapyd server
        job_id = self.server.schedule(self.name, self.spider_name, vendor=vendor, crawl_url=','.join(links), type=type)

        Notification('{} - [{}] \t Launch spider {}'
                     .format(datetime.utcnow(),
                             self.host_name,
                             job_id)
                     .expandtabs(3)
                     ).success()

        # Clear the tasks from the database
        self._clear_tasks_from_database(vendor, type, property_ids)

    def _get_tasks_from_database(self, vendor, type, items_per_spider):
        cursor = self.collection \
                     .find({"last_crawled_date": None, "type": type, "vendor": vendor}) \
                     .sort("created_date", pymongo.ASCENDING) \
                     .limit(items_per_spider)

        tasks = [(item['link'], item['property_id']) for item in cursor]

        return tasks

    def _clear_tasks_from_database(self, vendor, type, property_ids):
        self.collection.update({"vendor": vendor, "type": type, "property_id": {"$in": property_ids}},
                               {"$set": {"last_crawled_date": datetime.utcnow()}},
                               multi=True,
                               upsert=False)

    @staticmethod
    def _time_diff_in_minute(current, previous):
        return ((parser.parse(current) - parser.parse(previous)).seconds // 60) % 60

    @staticmethod
    def _strip_host_name(host):
        return host.replace('http://', '').replace('.compute.amazonaws.com:6800', '')