Example #1
0
def project_version(request, client_id, project_name):
    """
    get project deploy version
    :param request: request object
    :param client_id: client id
    :param project_name: project name
    :return: deploy version of project
    """
    if request.method == 'GET':
        # get client and project model
        client = Client.objects.get(id=client_id)
        project = Project.objects.get(name=project_name)
        scrapyd = ScrapydAPI(scrapyd_url(client.ip, client.port))
        # if deploy info exists in db, return it
        if Deploy.objects.filter(client=client, project=project):
            deploy = Deploy.objects.get(client=client, project=project)
        # if deploy info does not exists in db, create deploy info
        else:
            try:
                versions = scrapyd.list_versions(project_name)
            except ConnectionError:
                return JsonResponse({'message': 'Connect Error'}, status=500)
            if len(versions) > 0:
                version = versions[-1]
                deployed_at = timezone.datetime.fromtimestamp(int(version), tz=pytz.timezone(TIME_ZONE))
            else:
                deployed_at = None
            deploy, result = Deploy.objects.update_or_create(client=client, project=project, deployed_at=deployed_at)
        # return deploy json info
        return JsonResponse(model_to_dict(deploy))
Example #2
0
def project_version(request, client_id, project_name):
    """
    get project deploy version
    :param request: request object
    :param client_id: client id
    :param project_name: project name
    :return: deploy version of project
    """
    if request.method == 'GET':
        # get client and project model
        client = Client.objects.get(id=client_id)
        project = Project.objects.get(name=project_name)
        scrapyd = ScrapydAPI(scrapyd_url(client.ip, client.port))
        # if deploy info exists in db, return it
        if Deploy.objects.filter(client=client, project=project):
            deploy = Deploy.objects.get(client=client, project=project)
        # if deploy info does not exists in db, create deploy info
        else:
            try:
                versions = scrapyd.list_versions(project_name)
            except ConnectionError:
                return JsonResponse({'message': 'Connect Error'}, status=500)
            if len(versions) > 0:
                version = versions[-1]
                deployed_at = timezone.datetime.fromtimestamp(int(version), tz=pytz.timezone(TIME_ZONE))
            else:
                deployed_at = None
            deploy, result = Deploy.objects.update_or_create(client=client, project=project, deployed_at=deployed_at)
        # return deploy json info
        return JsonResponse(model_to_dict(deploy))
Example #3
0
def project_deploy(request, client_id, project_name):
    """
    deploy project operation
    :param request: request object
    :param client_id: client id
    :param project_name: project name
    :return: json of deploy result
    """
    if request.method == 'POST':
        # get project folder
        path = os.path.abspath(join(os.getcwd(), PROJECTS_FOLDER))
        project_path = join(path, project_name)
        # find egg file
        egg = find_egg(project_path)
        egg_file = open(join(project_path, egg), 'rb')
        # get client and project model
        client = Client.objects.get(id=client_id)
        project = Project.objects.get(name=project_name)
        # execute deploy operation
        scrapyd = ScrapydAPI(scrapyd_url(client.ip, client.port))
        try:
            scrapyd.add_version(project_name, int(time.time()), egg_file.read())
            # update deploy info
            deployed_at = timezone.now()
            Deploy.objects.filter(client=client, project=project).delete()
            deploy, result = Deploy.objects.update_or_create(client=client, project=project, deployed_at=deployed_at,
                                                             description=project.description)
            return JsonResponse(model_to_dict(deploy))
        except Exception:
            return JsonResponse({'message': get_traceback()}, status=500)
Example #4
0
def twitterscraping(request):
    context = {}
    form = TweetForm()
    context['topic'] = ""
    context['max_tweets'] = 15
    context['action'] = reverse('twitterscraping')
    context['firtsTimeLoad'] = 1
    if request.method == 'POST':
        topic = request.POST.get('topic')
        max_tweet = request.POST.get('max_tweets')
        context['topic'] = topic
        context['max_tweets'] = int(max_tweet)

        historyKey = CommonHelper.CommonHelper.RandomIdGenerator()
        _UserCrawlHistory = UserCrawlHistory(Historykey=historyKey,
                                             KeyWord=topic)
        _UserCrawlHistory.save(using="SentimentAppDB")

        global SpiderTwitterJOBID
        scrapyd = ScrapydAPI('http://127.0.0.1:6800')
        SpiderTwitterJOBID = scrapyd.schedule(SCRAPYD_PROJECT_NAME,
                                              'SpiderTwitter',
                                              historyKey=historyKey,
                                              count=max_tweet)
        context['firtsTimeLoad'] = 0
        context['historyKey'] = historyKey
    return render(request, 'SentimentApp/twitterscraping.html', {
        'result': context,
        'form': form
    })
Example #5
0
class ScrapydLoginFinderJob(object):

    def __init__(self, seed_url, username, password, db_name, scrapyd_host="localhost", scrapyd_port="6800", project="default", spider="login_finder"):

        scrapy_url = "http://" + scrapyd_host + ":" + str(scrapyd_port)
        self.scrapi = ScrapydAPI(scrapy_url)
        self.project = project
        self.spider = spider
        self.seed_url = seed_url
        self.username = username
        self.password = password
        self.db_name = db_name

    def schedule(self):

        self.job_id = self.scrapi.schedule(self.project, self.spider, seed_url = self.seed_url, username = self.username, password = self.password, db_name = self.db_name)

        return self.job_id

    def list_jobs(self):
        return self.scrapi.list_jobs(self.project)

    def get_state(self):

        try:
            self.job_id
        except:
            Exception("You must schedule a job before getting the state!")

        try:
            for job in self.scrapi.list_jobs(self.project)["running"]:
                print self.job_id, job["id"]
                if job["id"] == self.job_id:
                    return "Running"

            for job in self.scrapi.list_jobs(self.project)["pending"]:
                print self.job_id, job["id"]
                if job["id"] == self.job_id:
                    return "Pending"

        except:
            print "handled exception:"
            traceback.print_exc()
            return None

        return "Done"
    
    def block_until_done(self, timeout = 120):
        
        exec_time = 0
        while 1:
            exec_time += 1
            if exec_time == timeout:
                raise Exception("Timeout time reached for login_finder spider execution")

            time.sleep(1)
            state = self.get_state()
            if state == "Done":
                break
Example #6
0
def _start_spider(domain, keywords=None):
    localhost = 'http://localhost:6800'
    scrapyd = ScrapydAPI(localhost)
    job_id = scrapyd.schedule('default', 'externalspider',
                              started_by_domain=domain,
                              keywords=keywords)

    return job_id
Example #7
0
def remove_all_version(request, project, client_id):
    node = Node.objects.get(id=client_id)
    scrapyd = ScrapydAPI(scrapyd_url(node.node_ip, node.node_port))
    try:
        versions = scrapyd.delete_project(project)
        return JsonResponse(versions)
    except ConnectionError:
        return JsonResponse({'message': 'Connet Error'}, status=500)
Example #8
0
 def __init__(self, server):
     self.spider_status_name_dict = {
         SpiderStatus.PENDING: 'pending',
         SpiderStatus.RUNNING: 'running',
         SpiderStatus.FINISHED: 'finished'
     }
     super(ScrapydProxy, self).__init__(server)  # super执行的是父类的方法
     self.scrapyd_api = ScrapydAPI(self._scrapyd_url())  # 实例化ScrapydAPI
Example #9
0
def get_scrapyd(client):
    url = 'http://{ip}:{port}'.format(ip=client.ip, port=client.port)
    try:
        scrapyd = ScrapydAPI(url)
        result = scrapyd.list_projects()
        return scrapyd
    except (ConnectionError, InvalidURL):
        return False
Example #10
0
def delete_version(project, client, version):
    url = 'http://{ip}:{port}'.format(ip=client.ip, port=client.port)
    try:
        scrapyd = ScrapydAPI(url)
        result = scrapyd.delete_version(project.name, version)
        return True if result else False
    except (ConnectionError, InvalidURL):
        return False
Example #11
0
 def task():
     node = Node.objects.get(id=node_id)
     scrapyd = ScrapydAPI(scrapyd_url(node.node_ip, node.node_port))
     try:
         job = scrapyd.schedule(project_name, spider_name)
         return JsonResponse({'job': job})
     except ConnectionError:
         return JsonResponse({'message': 'Connect Error'}, status=500)
Example #12
0
def remove_depody_spider(request, client_id, project, version_name):
    if request.method == 'POST':
        node = Node.objects.get(id=client_id)
        scrapyd = ScrapydAPI(scrapyd_url(node.node_ip, node.node_port))
        try:
            spider = scrapyd.delete_version(project, version_name)
            return JsonResponse(spider)
        except ConnectionError:
            return JsonResponse({'message': 'Connect Error'}, status=500)
Example #13
0
def get_spider_version(request,project,client_id):
    client = Node.objects.get(id=client_id)
    scrapyd = ScrapydAPI(scrapyd_url(client.ip,client.port))
    try:
        spiders = scrapyd.list_spiders(project)
        spiders = [{'name': spider, 'id': index + 1} for index, spider in enumerate(spiders)]
        return JsonResponse(spiders)
    except ConnectionError:
        return JsonResponse({'message': 'Connect Error'}, status=500)
def debug():
    scrapyd = ScrapydAPI("http://localhost:6800")
    job = scrapyd.schedule(project='sougouHotList', spider= 'sougouHotListSpider')
    #job = scrapyd.schedule(project='baiduHotList', spider='baiduHotListSpider')
    time.sleep(120)
    print ('开始关闭爬虫')
    scrapyd.cancel(project='sougouHotList', job=job)
    time.sleep(300)
    print ('关闭爬虫')
Example #15
0
def task(request, project_name, spider_names, client_id):
    client = Node.objects.get(id=client_id)
    scrapyd = ScrapydAPI(scrapyd_url(client.ip, client.port))
    try:
        for spider_name in spider_names:
            task = scrapyd.schedule(project_name, spider_name)
            return JsonResponse(task)
    except ConnectionError:
        return JsonResponse({'message': 'Connect Error'}, status=500)
Example #16
0
def client_status(request, id):
    if request.method == 'GET':
        client = Client.objects.get(id=id)
        scrapyd = ScrapydAPI(scrapyd_url(client.ip, client.port))
        try:
            scrapyd.list_projects()
            return HttpResponse('1')
        except:
            return HttpResponse('0')
Example #17
0
def get_scrapyd(client):
    """
    get scrapyd of client
    :param client: client
    :return: scrapyd
    """
    if not client.auth:
        return ScrapydAPI(scrapyd_url(client.ip, client.port))
    return ScrapydAPI(scrapyd_url(client.ip, client.port),
                      auth=(client.username, client.password))
Example #18
0
def version(client, project):
    url = 'http://{ip}:{port}'.format(ip=client.ip, port=client.port)
    try:
        scrapyd = ScrapydAPI(url)
        versions = scrapyd.list_versions(project)
        if (len(versions) > 0):
            return versions[-1]
        return ''
    except (ConnectionError, InvalidURL, UnicodeError):
        return ''
Example #19
0
def spider_list(request, id, project):
    if request.method == 'GET':
        client = Client.objects.get(id=id)
        scrapyd = ScrapydAPI(scrapyd_url(client.ip, client.port))
        spiders = scrapyd.list_spiders(project)
        spiders = [{
            'name': spider,
            'id': index + 1
        } for index, spider in enumerate(spiders)]
        return HttpResponse(json.dumps(spiders))
Example #20
0
def get_project_version(request,project,client_id):
    if request.method == 'GET':
        client = Node.objects.get(id=client_id)
        scrapyd = ScrapydAPI(scrapyd_url(client.ip,client.port))
        try:
            versions = scrapyd.list_versions(project)
            versions = [{'name': version, 'id': index + 1} for index, version in enumerate(versions)]
            return JsonResponse(versions)
        except ConnectionError:
            return JsonResponse({'message': 'Connect Error'}, status=500)
Example #21
0
    def post(self, request):
        """
        :param request: 启动爬虫的请求参数
        :return: 爬虫启动是否成功
        """
        data = request.data
        spider_name = data.get("spider_name")
        spider_type = data.get("spider_type")
        # print(spider_name)
        # print(spider_type)
        if spider_type == "start":
            try:
                scrapyd = ScrapydAPI('http://localhost:6800')  # 这里是去调用部署分布式爬虫
                scrapyd.schedule('default', spider_name)  # 这里是启动爬虫
            except:
                return Response("failed")
        else:
            try:
                scrapyd = ScrapydAPI('http://localhost:6800')  # 这里是去调用部署分布式爬虫
                del_dict = scrapyd.list_jobs('default')  # 这里是启动爬虫
                # print(scrapyd.list_jobs('default'))
                del_jobs = []
                for k in ["pending", "running"]:
                    # print(del_dict[k])
                    for item in del_dict[k]:
                        if item.get("spider") == spider_name:
                            del_jobs.append(item.get("id"))
                for job_id in del_jobs:
                    scrapyd.cancel('default', job_id)
                # print(del_jobs)

            except:
                return Response("failed")
        return Response("ok")
Example #22
0
 def __init__(self):
     self._scrapyd = None
     try:
         self._scrapyd = ScrapydAPI('http://{}:{}'.format(
             config['Scrapyd']['host'], config['Scrapyd']['port']))
     except KeyError as e:
         logger.error("{}: No such key exists - {}".format(
             class_fullname(e), str(e)))
     except Exception as e:
         logger.error("{}: Failed to create a scrapyd object - {}".format(
             class_fullname(e), str(e)))
Example #23
0
def delete_project(project, url=DEFAULT_URL):
    """
    @param project: scrapy project name
    @param spider: spider name
    @param url: the url which target scrapyd daemon listens on
    @param settings: the settings dictionary

    To schedule a spider run:
        curl http://localhost:6800/schedule.json -d project=myproject -d spider=spider2
    """
    scrapyd = ScrapydAPI(url)
    return scrapyd.delete_project(project)
Example #24
0
def schedule_job(project, spider, url=DEFAULT_URL, settings={}, **kwargs):
    """
    @param project: scrapy project name
    @param spider: spider name
    @param url: the url which target scrapyd daemon listens on
    @param settings: the settings dictionary

    To schedule a spider run:
        curl http://localhost:6800/schedule.json -d project=myproject -d spider=spider2
    """
    scrapyd = ScrapydAPI(url)
    return scrapyd.schedule(project, spider, settings, **kwargs)
Example #25
0
def execute(*args):
    args = list(args)
    assert len(args) >= 1, u'É necessário informar pelo menos a spider'

    spider_name = args.pop(0)
    scrapy_url = 'http://localhost:6800'
    if args:
        scrapy_url = args.pop(0)

    scrapyd = ScrapydAPI(scrapy_url)
    scrapyd.schedule(
        get_project_name(), spider_name)
Example #26
0
def delete_project(project, url=DEFAULT_URL):
    """
    @param project: scrapy project name
    @param spider: spider name
    @param url: the url which target scrapyd daemon listens on
    @param settings: the settings dictionary

    To schedule a spider run:
        curl http://localhost:6800/schedule.json -d project=myproject -d spider=spider2
    """
    scrapyd = ScrapydAPI(url)
    return scrapyd.delete_project(project)
Example #27
0
class ScrapydJob(object):

    def __init__(self, scrapyd_host="localhost", scrapyd_port="6800", project="default", spider="website_finder", screenshot_dir='/memex-pinterest/ui/static/images/screenshots'):

        scrapy_url = "http://" + scrapyd_host + ":" + str(scrapyd_port)
        self.scrapi = ScrapydAPI(scrapy_url)
        self.project = project
        self.spider = spider
        self.screenshot_dir = screenshot_dir

    def schedule(self, seed):

        if not self.screenshot_dir:
            raise Exception("Please set the screenshot path in the config before scheduling")

        self.job_id = self.scrapi.schedule(self.project, self.spider, seed_urls=seed, screenshot_dir=self.screenshot_dir)

        return self.job_id

    def schedule_keywords(self, phrases, use_splash=True):
        """ Schedule a Scrapyd job """
        if not self.screenshot_dir:
            raise Exception("Please set the screenshot path in the config before scheduling")

        self.job_id = self.scrapi.schedule(self.project, self.spider,
            phrases=phrases,
            screenshot_dir=self.screenshot_dir,
            use_splash=int(use_splash)
        )
        return self.job_id

    def list_jobs(self):
        return self.scrapi.list_jobs(self.project)

    def get_state(self, job_id):

        try:
            for job in self.scrapi.list_jobs(self.project)["running"]:
                print job_id, job["id"]
                if job["id"] == job_id:
                    return "Running"

            for job in self.scrapi.list_jobs(self.project)["pending"]:
                print job_id, job["id"]
                if job["id"] == job_id:
                    return "Pending"

        except Exception:
            print "handled exception:"
            traceback.print_exc()
            return None

        return "Done"
Example #28
0
def job_list(request, id, project):
    if request.method == 'GET':
        client = Client.objects.get(id=id)
        scrapyd = ScrapydAPI(scrapyd_url(client.ip, client.port))
        result = scrapyd.list_jobs(project)
        jobs = []
        statuses = ['pending', 'running', 'finished']
        for status in statuses:
            for job in result.get(status):
                job['status'] = status
                jobs.append(job)
        return HttpResponse(json.dumps(jobs))
Example #29
0
def videoGetDetailsTaskSchedule():
    platformInfo = {
        '哔哩哔哩视频': 'bilibiliDetailInfo',
        '西瓜视频': 'xiguaDetailedInfo',  #因前面爬虫用了较好的名字,这个爬虫爬取西瓜系列较完整的信息
        '今日头条': 'xiguaDetailedInfo',
        '今日头条_点赞数': 'xiguaDetailInfo',  #仅补充点赞量
    }
    batchCheckNums = 64
    extraParams = {
        'proxytype': '1',
    }
    extraParams = json.dumps(extraParams,
                             ensure_ascii=False,
                             separators=(',', ':'))
    for k, v in platformInfo.items():
        if k == '哔哩哔哩视频' or k == '西瓜视频' or k == '今日头条':
            records = VideoDetailsData.objects.filter(
                platform__exact=k).filter(status__exact=2)
        elif k == '今日头条_点赞数':
            records = MovieOfflineData.objects.filter(
                platform__exact='今日头条').filter(ishz__exact=1).filter(
                    detailStatus__exact=0).filter(tag__in=['待处理', '未下线'])
        else:
            records = []
        spider = Spider.objects.get(Q(name__exact=v), Q(status__exact=0))
        deployProject = spider.deployProject
        i = 0
        scheduleServer = None
        j = m = 1
        paramList = []
        for record in records:
            if i % scrapydBatchSize == 0 and (j - 1) % batchCheckNums == 0:
                scheduleServer = getRunServer(deployProject)

            if scheduleServer:
                scrapyd = ScrapydAPI(scheduleServer, timeout=8)

                paramList.append({'id': record.id, 'targetUrl': record.url})
                if j % batchCheckNums == 0 or m == len(records):
                    params = json.dumps(paramList,
                                        ensure_ascii=False,
                                        separators=(',', ':'))
                    print(params)
                    status = scrapyd.schedule(project=deployProject,
                                              spider=spider.name,
                                              idTargetUrlList=params,
                                              extraParams=extraParams)
                    print(status)
                    paramList = []
                    i += 1
                j += 1
            m += 1
Example #30
0
def webcrawling(request):
    context = {}
    form = WebCrawlForm()
    context['formUrlList'] = "http://www.newspapers71.com/\n\
http://www.ntvbd.com/\r\n\
http://www.prothom-alo.com/\r\n\
http://www.kalerkantho.com/\r\n\
http://www.bhorerkagoj.net/\r\n\
http://www.jaijaidinbd.com/\r\n\
http://www.amadershomoy.biz/beta/\r\n\
https://www.dailyinqilab.com/\r\n\
http://www.jugantor.com/\r\n\
http://www.dailynayadiganta.com/\r\n\
http://www.mzamin.com/"

    context['formKeyWordList'] = "এসিআই\r\n\
স্বপ্ন\r\n\
স্যাভলন"

    context['action'] = reverse('webcrawling')
    context['firtsTimeLoad'] = 1

    if request.method == 'POST':
        urlText = request.POST.get('url')
        context['formUrlList'] = urlText
        keyWordList = request.POST.get('keyWord')
        context['formKeyWordList'] = keyWordList
        keyWord = keyWordList.splitlines()
        depth = request.POST.get('depth')
        context['depth'] = int(depth)

        historyKey = CommonHelper.CommonHelper.RandomIdGenerator()

        for key in keyWord:
            _UserCrawlHistory = UserCrawlHistory(Historykey=historyKey,
                                                 KeyWord=key)
            _UserCrawlHistory.save(using='SentimentAppDB')

        global SpiderWebCrawlerJOBID
        scrapyd = ScrapydAPI('http://127.0.0.1:6800')
        SpiderWebCrawlerJOBID = scrapyd.schedule(SCRAPYD_PROJECT_NAME,
                                                 'SpiderWebCrawler',
                                                 urls=urlText,
                                                 depth=depth,
                                                 historyKey=historyKey)
        context['historyKey'] = historyKey
        context['firtsTimeLoad'] = 0

    return render(request, 'SentimentApp/webcrawling.html', {
        'result': context,
        'form': form
    })
Example #31
0
    def __init__(
            self,
            scrapyd_host="localhost",
            scrapyd_port="6800",
            project="default",
            spider="website_finder",
            screenshot_dir='/memex-pinterest/ui/static/images/screenshots'):

        scrapy_url = "http://" + scrapyd_host + ":" + str(scrapyd_port)
        self.scrapi = ScrapydAPI(scrapy_url)
        self.project = project
        self.spider = spider
        self.screenshot_dir = screenshot_dir
Example #32
0
def check_parser_status():
    """
    task that checks current status (running, scheduled, finished
    or empty string if scraper with selected job id not founded)
    of all scrapes on the database
    :return:
    """
    spiders = Scraper.objects.all()
    scrapyd = ScrapydAPI('http://scrapyd:6800')
    for spider in spiders:
        status = scrapyd.job_status(BOT_NAME, spider.job_id)
        spider.status = status
        spider.save(update_fields=['status'])
Example #33
0
def AJX_BrowserCloseEvent(request):
    print("Browser close or reload event detected.")
    source = request.GET.get('source', None)
    global SpiderWebCrawlerJOBID
    global SpiderFacebookJOBID
    scrapyd = ScrapydAPI('http://127.0.0.1:6800')

    if source == "WebCrawl":
        try:
            print("Trying to stop web crawler scrapyd job : " +
                  str(SpiderWebCrawlerJOBID))
            scrapyd.cancel(SCRAPYD_PROJECT_NAME, SpiderWebCrawlerJOBID)
        except:
            print("Cant Find Web Crawler Active Job.")

    if source == "Facebook":
        try:
            print("Trying to stop facebook scrapyd job : " +
                  str(SpiderFacebookJOBID))
            scrapyd.cancel(SCRAPYD_PROJECT_NAME, SpiderFacebookJOBID)
        except:
            print("Cant Find Facebook Scraping Active Job.")

    if source == "Twitter":
        try:
            print("Trying to stop twitter scrapyd job : " +
                  str(SpiderTwitterJOBID))
            scrapyd.cancel(SCRAPYD_PROJECT_NAME, SpiderTwitterJOBID)
        except:
            print("Cant Find Twitter Scraping Active Job.")

    return JsonResponse(None, safe=False)
Example #34
0
def commonSchedule(type, catagery, isChangeScheduleStatus):
    if type == 0:
        if catagery == 1:
            results = MovieCrawlState.objects.filter(task__exact=catagery)
        else:
            results = MovieCrawlState.objects.filter(manage__exact=0).filter(
                task__exact=catagery)
    elif type == 1:
        if catagery == 1:
            results = MusicCrawlState.objects.filter(task__exact=catagery)
        else:
            results = MusicCrawlState.objects.filter(manage__exact=0).filter(
                task__exact=catagery)

    results = results[:(len(settings.SCRAPYD_URLS) * scrapydBatchSize)]
    i = 0
    scheduleServer = None
    for item in results:
        try:
            dictParam = json.loads(item.json) if item.json else {}
        except BaseException as e:
            print("json传入非法数据!")
            dictParam = {}
        searchWord, searchTaskId, suffixWords, spiderList, extraParams = setDeParams(
            dictParam)
        extraParams = json.dumps(extraParams,
                                 ensure_ascii=False,
                                 separators=(',', ':'))
        if i % scrapydBatchSize == 0:
            scheduleServer = getRunServer()

        if scheduleServer:
            if isChangeScheduleStatus:
                item.manage = 1
            scrapyd = ScrapydAPI(scheduleServer, timeout=8)
            if len(searchWord):
                item.startNum = len(spiderList)
                for spider in spiderList:
                    print(spider.deployProject, spider.name, searchWord,
                          searchTaskId, suffixWords, extraParams)
                    project = spider.deployProject
                    scrapyd.schedule(project=project,
                                     spider=spider.name,
                                     keyword=searchWord,
                                     searchTaskId=searchTaskId,
                                     suffixWords=suffixWords,
                                     extraParams=extraParams)
            item.save()

        i += 1
Example #35
0
def call_scrapyd_service():
    """通过 api 操作爬虫
    参考文档地址:https://pypi.python.org/pypi/python-scrapyd-api#downloads
    """
    scrapyd = ScrapydAPI('http://localhost:6800')
    scrapyd.job_status('govbuyscrapy', '0c838fd4b9f111e6abcc14dda97ae760')  # 查看指定爬虫任务执行状态
    scrapyd.list_jobs('govbuyscrapy')  # 查看爬虫任务列表
    scrapyd.schedule('govbuyscrapy', 'govbuy_wan_shucheng')  # 指定项目执行指定爬虫
Example #36
0
def schedule_job(project,
                 spider,
                 url=DEFAULT_URL,
                 settings={}, **kwargs):
    """
    @param project: scrapy project name
    @param spider: spider name
    @param url: the url which target scrapyd daemon listens on
    @param settings: the settings dictionary

    To schedule a spider run:
        curl http://localhost:6800/schedule.json -d project=myproject -d spider=spider2
    """
    scrapyd = ScrapydAPI(url)
    return scrapyd.schedule(project, spider, settings, **kwargs)
Example #37
0
def project_list(request, client_id):
    """
    project deployed list on one client
    :param request: request object
    :param client_id: client id
    :return: json
    """
    if request.method == 'GET':
        client = Client.objects.get(id=client_id)
        scrapyd = ScrapydAPI(scrapyd_url(client.ip, client.port))
        try:
            projects = scrapyd.list_projects()
            return JsonResponse(projects)
        except ConnectionError:
            return JsonResponse({'message': 'Connect Error'}, status=500)
Example #38
0
    def __init__(self, scrapyd_host="localhost", scrapyd_port="6800", project="default", spider="website_finder", screenshot_dir='/memex-pinterest/ui/static/images/screenshots'):

        scrapy_url = "http://" + scrapyd_host + ":" + str(scrapyd_port)
        self.scrapi = ScrapydAPI(scrapy_url)
        self.project = project
        self.spider = spider
        self.screenshot_dir = screenshot_dir
Example #39
0
    def __init__(self, scrapyd_host="localhost", scrapyd_port="6800", project="default", spider="website_finder", screenshot_dir=""):

        scrapy_url = "http://" + scrapyd_host + ":" + str(scrapyd_port)
        self.scrapi = ScrapydAPI(scrapy_url)
        self.project = project
        self.spider = spider
        self.screenshot_dir = screenshot_dir        
Example #40
0
def spider_list(request, client_id, project_name):
    """
    get spider list from one client
    :param request: request Object
    :param client_id: client id
    :param project_name: project name
    :return: json
    """
    if request.method == 'GET':
        client = Client.objects.get(id=client_id)
        scrapyd = ScrapydAPI(scrapyd_url(client.ip, client.port))
        try:
            spiders = scrapyd.list_spiders(project_name)
            spiders = [{'name': spider, 'id': index + 1} for index, spider in enumerate(spiders)]
            return JsonResponse(spiders)
        except ConnectionError:
            return JsonResponse({'message': 'Connect Error'}, status=500)
Example #41
0
def job_cancel(request, client_id, project_name, job_id):
    """
    cancel a job
    :param request: request object
    :param client_id: client id
    :param project_name: project name
    :param job_id: job id
    :return: json of cancel
    """
    if request.method == 'GET':
        client = Client.objects.get(id=client_id)
        try:
            scrapyd = ScrapydAPI(scrapyd_url(client.ip, client.port))
            result = scrapyd.cancel(project_name, job_id)
            return JsonResponse(result)
        except ConnectionError:
            return JsonResponse({'message': 'Connect Error'})
Example #42
0
def spider_start(request, client_id, project_name, spider_name):
    """
    start a spider
    :param request: request object
    :param client_id: client id
    :param project_name: project name
    :param spider_name: spider name
    :return: json
    """
    if request.method == 'GET':
        client = Client.objects.get(id=client_id)
        scrapyd = ScrapydAPI(scrapyd_url(client.ip, client.port))
        try:
            job = scrapyd.schedule(project_name, spider_name)
            return JsonResponse({'job': job})
        except ConnectionError:
            return JsonResponse({'message': 'Connect Error'}, status=500)
Example #43
0
    def __init__(self, seed_url, username, password, db_name, scrapyd_host="localhost", scrapyd_port="6800", project="default", spider="login_finder"):

        scrapy_url = "http://" + scrapyd_host + ":" + str(scrapyd_port)
        self.scrapi = ScrapydAPI(scrapy_url)
        self.project = project
        self.spider = spider
        self.seed_url = seed_url
        self.username = username
        self.password = password
        self.db_name = db_name
Example #44
0
def job_list(request, client_id, project_name):
    """
    get job list of project from one client
    :param request: request object
    :param client_id: client id
    :param project_name: project name
    :return: list of jobs
    """
    if request.method == 'GET':
        client = Client.objects.get(id=client_id)
        scrapyd = ScrapydAPI(scrapyd_url(client.ip, client.port))
        try:
            result = scrapyd.list_jobs(project_name)
            jobs = []
            statuses = ['pending', 'running', 'finished']
            for status in statuses:
                for job in result.get(status):
                    job['status'] = status
                    jobs.append(job)
            return JsonResponse(jobs)
        except ConnectionError:
            return JsonResponse({'message': 'Connect Error'}, status=500)
Example #45
0
def scheduler_job():
    """
    每分钟检查一次定时任务
    :return:
    """
    models = Task.objects.all()
    for model in models:
        scheduler_at = model.scheduler_at
        updated_at = model.updated_at
        scheduler_at_time_stamp = scheduler_at * 60
        updated_at_time_stamp = time.mktime(updated_at.timetuple())
        if time.time() - updated_at_time_stamp > scheduler_at_time_stamp:
            client_id = model.client_id
            project_name = model.project_name
            spider_name = model.spider_name
            client = Client.objects.get(id=client_id)
            scrapyd = ScrapydAPI(scrapyd_url(client.ip, client.port))
            try:
                job = scrapyd.schedule(project_name, spider_name)
                model.success = 1
            except ConnectionError:
                model.success = 0
            finally:
                model.save()
Example #46
0
    def __init__(self, name, spider_name, host, mongodb_credentials):
        self.server = ScrapydAPI(host)
        self.host_name = self._strip_host_name(host)
        self.birth_date = datetime.utcnow()
        self.name = name
        self.spider_name = spider_name
        self.alive = True
        client = pymongo.MongoClient(mongodb_credentials['server'],
                                     mongodb_credentials['port'],
                                     connectTimeoutMS=30000,
                                     socketTimeoutMS=None,
                                     socketKeepAlive=True)

        db = client[mongodb_credentials['database']]
        self.collection = db[mongodb_credentials['collection']]
Example #47
0
from scrapyd_api import ScrapydAPI
scrapyd = ScrapydAPI('http://127.0.0.1:6800')
scrapyd.list_jobs('project_name')
Example #48
0
class Overseer(object):
    """
    Overseer facilitate the deployment process of local spiders to a remote scrapyd server

    Available methods:
        spawn_spiders           Create spider and deploy them to remote scrapyd server
        get_status              Report the current status of the remote scrapyd server

    """

    DEFAULT_TYPE = 'sell'
    DEFAULT_VENDOR = 'None'

    def __init__(self, name, spider_name, host, mongodb_credentials):
        self.server = ScrapydAPI(host)
        self.host_name = self._strip_host_name(host)
        self.birth_date = datetime.utcnow()
        self.name = name
        self.spider_name = spider_name
        self.alive = True
        client = pymongo.MongoClient(mongodb_credentials['server'],
                                     mongodb_credentials['port'],
                                     connectTimeoutMS=30000,
                                     socketTimeoutMS=None,
                                     socketKeepAlive=True)

        db = client[mongodb_credentials['database']]
        self.collection = db[mongodb_credentials['collection']]

    def kill(self):
        self.alive = False
        return self.host_name

    def heartbeat(self):
        return self.alive

    def spawn_spiders(self, num_spiders=5, items_per_spider=100, **kwargs):
        type = kwargs.get('type', self.DEFAULT_TYPE)
        vendor = kwargs.get('vendor', self.DEFAULT_VENDOR)

        count = 0
        while count < num_spiders:
            count += 1
            self._spawn(vendor, type, items_per_spider)
            time.sleep(3)

    def get_status(self):
        """
         Return:
             the number of running spiders
             the number of finished spiders
             the average time for one spider to finish
        """
        status = self.server.list_jobs(self.name)
        running = status['running']
        finished = status['finished']
        finished_times = [self._time_diff_in_minute(job['end_time'], job['start_time']) for job in finished]
        avg_time = np.average(finished_times)

        Notification('{} - [{}] \t Running Spiders = {}, Finished Spiders = {}, Average Runtime = {}'
                     .format(datetime.utcnow(),
                             self.host_name,
                             len(running),
                             len(finished),
                             avg_time
                             )
                     .expandtabs(3)
                     ).info()

        return len(running), len(finished), avg_time

    def _spawn(self, vendor, type, items_per_spider=100):
        # Get the tasks from the database
        tasks = self._get_tasks_from_database(vendor, type, items_per_spider)
        if not tasks:
            raise ValueError('There is no more task from the database!')

        links, property_ids = zip(*tasks)

        # Schedule the tasks with the remote scrapyd server
        job_id = self.server.schedule(self.name, self.spider_name, vendor=vendor, crawl_url=','.join(links), type=type)

        Notification('{} - [{}] \t Launch spider {}'
                     .format(datetime.utcnow(),
                             self.host_name,
                             job_id)
                     .expandtabs(3)
                     ).success()

        # Clear the tasks from the database
        self._clear_tasks_from_database(vendor, type, property_ids)

    def _get_tasks_from_database(self, vendor, type, items_per_spider):
        cursor = self.collection \
                     .find({"last_crawled_date": None, "type": type, "vendor": vendor}) \
                     .sort("created_date", pymongo.ASCENDING) \
                     .limit(items_per_spider)

        tasks = [(item['link'], item['property_id']) for item in cursor]

        return tasks

    def _clear_tasks_from_database(self, vendor, type, property_ids):
        self.collection.update({"vendor": vendor, "type": type, "property_id": {"$in": property_ids}},
                               {"$set": {"last_crawled_date": datetime.utcnow()}},
                               multi=True,
                               upsert=False)

    @staticmethod
    def _time_diff_in_minute(current, previous):
        return ((parser.parse(current) - parser.parse(previous)).seconds // 60) % 60

    @staticmethod
    def _strip_host_name(host):
        return host.replace('http://', '').replace('.compute.amazonaws.com:6800', '')
Example #49
0
def job_status(project, job, url=DEFAULT_URL):
    scrapyd = ScrapydAPI(url)
    return scrapyd.job_status(project, job)
Example #50
0
def delete_version(project, version, url=DEFAULT_URL):
    scrapyd = ScrapydAPI(url)
    return scrapyd.delete_version(project, version)