Exemple #1
0
class ItjuziCrawler(object):

    ItjuziHeaders = {
        'Accept':
        'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',  # noqa: E501
        'Accept-Encoding': 'gzip, deflate, br',
        'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
        'Cache-Control': 'no-cache',
        'Pragma': 'no-cache',
        'Referer': 'https://www.itjuzi.com/user/login',
    }

    # 登录地址
    login_url = 'https://www.itjuzi.com/user/login?redirect=&flag=&radar_coupon='  # noqa: E501

    def __init__(self, username, password):
        self.httpclient = HttpClient(default_headers=self.ItjuziHeaders)
        self.username = username
        self.password = password
        self.uapool = UAPool(DeviceType.PC)

    def login(self):
        """登录IT桔子。"""
        data = {
            'identity': self.username,
            'password': self.password,
            'submit': '',
            'page': '',
            'url': '',
        }
        resp = self.httpclient.post(self.login_url, data=data, verify=False)
        if resp is not None and resp.status_code == HttpStatus.StatusOk:
            print('SUCCEED! ===================================>')  # noqa
            for cookie in resp.cookies:
                print('%s=%s' % (cookie.name, cookie.value))
            print('END==============')

    def get_project_detail(self, _id):
        """获取项目详情"""

    def run(self):
        """运行。"""
        page = 1
        url_tpl = 'http://radar.itjuzi.com/company/infonew?page=%(page)d'
        quit = False
        while not quit:
            url = url_tpl % dict(page=page)
            resp = self.httpclient.get(
                url, headers={'User-Agent': self.uapool.pick()})  # noqa: E501
            if resp is not None:
                try:
                    rows = resp.json()['data']['rows']
                    for row in rows:
                        print('>>>', row)
                except KeyError as e:
                    print(e)
                    continue
            page += 1
            break
Exemple #2
0
class QianBiDaoCrawler(object):

    QianBiDaoAppHeaders = {
        'User-Agent': 'PencilNews/1.3.0 (iPhone; iOS 11.2.5; Scale/2.00)',
        'Accept-Encoding': 'br, gzip, deflate',
        'Accept-Language': 'zh-Hans-CN;q=1, en-CN;q=0.9',
        'Authority': 'api.pencilnews.cn',
    }
    login_url = 'https://api.pencilnews.cn/user/login'

    def __init__(self, username, password):
        self.httpclient = HttpClient(default_headers=self.QianBiDaoAppHeaders)
        self.username = username
        self.password = password
        self.xlsx_name = ""
        self.token = ""

    def send_mail(self):
        """发送邮件"""
        mail = dict()
        mail['to'] = RECEIPTS
        mail['subject'] = self.xlsx_name
        mail['attachment'] = [u'%s.xlsx' % self.xlsx_name]
        mail_multipart(mail)

    def login(self):
        """登录铅笔道"""
        data = dict(username=self.username, password=self.password)
        resp = self.httpclient.post(self.login_url, data=data, verify=False)
        if resp and resp.status_code == HttpStatus.StatusOk:
            ret = resp.json()
            if ret and ret['message'] == 'SUCCESS':
                self.token = ret['data']['user']['token']
        return self.token

    def get_item_detail(self, item_id):
        """获取项目详情。"""
        url = 'https://api.pencilnews.cn/pay-project/detail?id=%s' % item_id
        print('>>>>>>', url)
        resp = self.httpclient.get(url, headers=dict(token=self.token))
        if resp and resp.status_code == HttpStatus.StatusOk:
            data = resp.json()
            if data['message'] == 'SUCCESS':
                return data['data']
        return {}

    def run(self):
        """爬虫运行"""
        page = 1
        url_tpl = 'https://api.pencilnews.cn/pay-project/list?page=%(page)d'
        projects = list()
        workbook = openpyxl.Workbook()
        sheet = workbook.active
        sheet.titile = u'铅笔道项目列表'
        headers = [
            u'项目名称', u'区域', u'细分领域', u'官网', u'项目简介', u'融资轮次', u'融资情况', u'项目来源'
        ]
        for idx, header in enumerate(headers, 1):
            sheet.cell(row=1, column=idx, value=header)
        quit = False
        last_id = cache.qianbidao.last_id
        while quit is False:
            url = url_tpl % dict(page=page)
            print('>>>', url)
            resp = self.httpclient.get(url, headers=dict(token=self.token))
            if resp and resp.status_code == HttpStatus.StatusOk:
                data = resp.json()
                for item in data['data']['items']:
                    project = dict()
                    project['id'] = item['id']
                    if last_id == project['id']:
                        quit = True
                        break
                    project['name'] = item['name']
                    project['description'] = item['description']
                    project['industry'] = item['industry'][0] \
                        if item['industry'] else ''
                    time.sleep(1)
                    detail_info = self.get_item_detail(project['id'])
                    detail_info = detail_info['content']
                    project['web'] = ''
                    project_follow = detail_info.get('projectFollow')
                    if project_follow:
                        project['web'] = project_follow.get(
                            'company_website', '')  # noqa: E501
                    if not project['web']:
                        project['web'] = project_follow.get(
                            'project_public_num', '')  # noqa: E501
                    project['region'] = detail_info['project']['region_name']
                    rounds = detail_info.get('rounds', [])
                    project['finance_round'] = u'获投状态不明确'
                    if rounds:
                        last_round = rounds[0]
                        project['finance_round'] = '%s %s %s %s' % (
                            last_round['annouced_time'],
                            last_round['stage_name'],
                            last_round['money_raised'], last_round['investor'])
                    project['finance'] = last_round['stage_name'] \
                        if rounds else u'尚未获投'
                    projects.append(project)
            time.sleep(0.5)
            page += 1
        for idx, project in enumerate(projects, 2):
            sheet.cell(row=idx, column=1, value=project['name'])
            sheet.cell(row=idx, column=2, value=project['region'])
            sheet.cell(row=idx, column=3, value=project['industry'])
            sheet.cell(row=idx, column=4, value=project['web'])
            sheet.cell(row=idx, column=5, value=project['description'])
            sheet.cell(row=idx, column=6, value=project['finance'])
            sheet.cell(row=idx, column=7, value=project['finance_round'])
            sheet.cell(row=idx, column=8, value=u'铅笔道')
        if projects:
            cache.qianbidao.last_id = projects[0]['id']
        self.xlsx_name = u'%s 铅笔道项目' % datetime.date.today()
        workbook.save(u'%s.xlsx' % self.xlsx_name)
        self.send_mail()