Example #1
0
def strptime(string, format):
    try:
        if string is None or len(string) == 0:
            return None
        return time.strptime(string, format)
    except Exception as e:
        log("时间转换失败===> params: {},{}".format(string, format) + str(e))
Example #2
0
 def start_requests(self):
     current_page = 1
     result = self.query_company_page(current_page)
     pages = result.get("pages")
     logger.log("公司总页数:" + str(pages))
     while current_page <= pages:
         result = self.query_company_page(current_page)
         for row in result.get("rows"):
             full_name = row[0]
             name = row[1]
             if full_name is not None:
                 yield Request(self.company_url.format(1, full_name),
                               dont_filter=True,
                               meta={
                                   "company": full_name,
                                   "current_page": 1
                               })
             else:
                 yield Request(self.company_url.format(1, name),
                               dont_filter=True,
                               meta={
                                   "company": name,
                                   "current_page": 1
                               })
         current_page += 1
Example #3
0
    def execute(self, sql, params):
        if type(params) == list and len(params) == 0:
            return
        # 使用cursor()方法获取操作游标
        connection = self.get_connection()
        # cursor = connection.cursor()

        try:
            # 执行sql语句
            # cursor.execute(sql)
            # cursor.executemany("insert into tb7(user,pass,licnese)values(%s,%s,%s)",
            # [("u1","u1pass","11111"),("u2","u2pass","22222")])
            with connection.cursor() as cursor:
                # 执行sql语句
                if type(params) == list:
                    cursor.executemany(sql, params)
                else:
                    cursor.execute(sql, params)
                # 获取自增id
                pk = cursor.lastrowid
                connection.commit()
            if pk:
                cy_logger.log("executes successfully! last_row_id==> " +
                              str(pk))
            else:
                cy_logger.log("executes successfully")
            return pk
        except Exception as e:
            cy_logger.error(str(e))
            # 发生错误时回滚
            connection.rollback()
        finally:
            # 关闭数据库连接
            connection.close()
Example #4
0
 def process_item(self, item, spider):
     logger.log("*" * 100)
     logger.log("===> " + str(item))
     logger.log("*" * 100)
     insert("INSERT INTO `xsbbiz`.`spider_test` (`name`, `company`) "
            "VALUES (%s, %s)",
            (item['name'], item['company']))
Example #5
0
    def select_rows_paper(self, sql, param=None, page_no=1, page_size=20):
        """
        分页查询
        """
        connection = self.get_connection()
        try:
            with connection.cursor() as cursor:
                if param:
                    sql = sql + '%s' % param

                cursor.execute("SELECT COUNT(1) FROM (%s) tmp" % sql)
                total = cursor.fetchall()[0][0]
                pages = 0
                rows = []
                if total > 0:
                    # 总页数
                    pages = int(total /
                                page_size) if total % page_size == 0 else int(
                                    total / page_size) + 1

                    if page_no > pages:
                        page_no = pages
                    offset = (page_no - 1) * page_size  # 偏移量
                    sql = sql + ' LIMIT %s, %s' % (offset, page_size)
                    cy_logger.log(sql)
                    cursor.execute(sql)
                    rows = cursor.fetchall()

                return {
                    'total': total,
                    'page_no': page_no,
                    'pages': pages,
                    'rows': rows
                }
        except Exception as e:
            cy_logger.error(e)
        finally:
            connection.close()
Example #6
0
    def parse_page(self, response):
        # 把json数据转换成Python对象
        company_list = json.loads(response.body.decode('utf-8'))
        logger.log("列表地址===>" + response.url)
        # 总共有多少页:
        page_count = jsonpath.jsonpath(company_list,
                                       '$..data')[0]['page_total']
        logger.log(page_count)

        # 当前第几页:
        now_page = jsonpath.jsonpath(company_list, '$..data')[0]['page_num']
        logger.log(now_page)

        if now_page == 1:
            for url in [
                    self.list_url.format(page)
                    for page in range(2, page_count + 1)
            ]:
                yield Request(url,
                              cookies=self.cookies,
                              callback=self.parse_page,
                              headers=self.headers,
                              dont_filter=True)

        companys = jsonpath.jsonpath(company_list, '$..data')[0]['rows']

        for company in companys:
            item = ItjuziCompanyItem()

            # 解析想要的字段:
            item["company_name"] = company["com_name"]
            # logo地址
            item["company_logo"] = company['com_logo_archive']

            # 公司id
            item['company_id'] = company['com_id']
            # 公司描述
            item["company_des"] = company['com_des']
            # 公司行业
            item["company_fa"] = company['com_cat_name']
            # 公司子行业
            item["company_son"] = company['com_sub_cat_name']

            # 最新融资情况由四步分组成:
            invse_date = company['invse_date']
            invse_round_id = company['invse_round_id']
            invse_detail_money = company['invse_detail_money']
            invse_total_money = company['invse_total_money']

            # 公司最新融资情况
            item[
                "company_recent"] = invse_date + ' ' + invse_round_id + ' ' + invse_detail_money + ' ' + invse_total_money
            # 公司资产总额
            item["company_count"] = company['invse_total_money']
            # 公司估价
            item["company_money"] = company['guzhi']

            # 公司地址
            item['company_addr'] = company['com_addr']
            # 公司标语
            item['company_slogan'] = company['com_slogan']

            # 公司成立时间
            item["company_btime"] = company['com_born_date']
            # 公司规模
            item["company_people"] = company['com_scale']
            # 公司营运情况
            item["company_operate"] = company['com_status']

            # 发送详细页的请求
            yield FormRequest("https://www.itjuzi.com/company/" +
                              item['company_id'],
                              cookies=self.cookies,
                              meta={'meta1': item},
                              callback=self.parse_detail,
                              headers=self.headers,
                              dont_filter=True)
Example #7
0
    def __init__(self, *a, **kw):
        super(BaiduZhaopinSpider, self).__init__(*a, **kw)
        self.chrome_options = Options()
        #  不打开浏览器窗口
        self.chrome_options.add_argument('--headless')
        self.chrome_options.add_argument('--disable-gpu')
        self.browser = webdriver.Chrome(
            executable_path=r'spider/file/chromedriver.exe',
            chrome_options=self.chrome_options)
        self.browser.maximize_window()  # 窗口最大化
        # 隐性等待,最长等2秒
        self.browser.implicitly_wait(2)

        current_page = 1
        result = self.query_company_page(current_page)
        pages = result.get("pages")
        logger.log("公司总页数:" + str(pages))
        while current_page <= pages:
            result = self.query_company_page(current_page)
            for row in result.get("rows"):
                name = row[1]
                self.browser.get(self.company_url.format(name))
                # 解析数据
                job_list = self.browser.find_elements_by_xpath(
                    "//div[@id='qz-list-box']/a[@id='cktarget']")
                cur_num = 0
                if len(job_list) > 0:
                    while len(job_list) > cur_num:
                        cur_num = len(job_list)
                        # 满10条提交一次
                        if cur_num % 10 == 0:
                            for job in job_list[-10:]:
                                item = BaiduZhaopinItem()
                                item['company_name'] = job.find_element_by_xpath(
                                    "./div/div[@class='inlineblock percent33']/div[@class='company']/span[@class='inlineblock companyname']"
                                ).text
                                item['job_name'] = job.find_element_by_xpath(
                                    "./div/div[@class='inlineblock percent47']/div[@class='title']"
                                ).text
                                item['location'] = job.find_element_by_xpath(
                                    "./div/div[@class='inlineblock percent33']/div[@class='detail']/span[1]"
                                ).text
                                item['education'] = job.find_element_by_xpath(
                                    "./div/div[@class='inlineblock percent33']/div[@class='detail']/span[3]"
                                ).text
                                item['years'] = job.find_element_by_xpath(
                                    "./div/div[@class='inlineblock percent33']/div[@class='detail']/span[5]"
                                ).text
                                salary_num = job.find_element_by_xpath(
                                    "./div/div[@class='inlineblock percent47']/div[@class='salaryarea top16']/span[@class='inlineblock num']"
                                ).text
                                try:
                                    salary_unit = job.find_element_by_xpath(
                                        "./div/div[@class='inlineblock percent47']/div[@class='salaryarea top16']/span[@class='inlineblock unit']"
                                    ).text
                                except:
                                    salary_unit = ""
                                item['salary'] = salary_num + salary_unit
                                # 点击跳转详情
                                ActionChains(self.browser).move_to_element(
                                    job).click(job).perform()
                                handles = self.browser.window_handles  # 获取窗口句柄集合(列表类型)
                                for handle in handles:  # 切换到新窗口
                                    if handle != self.browser.current_window_handle:
                                        self.browser.switch_to_window(handle)
                                        break
                                item['release_time'] = self.browser.find_element_by_xpath(
                                    "//div[@class='job-desc-box inner home-inner clearfix']/div[@class='job-desc right-box']/div[@class='job-desc-item'][1]/div[@class='job-classfiy']/p[2]"
                                ).text
                                item['platform'] = self.browser.find_element_by_xpath(
                                    "//div[@class='job-desc-box inner home-inner clearfix']/div[@class='job-desc right-box']/div[@class='job-desc-item'][3]/div[@class='media-item source']/div[@class='item-bd']/h4[@class='bd-tt']"
                                ).text
                                self.browser.close()  # 关闭当前窗口
                                self.browser.switch_to_window(
                                    handles[0])  # 切换回原窗口
                                self.save_item(item)
                        # 滑到底部,加载更多
                        self.browser.execute_script(
                            "window.scrollTo(0, document.body.scrollHeight)")
                        job_list = self.browser.find_elements_by_xpath(
                            "//div[@id='qz-list-box']/a[@id='cktarget']")
                    # 提交剩余数据
                    for job in job_list[-10:]:
                        item = BaiduZhaopinItem()
                        item['company_name'] = job.find_element_by_xpath(
                            "./div/div[@class='inlineblock percent33']/div[@class='company']/span[@class='inlineblock companyname']"
                        ).text
                        item['job_name'] = job.find_element_by_xpath(
                            "./div/div[@class='inlineblock percent47']/div[@class='title']"
                        ).text
                        item['location'] = job.find_element_by_xpath(
                            "./div/div[@class='inlineblock percent33']/div[@class='detail']/span[1]"
                        ).text
                        item['education'] = job.find_element_by_xpath(
                            "./div/div[@class='inlineblock percent33']/div[@class='detail']/span[3]"
                        ).text
                        item['years'] = job.find_element_by_xpath(
                            "./div/div[@class='inlineblock percent33']/div[@class='detail']/span[5]"
                        ).text
                        salary_num = job.find_element_by_xpath(
                            "./div/div[@class='inlineblock percent47']/div[@class='salaryarea top16']/span[@class='inlineblock num']"
                        ).text
                        try:
                            salary_unit = job.find_element_by_xpath(
                                "./div/div[@class='inlineblock percent47']/div[@class='salaryarea top16']/span[@class='inlineblock unit']"
                            ).text
                        except:
                            salary_unit = ""
                        item['salary'] = salary_num + salary_unit
                        # 点击跳转详情
                        ActionChains(self.browser).move_to_element(job).click(
                            job).perform()
                        handles = self.browser.window_handles  # 获取窗口句柄集合(列表类型)
                        for handle in handles:  # 切换到新窗口
                            if handle != self.browser.current_window_handle:
                                self.browser.switch_to_window(handle)
                                break
                        item['release_time'] = self.browser.find_element_by_xpath(
                            "//div[@class='job-desc-box inner home-inner clearfix']/div[@class='job-desc right-box']/div[@class='job-desc-item'][1]/div[@class='job-classfiy']/p[2]"
                        ).text
                        try:
                            item['platform'] = self.browser.find_element_by_xpath(
                                "//div[@class='job-desc-box inner home-inner clearfix']/div[@class='job-desc right-box']/div[@class='job-desc-item'][3]/div[@class='media-item source']/div[@class='item-bd']/h4[@class='bd-tt']"
                            ).text
                        except:
                            item['platform'] = "百度"
                        self.browser.close()  # 关闭当前窗口
                        self.browser.switch_to_window(handles[0])  # 切换回原窗口
                        self.save_item(item)
            current_page += 1
        # 关闭浏览器
        self.browser.quit()
Example #8
0
 def process_item(self, item, spider):
     logger.log("===> " + str(item))