Ejemplo n.º 1
0
    def parse_list(self, response):
        logger.info("job list url {}".format(response.url))
        kw = response.meta["kw"]
        city = response.meta["city"]
        pg = response.meta["pg"]

        timeout_date = self.timeout_date
        timeout = False

        content = json.loads(response.body)

        for cell in content['content']["positionResult"]["result"]:
            post_item = JobShortItem()
            date = dateformatting.parse(cell["createTime"])
            if date and date < timeout_date:
                timeout = True
                logger.info("Timeout: %s < %s" % (date, timeout_date))
                break
            elif not date:
                logger.warn("parse time badly  please check dateformatting {} ".format(time_it))
                continue
            post_item["job_name"] = cell["positionName"]
            post_item["url"] = "https://www.lagou.com/jobs/{}.html".format(cell["positionId"])
            post_item["city"] = cell["city"]
            post_item["source"] = "拉勾网"
            post_item["district"] = cell["district"]
            post_item["month_salary"] = cell["salary"]
            post_item["day_salary"] = ""
            post_item["job_direction"] = key_words[kw]
            post_item["job_exp"] = cell["workYear"]
            post_item["job_edu"] = cell["education"]
            post_item["publish_man"] = cell["companyShortName"]
            post_item["publish_man_post"] = cell["companyShortName"]
            post_item["publish_time"] = dateformatting.parse(cell["createTime"]).strftime(date_format)
            post_item["company_name"] = cell["companyFullName"]
            post_item["company_addr"] = cell["district"]
            post_item["company_industry"] = cell["industryField"]
            logger.info("crawled list {} {}".format(post_item["url"], post_item["job_name"]))
            yield post_item

        if pg < 10 and not timeout:
            pg = pg + 1
            url = list_url_tem.format(ct=city)
            post_body = {
                'first': "false",
                'pn': str(pg),
                'kd': kw + "实习"
            }
            logger.info("will crawl url {}".format(url))
            yield FormRequest(url=url, callback=self.parse_list, priority=6, formdata=post_body,
                              meta={"city": city, "kw": kw, "pg": pg}, headers=headers)
Ejemplo n.º 2
0
    def parse_list(self, response):
        logger.info("job list url {}".format(response.url))
        kw = response.meta["kw"]
        cid = response.meta["cid"]
        pg = response.meta["pg"]
        direct = response.meta["direct"]

        timeout_date = self.timeout_date
        timeout = False

        content = response.xpath('//div[@class="dw_table"]/div[@class="el"]')
        if not content.get('content'):
            logger.warning("what a bad url.{}".format(response.url))
            return
        for cell in content:
            post_item = JobShortItem()
            time_it = cell.xpath('./span[@class="t5"]/text()').extract_first()
            date = dateformatting.parse(time_it)
            if date and date < timeout_date:
                timeout = True
                logger.info("Timeout: %s < %s" % (date, timeout_date))
                break
            elif not date:
                logger.warn("parse time badly  please check dateformatting {} ".format(time_it))
                continue
            post_item["job_name"] = cell.xpath('./p[starts-with(@class, "t1")]//a/@title').extract_first()
            post_item["url"] = cell.xpath('./p[starts-with(@class, "t1")]//a/@href').extract_first()
            post_item["city"] = city_ids[cid]
            post_item["source"] = "51job"
            post_item["district"] = cell.xpath('./span[@class="t3"]/text()').extract_first()
            salary = cell.xpath('./span[@class="t4"]/text()').extract_first()
            post_item["month_salary"] = salary if salary else "面议"
            post_item["day_salary"] = ""
            post_item["job_direction"] = directions[direct]
            post_item["job_exp"] = ""
            post_item["job_edu"] = ""
            post_item["publish_man"] = ""
            post_item["publish_man_post"] = ""
            post_item["publish_time"] = dateformatting.parse(time_it).strftime(date_format)
            post_item["company_name"] = cell.xpath('./span/a/@title').extract_first()
            post_item["company_addr"] = cell.xpath('./span[@class="t3"]/text()').extract_first()
            post_item["company_industry"] = ""
            logger.info("crawled list {} {}".format(post_item["url"], post_item["job_name"]))
            yield post_item
        next_page = response.xpath('//div[@class="rt"]/a/@href').extract_first()
        if next_page and not timeout:
            pg = pg + 1
            url = next_page
            logger.info("will crawl url {}".format(url))
            yield Request(url=url, callback=self.parse_list, priority=6,
                          meta={"cid": cid, "kw": kw, "pg": pg, "direct": direct}, headers=headers)
Ejemplo n.º 3
0
    def parse_live_detail(self, response):
        logger.info("live url {}".format(response.url))
        info = re.findall("window.(anchor = .*?);", response.body, re.S)[0]
        post_info = js2py.eval_js(info)

        post_item = LiveItem()
        post_item["author_id"] = post_info["memberid"]
        post_item["author_name"] = post_info["nickname"]
        post_item["url"] = response.url
        post_item["title"] = response.xpath("//h1/text()").extract_first()
        post_item["site_id"] = 1223
        post_item["site_name"] = "一直播"
        # post_item["read_num"] = post_info["online"]
        post_item["online_num"] = post_info["online"]  # 文章阅读数 视频观看数 live参加数
        post_item["like_num"] = response.xpath('//div[@class="hide"]').re_first(u"共有(\d+)条点赞")  # 点赞数
        post_item["comment_num"] = response.xpath('//div[@class="hide"]').re_first(u"共有(\d+)条评论")  # 评论数
        post_item["post_time"] = dateformatting.parse(post_info["starttime"]).strftime(date_format)  # 发布时间
        post_item["include_time"] = self.crawled_time  # 抓取时间
        post_item["content_tags"] = response.xpath('//div[@class="hide"]').re_first(u"认证类型:(.*?)。")
        post_item["video"] = post_info["play_url"]
        post_item["image"] = post_info["covers"]
        yield post_item
        # logger.info(post_item)

        logger.info(u"{} live view people {}".format(post_item["author_name"], post_item["online_num"]))
Ejemplo n.º 4
0
def stand_time(time_str):
    logger.debug('time_str: %s', time_str)
    post_time = dateformatting.parse(time_str)
    if post_time is None:
        post_time = '2000-1-1 00:00:00'
    else:
        post_time = post_time.strftime('%Y-%m-%d %H:%M:%S')
    return post_time
Ejemplo n.º 5
0
def _check_post_time(dt):
    ndt = dateformatting.parse(dt)
    if ndt and ndt.strftime("%Y-%m-%d %H:%M:%S") != dt:
        logger.error("Error post time of data:\n%s" % str(data))
        return False
    if ndt < D20160101:
        logger.warning("Post time timeout: %s" % dt)
        return False
    return True
Ejemplo n.º 6
0
 def parse(self, response):
     logger.info(response.url)
     entry_id = response.meta['entry_id']
     data = json.loads(response.body)['data']
     for it in data:
         detail_url = response.urljoin(it['question_url'])
         post = PostItem()
         post_time = dateformatting.parse(
             it['question_date']).strftime('%Y-%m-%d %H:%M:%S')
         post['author_id'] = it['question_uid']
         post['url'] = detail_url
         post['title'] = it['question_title']
         post['comment_num'] = it['question_renum']
         post['data_type'] = 'first'
         post['post_time'] = post_time
         post['site_type'] = 15
         post['author_name'] = it['question_username']
         post['text'] = it['question_content']
         # post['img_url'] = img_url
         post['entry_id'] = entry_id
         post['include_time'] = time.strftime('%Y-%m-%d %H:%M:%S',
                                              time.localtime(time.time()))
         yield post
         logger.info('post_louzhu:%s' % post)
         yield Request(detail_url,
                       callback=self.parse_content,
                       meta={
                           'parent_url': detail_url,
                           'entry_id': entry_id
                       })
     # #匹配分页
     dest_time = data[-1]['question_date']
     dest_time = dateformatting.parse(dest_time).strftime(
         '%Y-%m-%d %H:%M:%S')
     flag = self.cal_time(dest_time)
     if flag:
         logger.info('抓取的帖子时间:%s' % dest_time)
         self.page_num += 1
         dat = {'page': str(self.page_num)}
         yield scrapy.FormRequest(url=self.wenda_url,
                                  callback=self.parse,
                                  formdata=dat,
                                  meta={'entry_id': entry_id})
Ejemplo n.º 7
0
 def parse(self, response):
     logger.info(response.url)
     entry_id = response.meta['entry_id']
     # # 获取社区
     # shequ = response.xpath('//div[@class="q-layer q-layer-section"]//dd/a')
     # for x in shequ:
     #     shequ_name = ''.join(x.xpath('./text()').extract()).encode('utf-8')
     #     shequ_url = response.urljoin(''.join(x.xpath('./@href').extract()))
     #     she = {'name': shequ_name, 'url': shequ_url, 'site_type': 2}
     #     with open('qiongyou.json', 'a') as f:
     #         f.write(json.dumps(she) + '\n')
     # 匹配详情页
     bl = response.xpath('//ul[@id="list-id"]/li')
     for i in bl:
         detail = response.xpath('//a[@class="txt"]/@href').extract_first()
         comment_num = i.xpath(
             './/span[@class="reply"]//text() | //span[@class="lbvch xnum"]//text()'
         ).extract()[0]
         detail_url = response.urljoin(detail)
         yield Request(detail_url,
                       callback=self.parse_content,
                       meta={
                           'parent_url': detail_url,
                           'entry_id': entry_id,
                           'comment_num': comment_num
                       })
     # #匹配分页
     dest_time = response.xpath('//span[@class="zdate"]/text()').extract()
     dest_time = ''.join(dest_time[-1]).encode('utf-8').split(' ')[1]
     dest_time = dateformatting.parse(dest_time).strftime(
         '%Y-%m-%d %H:%M:%S')
     flag = self.cal_time(dest_time)
     if flag:
         logger.info('抓取的帖子时间:%s' % dest_time)
         page_urls = response.xpath(
             '//div[@class="ui_page"]/a/@href').extract()
         # for ur in page_urls:
         page_url = response.urljoin(page_urls[-1])
         # logger.info(page_url)
         yield Request(page_url,
                       callback=self.parse,
                       meta={'entry_id': entry_id})
Ejemplo n.º 8
0
 def parse(self, response):
     entry_id = response.meta['entry_id']
     # #匹配分页
     dest_time = response.xpath(
         '//div[@class="by poster"]/em/a/text()').extract()
     dest_time = ''.join(dest_time[-1]).encode('utf-8')
     dest_time = dateformatting.parse(dest_time).strftime(
         '%Y-%m-%d %H:%M:%S')
     flag = self.cal_time(dest_time)
     if flag:
         logger.info('抓取的帖子时间:%s' % dest_time)
         page_urls = response.xpath(
             '//span[@id="fd_page_bottom"]//a[@class="nxt"]/@href').extract(
             )
         page_url = response.urljoin(page_urls[0])
         yield Request(page_url,
                       callback=self.parse,
                       meta={'entry_id': entry_id})
     # 匹配详情页
     details = response.xpath('//tr//a[@class="s xst"]/@href').extract()
     for detail in details:
         detail_url = response.urljoin(detail)
         yield Request(detail_url,
                       callback=self.parse_content,
                       meta={
                           'parent_url': detail_url,
                           'entry_id': entry_id
                       })
     # 匹配user
     user_urls = response.xpath(
         '//div[@class="by author"]/cite/a/@href').extract()
     for usr in user_urls:
         usr = response.urljoin(usr)
         yield Request(usr,
                       callback=self.parse_user,
                       meta={'entry_id': entry_id})
Ejemplo n.º 9
0
    def parse_content(self, response):
        items1 = response.xpath('//div[@id="postlist"]/div[1]')
        items = response.xpath('//div[@id="comment_list"]/div')
        if items != []:
            items.pop()
        items = items1 + items
        parent_url = response.meta['parent_url']
        for item in items:
            # 发表时间
            post = PostItem()
            post_time = item.xpath(
                './/div[@class="authi"]//span/@title').extract()
            if post_time == []:
                post_time = item.xpath(
                    './/div[@class="authi"]/em/text()').extract()
                if post_time != []:
                    post_time = post_time[0].split(' ')
                    if len(post_time) == 3:
                        post_time.pop(0)
                    post_time = ' '.join(post_time).encode('utf-8')
            if post_time == '':
                continue
            post_time = dateformatting.parse(post_time).strftime(
                '%Y-%m-%d %H:%M:%S')
            site_type = 2
            target = ''.join(
                item.xpath(
                    './/td[@class="plc plcon"]//strong/a//text()').extract())
            url = re.findall(ur"\d+[\u4e00-\u9fa5]+", target)
            if url == []:
                url = re.findall(ur"[\u4e00-\u9fa5]+", target)
            url = response.url + '#' + ''.join(url).encode('utf-8')
            author_name = item.xpath(
                './/div[@class="authi"]/a[@class="xw1"]/text()').extract()
            if author_name != []:
                author_name = author_name[0].encode('utf-8')
            text = item.xpath('.//td[@class="t_f"]//text()').extract()
            text = ''.join(text).encode('utf-8')
            img_url = item.xpath(
                './/td[@class="t_f"]//img/@zoomfile').extract()
            img_list = []
            for ur in img_url:
                ur = response.urljoin(ur)
                img_list.append(ur)
            img_url = img_list

            text = ''.join(text)
            post['post_time'] = post_time
            post['site_type'] = site_type
            post['author_name'] = author_name
            x = ''.join(item.xpath(
                './/a[@class="show"]/text()').extract()).encode('utf-8')
            if '阅读模式' in x:
                title = response.xpath(
                    '//h1[@class="ts"]/a/text()').extract()[0]
                read_num = response.xpath(
                    '//div[@class="authi"]//span[@class="xi1 views"][1]/text()'
                ).extract()[0]
                comment_num = response.xpath(
                    '//div[@class="authi"]//span[@class="xi1 replies"]/text()'
                ).extract()[0]
                post['url'] = parent_url
                post['title'] = title
                post['read_num'] = read_num
                post['comment_num'] = comment_num
                post['data_type'] = 'first'
            else:
                post['url'] = url
                post['data_type'] = 'comment'
                post['parent_url'] = parent_url
            post['text'] = text
            post['img_url'] = img_url
            post['entry_id'] = response.meta['entry_id']
            post['include_time'] = time.strftime('%Y-%m-%d %H:%M:%S',
                                                 time.localtime(time.time()))
            uid = item.xpath('.//a[@class="xw1"]/@href').extract()
            uid = ''.join(re.findall(r'uid-(\d+)', ''.join(uid)))
            post['author_id'] = uid
            logger.info('post:%s' % post)
            yield post
        # 匹配detail分页
        detail_urls = response.xpath(
            '//div[@class="pgs mtm mbm cl"]/div[@class="pg"]/a/@href').extract(
            )
        for detail_url in detail_urls:
            yield Request(response.urljoin(detail_url),
                          callback=self.parse_content,
                          meta={
                              'parent_url': parent_url,
                              'entry_id': response.meta['entry_id']
                          })
Ejemplo n.º 10
0
 def timeout_date(self):
     return dateformatting.parse("10天前")
Ejemplo n.º 11
0
 def parse_content(self, response):
     entry_id = response.meta['entry_id']
     items = response.xpath('//div[@class="bbs_detail_list"]/div')
     parent_url = response.meta['parent_url']
     for item in items:
         # 发表时间
         post = PostItem()
         post_time = ''.join(
             item.xpath(
                 './/div[@class="bbs_detail_title clearfix"]/p/text()').
             extract()).encode('utf-8').split(' ')
         post_time.pop(0)
         post_time = ' '.join(post_time)
         post_time = dateformatting.parse(post_time).strftime(
             '%Y-%m-%d %H:%M:%S')
         # logger.info(post_time)
         site_type = 15
         # logger.info(post_time)
         target = ''.join(
             item.xpath(
                 './/div[@class="bbs_detail_title clearfix"]/a/text()').
             extract()).encode('utf-8')
         target = target.replace('\n', '').replace('\t',
                                                   '').replace('\r', '')
         url = response.url + '#' + target
         author_name = item.xpath(
             './/h3[@class="titles"]/a/text()').extract()
         if author_name == []:
             continue
         author_name = ''.join(author_name).encode('utf-8')
         text = item.xpath(
             './/td[@class="editor bbsDetailContainer"]//text()').extract()
         if text == []:
             text = item.xpath('.//ul[@class="xpc"]//text()').extract()
         text = ''.join(text).encode('utf-8')
         img_url = item.xpath(
             './/td[@class="editor bbsDetailContainer"]//@data-original'
         ).extract()
         if img_url == []:
             img_url = item.xpath(
                 './/ul[@class="xpc"]//img/@data-original').extract()
         li = []
         if img_url != []:
             for img in img_url:
                 img = response.urljoin(img)
                 li.append(img)
             img_url = li
         if img_url == []:
             img_url = ''
         post['post_time'] = post_time
         post['site_type'] = site_type
         post['author_name'] = author_name
         if '#1楼' in url:
             title = response.xpath(
                 '//h3[@class="b_tle"]/text()').extract()[-1]
             title = ''.join(title)
             read_num = ''.join(
                 response.xpath(
                     '//span[@class="viewtxt"]/text()').extract())
             if read_num == '':
                 read_num = ''.join(
                     response.xpath(
                         '//span[@class="poi"]/text()').extract())
             read_num = ''.join(re.findall(r'\d+', read_num))
             try:
                 comment_num = response.meta["comment_num"]
                 comment_num = ''.join(re.findall(r'\d+', comment_num))
             except Exception as e:
                 comment_num = 0
                 logger.info('comment_num Exception: %s' % e)
             post['url'] = parent_url
             post['title'] = title
             post['read_num'] = read_num
             post['comment_num'] = comment_num
             post['data_type'] = 'first'
         else:
             post['url'] = url
             post['data_type'] = 'comment'
             post['parent_url'] = parent_url
         post['text'] = text
         post['img_url'] = img_url
         post['entry_id'] = entry_id
         post['include_time'] = time.strftime('%Y-%m-%d %H:%M:%S',
                                              time.localtime(time.time()))
         # 匹配user
         uid = item.xpath('.//h3[@class="titles"]/a/@href').extract()
         uid = ''.join(re.findall(r'/u/(\d+)', uid[0]))
         post['author_id'] = uid
         logger.info('post:%s' % post)
         yield post
         # 匹配user
         yield Request(self.user_url.format(uid),
                       callback=self.parse_user,
                       meta={
                           "entry_id": entry_id,
                           'uid': uid,
                           'author_name': author_name
                       })
         # # 匹配detail分页
         detail_urls = response.xpath(
             '//div[@class="ui_page"]/a/@href').extract()
         for detail_url in detail_urls:
             yield Request(response.urljoin(detail_url),
                           callback=self.parse_content,
                           meta={
                               'parent_url': parent_url,
                               'entry_id': entry_id
                           })
Ejemplo n.º 12
0
 def parse_content(self, response):
     entry_id = response.meta['entry_id']
     items = response.xpath('//div[@class="mod_discuss_box"]')
     parent_url = response.meta['parent_url']
     for item in items:
         # 发表时间
         post = PostItem()
         post_time = ''.join(
             item.xpath('.//span[@class="answer_time"]/a/text()').extract()
         ).encode('utf-8')
         logger.info(post_time)
         post_time = dateformatting.parse(post_time).strftime(
             '%Y-%m-%d %H:%M:%S')
         site_type = 15
         # logger.info(post_time)
         url = response.url + '#' + post_time
         author_name = item.xpath(
             './/div[@class="mod_discuss_box_name"]/a/text()').extract()
         author_name = ''.join(author_name).encode('utf-8')
         text = item.xpath(
             './/div[@class="mod_discuss_box_text qyer_spam_text_filter"]//text()'
         ).extract()
         text = ''.join(text).encode('utf-8')
         img_url = item.xpath(
             './/div[@class="mod_discuss_box_text qyer_spam_text_filter"]//img/@data-original'
         ).extract()
         if img_url == []:
             img_url = item.xpath(
                 './/ul[@class="xpc"]//img/@data-original').extract()
         li = []
         if img_url != []:
             for img in img_url:
                 img = response.urljoin(img)
                 li.append(img)
             img_url = li
         if img_url == []:
             img_url = ''
         post['post_time'] = post_time
         post['site_type'] = site_type
         post['author_name'] = author_name
         post['url'] = url
         post['data_type'] = 'comment'
         post['parent_url'] = parent_url
         post['text'] = text
         post['img_url'] = img_url
         post['entry_id'] = entry_id
         post['include_time'] = time.strftime('%Y-%m-%d %H:%M:%S',
                                              time.localtime(time.time()))
         # 匹配user
         uid = item.xpath(
             './/div[@class="mod_discuss_box_name"]/a/@href').extract()
         uid = ''.join(re.findall(r'/u/(\d+)', uid[0]))
         post['author_id'] = uid
         logger.info('post:%s' % post)
         yield post
         # 匹配user
         yield Request(self.user_url.format(uid),
                       callback=self.parse_user,
                       meta={
                           "entry_id": entry_id,
                           'uid': uid,
                           'author_name': author_name
                       })
         # # 匹配detail分页
     louzhu_url = ''.join(
         response.xpath('//a[@class="avatar"]/@href').extract())
     uid = ''.join(re.findall(r'/u/(\d+)', louzhu_url))
     author_name = ''.join(
         response.xpath(
             '//div[@class="question-info clearfix mt10"]/a/text()').
         extract())
     yield Request(self.user_url.format(uid),
                   callback=self.parse_user,
                   meta={
                       "entry_id": entry_id,
                       'uid': uid,
                       'author_name': author_name
                   })
Ejemplo n.º 13
0
 def timeout_date(self):
     return dateformatting.parse(self.settings.get("AFTER_DATE", u"3天前"))