def parse_list(self, response): logger.info("job list url {}".format(response.url)) kw = response.meta["kw"] city = response.meta["city"] pg = response.meta["pg"] timeout_date = self.timeout_date timeout = False content = json.loads(response.body) for cell in content['content']["positionResult"]["result"]: post_item = JobShortItem() date = dateformatting.parse(cell["createTime"]) if date and date < timeout_date: timeout = True logger.info("Timeout: %s < %s" % (date, timeout_date)) break elif not date: logger.warn("parse time badly please check dateformatting {} ".format(time_it)) continue post_item["job_name"] = cell["positionName"] post_item["url"] = "https://www.lagou.com/jobs/{}.html".format(cell["positionId"]) post_item["city"] = cell["city"] post_item["source"] = "拉勾网" post_item["district"] = cell["district"] post_item["month_salary"] = cell["salary"] post_item["day_salary"] = "" post_item["job_direction"] = key_words[kw] post_item["job_exp"] = cell["workYear"] post_item["job_edu"] = cell["education"] post_item["publish_man"] = cell["companyShortName"] post_item["publish_man_post"] = cell["companyShortName"] post_item["publish_time"] = dateformatting.parse(cell["createTime"]).strftime(date_format) post_item["company_name"] = cell["companyFullName"] post_item["company_addr"] = cell["district"] post_item["company_industry"] = cell["industryField"] logger.info("crawled list {} {}".format(post_item["url"], post_item["job_name"])) yield post_item if pg < 10 and not timeout: pg = pg + 1 url = list_url_tem.format(ct=city) post_body = { 'first': "false", 'pn': str(pg), 'kd': kw + "实习" } logger.info("will crawl url {}".format(url)) yield FormRequest(url=url, callback=self.parse_list, priority=6, formdata=post_body, meta={"city": city, "kw": kw, "pg": pg}, headers=headers)
def parse_list(self, response): logger.info("job list url {}".format(response.url)) kw = response.meta["kw"] cid = response.meta["cid"] pg = response.meta["pg"] direct = response.meta["direct"] timeout_date = self.timeout_date timeout = False content = response.xpath('//div[@class="dw_table"]/div[@class="el"]') if not content.get('content'): logger.warning("what a bad url.{}".format(response.url)) return for cell in content: post_item = JobShortItem() time_it = cell.xpath('./span[@class="t5"]/text()').extract_first() date = dateformatting.parse(time_it) if date and date < timeout_date: timeout = True logger.info("Timeout: %s < %s" % (date, timeout_date)) break elif not date: logger.warn("parse time badly please check dateformatting {} ".format(time_it)) continue post_item["job_name"] = cell.xpath('./p[starts-with(@class, "t1")]//a/@title').extract_first() post_item["url"] = cell.xpath('./p[starts-with(@class, "t1")]//a/@href').extract_first() post_item["city"] = city_ids[cid] post_item["source"] = "51job" post_item["district"] = cell.xpath('./span[@class="t3"]/text()').extract_first() salary = cell.xpath('./span[@class="t4"]/text()').extract_first() post_item["month_salary"] = salary if salary else "面议" post_item["day_salary"] = "" post_item["job_direction"] = directions[direct] post_item["job_exp"] = "" post_item["job_edu"] = "" post_item["publish_man"] = "" post_item["publish_man_post"] = "" post_item["publish_time"] = dateformatting.parse(time_it).strftime(date_format) post_item["company_name"] = cell.xpath('./span/a/@title').extract_first() post_item["company_addr"] = cell.xpath('./span[@class="t3"]/text()').extract_first() post_item["company_industry"] = "" logger.info("crawled list {} {}".format(post_item["url"], post_item["job_name"])) yield post_item next_page = response.xpath('//div[@class="rt"]/a/@href').extract_first() if next_page and not timeout: pg = pg + 1 url = next_page logger.info("will crawl url {}".format(url)) yield Request(url=url, callback=self.parse_list, priority=6, meta={"cid": cid, "kw": kw, "pg": pg, "direct": direct}, headers=headers)
def parse_live_detail(self, response): logger.info("live url {}".format(response.url)) info = re.findall("window.(anchor = .*?);", response.body, re.S)[0] post_info = js2py.eval_js(info) post_item = LiveItem() post_item["author_id"] = post_info["memberid"] post_item["author_name"] = post_info["nickname"] post_item["url"] = response.url post_item["title"] = response.xpath("//h1/text()").extract_first() post_item["site_id"] = 1223 post_item["site_name"] = "一直播" # post_item["read_num"] = post_info["online"] post_item["online_num"] = post_info["online"] # 文章阅读数 视频观看数 live参加数 post_item["like_num"] = response.xpath('//div[@class="hide"]').re_first(u"共有(\d+)条点赞") # 点赞数 post_item["comment_num"] = response.xpath('//div[@class="hide"]').re_first(u"共有(\d+)条评论") # 评论数 post_item["post_time"] = dateformatting.parse(post_info["starttime"]).strftime(date_format) # 发布时间 post_item["include_time"] = self.crawled_time # 抓取时间 post_item["content_tags"] = response.xpath('//div[@class="hide"]').re_first(u"认证类型:(.*?)。") post_item["video"] = post_info["play_url"] post_item["image"] = post_info["covers"] yield post_item # logger.info(post_item) logger.info(u"{} live view people {}".format(post_item["author_name"], post_item["online_num"]))
def stand_time(time_str): logger.debug('time_str: %s', time_str) post_time = dateformatting.parse(time_str) if post_time is None: post_time = '2000-1-1 00:00:00' else: post_time = post_time.strftime('%Y-%m-%d %H:%M:%S') return post_time
def _check_post_time(dt): ndt = dateformatting.parse(dt) if ndt and ndt.strftime("%Y-%m-%d %H:%M:%S") != dt: logger.error("Error post time of data:\n%s" % str(data)) return False if ndt < D20160101: logger.warning("Post time timeout: %s" % dt) return False return True
def parse(self, response): logger.info(response.url) entry_id = response.meta['entry_id'] data = json.loads(response.body)['data'] for it in data: detail_url = response.urljoin(it['question_url']) post = PostItem() post_time = dateformatting.parse( it['question_date']).strftime('%Y-%m-%d %H:%M:%S') post['author_id'] = it['question_uid'] post['url'] = detail_url post['title'] = it['question_title'] post['comment_num'] = it['question_renum'] post['data_type'] = 'first' post['post_time'] = post_time post['site_type'] = 15 post['author_name'] = it['question_username'] post['text'] = it['question_content'] # post['img_url'] = img_url post['entry_id'] = entry_id post['include_time'] = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) yield post logger.info('post_louzhu:%s' % post) yield Request(detail_url, callback=self.parse_content, meta={ 'parent_url': detail_url, 'entry_id': entry_id }) # #匹配分页 dest_time = data[-1]['question_date'] dest_time = dateformatting.parse(dest_time).strftime( '%Y-%m-%d %H:%M:%S') flag = self.cal_time(dest_time) if flag: logger.info('抓取的帖子时间:%s' % dest_time) self.page_num += 1 dat = {'page': str(self.page_num)} yield scrapy.FormRequest(url=self.wenda_url, callback=self.parse, formdata=dat, meta={'entry_id': entry_id})
def parse(self, response): logger.info(response.url) entry_id = response.meta['entry_id'] # # 获取社区 # shequ = response.xpath('//div[@class="q-layer q-layer-section"]//dd/a') # for x in shequ: # shequ_name = ''.join(x.xpath('./text()').extract()).encode('utf-8') # shequ_url = response.urljoin(''.join(x.xpath('./@href').extract())) # she = {'name': shequ_name, 'url': shequ_url, 'site_type': 2} # with open('qiongyou.json', 'a') as f: # f.write(json.dumps(she) + '\n') # 匹配详情页 bl = response.xpath('//ul[@id="list-id"]/li') for i in bl: detail = response.xpath('//a[@class="txt"]/@href').extract_first() comment_num = i.xpath( './/span[@class="reply"]//text() | //span[@class="lbvch xnum"]//text()' ).extract()[0] detail_url = response.urljoin(detail) yield Request(detail_url, callback=self.parse_content, meta={ 'parent_url': detail_url, 'entry_id': entry_id, 'comment_num': comment_num }) # #匹配分页 dest_time = response.xpath('//span[@class="zdate"]/text()').extract() dest_time = ''.join(dest_time[-1]).encode('utf-8').split(' ')[1] dest_time = dateformatting.parse(dest_time).strftime( '%Y-%m-%d %H:%M:%S') flag = self.cal_time(dest_time) if flag: logger.info('抓取的帖子时间:%s' % dest_time) page_urls = response.xpath( '//div[@class="ui_page"]/a/@href').extract() # for ur in page_urls: page_url = response.urljoin(page_urls[-1]) # logger.info(page_url) yield Request(page_url, callback=self.parse, meta={'entry_id': entry_id})
def parse(self, response): entry_id = response.meta['entry_id'] # #匹配分页 dest_time = response.xpath( '//div[@class="by poster"]/em/a/text()').extract() dest_time = ''.join(dest_time[-1]).encode('utf-8') dest_time = dateformatting.parse(dest_time).strftime( '%Y-%m-%d %H:%M:%S') flag = self.cal_time(dest_time) if flag: logger.info('抓取的帖子时间:%s' % dest_time) page_urls = response.xpath( '//span[@id="fd_page_bottom"]//a[@class="nxt"]/@href').extract( ) page_url = response.urljoin(page_urls[0]) yield Request(page_url, callback=self.parse, meta={'entry_id': entry_id}) # 匹配详情页 details = response.xpath('//tr//a[@class="s xst"]/@href').extract() for detail in details: detail_url = response.urljoin(detail) yield Request(detail_url, callback=self.parse_content, meta={ 'parent_url': detail_url, 'entry_id': entry_id }) # 匹配user user_urls = response.xpath( '//div[@class="by author"]/cite/a/@href').extract() for usr in user_urls: usr = response.urljoin(usr) yield Request(usr, callback=self.parse_user, meta={'entry_id': entry_id})
def parse_content(self, response): items1 = response.xpath('//div[@id="postlist"]/div[1]') items = response.xpath('//div[@id="comment_list"]/div') if items != []: items.pop() items = items1 + items parent_url = response.meta['parent_url'] for item in items: # 发表时间 post = PostItem() post_time = item.xpath( './/div[@class="authi"]//span/@title').extract() if post_time == []: post_time = item.xpath( './/div[@class="authi"]/em/text()').extract() if post_time != []: post_time = post_time[0].split(' ') if len(post_time) == 3: post_time.pop(0) post_time = ' '.join(post_time).encode('utf-8') if post_time == '': continue post_time = dateformatting.parse(post_time).strftime( '%Y-%m-%d %H:%M:%S') site_type = 2 target = ''.join( item.xpath( './/td[@class="plc plcon"]//strong/a//text()').extract()) url = re.findall(ur"\d+[\u4e00-\u9fa5]+", target) if url == []: url = re.findall(ur"[\u4e00-\u9fa5]+", target) url = response.url + '#' + ''.join(url).encode('utf-8') author_name = item.xpath( './/div[@class="authi"]/a[@class="xw1"]/text()').extract() if author_name != []: author_name = author_name[0].encode('utf-8') text = item.xpath('.//td[@class="t_f"]//text()').extract() text = ''.join(text).encode('utf-8') img_url = item.xpath( './/td[@class="t_f"]//img/@zoomfile').extract() img_list = [] for ur in img_url: ur = response.urljoin(ur) img_list.append(ur) img_url = img_list text = ''.join(text) post['post_time'] = post_time post['site_type'] = site_type post['author_name'] = author_name x = ''.join(item.xpath( './/a[@class="show"]/text()').extract()).encode('utf-8') if '阅读模式' in x: title = response.xpath( '//h1[@class="ts"]/a/text()').extract()[0] read_num = response.xpath( '//div[@class="authi"]//span[@class="xi1 views"][1]/text()' ).extract()[0] comment_num = response.xpath( '//div[@class="authi"]//span[@class="xi1 replies"]/text()' ).extract()[0] post['url'] = parent_url post['title'] = title post['read_num'] = read_num post['comment_num'] = comment_num post['data_type'] = 'first' else: post['url'] = url post['data_type'] = 'comment' post['parent_url'] = parent_url post['text'] = text post['img_url'] = img_url post['entry_id'] = response.meta['entry_id'] post['include_time'] = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) uid = item.xpath('.//a[@class="xw1"]/@href').extract() uid = ''.join(re.findall(r'uid-(\d+)', ''.join(uid))) post['author_id'] = uid logger.info('post:%s' % post) yield post # 匹配detail分页 detail_urls = response.xpath( '//div[@class="pgs mtm mbm cl"]/div[@class="pg"]/a/@href').extract( ) for detail_url in detail_urls: yield Request(response.urljoin(detail_url), callback=self.parse_content, meta={ 'parent_url': parent_url, 'entry_id': response.meta['entry_id'] })
def timeout_date(self): return dateformatting.parse("10天前")
def parse_content(self, response): entry_id = response.meta['entry_id'] items = response.xpath('//div[@class="bbs_detail_list"]/div') parent_url = response.meta['parent_url'] for item in items: # 发表时间 post = PostItem() post_time = ''.join( item.xpath( './/div[@class="bbs_detail_title clearfix"]/p/text()'). extract()).encode('utf-8').split(' ') post_time.pop(0) post_time = ' '.join(post_time) post_time = dateformatting.parse(post_time).strftime( '%Y-%m-%d %H:%M:%S') # logger.info(post_time) site_type = 15 # logger.info(post_time) target = ''.join( item.xpath( './/div[@class="bbs_detail_title clearfix"]/a/text()'). extract()).encode('utf-8') target = target.replace('\n', '').replace('\t', '').replace('\r', '') url = response.url + '#' + target author_name = item.xpath( './/h3[@class="titles"]/a/text()').extract() if author_name == []: continue author_name = ''.join(author_name).encode('utf-8') text = item.xpath( './/td[@class="editor bbsDetailContainer"]//text()').extract() if text == []: text = item.xpath('.//ul[@class="xpc"]//text()').extract() text = ''.join(text).encode('utf-8') img_url = item.xpath( './/td[@class="editor bbsDetailContainer"]//@data-original' ).extract() if img_url == []: img_url = item.xpath( './/ul[@class="xpc"]//img/@data-original').extract() li = [] if img_url != []: for img in img_url: img = response.urljoin(img) li.append(img) img_url = li if img_url == []: img_url = '' post['post_time'] = post_time post['site_type'] = site_type post['author_name'] = author_name if '#1楼' in url: title = response.xpath( '//h3[@class="b_tle"]/text()').extract()[-1] title = ''.join(title) read_num = ''.join( response.xpath( '//span[@class="viewtxt"]/text()').extract()) if read_num == '': read_num = ''.join( response.xpath( '//span[@class="poi"]/text()').extract()) read_num = ''.join(re.findall(r'\d+', read_num)) try: comment_num = response.meta["comment_num"] comment_num = ''.join(re.findall(r'\d+', comment_num)) except Exception as e: comment_num = 0 logger.info('comment_num Exception: %s' % e) post['url'] = parent_url post['title'] = title post['read_num'] = read_num post['comment_num'] = comment_num post['data_type'] = 'first' else: post['url'] = url post['data_type'] = 'comment' post['parent_url'] = parent_url post['text'] = text post['img_url'] = img_url post['entry_id'] = entry_id post['include_time'] = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) # 匹配user uid = item.xpath('.//h3[@class="titles"]/a/@href').extract() uid = ''.join(re.findall(r'/u/(\d+)', uid[0])) post['author_id'] = uid logger.info('post:%s' % post) yield post # 匹配user yield Request(self.user_url.format(uid), callback=self.parse_user, meta={ "entry_id": entry_id, 'uid': uid, 'author_name': author_name }) # # 匹配detail分页 detail_urls = response.xpath( '//div[@class="ui_page"]/a/@href').extract() for detail_url in detail_urls: yield Request(response.urljoin(detail_url), callback=self.parse_content, meta={ 'parent_url': parent_url, 'entry_id': entry_id })
def parse_content(self, response): entry_id = response.meta['entry_id'] items = response.xpath('//div[@class="mod_discuss_box"]') parent_url = response.meta['parent_url'] for item in items: # 发表时间 post = PostItem() post_time = ''.join( item.xpath('.//span[@class="answer_time"]/a/text()').extract() ).encode('utf-8') logger.info(post_time) post_time = dateformatting.parse(post_time).strftime( '%Y-%m-%d %H:%M:%S') site_type = 15 # logger.info(post_time) url = response.url + '#' + post_time author_name = item.xpath( './/div[@class="mod_discuss_box_name"]/a/text()').extract() author_name = ''.join(author_name).encode('utf-8') text = item.xpath( './/div[@class="mod_discuss_box_text qyer_spam_text_filter"]//text()' ).extract() text = ''.join(text).encode('utf-8') img_url = item.xpath( './/div[@class="mod_discuss_box_text qyer_spam_text_filter"]//img/@data-original' ).extract() if img_url == []: img_url = item.xpath( './/ul[@class="xpc"]//img/@data-original').extract() li = [] if img_url != []: for img in img_url: img = response.urljoin(img) li.append(img) img_url = li if img_url == []: img_url = '' post['post_time'] = post_time post['site_type'] = site_type post['author_name'] = author_name post['url'] = url post['data_type'] = 'comment' post['parent_url'] = parent_url post['text'] = text post['img_url'] = img_url post['entry_id'] = entry_id post['include_time'] = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) # 匹配user uid = item.xpath( './/div[@class="mod_discuss_box_name"]/a/@href').extract() uid = ''.join(re.findall(r'/u/(\d+)', uid[0])) post['author_id'] = uid logger.info('post:%s' % post) yield post # 匹配user yield Request(self.user_url.format(uid), callback=self.parse_user, meta={ "entry_id": entry_id, 'uid': uid, 'author_name': author_name }) # # 匹配detail分页 louzhu_url = ''.join( response.xpath('//a[@class="avatar"]/@href').extract()) uid = ''.join(re.findall(r'/u/(\d+)', louzhu_url)) author_name = ''.join( response.xpath( '//div[@class="question-info clearfix mt10"]/a/text()'). extract()) yield Request(self.user_url.format(uid), callback=self.parse_user, meta={ "entry_id": entry_id, 'uid': uid, 'author_name': author_name })
def timeout_date(self): return dateformatting.parse(self.settings.get("AFTER_DATE", u"3天前"))