def parse(self, response): start_url = response.url try: data = htmlparser.Parser(response.body.decode(self.encoding)) except Exception as e: print('response failed %s' % e) return org_list = data.xpathall('''//ul[@class="list_009"]/li''') # for org in org_list[:5]: for org in org_list: if org: title = org.xpath('''//a/text()''').text().strip() ctime = org.xpath('''//span/text()''').replace( u'月', '-').replace(u'日', '').regex('(\d+-\d+ \d+:\d+)').text().strip() y_ctime = datetime.datetime.now().strftime("%Y") ctime = y_ctime + '-' + ctime c_time = ctime org_url = org.xpath('''//a/@href''').text().strip() if title: url = urljoin(start_url, org_url) print(url) ctime = local_timestamp(ctime) item = {'ctime': ctime, 'title': title} print(item) yield scrapy.Request(url, callback=self.detail_parse, meta={ 'item': item, "c_time": c_time }, headers=self.headers, dont_filter=True)
def parse_detail_page(self, response=None, url=None): try: response.encoding = self.encoding unicode_html_body = response.text data = htmlparser.Parser(unicode_html_body) except Exception, e: return []
def parse(self, response): start_url = response.url try: data = htmlparser.Parser(response.body.decode(self.encoding)) except Exception as e: print('response failed %s' % e) return org_list = data.xpathall('''//div[@id="newslist"]/a''') # for org in org_list[:5]: for org in org_list: if org: title = org.xpath('''//h2/text()''').text().strip() ctime = org.xpath('''//div[@class="author"]/span''').regex( '(\d+-\d+-\d+ \d+:\d+)').text().strip() org_url = org.xpath('''//@href''').text().strip() if title: url = urljoin(start_url, org_url) print(url) ctime = local_timestamp(ctime) item = {'ctime': ctime, 'title': title} print(item) yield scrapy.Request(url, callback=self.detail_parse, meta={'item': item}, headers=self.headers, dont_filter=True)
def handle_city_job(self, city): first_requets_url = 'https://www.lagou.com/jobs/list_python?&px=default&city=%s' % city first_response = self.handle_requets(method="GET", url=first_requets_url) first_html = htmlparser.Parser(first_response) try: page = first_html.xpath( '''//div[@class="page-number"]/span[last()]''').text() except: return else: for i in range(1, int(page) + 1): print(i) data = {"pn": i, "kd": "python"} post_url = 'https://www.lagou.com/jobs/positionAjax.json?px=default&city=%s&needAddtionalResult=false' % city refer_url = 'https://www.lagou.com/jobs/list_python?&px=default&city=%s' % city self.headers['Referer'] = refer_url.encode() post_response = self.handle_requets(method='POST', url=post_url, data=data, info=city) datas = json.loads(post_response) job_list = datas["content"]["positionResult"]["result"] for job in job_list: lagou_mysql.insert_item(job)
def process(self,html_path): pic_small,pic_dict =self.get_pic_fn(html_path) #print(pic_small) with open(html_path, 'r', encoding='utf8') as F: html_output = F.read() ############################################ #替换所有满足条件的% all_imgs = re.findall('<img.*?\>', html_output, re.S | re.I | re.M)#先取出所有的包干img部分 for i in all_imgs: for j in pic_small: if j in i: html_output = html_output.replace(i,'%') ######################################### #先处理table table_list_parttern = '(<table.*?</table>)' table_list = re.findall(table_list_parttern, html_output, re.S | re.I | re.M) label = {} for count, table_item in enumerate(table_list): index_table_num = count + 1 node = '#table_0_0_{}#'.format(index_table_num) html_output = html_output.replace(table_item, '<p>' + node + '</p>') table_item = re.sub('<img.*?>', '', table_item, re.S | re.I | re.M) label[node] = table_item ########################################## #处理非lable的image for i in all_imgs: for k in pic_dict.keys(): if k in i: node = '#img_0_0_'+k.replace('Image_','').replace('.png','').replace('.jpg','').replace('.gif','')+'#' html_output = html_output.replace(i,node) pic_dict_new={} dict2sort=sorted(pic_dict.items(), key = lambda item : len(item[1]), reverse=True)#从大到小排序,保留最大。去除最小 for j,k in enumerate(dict2sort): if j<=50:#图片大于50不保留 node = '#img_0_0_'+k[0].replace('Image_','').replace('.png','').replace('.jpg','').replace('.gif','')+'#' pic_dict_new[node]=k[1] else: html_output= html_output.replace(node,'') web_contents = [] if html_output == '': pass else: xml = htmlparser.Parser(html_output) try: content_list = xml.xpathall('''//p | //table | //h1 | //h2 | //h3 | //h4 | //h5 | //h6''') for con in content_list: con = con.text().strip() if type(con) == bytes: con = str(con, encoding='utf8') if con and con!='%':#只有%忽略 web_contents.append('\n\n\n' + con) except: pass return label,web_contents,pic_dict_new
def detail_parse(self, response): item = response.meta['item'] try: data = htmlparser.Parser(response.body.decode(self.encoding)) except Exception as e: print('second response failed %s' % e) return c_time = response.meta['c_time'] url = response.url contents = [] # 全部的文本内容 content_list = data.xpathall('''//div[@id='artibody']/p//text()''') for con in content_list: con = con.text().strip() if con: contents.append(con) content_x = data.xpath('''//div[@id='artibody']''').data content_xml = content_x label = {} img_list = data.xpathall('''//div[@id='artibody']//img''') if img_list: for count, image in enumerate(img_list): image_dict = {} image_url = image.xpath('//@src').text().strip() if image_url: image_url = urljoin(url, image_url) node = '#image{}#'.format(count) file_name = image_url.split('/')[-1] image_dict['url'] = image_url image_dict['name'] = '' image_dict['file_name'] = file_name label[node] = image_dict table_list = data.xpathall('''//div[@id='artibody']//table''') if table_list: for count, table in enumerate(table_list): table_dict = {} node = "#table{}#".format(count) table_sele = table.data table_dict['table_xml'] = table_sele node_p = "<p>" + node + "</p>" content_x = content_x.replace(table_sele, node_p) label[node] = table_dict xml = htmlparser.Parser(content_x) web_contents = [] # web直接展示的content(表格替换成node) content_list = xml.xpathall('''//p''') for con in content_list: con = con.text().strip() if con: web_contents.append(con) breadcrumb = ["首页", "科创板"] article_info = {} channel = '科创板' accessory = [] # 附件 # all_acc = data.xpathall('''//div[@class="ewb-info-con"]//a''') # if all_acc: # for acc in all_acc: # temp = {} # acc_url = acc.xpath('//@href').text().strip() # if acc_url and '@' not in acc_url: # acc_url = urljoin(url, acc_url) # name = acc.text().strip() # file_name = acc_url.split('/')[-1].split('=')[-1] # temp['url'] = acc_url # temp['name'] = name # temp['file_name'] = file_name # dir_path = os.path.join(self.ori_path, self.dir_name) # if not os.path.isdir(dir_path): # os.makedirs(dir_path) # path = os.path.join(dir_path, file_name) # dow_img_acc(path, acc_url) # # file_content = parse_main(path) # temp['file_content'] = '' # file_content # accessory.append(temp) gtime = int(time.time()) main_business = '' source = data.xpath( '''//span[@class="source ent-source"]/text()|//a[@class="source ent-source"]/text()''' ).text().strip() webname = '新浪财经' domain = self.allowed_domains[0] uid = add_uuid(url) item["collection_name"] = "news_finance_sina_raw" # 集合名 item["url"] = url # 链接 item["uid"] = uid # 去重id item["contents"] = contents # 数据处理的内容 item["web_contents"] = web_contents # 前端使用的内容 item["article_info"] = article_info # 文章的相关信息 item["label"] = label # 图片、表格 item["accessory"] = accessory # 附件 item["gtime"] = gtime # 爬虫时间 item['breadcrumb'] = breadcrumb # 导航 item['channel'] = channel # 频道 item["spider_name"] = self.name # 爬虫名 item["webname"] = webname # 网站名 item["domain"] = domain # 域名 item["source"] = source # 来源 item["main_business"] = main_business # 相关行业 item['path'] = '' # 附件路径 yield item
def parse(self, response, url): # try: # # response.encoding = self.encoding # # unicode_html_body = response.text # # data = htmlparser.Parser(unicode_html_body) # except Exception, e: # return ([], None, None) areas = [ 620100, 620200, 620300, 620400, 620500, 620600, 620700, 620800, 620900, 621000, 621100, 621200, 622900, 623000, 620001 ] projecttypes = ['A', 'D'] for projecttype in projecttypes: for area in areas: postData = { "pageNo": "0", "pageSize": "", "area": area, "projecttype": projecttype, "prjpropertynewA": "A", "prjpropertynewD": "D", "prjpropertynewC": "C", "prjpropertynewB": "B", "prjpropertynewE": "E", "prjpropertynewZ": "Z", "projectname": "", } response = requests.post( 'http://ggzyjy.gansu.gov.cn/f/province/annogoods/getAnnoList', data=postData).text data = htmlparser.Parser(response) li_content = data.xpathall('''//dd''') for item in li_content: title = item.xpath('''//a/text()''').text().strip() link = item.xpath('''//a/@href''').text().strip() #请求包含正文的网址 tradingCode = re.findall('cityTenderProject/(.*?)/', link)[0] if projecttype == 'A': tableName = '' else: tableName = re.findall('&table=(.*)', link)[0] postdata1 = { "tradingCode": tradingCode, "index": "0", "projectType": projecttype, "tableName": tableName } detailResponse = requests.post( 'http://ggzyjy.gansu.gov.cn/f/cityTenderProject/flowpage', data=postdata1).text detaildata = htmlparser.Parser(detailResponse) detaillink = detaildata.xpath( '//div[@class="jxTradingPublic"]//iframe/@src').text( ).strip() detaillink = re.findall('\?(.*)', detaillink)[0] detaillink = 'http://lzggzyjy.lanzhou.gov.cn/TPBidder/zhmanagemis/pages/webgonggao/webgonggao_DetailAction.action?cmd=page_Load&{}'.format( detaillink) date = item.xpath('''//i/text()''').text().strip() date = str(date).replace("-", "") publicTime = date if self.getdumps(link): continue uid = str(uuid.uuid5(uuid.NAMESPACE_DNS, link)) + str( uuid.uuid3(uuid.NAMESPACE_DNS, link)) ctime = datetime.datetime.now().strftime( '%Y-%m-%d %H:%M:%S') location = "甘肃省" service = '' industry = "" post = { "uuid": uid, # md5 "detailUrl": detaillink, # url "name": title, # 标题 "location": location, # 地区 "publicTime": publicTime, # 公布时间 "tag": "中标公告", # 标签 "site": self.site_domain, "siteName": self.siteName, "ctime": ctime, "industry": industry, "service": service } dic = self.handle_post(post) try: self.db.table(self.table).add(dic) except Exception as e: print e