Example #1
0
 def parse(self, response):
     start_url = response.url
     try:
         data = htmlparser.Parser(response.body.decode(self.encoding))
     except Exception as e:
         print('response failed %s' % e)
         return
     org_list = data.xpathall('''//ul[@class="list_009"]/li''')
     # for org in org_list[:5]:
     for org in org_list:
         if org:
             title = org.xpath('''//a/text()''').text().strip()
             ctime = org.xpath('''//span/text()''').replace(
                 u'月',
                 '-').replace(u'日',
                              '').regex('(\d+-\d+ \d+:\d+)').text().strip()
             y_ctime = datetime.datetime.now().strftime("%Y")
             ctime = y_ctime + '-' + ctime
             c_time = ctime
             org_url = org.xpath('''//a/@href''').text().strip()
             if title:
                 url = urljoin(start_url, org_url)
                 print(url)
                 ctime = local_timestamp(ctime)
                 item = {'ctime': ctime, 'title': title}
                 print(item)
                 yield scrapy.Request(url,
                                      callback=self.detail_parse,
                                      meta={
                                          'item': item,
                                          "c_time": c_time
                                      },
                                      headers=self.headers,
                                      dont_filter=True)
Example #2
0
 def parse_detail_page(self, response=None, url=None):
     try:
         response.encoding = self.encoding
         unicode_html_body = response.text
         data = htmlparser.Parser(unicode_html_body)
     except Exception, e:
         return []
Example #3
0
 def parse(self, response):
     start_url = response.url
     try:
         data = htmlparser.Parser(response.body.decode(self.encoding))
     except Exception as e:
         print('response failed %s' % e)
         return
     org_list = data.xpathall('''//div[@id="newslist"]/a''')
     # for org in org_list[:5]:
     for org in org_list:
         if org:
             title = org.xpath('''//h2/text()''').text().strip()
             ctime = org.xpath('''//div[@class="author"]/span''').regex(
                 '(\d+-\d+-\d+ \d+:\d+)').text().strip()
             org_url = org.xpath('''//@href''').text().strip()
             if title:
                 url = urljoin(start_url, org_url)
                 print(url)
                 ctime = local_timestamp(ctime)
                 item = {'ctime': ctime, 'title': title}
                 print(item)
                 yield scrapy.Request(url,
                                      callback=self.detail_parse,
                                      meta={'item': item},
                                      headers=self.headers,
                                      dont_filter=True)
Example #4
0
    def handle_city_job(self, city):
        first_requets_url = 'https://www.lagou.com/jobs/list_python?&px=default&city=%s' % city
        first_response = self.handle_requets(method="GET",
                                             url=first_requets_url)
        first_html = htmlparser.Parser(first_response)
        try:
            page = first_html.xpath(
                '''//div[@class="page-number"]/span[last()]''').text()
        except:
            return
        else:
            for i in range(1, int(page) + 1):
                print(i)
                data = {"pn": i, "kd": "python"}

                post_url = 'https://www.lagou.com/jobs/positionAjax.json?px=default&city=%s&needAddtionalResult=false' % city
                refer_url = 'https://www.lagou.com/jobs/list_python?&px=default&city=%s' % city
                self.headers['Referer'] = refer_url.encode()
                post_response = self.handle_requets(method='POST',
                                                    url=post_url,
                                                    data=data,
                                                    info=city)
                datas = json.loads(post_response)
                job_list = datas["content"]["positionResult"]["result"]
                for job in job_list:
                    lagou_mysql.insert_item(job)
	def process(self,html_path):
		pic_small,pic_dict =self.get_pic_fn(html_path)
		#print(pic_small)
		with open(html_path, 'r', encoding='utf8') as F:
			html_output = F.read()
############################################
#替换所有满足条件的%
		all_imgs = re.findall('<img.*?\>', html_output, re.S | re.I | re.M)#先取出所有的包干img部分
		for i in all_imgs:
			for j in pic_small:
				if j  in i:
					html_output = html_output.replace(i,'%')
#########################################
#先处理table

		table_list_parttern = '(<table.*?</table>)'
		table_list = re.findall(table_list_parttern, html_output, re.S | re.I | re.M)
		label = {}
		for count, table_item in enumerate(table_list):
			index_table_num = count + 1
			node = '#table_0_0_{}#'.format(index_table_num)
			html_output = html_output.replace(table_item, '<p>' + node + '</p>')
			table_item = re.sub('<img.*?>', '', table_item, re.S | re.I | re.M)
			label[node] = table_item

##########################################
#处理非lable的image

		for i in all_imgs:

			for k in pic_dict.keys():
				if k  in i:
					node = '#img_0_0_'+k.replace('Image_','').replace('.png','').replace('.jpg','').replace('.gif','')+'#'
					html_output = html_output.replace(i,node)
		pic_dict_new={}
		dict2sort=sorted(pic_dict.items(), key = lambda item : len(item[1]), reverse=True)#从大到小排序,保留最大。去除最小
		for j,k in enumerate(dict2sort):
			if j<=50:#图片大于50不保留
				node = '#img_0_0_'+k[0].replace('Image_','').replace('.png','').replace('.jpg','').replace('.gif','')+'#'
				pic_dict_new[node]=k[1]
			else:
				html_output= html_output.replace(node,'')
		web_contents = []
		if html_output == '':
			pass
		else:
			xml = htmlparser.Parser(html_output)
			try:
				content_list = xml.xpathall('''//p | //table | //h1 | //h2 | //h3 | //h4 | //h5 | //h6''')
				for con in content_list:
					con = con.text().strip()
					if type(con) == bytes:
						con = str(con, encoding='utf8')
					if con and con!='%':#只有%忽略

						web_contents.append('\n\n\n' + con)
			except:
				pass
		return label,web_contents,pic_dict_new
    def detail_parse(self, response):
        item = response.meta['item']
        try:
            data = htmlparser.Parser(response.body.decode(self.encoding))
        except Exception as e:
            print('second response failed %s' % e)
            return
        c_time = response.meta['c_time']
        url = response.url
        contents = []  # 全部的文本内容
        content_list = data.xpathall('''//div[@id='artibody']/p//text()''')
        for con in content_list:
            con = con.text().strip()
            if con:
                contents.append(con)
        content_x = data.xpath('''//div[@id='artibody']''').data
        content_xml = content_x
        label = {}
        img_list = data.xpathall('''//div[@id='artibody']//img''')
        if img_list:
            for count, image in enumerate(img_list):
                image_dict = {}
                image_url = image.xpath('//@src').text().strip()
                if image_url:
                    image_url = urljoin(url, image_url)
                    node = '#image{}#'.format(count)
                    file_name = image_url.split('/')[-1]
                    image_dict['url'] = image_url
                    image_dict['name'] = ''
                    image_dict['file_name'] = file_name
                    label[node] = image_dict

        table_list = data.xpathall('''//div[@id='artibody']//table''')
        if table_list:
            for count, table in enumerate(table_list):
                table_dict = {}
                node = "#table{}#".format(count)
                table_sele = table.data
                table_dict['table_xml'] = table_sele
                node_p = "<p>" + node + "</p>"
                content_x = content_x.replace(table_sele, node_p)
                label[node] = table_dict
        xml = htmlparser.Parser(content_x)
        web_contents = []  # web直接展示的content(表格替换成node)
        content_list = xml.xpathall('''//p''')
        for con in content_list:
            con = con.text().strip()
            if con:
                web_contents.append(con)
        breadcrumb = ["首页", "科创板"]
        article_info = {}
        channel = '科创板'
        accessory = []  # 附件
        # all_acc = data.xpathall('''//div[@class="ewb-info-con"]//a''')
        # if all_acc:
        #     for acc in all_acc:
        #         temp = {}
        #         acc_url = acc.xpath('//@href').text().strip()
        #         if acc_url and '@' not in acc_url:
        #             acc_url = urljoin(url, acc_url)
        #             name = acc.text().strip()
        #             file_name = acc_url.split('/')[-1].split('=')[-1]
        #             temp['url'] = acc_url
        #             temp['name'] = name
        #             temp['file_name'] = file_name
        #             dir_path = os.path.join(self.ori_path, self.dir_name)
        #             if not os.path.isdir(dir_path):
        #                 os.makedirs(dir_path)
        #             path = os.path.join(dir_path, file_name)
        #             dow_img_acc(path, acc_url)
        #             # file_content = parse_main(path)
        #             temp['file_content'] = '' # file_content
        #             accessory.append(temp)
        gtime = int(time.time())
        main_business = ''
        source = data.xpath(
            '''//span[@class="source ent-source"]/text()|//a[@class="source ent-source"]/text()'''
        ).text().strip()
        webname = '新浪财经'
        domain = self.allowed_domains[0]
        uid = add_uuid(url)
        item["collection_name"] = "news_finance_sina_raw"  # 集合名
        item["url"] = url  # 链接
        item["uid"] = uid  # 去重id
        item["contents"] = contents  # 数据处理的内容
        item["web_contents"] = web_contents  # 前端使用的内容
        item["article_info"] = article_info  # 文章的相关信息
        item["label"] = label  # 图片、表格
        item["accessory"] = accessory  # 附件
        item["gtime"] = gtime  # 爬虫时间
        item['breadcrumb'] = breadcrumb  # 导航
        item['channel'] = channel  # 频道
        item["spider_name"] = self.name  # 爬虫名
        item["webname"] = webname  # 网站名
        item["domain"] = domain  # 域名
        item["source"] = source  # 来源
        item["main_business"] = main_business  # 相关行业
        item['path'] = ''  # 附件路径
        yield item
Example #7
0
    def parse(self, response, url):
        # try:
        #     # response.encoding = self.encoding
        #     # unicode_html_body = response.text
        #     # data = htmlparser.Parser(unicode_html_body)
        # except Exception, e:
        #     return ([], None, None)
        areas = [
            620100, 620200, 620300, 620400, 620500, 620600, 620700, 620800,
            620900, 621000, 621100, 621200, 622900, 623000, 620001
        ]
        projecttypes = ['A', 'D']
        for projecttype in projecttypes:
            for area in areas:
                postData = {
                    "pageNo": "0",
                    "pageSize": "",
                    "area": area,
                    "projecttype": projecttype,
                    "prjpropertynewA": "A",
                    "prjpropertynewD": "D",
                    "prjpropertynewC": "C",
                    "prjpropertynewB": "B",
                    "prjpropertynewE": "E",
                    "prjpropertynewZ": "Z",
                    "projectname": "",
                }
                response = requests.post(
                    'http://ggzyjy.gansu.gov.cn/f/province/annogoods/getAnnoList',
                    data=postData).text
                data = htmlparser.Parser(response)
                li_content = data.xpathall('''//dd''')
                for item in li_content:
                    title = item.xpath('''//a/text()''').text().strip()
                    link = item.xpath('''//a/@href''').text().strip()

                    #请求包含正文的网址
                    tradingCode = re.findall('cityTenderProject/(.*?)/',
                                             link)[0]
                    if projecttype == 'A':
                        tableName = ''
                    else:
                        tableName = re.findall('&table=(.*)', link)[0]
                    postdata1 = {
                        "tradingCode": tradingCode,
                        "index": "0",
                        "projectType": projecttype,
                        "tableName": tableName
                    }
                    detailResponse = requests.post(
                        'http://ggzyjy.gansu.gov.cn/f/cityTenderProject/flowpage',
                        data=postdata1).text
                    detaildata = htmlparser.Parser(detailResponse)
                    detaillink = detaildata.xpath(
                        '//div[@class="jxTradingPublic"]//iframe/@src').text(
                        ).strip()
                    detaillink = re.findall('\?(.*)', detaillink)[0]
                    detaillink = 'http://lzggzyjy.lanzhou.gov.cn/TPBidder/zhmanagemis/pages/webgonggao/webgonggao_DetailAction.action?cmd=page_Load&{}'.format(
                        detaillink)

                    date = item.xpath('''//i/text()''').text().strip()
                    date = str(date).replace("-", "")
                    publicTime = date
                    if self.getdumps(link):
                        continue
                    uid = str(uuid.uuid5(uuid.NAMESPACE_DNS, link)) + str(
                        uuid.uuid3(uuid.NAMESPACE_DNS, link))
                    ctime = datetime.datetime.now().strftime(
                        '%Y-%m-%d %H:%M:%S')
                    location = "甘肃省"
                    service = ''
                    industry = ""
                    post = {
                        "uuid": uid,  # md5
                        "detailUrl": detaillink,  # url
                        "name": title,  # 标题
                        "location": location,  # 地区
                        "publicTime": publicTime,  # 公布时间
                        "tag": "中标公告",  # 标签
                        "site": self.site_domain,
                        "siteName": self.siteName,
                        "ctime": ctime,
                        "industry": industry,
                        "service": service
                    }

                    dic = self.handle_post(post)
                    try:
                        self.db.table(self.table).add(dic)
                    except Exception as e:
                        print e