def __init__(self, url): self.url = url.lstrip('http://').lstrip('https://') self.host = urlsplit(url).netloc req = Request(url, headers={ 'content-type': 'text/html', 'User-Agent': 'Mozilla/5.0' }) res = urlopen(req) self.access = localtime() date = res.headers['Last-Modified'] self.date = None if date == None else strptime( date, '%a, %d %b %Y %H:%M:%S %Z') page = res.read() html = HTML(page.decode(detect(page)['encoding'])) author = html.xpath( '//meta[@name="author"or@property="author"][1]/@content') self.author = None if author == [] else author[0] site = html.xpath('//meta[@property="og:site_name"][1]/@content') self.site = None if site == [] else site[0] title = html.xpath('//meta[@property="og:title"][1]/@content') self.title = html.xpath( '//title[1]/text()')[0] if title == [] else title[0]
def parse_serp(self, html): elements = HTML(html) container = CSSSelector('div#isr_mc')(elements)[0] results = CSSSelector('div.rg_di')(container) for result in results: result_containers = CSSSelector('a.rg_l')(result) if not result_containers: continue result_container = result_containers[0] result_href = result_container.get('href') if not result_href: continue double_quoted_link = self.link_re.match(result_href).group(1) link = self.double_unquote(double_quoted_link) double_quoted_visible_link = self.visible_link_re.match( result_href).group(1) visible_link = self.double_unquote(double_quoted_visible_link) yield link, visible_link
def get_yiparts_detail(self): sql = 'select * from yiparts' results = self.db.find_all(sql) for res in results: url = 'http://www.yiparts.com/parts/{yiparts_name_en}/'.format( yiparts_name_en=res[2]) print(url) response = self.download.get_html(url) doc = HTML(response) names = doc.xpath('//div[@id="sort2"]/div/div/a/span[2]/text()') name_ens = doc.xpath('//div[@id="sort2"]/div/div/a/@href') imgs = doc.xpath('//div[@id="sort2"]/div/div/a/span[1]/img/@src') for name, name_en, img in zip(names, name_ens, imgs): item_name = name.strip() item_name_en = name_en[11:-1] item_img = img sql = 'insert into yiparts_detail(pid, detail_name, detail_name_en, detail_img) VALUES ("{pid}", "{detail_name}", "{detail_name_en}", "{detail_img}")'.format( pid=res[0], detail_name=item_name, detail_name_en=item_name_en, detail_img=item_img) print(sql) self.db.save(sql)
def get_keyword(self, response): html = HTML(response.text) url_list = html.xpath('//a/@href') self.parse_keyword(response) exits_url = [] for url in url_list: if re.match('^/.*?aspx$', url): two_url = 'http://www.aliwuxi.com' + url elif re.match('http://.*?aspx$', url): two_url = url else: continue if two_url in exits_url: continue else: exits_url.append(two_url) print(two_url) two_response = self.down.get_html(two_url) self.parse_keyword(two_response) self.kw_list = list(filter(None, self.kw_list)) self.kw_list = list(set(self.kw_list)) return self.kw_list
def deal(json_obj): # print(json.dumps(json_obj)) for data in json_obj['pageRow']: if 'article_id' in data and data['article_id']: article_id = data['article_id'] _type = 'perio' else: article_id = data['report_id'] _type = 'tech' link = 'http://www.wanfangdata.com.cn/details/detail.do?_type={_type}&id={article_id}'.format(_type=_type,article_id=article_id) print(link) response = requests.get(link) # print(response.text) html = HTML(response.text) title = html.xpath('string(//div[@class="left_con_top"]//div[@class="title"]/text())').strip() address_xpath_list = html.xpath('//ul[@class="info"]//div[@class="info_right info_right_newline"]/a/text()') address_list = [] flag = True for addressStr in address_xpath_list: addressStr = addressStr.replace(' ','').replace(' ','') # print(addressStr) searchRes = re.search('.*?(,|,)(.*?)(,|,)\d+$', addressStr) if searchRes: address = searchRes.group(2) if ',' in address: address = address.split(',')[-1] save_res = title+','+address+'\n' print(save_res) flag = False with open('结果.csv','a',encoding='gbk') as f: f.write(save_res) if flag: print('无匹配数据') print('暂停10秒') time.sleep(10)
def req_for_name(self, wechat_id): url = self.url.format(wechat_id) # response = WanDou().http_client(url=url, param=self.headers) resp1 = requests.get( url= r"http://h.wandouip.com/get/ip-list?pack=853&num=1&xy=1&type=2&lb=\r\n&mr=1&" ) resp2 = resp1.json()["data"][0] # print(resp2) # resp1.close() time.sleep(2) try: response = requests.get( url=url, headers=self.headers, proxies={"http": "{}:{}".format(resp2["ip"], resp2["port"])}) except Exception as e: print(1, e) self.logger.info("error ip: {}".format(resp2)) time.sleep(5) return self.req_for_name(wechat_id) html = HTML(response.content.decode()) # response.close() name = html.xpath('//p[@class="tit"]/a/text()') if name: # print(name) self.error_count = 0 return name[0] else: self.error_count += 1 if self.error_count == 5: self.logger.info("wetchat id error: \"{}\"".format(wechat_id)) return "None" else: time.sleep(2) self.req_for_name(wechat_id)
def start(): with open('url.txt') as f: results = f.readlines() for res in results: try: url = res.strip() print(url) response = requests.get(url) # print(response.text) html = HTML(response.text) comName = html.xpath( 'string(//table[1]//tr[2]/td[2])').replace( '\n', '').replace('\r', '').replace('\t', ' ').strip() comAddress = html.xpath( 'string(//table[1]//tr[3]/td[2])').replace( '\n', '').replace('\r', '').replace('\t', ' ').strip() positionName = html.xpath( 'string(//table[2]//tr[2]/td[2])').replace( '\n', '').replace('\r', '').replace('\t', ' ').strip() jobType = html.xpath( 'string(//table[2]//tr[2]/td[4])').replace( '\n', '').replace('\r', '').replace('\t', ' ').strip() zhize = html.xpath('string(//table[2]//tr[9]/td[2])').replace( '\n', '').replace('\r', '').replace('\t', ' ').strip() price = html.xpath('string(//table[2]//tr[5]/td[4])').replace( '\n', '').replace('\r', '').replace('\t', ' ').strip() save_res = comName + '||' + comAddress + '||' + positionName + '||' + jobType + '||' + zhize + '||' + price + '\n' save_res = save_res.replace(',', ',').replace('||', ',') print(save_res) with open('岗位信息.csv', 'a', encoding='gbk', errors='ignore') as f: f.write(save_res) except: print('error...' + str(res)) continue
def get_trie_data(self, product_url): """获取轮胎详情数据""" log_init().info(f'{product_url}数据请求中...') response = self._parse_url(product_url) html = HTML(response.text) # 轮胎名称 title = html.xpath('//*[@id="product_detail"]/div[2]/h1/text()') title = [i.strip() for i in title if i.strip()][0] properties = html.xpath( '//*[@id="product_detail"]/div[2]/div[1]/ul/li') # 解析轮胎参数 reltus = {} for propertie in properties: TireBrand = propertie.xpath('.//text()') reltus[TireBrand[0].replace(':', '')] = TireBrand[1] TireBrand = self.is_null(reltus.get('轮胎品牌')) # 轮胎品牌 Productspec = self.is_null(reltus.get('产品规格')) # 产品规格 Speedlevel = self.is_null(reltus.get('速度级别')) # 速度级别 LoadIndex = self.is_null(reltus.get('载重指数')) # 载重指数 ProductOrigin = self.is_null(reltus.get('产品产地')) # 产品产地 Tyrecategory = self.is_null(reltus.get('轮胎类别')) # 轮胎类别 Tirepattern = self.is_null(reltus.get('轮胎花纹')) # 轮胎花纹 price = html.xpath( '//*[@id="product_detail"]/div[2]/div[2]/div[2]/strong/text()')[ 0] # 价格 data = [[ title, TireBrand, Productspec, Speedlevel, LoadIndex, ProductOrigin, Tyrecategory, Tirepattern, price, product_url ]] log_init().info(f'{product_url}数据获取成功!') self.csv_save(data)
def deal(html): urls = html.xpath( '//ol[@class="article-list"]/li//a[@class="anchor article-content-title u-margin-xs-top u-margin-s-bottom"]/@href | //ol[@class="js-jl-aip-list article-list-items"]/li//a[@class="anchor article-content-title u-margin-xs-top u-margin-s-bottom"]/@href' ) for url in urls: link = 'https://www.sciencedirect.com' + url print(link) response = requests.get(link, headers=headers) html = HTML(response.text) title = html.xpath('string(//h1/span/text())') keyword_list = html.xpath( '//div[@class="keywords-section"]/div[@class="keyword"]/span/text()' ) # print(keyword_list) keywordStr = ','.join(keyword_list) qikanName = html.xpath( 'string(//h2/a[@class="publication-title-link"]/text())') authors_a_list = html.xpath('//div[@id="author-group"]/a') authorsList = [] for span in authors_a_list: nameList = span.xpath('.//span[@class="content"]//span/text()') nameStr = ' '.join(nameList) authorsList.append(nameStr) authorsStr = ','.join(authorsList) abstractList = html.xpath( '//p[@id="sp0010"]//text()|//p[@id="d1e631"]//text()|//div[@id="abstracts"]/div/div/p//text()' ) abstract = ''.join(abstractList) save_res = id + '||' + qikanName + '||' + title + '||' + authorsStr + '||' + abstract + '||' + keywordStr + '||' + link save_res = save_res.replace(',', ',').replace('\n', '').replace( '||', ',') + '\n' print(save_res) with open('sciencedirect.csv', 'a', encoding='gbk', errors='ignore') as f: f.write(save_res)
def xpathpages(self, resp): """规则解析""" etre = HTML(resp) trlt = etre.xpath('//tr[1]/following-sibling::tr') try: for _ in trlt: item = {} item["name"] = "".join(_.xpath('.//td[1]//text()')).strip(" ") item["userID"] = "".join( _.xpath('.//td[2]//text()')).strip(" ") item["sex"] = "".join(_.xpath('.//td[3]//text()')).strip(" ") item["education"] = "".join( _.xpath('.//td[4]//text()')).strip(" ") item["_id"] = hashlib.md5( (item["name"] + item["userID"]).encode('utf-8')).hexdigest() GD_RC.save(item) self.log.info(f"数据{item['_id']}存入mongo") except Exception as e: print(e)
def get_canInfo(item, EntType): url = 'http://www.aepb.gov.cn:8080/WRYJG/STZXGK/STAuto_Data.aspx?NewsID={id}&zdlx={EntType}'.format( id=item['id'], EntType=EntType) response = requests.get(url, timeout=80) # print(response.text) html = HTML(response.text) jianceCodeList = html.xpath('//select[@id="DropPk"]/option/@value') jianceCodeNameList = html.xpath('//select[@id="DropPk"]/option/text()') jianceCodeObjList = [] for jianceCode, jianceCodeName in zip(jianceCodeList, jianceCodeNameList): obj = { 'jianceCode': jianceCode, 'jianceCodeName': jianceCodeName, } jianceCodeObjList.append(obj) __VIEWSTATE = re.search('id="__VIEWSTATE" value="(.*?)"', response.text).group(1) __VIEWSTATE = quote(__VIEWSTATE).replace('/', '%2F') # totalPage = int(re.search('当前第1/(\d+)页',response.text).group(1)) # print(jianceCodeList) # print(__VIEWSTATE) return jianceCodeObjList, __VIEWSTATE
def wzws_cid_decrypt(text: Union[str, bytes]) -> str: """ :param text: 提示"请开启JavaScript并刷新该页"的响应text :return: 重定向url,访问重定向url后会返回wzws_cid的cookie """ base_url = "http://wenshu.court.gov.cn" custom_js = """ window = {}; document = { createElement: () => ({ style: "", appendChild: () => ({}), submit: () => ({}) }), body: { appendChild: obj => { window.location = obj.action } } }; atob = str => Buffer.from(str, "base64").toString("binary"); get_location = () => window.location; """ html = HTML(text) js = html.xpath("//script/text()")[0] ctx = nodejs.compile(custom_js + js) location = ctx.call("get_location") redirect_url = parse.urljoin(base_url, location) return redirect_url
def get_comment(movieId, name): for i in range(1, 20): #从第几页开始 try: print('评论当前页:' + str(i)) pageToken = i * 20 start_url = 'https://movie.douban.com/subject/{movieId}/comments?start={pageToken}&limit=20&sort=new_score&status=P' url = start_url.format(movieId=movieId, pageToken=pageToken) response = requests.get(url, headers=headers) html = HTML(response.text) alldiv = html.xpath( '//div[@id="comments"]/div[@class="comment-item"]') # print(allText) for div in alldiv: commentId = div.xpath('string(./@data-cid)') commentInfo = div.xpath( 'string(.//span[@class="short"]/text())') commentAuthor = div.xpath( 'string(.//div[@class="avatar"]/a/@title)') commentVote = div.xpath('string(.//span[@class="votes"])') commentForMovie = name sql = "insert into comments(commentId,commentInfo,commentAuthor,commentVote,commentForMovie)" \ " VALUES ('%s', '%s', '%s', '%s', '%s')" \ % (commentId,commentInfo,commentAuthor,commentVote,commentForMovie) \ + "ON DUPLICATE KEY UPDATE commentVote='%s'" % (commentVote) print(sql) dbCli.save(sql) print('暂停10秒') time.sleep(10) except: print('comment,error..') continue
def parse_poi_detail(self, response): """ 旅游景点解析 eg:https://place.qyer.com/poi/V2UJYVFkBzJTZVI9/ """ html = HTML(response.text) item = items.PoiDetailItem() item['raw'] = {'html': str(lzma.compress(response.body))} item['url'] = response.request.url item['id'] = response.request.meta.get('id') item['catename'] = response.request.meta.get('catename') item['head'] = utils.get_text_by_xpath(html, './/div[@class="qyer_head_crumb"]/span//text()') item['title'] = utils.get_text_by_xpath(html, './/div[@class="poi-largeTit"]/h1[@class="cn"]//text()') item['title_en'] = utils.get_text_by_xpath(html, './/div[@class="poi-largeTit"]/h1[@class="en"]//text()') item['rank'] = utils.get_text_by_xpath(html, './/div[@class="infos"]//ul/li[@class="rank"]/span//text()') item['poi_detail'] = utils.get_text_by_xpath(html, './/div[@class="compo-detail-info"]/div[@class="poi-detail"]//text()') item['poi_tips'] = utils.get_text_by_xpath(html, './/div[@class="compo-detail-info"]/ul[@class="poi-tips"]//text()') lis = html.xpath('.//div[@class="compo-detail-info"]/ul[@class="poi-tips"]/li') for li in lis: title = utils.get_text_by_xpath(li, './/span[@class="title"]/text()') content = utils.get_text_by_xpath(li, './/div[@class="content"]//text()') if '地址' in title: item['address'] = content elif '到达方式' in title: item['arrive_method'] = content elif '开放时间' in title: item['open_time'] = content elif '门票' in title: item['ticket'] = content elif '电话' in title: item['phone'] = content elif '网址' in title: item['website'] = content item['poi_tip_content'] = utils.get_text_by_xpath(html, './/div[@class="compo-detail-info"]/div[@class="poi-tipContent"]//text()') yield item
async def func(): async with contextlib.AsyncExitStack() as stack: if not RUN_IMMEDIATELY: await sleep(random() * 30 * 60) account_id = account['account-id'] password = account['password'] context, page = await new_context() stack.push_async_callback(context.close) stack.push_async_callback(page.close) logged = await login(page, account_id, password) if not logged: return html = HTML(await page.content()) source = await get_script_source(html=html) if SCRIPT_SOURCE: if diff := '\n'.join( get_diff(SCRIPT_SOURCE, source.splitlines())): await handle_page_changing(diff, source) return name = html.xpath("//input[@id='xm']/@value")[0] await page.evaluate(js_codes.submit()) await sleep(2) async with page.expect_response('**/tbBcJkxx.zf') as response: await page.click("//button[text()='提交']") await sleep(5) response = await (await response.value).json() if response['status'] == 'success': await page.wait_for_selector("//div[text()='保存数据成功']", state='attached') LOGGER.warning(f'Success: {account_id} {name}') else: LOGGER.warning( f'Submit failed: {account_id} {name} <<<{source=}>>>') await notify('Submit failed', f'{account_id} {name}')
def parse(response): html = HTML(response.text) div_list = html.xpath('//div[@id="zdlist"]/div[@class="zddiv"]') # print(len(div_list)) for div in div_list: link = div.xpath('string(.//div[@class="gsname"]/a/@href)') name = re.search('https://www.tianyancha.com/search\?key=(.*?)$', link).group(1) # email = div.xpath('string(.//div[@class="other"]/text()[2])').replace('邮箱:','').strip() # phone = div.xpath('string(.//div[@class="other"]/text()[3])').replace('电话:','').strip() # reg = div.xpath('string(.//div[@class="other"]/text()[4])').replace('注册资本:','').strip() deatilText = tostring(div, encoding='utf8').decode('utf8') # print(deatilText) email = re.search('邮箱:(.*?)<br', deatilText) if email: email = email.group(1).strip() else: email = '|' phone = re.search('电话:(.*?)<br', deatilText) if phone: phone = phone.group(1).strip() else: phone = '|' reg = re.search('注册资本:(.*?)注册时间', deatilText) if reg: reg = reg.group(1).strip() else: reg = '|' save_res = name + '---' + email + '---' + phone + '---' + reg + '\n' print(save_res) with open('结果.txt','a') as f: f.write(save_res)
async def __get_proxies_from_sslproxies(self, session): urls = [ 'https://www.sslproxies.org/', 'https://www.us-proxy.org/', 'https://free-proxy-list.net/', 'https://free-proxy-list.net/uk-proxy.html', 'https://free-proxy-list.net/anonymous-proxy.html' ] idx = 0 proxies = self.get_https_proxy() for url in urls: i = 5 while i > 0: await asyncio.sleep(3) try: if len(proxies) <= idx: idx = 0 res = await session.get( url, proxy='' if len(proxies) == 0 else proxies[idx], timeout=10) html = HTML(await res.text()) addresses = html.xpath( '//*[@id="raw"]/div/div/div[2]/textarea/text()' )[0].split('\n')[3:] for adr in addresses: await self.put_proxy('http://' + adr, 'sslproxies') break except Exception: i -= 1 if idx + 1 > len(proxies): proxies = self.get_https_proxy() idx += 1 if (idx >= len(proxies)): idx == 0 logger.exception(f"Parse {url} Fail") await asyncio.sleep(1)
def _get_data(self, html): html = HTML(html) li_list = html.xpath('//ul[@class="list05"]/li') data = dict() item_list = list() # 同步操作 # for li in li_list: # item = self._parse_detail(li) # item_list.append(item) # time.sleep(3) # 协程池异步 coroutine_list = [ self.pool.spawn(self._parse_detail, li) for li in li_list ] gevent.joinall(coroutine_list) for coroutine in coroutine_list: item_list.append(coroutine.value) data['item_list'] = item_list data['next_url'] = html.xpath( '//form[@name="pageform"]/div/a[@class="xyy"]/@href').pop() return data
def main(wf): parse = argparse.ArgumentParser() parse.add_argument('--app', dest='app') parse.add_argument('query', nargs='*', default=None) args = parse.parse_args() query = args.query[0] log.warn(query) if query: id = query.rsplit('/', 1)[-1].split('.')[0] url = 'http://soft.macx.cn/downloado.do?softid={}&cpus=2&urls=3'.format( id) r = web.get(url) r.raise_for_status() a = r.text node = HTML(a).find('.//a[@rel="facebox"][last()]') log.info(node.text) open = ['open'] if args.app: open.extend(['-a', args.app]) if node is not None and node.text == '浏览器直接下载': open.append(node.get('href')) else: open.append(url) call(open)
def get_weather(url, weather): """ 抓取天气。 :param url: 网址。 :param weather: 储存天气信息的对象。 """ while True: try: response = requests.get( url, headers={ 'Host': 'www.weather.com.cn', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) ' 'AppleWebKit/537.36 (KHTML, like Gecko) ' 'Chrome/55.0.2883.87 Safari/537.36' }, timeout=7) except Exception as e: print('【{}】抓取天气失败,失败信息:{},尝试重新加载……'.format( time.strftime('%Y-%m-%d %H:%M:%S'), e)) continue if 200 == response.status_code: break print('【{}】抓取天气失败,状态码:{},尝试重新加载……'.format( time.strftime('%Y-%m-%d %H:%M:%S'), response.status_code)) li = HTML(response.content.decode('utf8')).xpath( '//*[@id="today"]/div[1]/ul/li[1]')[0] weather['date'] = li.xpath('h1/text()')[0] weather['weather'] = li.xpath('p[1]/text()')[0] weather['temp'] = li.xpath('p[2]/span/text()')[0].replace('-', '—') weather['wind'] = li.xpath('p[3]/span/@title')[0] weather['wind_speed'] = li.xpath('p[3]/span/text()')[0].replace('-', '~') sun = li.xpath('p[last()]/span/text()')[0] weather['sun'] = sun[sun.find(' ') + 1:]
def barcode(self, value, code='Code128', drawOpts=None, htmlAttrs=None): """ Generate a <img /> tag with embedded barcode Params: - value: barcode value, must be valid for barcode type - code: barcode type, as per reportlab.graphics.barcode.getCodes() - drawOpts: options for the reportlab barcode - htmlAttrs: attributes for <img /> tag """ drawOpts = (drawOpts or {}) imgtype = drawOpts.pop('format', 'png') attrs = (htmlAttrs or {}) drawOpts['value'] = value for k in ('width', 'height'): # Attempt to unify drawing and image sizes to prevent accidental # scaling, and reduce parameter duplication if k in drawOpts and k not in attrs: attrs[k] = "{0}px".format(drawOpts[k]) elif k in attrs and k not in drawOpts: # reportlab expects a float value = str(attrs[k]) if value.endswith("px"): value = value[:-2].strip() try: value = float(value) except ValueError: # Ignore values that we can't handle pass else: drawOpts[k] = value data = createBarcodeDrawing(code, **drawOpts).asString(imgtype) attrs['src'] = "data:image/{1};base64,{0}".format( data.encode('base64'), imgtype, ) return HTML(Element('img', attrs))
def getFinalResult(self): try: self.resp.append(httpx.post( self.url, data=self.data, cookies=self.cookies)) except: self.result = self.errmsg return None self.doc.append(HTML(self.resp[2].text)) try: self.finalResult = self.doc[2].xpath( '//*[@id="lableft"]/text()')[0] df = self.findPat("账户余额:.*元", 5, -1, float) du = self.findPat("剩余电量:.*度", 5, -1, float) result = { "账户余额": df, "剩余电量": du } return result except IndexError: result = { "账户余额": -1.0, "剩余电量": -1.0 } return result
def start(): url = 'http://wx.jd120.com/HqReg-Register.action?code=023Xrq670fPK7F1153970JXj670Xrq6d&state=gh' response = requests.get(url) # print(response.text) html = HTML(response.text) urls = html.xpath('//div[@id="appointRegTabContent"]/div/ul/li/a/@href') titles = html.xpath('//div[@id="appointRegTabContent"]/div/ul/li/a/text()') # print(len(urls)) # print(len(titles)) item_list = [] for url, title in zip(urls, titles): link = 'http://wx.jd120.com/' + url catName = title.strip() # print(link, title) obj = { 'url': link, 'catName': catName, } item_list.append(obj) # date_list = [] # for i in range(10): # addTime = i * 3600 * 24 # userTime = time.strftime('%Y-%m-%d', time.localtime(time.time() + addTime)) # date_list.append(userTime) # print(date_list) # for date in date_list: # with open(date + '.csv', 'w', encoding='gbk') as f: # pass # print(date) allObj_list = [] for item in item_list: url = item['url'] catName = item['catName'] print(url, catName) try: response = requests.get(url, timeout=15) except: print('请求失败') continue html = HTML(response.text) # print(response.text) td_list = html.xpath('//table[@class="table appoint-table"]//tr//td') for td in td_list: hrefValue_list = td.xpath('.//a/@href') # print(hrefValue_list) if len(hrefValue_list) >= 1: num = 1 for hrefValue in hrefValue_list: # print(hrefValue) searchRes = re.search( '/HqReg-select_time.action\?workSessionId=.*?&dateId=(.*?)&.*?doctorId=(.*?)$', hrefValue) if searchRes: if searchRes.group(2) != '': dateName = searchRes.group(1) doctorName = td.xpath('string(.//a[' + str(num) + ']/span/text())').replace( '\n', '').replace( '\t', '').replace( '\r', '').strip() print(dateName, doctorName) obj = { 'dateName': dateName, 'doctorName': doctorName, 'catName': catName, } allObj_list.append(obj) num += 1 saveFileDate_list = [] for obj in allObj_list: if obj['dateName'] not in saveFileDate_list: with open(obj['dateName'] + '.csv', 'w', encoding='gbk') as f: save_res = obj['catName'] + ',' + obj['doctorName'] + '\n' f.write(save_res) saveFileDate_list.append(obj['dateName']) else: with open(obj['dateName'] + '.csv', 'a', encoding='gbk') as f: save_res = obj['catName'] + ',' + obj['doctorName'] + '\n' f.write(save_res)
def run(self): count = DB.find({"qcc_supplement": 0}).count() cookie_count = 0 while count: #初始化driver self.borser = self.chrome_driver() #最小化窗口 self.borser.minimize_window() #清除缓存 # self.borser.delete_all_cookies() mogodata = DB.find_one({"qcc_supplement": 0}) # company_key = "佳木斯益隆煤矿机械制造有限公司" company_key = mogodata["companyName"] self.borser.get(url="https://www.qichacha.com/") self.borser.find_element_by_xpath( "//*[@id='index']/preceding-sibling::input").send_keys( company_key) self.borser.find_element_by_id("V3_Search_bt").click() action = ActionChains(self.borser) if "您的操作过于频繁,验证后再操作" in self.borser.page_source: self.czpf(action) elif "法人或股东" not in self.borser.page_source: self.smdl(action) cookie = self.borser.get_cookies() print(cookie) cookies = "" for items in cookie: jioncook = items["name"] + "=" + items["value"] + "; " cookies += jioncook print(cookies) time.sleep(2) HTMLTEXT = self.borser.page_source etre = HTML(HTMLTEXT) info_parmas = etre.xpath( '//*[contains(@id,"search-result")]//td[contains(@class,"imgtd")]/following-sibling::*[1]/a/@onclick' ) # pc端 company_infos = etre.xpath( '//*[contains(@id,"search-result")]//td[contains(@class,"imgtd")]/following-sibling::*[1]/a/@href' ) self.f.session.headers.update({ "Cookie": cookies, }) self.proxy = self.get_pros() item = mogodata for _ in range(len(info_parmas)): fol_result = ( info_parmas[_].split("addSearchIndex")[1]).replace( '(', '').replace(')', '').replace("'", '').replace(";", '') if mogodata["companyName"] == fol_result.split(',')[0]: fol_results = fol_result.split(',') company_info_url = company_infos[_] data = { "search_key": fol_results[0], "search_index": fol_results[1], "search_url": '', "company_name": fol_results[2], "type": fol_results[-1], } # 基本信息 company_info_urls = self.server_auth( data, company_info_url) html_text = self.company_info_req(company_info_urls) self.company_info_parse(html_text, item, company_info_urls) self.borser.close() count -= 1
def parse_page(url): response = fetch(url) selector = HTML(response.text) href = selector.xpath( '//div[@class="list-article list-short"]/ul/li/a/@href') return [urljoin(BASE_URL, url) for url in href]
url = 'https://twitter.com/search?f=tweets&vertical=default&q=%23CIIE&src=typd' chrome_options = webdriver.ChromeOptions() chrome_options.add_argument('--proxy-server=' + '127.0.0.1:1087') prefs = {"profile.managed_default_content_settings.images": 2} chrome_options.add_experimental_option("prefs", prefs) driver = webdriver.Chrome(chrome_options=chrome_options) driver.get(url) time.sleep(3) while True: try: driver.execute_script("window.scrollBy(0,5000)") time.sleep(5) # print(driver.page_source) html = HTML(driver.page_source) results = html.xpath('//div[@class="stream"]/ol/li') for res in results: try: detail_html_text = etree.tostring(res) detail_html = HTML(detail_html_text.decode()) content_list = detail_html.xpath( '//p[@class="TweetTextSize js-tweet-text tweet-text"]//text()' ) content = ''.join(content_list).replace('\n', '').replace( '\r', '').replace('\t', '').replace(',', ',').strip() commentCount = detail_html.xpath( 'string(//div[@class="ProfileTweet-action ProfileTweet-action--reply"]//span[@class="ProfileTweet-actionCountForPresentation"]//text())' ) shareCount = detail_html.xpath( 'string(//div[@class="ProfileTweet-action ProfileTweet-action--retweet js-toggleState js-toggleRt"]//span[@class="ProfileTweet-actionCountForPresentation"]//text())'
def parse_item(self, response): data_json = json.loads(response.body) if 'cards' in data_json.keys(): for item in data_json['cards']: category = response.meta['category'] title = item['item']['title'] pic_url = item['item']['displayImages'][0][ 'urlTemplate'].replace( 'w=#{width}&h=#{height}&quality=#{quality}', '') describe = item['item']['trailText'] app_name = '英国卫报' try: selector = HTML(item['item']['body']) except: return content = selector.xpath('//text()') content = ''.join(content) content = content.replace('\t', '').replace('\n', '').replace('\r', '') publishedDate = item['item']['webPublicationDate'].replace( 'T', ' ').replace('Z', '') author = item['item']['byline'] crawlTime = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(time.time())) home_url = response.url url = 'https://www.theguardian.com/' + item['item']['id'] pic_more_url = [] for pic in item['item']['bodyImages']: pic_more_url.append(pic['urlTemplate'].replace( 'w=#{width}&h=#{height}&quality=#{quality}', '')) print "app名称", app_name print "主图片url", pic_url print "子图片url", pic_more_url print "作者", author print "详情页地址", url print "所属类型", category print "标题", title print "描述", describe print "内容", content print "主url", home_url print "发布时间", publishedDate print "爬取时间", crawlTime print '\n\n' item = NewsItem() item['app_name'] = app_name item['pic_url'] = pic_url item['pic_more_url'] = pic_more_url item['author'] = author item['url'] = url item['category'] = category item['title'] = title item['describe'] = describe item['content'] = content item['home_url'] = home_url item['publishedDate'] = publishedDate item['crawlTime'] = crawlTime timeArray = time.strptime(publishedDate, "%Y-%m-%d %H:%M:%S") timenum = int(time.mktime(timeArray)) if timenum >= self.timeStamp: self.count += 1 item['count'] = self.count publishedDate = time.strftime( "%Y-%m-%d %H:%M:%S", time.localtime(float(timenum))) item['publishedDate'] = publishedDate yield item
def parseData(urlList): urlW=open("/usr/product/qujiaozhi/url.txt" ,'a') for u in urlList: url=u.get("href").strip() print url urlW.write(url) urlW.write("\n") h = HTML(getHtml(url).decode('gbk')) try: dTxt=h.xpath('//h3') name=dTxt[0].text.strip().split()[0]+" "+dTxt[0].text.strip().split()[1]#名字 brand=dTxt[0].text.strip().split()[0]#品牌 except Exception: errorTxt.write(url) # print brand # print name try: pCpgg=h.xpath('//p[@class="pCpgg"]') td=h.xpath('//td[@class="td2"]') except Exception: errorTxt.write(url) try: if td: price=list(td[0].itertext())[1].strip() else : price=list(pCpgg[0].itertext())[1].strip()#价格 # print price except Exception: errorTxt.write(url) try: norms=list(pCpgg[-1].itertext())[1].strip()#规格 # print norms except Exception: errorTxt.write(url) try: spePs=h.xpath('//p[@class="speP"]/a') effect='' for speP in spePs: effect+=speP.text.strip()+" "#功效 # print effect except Exception: errorTxt.write(url) try: awrap=h.xpath('//div[@class="Awrap"]/ul/li/a') imgUrl=awrap[0].find("img").attrib.get("src")#图片链接地址 # print imgUrl except Exception: errorTxt.write(url) try: troCon=h.xpath('//div[@class="troCon"]') des=list(troCon[0].itertext()) description='' for d in des: if len(d.strip())>20: description+=d.strip()+""#产品描述 # print description except Exception: errorTxt.write(url) try: dTxt=h.xpath('//div[@class="dTxt"]/p/a') series=dTxt[1].text.strip() #系列 except Exception: errorTxt.write(url) insertData(name,brand,price,norms,effect,imgUrl,description,series)
db.set_character_set('utf8') cursor.execute('SET NAMES utf8;') cursor.execute('SET CHARACTER SET utf8;') cursor.execute('SET character_set_connection=utf8;') cursor.execute(sql) db.commit() except MySQLdb.Error,e: print "Mysql Error %d: %s" % (e.args[0], e.args[1]) cursor.close() db.close() #urlHtml=getHtml("http://cosme.pclady.com.cn/products_list/br0_bs0_bi1_sm12_ef0_pb0_pe0_or0.html") for i in range(58,59): i=str(i) print i htmls="http://cosme.pclady.com.cn/products_list/br0_bs0_bi1_sm12_ef0_pb0_pe0_or0_p"+i+".html#productList" urlHtml=getHtml(htmls) html= HTML(urlHtml.decode('gbk')) urlList=html.xpath('//div[@class="dList"]/ul/li/i[@class="iPic"]/a') try: html= HTML(urlHtml.decode('gbk')) urlList=html.xpath('//div[@class="dList"]/ul/li/i[@class="iPic"]/a') parseData(urlList) except Exception : errorTxt.write("\n") errorTxt.write(i) errorTxt.write("\n") continue #html= HTML(urlHtml.decode('gbk')) #urlList=html.xpath('//div[@class="dList"]/ul/li/i[@class="iPic"]/a') #parseData(urlList)
async def start(): writeList = [ '时间', '榜单类型', '姓名', '总分数', '阅读人数', '阅读人数得分', '阅读人数排名', '互动数', '互动数得分', '互动数排名', '社会影响力', '社会影响力得分', '社会影响力排名', '爱慕值', '爱慕值得分', '爱慕值排名', '正能量', '正能量得分', '正能量排名', '搜索量', '搜索量得分', '搜索量排名', '提及量', '提及量得分', '提及量排名', '阅读数', '阅读数得分', '阅读数排名' ] # with open('微博数据.csv', 'w', encoding='gbk') as f: # f.write('时间,榜单类型,姓名,总分数,阅读人数,阅读人数得分,阅读人数排名,互动数,互动数得分,互动数排名,社会影响力,社会影响力得分,社会影响力排名,爱慕值,爱慕值得分,爱慕值排名,正能量,正能量得分,正能量排名,搜索量,搜索量得分,搜索量排名,提及量,提及量得分,提及量排名,阅读数,阅读数得分,阅读数排名\n') date_list = get_date() print(date_list) url = "http://chart.weibo.com/aj/ranklist" rank_type_list = ['5', '3', '6'] item_list = [] for rank_type in rank_type_list: for dateObj in date_list: date = dateObj['date'] period = dateObj['period'] for pageToken in range(1, 5): # payload = "time_type={date}&rank_type={rank_type}&version=v1&_t=0" payload = "datatype=&page={pageToken}&pagesize=25&rank_type={rank_type}&time_type={date}&period={period}&version=v1&_t=0" data = payload.format(pageToken=pageToken, date=date, rank_type=rank_type, period=period) # data = 'date=2019%2f1%2f1&type=realTimeHotSearchList' print(data) try: response = requests.request("POST", url, data=data, headers=headers, verify=False) print(response.text) json_obj = json.loads(response.text) html = HTML(json_obj['data']) except: print('errors..' + date + ',' + str(pageToken) + ',' + rank_type + '\n') with open('errors.txt', 'a') as f: f.write('errors..' + date + ',' + str(pageToken) + ',' + rank_type + '\n') continue div_list = html.xpath( '//div[@class="sr_ranking_type clearfix"]') for div in div_list: if rank_type == '5': bangdanType = '内地榜' elif rank_type == '3': bangdanType = '港澳台榜' else: bangdanType = '新星榜' name = div.xpath( 'string(.//div[@class="sr_name S_func1"]/a/text())' ).strip() zongfenshu = div.xpath( 'string(.//div[@class="sr_text W_f16"]/span/b/text())') len_li = div.xpath('.//ul/li') item = {} item['时间'] = date item['榜单类型'] = bangdanType item['姓名'] = name item['总分数'] = zongfenshu for liNum in range(1, len(len_li) + 1): spanName = div.xpath( 'string(.//ul/li[' + str(liNum) + ']//div[@class="propor sr_fl"]/span[@class="pro_txt"]/text())' ).replace(':', '').strip() if spanName == '互动量': spanName = '互动数' spanNameValue = div.xpath( 'string(.//ul/li[' + str(liNum) + ']//div[@class="propor sr_fl"]/span[@class="pro_num"]/text())' ) spanNamedefen = div.xpath( 'string(.//ul/li[' + str(liNum) + ']//div[@class="civi score sr_fl"]/span/i[@class="ci_num"]/text())' ) spanNamepaiming = div.xpath( 'string(.//ul/li[' + str(liNum) + ']//div[@class="civi sr_fl"]/span/i[@class="ci_num"]/text())' ) # print(spanName) spanNamedefenName = spanName + '得分' spanNamepaimingName = spanName + '排名' item[spanName] = spanNameValue item[spanNamedefenName] = spanNamedefen item[spanNamepaimingName] = spanNamepaiming # print(item) for key in writeList: if key not in item.keys(): item[key] = '' item_list.append(item) print(item) # yuedurenshu = div.xpath('string(.//ul/li[1]//div[@class="propor sr_fl"]/span[@class="pro_num"]/text())') # # hudongshu = div.xpath('string(.//ul/li[2]//div[@class="propor sr_fl"]/span[@class="pro_num"]/text())') # hudongshudefen = div.xpath('string(.//ul/li[2]//div[@class="civi score sr_fl"]/span/i[@class="ci_num"]/text())') # hudongshupaiming = div.xpath('string(.//ul/li[2]//div[@class="civi sr_fl"]/span/i[@class="ci_num"]/text())') # # shehui = div.xpath('string(.//ul/li[3]//div[@class="propor sr_fl"]/span[@class="pro_num"]/text())') # shehuidefen = div.xpath('string(.//ul/li[3]//div[@class="civi score sr_fl"]/span/i[@class="ci_num"]/text())') # shehuipaiming = div.xpath('string(.//ul/li[3]//div[@class="civi sr_fl"]/span/i[@class="ci_num"]/text())') # # aiamu = div.xpath('string(.//ul/li[4]//div[@class="propor sr_fl"]/span[@class="pro_num"]/text())') # aiamudefen = div.xpath('string(.//ul/li[4]//div[@class="civi score sr_fl"]/span/i[@class="ci_num"]/text())') # aiamupaiming = div.xpath('string(.//ul/li[4]//div[@class="civi sr_fl"]/span/i[@class="ci_num"]/text())') # # zhengnengliang = div.xpath('string(.//ul/li[5]//div[@class="propor sr_fl"]/span[@class="pro_num"]/text())') # zhengnengliangdefen = div.xpath('string(.//ul/li[5]//div[@class="civi score sr_fl"]/span/i[@class="ci_num"]/text())') # zhengnengliangpaiming = div.xpath('string(.//ul/li[5]//div[@class="civi sr_fl"]/span/i[@class="ci_num"]/text())') # # save_res = date+','+bangdanType+','+name+','+zongfenshu+','+yuedurenshu+','+yuedurenshudefen+','+yuedurenshupaiming+','+hudongshu+','+hudongshudefen+','+hudongshupaiming+','+shehui+','+shehuidefen+','+shehuipaiming+','+aiamu+','+aiamudefen+','+aiamupaiming+','+zhengnengliang+','+zhengnengliangdefen+','+zhengnengliangpaiming+'\n' # print(save_res) # with open('微博数据.csv','a', encoding='gbk',errors='ignore') as f: # f.write(save_res) time.sleep(5) mongo_config = WriterConfig.WXLSXConfig('结果.xlsx', headers=writeList) with ProcessFactory.create_writer(mongo_config) as mongo_writer: mongo_writer.write(item_list)