def parse_state_url(self, response): # draw the state sel = Selector(response) tempcountryname = sel.xpath( '//div[@id="MediaWeatherRegion"]/div[@class="hd"]/div[@class="yom-bread"]/text()').extract() match = re.search(r'[\w\s]+$', tempcountryname[0]) if match: countryname = match.group().strip() else: self.log('没有国家名', log.WARNING) return data_1 = response.meta['data'] for node in sel.xpath('//div[@id="page1"]/ul/li/a'): state_name = node.xpath('./span/text()').extract()[0].strip() state_href = node.xpath('./@href').extract()[0] yield Request(url='https://weather.yahoo.com' + state_href, callback=self.parse_city, meta={'data': {'data_1': data_1, 'countryname': countryname, 'state': state_name}}) country_code = data_1['countrycode'] # Get states and provinces item = YahooCityItem() item['country'] = {'countrycode': country_code, 'countryname': countryname} item['state'] = state_name item['level'] = 1 item['abroad'] = data_1['abroad'] yield item
def parse_torrent(self,response): all_content = BeautifulSoup(response.body,'html5lib') sel = Selector(text=all_content.prettify(), type="html") topic_item = response.meta['topic_item'] topic_item['thread_content'] = response.body topic_item['topic_board']='凯迪社区' print '+++++++++++++++++++' try: homepage = sel.xpath('//div[re:test(@class,"postspecific")]//span[re:test(@class,"c-main")]/a/@href').extract()[0].strip() topic_item['homepage'] = homepage user_id = re.findall(self.urser_id_pa,homepage)[0] topic_item['poster_id'] = user_id except: topic_item['homepage'] = '' topic_item['poster_id'] = '111' topic_item['data_type']=2 topic_item['site_id']=8 scratch_time = time.strftime("%Y-%m-%d %H:%M:%S",time.localtime()) topic_item['scratch_time'] = scratch_time return topic_item
def parse(self, response): sel = Selector(response) sites = sel.xpath('//div[@class="item"]/div[@class="info"]') items = [] for site in sites: item = Doubantop250FilmItem() item['name'] = str("".join(site.xpath('div[@class="hd"]/a/span[@class="title"]/text()').extract())) item['rate'] = str("".join(site.xpath('div[@class="bd"]/div[@class="star"]/span[@class="rating_num"]/text()').extract())) item['url'] = str("".join(site.xpath('div[@class="hd"]/a/@href').extract())) item['rate_num'] = str(site.xpath('div[@class="bd"]/div[@class="star"]/span/text()').extract()[1]) item['summary'] = str("".join(site.xpath('div[@class="bd"]/p[@class="quote"]/span/text()').extract())) direct_actor = str(site.xpath('div[@class="bd"]/p/text()').extract()[0]).replace("\n", "") if direct_actor.__contains__("主演"): item['director'] = direct_actor.split("主演")[0].strip().split("导演")[1].strip().replace(":", "") item['actor'] = direct_actor.split("主演")[1].strip().replace(":", "") else: item['director'] = direct_actor.split("导演")[1].strip().replace(":", "") item['actor'] = 'unknown' releaseDate_nation_type = str(site.xpath('div[@class="bd"]/p/text()').extract()[1]).replace("\n", "") item['releaseDate'] = releaseDate_nation_type.split("/")[0].strip() item['nation'] = releaseDate_nation_type.split("/")[1].strip() item['type'] = releaseDate_nation_type.split("/")[2].strip() items.append(item) return items
def parse(self, response): """ The lines below is a spider contract. For more info see: http://doc.scrapy.org/en/latest/topics/contracts.html @url http://pathofexile.gamepedia.com/List_of_unique_XXX @scrapes pathofexile.gamepedia.com """ if not self._is_valid_url(response.url): return None url_parts = urlparse(response.url) self.set_path(url_parts) #self.log('A response from %s just arrived!' % response.url) sel = Selector(response) items = sel.xpath(".//tr[@id]") unique_items = [] for an_item in items: unique_item = UniqueItem() unique_item['name'] = an_item.xpath("./td[1]/a[1]/@title").extract()[0] num_spans = len(an_item.xpath("./td[last()]//div[@class='itemboxstatsgroup']/span")) if num_spans == 1: unique_item['implicit_mods'] = [] else: unique_item['implicit_mods'] = an_item.xpath("./td[last()]//div[@class='itemboxstatsgroup'][1]//span/text()").extract() affix_mods = an_item.xpath("./td[last()]//div[@class='itemboxstatsgroup'][last()]//span/text()").extract() unique_item['affix_mods'] = affix_mods unique_item['url'] = "{}://{}{}".format(url_parts.scheme, url_parts.netloc, an_item.xpath("./td[1]/a[1]/@href").extract()[0]) unique_item['category'] = self.get_category() unique_items.append(unique_item) return unique_items
def parse_job_details(self, response): hxs = Selector(response) item = BrightermondaySampleItem() item['link'] = response.url item['title'] = hxs.xpath('//h2/text()').extract()[0] item['desc'] = hxs.xpath('//article[@class="resultDetail"]/p/text()').extract()[0] return item
def parse(self, response): item = Bet() item['bookmaker'] = 'TheGreek' item['sport'] = 'Soccer' item['eventDate'] = '23' item['moneyLine'] = {} item['totals'] = {} item['spreads'] = {} leagues = Selector(response).xpath('//div[@class="table-container"]') for league in leagues: item['league'] = league.xpath('h4/text()').extract()[0].strip() lines = leagues.xpath('div[@class="lines"]') for line in lines: item['homeTeam'] = line.xpath('ul/li[@class="name"]/a/text()').extract()[0].strip() item['awayTeam'] = line.xpath('ul/li[@class="name"]/a/text()').extract()[1].strip() item['moneyLine']['home'] = line.xpath('ul/li[@id="ml"]/a/text()').extract()[0].strip() item['moneyLine']['draw'] = line.xpath('ul/li[@id="ml"]/a/text()').extract()[1].strip() item['moneyLine']['away'] = line.xpath('ul/li[@id="ml"]/a/text()').extract()[2].strip() item['totals']['points'] = line.xpath('ul/li[@id="gt"]/a/text()').extract()[0].strip().encode("utf8") item['totals']['over'] = line.xpath('ul/li[@id="gt"]/a/text()').extract()[1].strip() item['totals']['under'] = line.xpath('ul/li[@id="gt"]/a/text()').extract()[3].strip() item['spreads']['hdp'] = line.xpath('ul/li[@id="spread_home"]/a/text()').extract()[0].strip().encode("utf8") item['spreads']['home'] = line.xpath('ul/li[@id="spread_home"]/a/text()').extract()[1].strip() item['spreads']['away'] = line.xpath('ul/li[@id="spread_away"]/a/text()').extract()[1].strip() yield item
def parse(self, response): sel=Selector(response) sites=sel.xpath('//li[@class="item"]/div[@class="item-wrap"]') items=[] info=str(len(sites))+" mobile info have been found" logging.info(info.encode('utf-8')) for site in sites: item=WebCrawlingItem() site1=site.xpath('div[@class="item-detail"]/div[@class="item-title"]') site2=site.xpath('div[@class="item-detail"]/div[@class="item-rela"]/a') site3=site.xpath('div[@class="item-sales"]') name=site1.xpath('h3/a/text()').extract() describe=site1.xpath('span/text()').extract() level=site2.xpath('div[@class="score-num"]/text()').extract() price=site3.xpath('div[@class="price price-now"]/a/text()').extract() # print(name) item['name'] = self.str_join([d.encode('utf-8') for d in name]) item['describe'] = self.str_join([d.encode('utf-8') for d in describe]) item['level'] = self.str_join([d.encode('utf-8') for d in level]) item['price'] = self.str_join([d.encode('utf-8') for d in price]) items.append(item) logging.info("Appending item "+item['name']) logging.info("Append done.") return items
def parse_item(self, response): selector = Selector(response) companyInfo = selector.xpath('//td[@class="cont_company"]//td[@class="td_r"]/text()') jobInfo = selector.xpath('//*[@id="DataList1"]//table/tr') contactInfo = selector.xpath('//td[@class="cont_contact"]') contact_text = contactInfo.xpath('text()').extract()[0] + ' ' + contactInfo.xpath('text()').extract()[1] + ' ' + contactInfo.xpath('text()').extract()[2] #print self.mailre.findall(contact_text) #print self.phonePartern.match(contactInfo.xpath('text()').extract()[0]) #print self.emainPartern(contactInfo.xpath('text()').extract()[1]) #print (contactInfo.xpath('text()').extract()[2]).replace(' ','') for each in jobInfo: item = TsrcwItem() print each.extract() jobList = [] try: for i in each.xpath('td[@class="td-grey"]/text()'): if not (i.extract()).strip() == "": jobList.append((i.extract()).strip()) item['email'] = self.mailre.findall(contact_text)[0] item['companyName'] = (companyInfo.extract()[0]).strip() item['industryName'] = (companyInfo.extract()[1]).strip() item['companyNature'] = (companyInfo.extract()[2]).strip() item['jobName'] = (each.xpath('td[@class="td-grey"]/a/text()').extract()[0]).strip() item['jobDetail'] = self.baseUrl+(each.xpath('td[@class="td-grey"]/a/@href').extract()[0]).strip() item['jobRegion'] = jobList[0] item['requiredDegree'] = jobList[1] item['salary'] = jobList[2] item['endDate'] = jobList[3] yield item except Exception,e: continue
def get_node(text, namespaces=None): """Get a scrapy selector for the given text node.""" node = Selector(text=text, type="xml") if namespaces: for ns in namespaces: node.register_namespace(ns[0], ns[1]) return node
async def _parse_result(host, resp): """ parse response of homepage, if there have any error, return it , or return None """ rurl = urlparse(str(resp.url)) # check: do site return friendly error: url is http://www.test.com/504.htm m = _url_error_re.search(rurl.path) if m: return "SITE:%s" % m.group(1) # check: is site redirect to other site, maybe nginx config have problem or host was recycled by dns manufacturer if not rurl.netloc.lower().endswith(host.lower()): return "SITE:REDIRECT" # body check html = await resp.text() sel = Selector(text=html) emlist = sel.xpath('//body/*').extract() sbody = ''.join(emlist) # check: is site homepage blank if len(sbody) == 0: return "BODY:Blank" else: m = _nginx_error_page.search(sbody) if m: return "NGX:%s" % m.group(1) elif len(sbody) < _url_body_min: return "BODY:Min" return None
def parse_alpha(self, response): """ extract the alpha letters links""" sel = Selector(response) urls = sel.css("ul.alpha li a::attr(href)").extract() for url in urls: yield Request(url, callback=self.parse_page)
def pages(self, response): """ 提取各种案件的页数,并发起请求 """ sel = Selector(text=response.body) self.cases(response) #提取首页的内容 total = sel.xpath("//table/tbody//script/text()").re(u"共[\D]*?([\d]*?)[\D]*?页") try: total = int(total[0]) + 1 for i in xrange(2, total): self.data['page'] = str(i) con = ["=".join(item) for item in self.data.items()] tail = "&".join(con) url = self.model_urls + "?" + tail fp = self.url_fingerprint(url) isexist = self.myRedis.sadd(self.url_have_seen,fp) if isexist: #如果redis set ppai_dup_redis没有则插入并返回1,否则 #返回0 yield Request(url, callback=self.cases,\ dont_filter=False) else: pass except Exception, e: log.msg("only_one url==%s== error=%s" %(response.url,\ e), level=log.ERROR)
def parse_item(self, response): sel = Selector(response) item = ZapposItem() self._enrich_base_data(item, response, is_update=False) item['productId'] = ''.join(sel.xpath('//form[@id="prForm"]/input[@name="productId"]/@value').extract()).strip() if item['productId'] in self.seen_products: self.crawler.stats.inc_total_pages(response.meta['crawlid'], response.meta['spiderid'], response.meta['appid'], -1) return else: self.seen_products.add(item['productId']) self._enrich_same_part(item, response) positions = ['p', '1', '2', '3', '4', '5', '6'] all_images = [] image_urls = [] for one_colorId in item['colorIds']: for one_position in positions: reg_str = r"pImgs\[%s\]\[\'4x\'\]\[\'%s\'\] = . filename: '(.*?)'," % (one_colorId, one_position) image_file = re_search(reg_str, response.body, dotall=False) image_file.replace("'", "") image_file.replace('"', "") all_images.append([one_colorId, one_position, image_file]) if len(image_file) > 0: image_urls.append(image_file) item['color_images'] = all_images item['image_urls'] = image_urls self.crawler.stats.inc_crawled_pages( crawlid=response.meta['crawlid'], spiderid=response.meta['spiderid'], appid=response.meta['appid'] ) return item
def parse_country_helper(self, response): if 'cont' in self.param: self.cont_map = {tmp: self.cont_map[tmp] for tmp in self.param['cont']} if 'country' in self.param: self.country_filter = [int(tmp) for tmp in self.param['country']] sel = Selector(response) for cont in self.cont_map: cont_node = sel.xpath('//div[@class="pla_indcountrylists"]/div[@id="%s"]' % self.cont_map[cont])[0] for region_node in cont_node.xpath('.//li[@class="item"]'): is_hot = bool(region_node.xpath('./p[@class="hot"]').extract()) tmp = region_node.xpath('.//a[@href and @data-bn-ipg]') if not tmp: continue region_node = tmp[0] zh_name = region_node.xpath('./text()').extract()[0].strip() en_name = region_node.xpath('./span[@class="en"]/text()').extract()[0].strip() tmp = region_node.xpath('./@data-bn-ipg').extract()[0] pid = int(re.search(r'place-index-countrylist-(\d+)', tmp).group(1)) href = region_node.xpath('./@href').extract()[0] url = self.build_href(response.url, href) if self.country_filter and pid not in self.country_filter: continue item = {'type': 'country'} data = {'zhName': zh_name, 'enName': en_name, 'alias': {zh_name.lower(), en_name.lower()}, 'isHot': is_hot, 'id': pid, 'url': url} item['data'] = data yield item
def parse_homepage(self, response): sel = Selector(response) def func(node, hot): country_url = node.xpath('./@href').extract()[0].strip() country_name = node.xpath('./text()').extract()[0].strip() ret = node.xpath('./span[@class="en"]/text()').extract() country_engname = ret[0].lower().strip() if ret else None if 'country' in self.param and country_engname.lower() not in self.param['country']: return None sights_url = urlparse.urljoin(country_url, './sight') m = {"country_name": country_name, "country_url": country_url, "country_popular": hot, "country_engname": country_engname, "sights_url": sights_url} return Request(url=sights_url, callback=self.parse_countrysights, meta={"country": m}) for req in map(lambda node: func(node, False), sel.xpath('//div[@id="allcitylist"]/div[contains(@class,"line")]/ul/li/a[@href]')): yield req for req in map(lambda node: func(node, True), sel.xpath( '//div[@id="allcitylist"]/div[contains(@class,"line")]/ul/li/p[@class="hot"]/a[@href]')): yield req
def stepTwo(self, response): hxs = Selector(response) translatedRows = hxs.xpath('//table[@id="translations"]/tr[ contains(@class, "status-current") ]/td[@class="original"]') # print ( len(untranslatedRows) ) # pdb.set_trace() for rows in translatedRows: aux = "" for r in rows.xpath('./child::node()').extract(): aux = aux + r.strip() + ' ' i = self.compareStrings(aux) if i is not None: #scrapy item # traductionItem = W # traductionItem['originalString'] = aux self.untranslated[i]['translatedString'] = rows.xpath('./..//td[@class="translation foreign-text"]/text()').extract()[0].strip() paginaSiguiente = [] paginaSiguiente = hxs.xpath('//div[@class="paging"]/a[@class="next"]/@href') try: fullUrl_toNextPage = response.urljoin( paginaSiguiente[0].extract() ) return fullUrl_toNextPage except Exception: return None
def parseAreas(data): global pURL4 hxs = Selector(text=data) for x in hxs.xpath("//div[@id=\'location_area\']/div[@class=\'facet-values\']/a/@href").extract(): pURL4.append(addURL+x)
def parse_reply(self, response): items = [] sel = Selector(response) item = MafengwoYoujiItem() item_data = response.meta['item'] item['author'] = item_data['author'] item['title'] = item_data['title'] item['reply'] = item_data['reply'] item['place'] = item_data['place'] item['public_time'] = item_data['public_time'] item['way'] = item_data['way'] item['days'] = item_data['days'] item['contents'] = item_data['contents'] item['cost'] = item_data['cost'] page = response.meta['page'] max_page = response.meta['max_page'] template_url = response.meta['url'][:-1] reply = sel.xpath('//div[@class="post_item"]/div/div[contains(@class,"a_con_text reply")]').extract() if reply: item['reply'].extend(reply) max_page = int(max_page) if page < max_page: page += 1 url = "%s%d" % (template_url, page) return Request(url=url, callback=self.parse_reply, meta={'item': item, 'page': page, 'max_page': max_page, 'url': url}) else: items.append(item) return items
def stepOne(self, response): hxs = Selector(response) # untranslatedRows = hxs.xpath('//table[@id="translations"]/tr[@class="preview untranslated priority-normal no-warnings"]/td[@class="original"]') untranslatedRows = hxs.xpath('//table[@id="translations"]/tr[ contains(@class, "untranslated") ]/td[@class="original"]') for rows in untranslatedRows: aux = WordpressTranslationHackItem() aux['originalString'] = '' for r in rows.xpath('./child::node()').extract(): aux['originalString'] = aux['originalString'] + r.strip() + ' ' self.untranslated.append( aux ) # print ( self.untranslated[-1] ) # print ( '------------------' ) # pdb.set_trace() paginaSiguiente = [] paginaSiguiente = hxs.xpath('//div[@class="paging"]/a[@class="next"]/@href') try: fullUrl_toNextPage = response.urljoin( paginaSiguiente[0].extract() ) return fullUrl_toNextPage except Exception: return None
def parse_fans(self, response): fans_page = response.meta['fans_page'] fans_level = response.meta['fans_level'] item = response.meta['item'] if u'暂无霸王票' in response.body.decode('gbk', 'ignore'): item['fans'] = [] if fans_level: counter = Counter(fans_level) item['fans'] = [{'name': k, 'value': counter[k]} for k in counter] yield item return sel = Selector(text=response.body.decode('gbk', 'ignore')) fans_level.extend(sel.xpath('//*[@id="rank"]/div[2]/table/tr/td[2]/text()').extract()) fans_page += 1 if fans_page > 5: counter = Counter(fans_level) item['fans'] = [{'name': k, 'value': counter[k]} for k in counter] yield item else: yield Request( url='http://www.jjwxc.net/reader_kingticket.php?novelid={0}&page={1}'.format( item['book_id'], fans_page), meta={'item': item, 'fans_page': fans_page, 'fans_level': fans_level}, callback=self.parse_fans, dont_filter=True )
def parse(self, response): zip_file = open('CANADA_ZIPCODES.txt', 'r+') zip_list = filter(None, zip_file.read().split('\n')) for zip_item in zip_list: print "*** zip_item" print zip_item geo_url = 'https://maps.google.com/?q=%s canada'%(zip_item) try: map_url_content = requests.get(geo_url).content except: sleep(15) map_url_content = requests.get(geo_url).content sleep(3) sell = Selector(text=map_url_content) map_error_1 = sell.xpath( '//div[@class="sp-error-msg"]|//div[@class="noprint res"]/div//div[contains(@id,"marker_B")]') latlong = ' '.join(sell.xpath('//script').extract()) if not map_error_1 else '' lat_lng = re.findall(r'",\[(-?\d+\.?\d*),(-?\d+\.?\d*)\]\]', latlong, re.I) venue_latitude, venue_longitude = lat_lng[0] if lat_lng else ('', '') print venue_latitude, venue_longitude if not venue_latitude or not venue_longitude: with open('missing_lat_lng.txt', 'a+') as d: print "*** DROPPED ZIP - %s"%(zip_item) d.write(zip_item+'\n') print "NO LATITUDE OR LONGITUDE" else: fetch_url = 'http://api.invisalign.com/svc/rd?pc=%s&cl=CA&lat=%s&lng=%s&it=us'%(zip_item, venue_latitude, venue_longitude) meta_data = {'venue_latitude': venue_latitude, 'venue_longitude': venue_longitude, 'zip_code': zip_item} yield Request(url = fetch_url, dont_filter=True, callback=self.parse_result, meta=meta_data)
def parse_item(self, response): items = [] sel = Selector(response) print("test1") products = sel.xpath('//*[@id="coreProductInfos"]/div[2]') # breadcrumbs = sel.xpath('//div[@id ="contentWrapper"]')\ table = sel.xpath('//tr[contains(td, "techDataCol")]') category = sel.xpath('//*[@id="contentWrapper"]/div[1]/span[2]/a/span/text()').extract() print(category) for product in products: if 'Geheugen' in category: item = Memory() print (table.xpath('//td/text()').extract()) item['Category'] = category item['Name'] = product.xpath('//td[contains(td[1], "Modelnaam")]/td[2]/table/tbody/tr/td/text()').extract() item['Brand'] = product.xpath('//*[@id="details"]/div[4]/div/table/tbody/tr[2]/td[2]/table/tbody/tr/td/text()').extract() item['Quantity'] = product.xpath('//tr[contains(td[1], "Aantal")]/td[2]/text()').extract() item['Size'] = product.xpath('//tr[contains(td[1], "Modulegrootte")]/td[2]/text()').extract() item['PriceGB'] = product.xpath('//tr[contains(td[1], "Prijs per GB")]/td[2]/text()').extract() item['Type'] = product.xpath('//tr[contains(td[1], "Geheugentype")]/td[2]/text()').extract() item['Specification'] = product.xpath('//tr[contains(td[1], "Geheugen Specificatie")]/td[2]/text()').extract() item['LowVoltage'] = product.xpath('//tr[contains(td[1], "Low Voltage DDR")]/td[2]/text()').extract() item['Voltage'] = product.xpath('//tr[contains(td[1], "Spanning")]/td[2]/text()'). extract() item['Warranty'] = product.xpath('//tr[contains(td[1], "Fabrieksgarantie")]/td[2]/text()').extract() item['Ean'] = product.xpath('//tr[contains(td[1], "EAN")]/td[2]/text()').extract() item['Sku'] = product.xpath('//tr[contains(td[1], "SKU")]/td[2]/text()').extract() print("Geheugen!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!") items.append(item) return items
def detail(self, response): """ extract detail info """ sel = Selector(text=response.body) condition = sel.xpath(self.xpathSen["brand"]).extract() if len(condition) != 0: xpath_keys = ["type_auto","brand","level","BSX", "CSJG","ZWGS","PL","RLXS","QDFS"] xpath_conf = ["DDTC","DDTJZY","ESP","GPS","DSXH", "DCLD","DGLFXP"] keys_info = [] for xpath_str in xpath_keys: tmp = sel.xpath(self.xpathSen[xpath_str]).extract() try: keys_info.append(tmp[0]) except Exception, e: keys_info.append("") log.msg("error info=%s keys_info=%s" %(e, "\001".join(keys_info)), level=log.ERROR) conf_info = [] for xpath_s in xpath_conf: tmp = sel.xpath(self.xpathSen[xpath_s]).extract() try: conf_info.append(tmp[0]) except Exception, e: conf_info.append("-") log.msg("error info=%s conf_info=%s"%(e, \ "\001".join(conf_info)), level=log.ERROR)
def parse_location(self, response): sel = Selector(response) print(" **************** LOCATION LIST *************") print(response.url) print(" **************** LOCATION LIST *************") location = sel.xpath("//ul[@class='geoList']") for loc in location: state_link = loc.xpath("li/a/@href").extract() print(" **************** Attraction List starts *************") for link in state_link: url_link = response.urljoin(link) print(url_link) # "https://www.tripadvisor.com/Attractions-g34345-Activities-Key_West_Florida_Keys_Florida.html" yield scrapy.Request(url_link, callback=self.parse_attraction) print(" **************** Attraction List ends *************") # yield scrapy.Request(url_link,callback=self.parse_test) locations = sel.xpath("//a[@class='guiArw sprite-pageNext pid0']/@href").extract() print(" **************** LOCATION LIST PAGINATION starts *************") print(locations) print(" **************** LOCATION Link *************") for location in locations: if location: location_link = response.urljoin(location) print(location_link) yield scrapy.Request(location_link, callback=self.parse_location) print(" **************** LOCATION Link *************") print(" **************** LOCATION LIST PAGINATION ends *************")
def parse_channel(self, response): hxs = Selector(response) item = response.meta['record'] item['video_url'] = hxs.xpath("body//div[@id='divVideoHolder']/@videosrc").extract()[0] item["title"] = hxs.xpath("body//div[@id='divTitrGrid']/text()").extract()[0] return item
def load_annotations(body): """Create slybot annotations from annotated html.""" if not body: return {'annotations-plugin': {'extracts': []}} sel = Selector(text=add_tagids(body)) existing_ids = set() annotations = [] for elem in sel.xpath('//*[@data-scrapy-annotate]'): attributes = elem._root.attrib annotation = json.loads(unquote(attributes['data-scrapy-annotate'])) if (isinstance(elem._root, _Element) and elem._root.tag.lower() == 'ins'): annotation.update(find_generated_annotation(elem)) else: annotation['tagid'] = attributes.get('data-tagid') if 'id' not in annotation: annotation['id'] = gen_id(disallow=existing_ids) existing_ids.add(annotation['id']) annotations.append(annotation) for elem in sel.xpath('//*[@%s]' % '|@'.join(IGNORE_ATTRIBUTES)): attributes = elem._root.attrib for attribute in IGNORE_ATTRIBUTES: if attribute in attributes: break ignore = {attribute[len('data-scrapy-'):]: True} if 'id' not in ignore: ignore['id'] = gen_id(disallow=existing_ids) existing_ids.add(ignore['id']) annotations.append(ignore) return {'annotations-plugin': {'extracts': annotations}}
def parse_page_2(self, response): hxs = Selector(response) url = hxs.xpath("//a[@class='post-get-it-button--primary']/@href").extract()[0] return Request( url=url, callback=self.parse_end_page, meta={'item': response.meta['item']})
def parse(self, response): driver = response.meta['driver'] for _, value in self.df.iterrows(): driver.get(value['url']) time.sleep(2) html = driver.page_source resp_obj = Selector(text=html) check1 = resp_obj.xpath("//div[@data-type='items']") check2 = resp_obj.xpath( "//span[text()='Shop by Category' or text()='Shop by category']/parent::span/parent::button/following-sibling::div/div/ul/li" ) check3 = resp_obj.xpath( "//h2[text()='Shop by category']/parent::div/parent::div/following-sibling::div//div[@class='TempoCategoryTile-tile valign-top']" ) if check1: cntr = 1 while True: html = driver.page_source resp_obj = Selector(text=html) listings = resp_obj.xpath("//div[@data-type='items']") for prods in listings: product_url = f'''https://www.walmart.com{prods.xpath(".//div[@class='search-result-product-title gridview']/a/@href").get()}''' product_name = prods.xpath( "normalize-space(.//div[@class='search-result-product-title gridview']/a/span/text())" ).get() price = prods.xpath( "normalize-space(.//span[@class='price-main-block']/span/span/text())" ).get() if not product_name: product_url = f'''https://www.walmart.com{prods.xpath(".//span[text()='Product Title']/parent::div/a/@href").get()}''' product_name = prods.xpath( "normalize-space(.//span[text()='Product Title']/parent::div/a/span/text())" ).get() if not price: price = f'''{prods.xpath("normalize-space(.//span[@class='price price-main'][1]/span/text())").get()} - {prods.xpath("normalize-space(.//span[@class='price price-main'][2]/span/text())").get()}''' yield { 'product_url': product_url, 'product_name': product_name, 'product_price': price, 'lvl1_cat': value['lvl1_cat'], 'lvl2_cat': value['lvl2_cat'], 'lvl3_cat': value['lvl3_cat'], 'lvl4_cat': None } next_page = resp_obj.xpath( "//span[text()='Next Page']/parent::button") cntr += 1 if next_page: next_page = resp_obj.xpath( f"//ul[@class='paginator-list']/li/a[text()='{cntr}']/@href" ).get() driver.get(f"https://www.walmart.com{next_page}") time.sleep(2) else: break elif check2: driver.execute_script("window.open('');") driver.switch_to.window(driver.window_handles[1]) for listings in check2: lvl4_cat = listings.xpath(".//a/span/text()").get() url = listings.xpath(".//a/@href").get() driver.get(f"https://www.walmart.com{url}") cntr = 1 while True: html = driver.page_source resp_obj = Selector(text=html) listings = resp_obj.xpath("//div[@data-type='items']") for prods in listings: product_url = f'''https://www.walmart.com{prods.xpath(".//div[@class='search-result-product-title gridview']/a/@href").get()}''' product_name = prods.xpath( "normalize-space(.//div[@class='search-result-product-title gridview']/a/span/text())" ).get() price = prods.xpath( "normalize-space(.//span[@class='price-main-block']/span/span/text())" ).get() if not product_name: product_url = f'''https://www.walmart.com{prods.xpath(".//span[text()='Product Title']/parent::div/a/@href").get()}''' product_name = prods.xpath( "normalize-space(.//span[text()='Product Title']/parent::div/a/span/text())" ).get() if not price: price = f'''{prods.xpath("normalize-space(.//span[@class='price price-main'][1]/span/text())").get()} - {prods.xpath("normalize-space(.//span[@class='price price-main'][2]/span/text())").get()}''' yield { 'product_url': product_url, 'product_name': product_name, 'product_price': price, 'lvl1_cat': value['lvl1_cat'], 'lvl2_cat': value['lvl2_cat'], 'lvl3_cat': value['lvl3_cat'], 'lvl4_cat': lvl4_cat } next_page = resp_obj.xpath( "//span[text()='Next Page']/parent::button") cntr += 1 if next_page: next_page = resp_obj.xpath( f"//ul[@class='paginator-list']/li/a[text()='{cntr}']/@href" ).get() driver.get(f"https://www.walmart.com{next_page}") time.sleep(2) else: break driver.close() driver.switch_to.window(driver.window_handles[0]) elif check3: driver.execute_script("window.open('');") driver.switch_to.window(driver.window_handles[1]) for listings in check3: lvl4_cat = listings.xpath(".//span/text()").get() url = listings.xpath(".//following-sibling::a/@href").get() driver.get(f"https://www.walmart.com{url}") cntr = 1 while True: html = driver.page_source resp_obj = Selector(text=html) listings = resp_obj.xpath("//div[@data-type='items']") for prods in listings: product_url = f'''https://www.walmart.com{prods.xpath(".//div[@class='search-result-product-title gridview']/a/@href").get()}''' product_name = prods.xpath( "normalize-space(.//div[@class='search-result-product-title gridview']/a/span/text())" ).get() price = prods.xpath( "normalize-space(.//span[@class='price-main-block']/span/span/text())" ).get() if not product_name: product_url = f'''https://www.walmart.com{prods.xpath(".//span[text()='Product Title']/parent::div/a/@href").get()}''' product_name = prods.xpath( "normalize-space(.//span[text()='Product Title']/parent::div/a/span/text())" ).get() if not price: price = f'''{prods.xpath("normalize-space(.//span[@class='price price-main'][1]/span/text())").get()} - {prods.xpath("normalize-space(.//span[@class='price price-main'][2]/span/text())").get()}''' yield { 'product_url': product_url, 'product_name': product_name, 'product_price': price, 'lvl1_cat': value['lvl1_cat'], 'lvl2_cat': value['lvl2_cat'], 'lvl3_cat': value['lvl3_cat'], 'lvl4_cat': lvl4_cat } next_page = resp_obj.xpath( "//span[text()='Next Page']/parent::button") cntr += 1 if next_page: next_page = resp_obj.xpath( f"//ul[@class='paginator-list']/li/a[text()='{cntr}']/@href" ).get() driver.get(f"https://www.walmart.com{next_page}") time.sleep(2) else: break driver.close() driver.switch_to.window(driver.window_handles[0]) else: pass
def parse_detail(self, response): try: # 数据获取不全 data = Selector(text=response.body.decode('gbk')) items = str(data.xpath('string(.)').extract()[0]).replace( '\xa0', '').replace('\u3000', '') # 共有字段 fileTitle = data.xpath( '//td[@class="fh tac bw fwb f18-0 pl2 b0"]/text()' ).extract_first() # 正文标题 textTitle = data.xpath( '//td[@class="fh vat bw f8-0 b1"]/table[1]//tr[1]/td[@align="center"]/text()' ).extract_first() supllyType = response.meta.get('supllyType').strip() administration = response.meta.get('administration').strip() supplyNoticeTitle = response.meta.get('supplyNoticeTitle').strip() publishTime = response.meta.get('publishTime').strip() projectName = '' parcelNumber = '' parcelLocation = '' landPurpose = '' landArea = '' transferTimeLimit = '' transferPrice = '' landPurposeDetail = '' transferUnit = '' remark = '' publicityPeriod = '' contactUnit = '' unitAddr = '' postalCode = '' contactTel = '' contacter = '' email = '' lanServiceCondition = '' # 公告类型 # noticeType = # 公示期 publicityPeriod = reFunction(u'公示期:([\s\S]*)三、', reFunction('四、[\s\S]*', items)).strip() # 联系单位 contactUnit = reFunction(u'联系单位:([\s\S]*)单位地址', reFunction('四、[\s\S]*', items)).strip() # 单位地址 unitAddr = reFunction(u'单位地址:([\s\S]*)邮政编码', reFunction('四、[\s\S]*', items)).strip() # 邮政编码 postalCode = reFunction(u'邮政编码:([\s\S]*)联系电话', reFunction('四、[\s\S]*', items)).strip() # 联系电话 contactTel = reFunction(u'联系电话:([\s\S]*)联 系 人', reFunction('四、[\s\S]*', items)).strip() # 联系人 contacter = reFunction(u'联 系 人:([\s\S]*)电子邮件', reFunction('四、[\s\S]*', items)).strip() # 电子邮件 email = reFunction(u'电子邮件:([\w\.\@]*)(?:[\S]*)', reFunction('四、[\s\S]*', items)).strip() if '宗地编号' in items: for item in [ '宗地编号' + _ for _ in re.findall('([\s\S]*)二、', items) [0].split('宗地编号')[1:] ]: # 宗地编号 parcelNumber = reFunction('宗地编号:(?:\s*)([\s\S]*)地块位置', item).strip() # 地块位置 parcelArea parcelLocation = reFunction('地块位置:(?:\s*)([\s\S]*)土地用途:', item).strip() # 土地用途 landPurpose = reFunction('土地用途:(?:\s*)([\s\S]*)土地面积\(公顷\)', item).strip() # 土地面积(公顷) landArea = reFunction( '土地面积\(公顷\):(?:\s*)([\w}/\.{]*)(?:\s*)', item).strip() # 项目名称 projectName = reFunction('项目名称:(?:\s*)([\s\S]*)土地用途明细', item).strip() # 出让年限 transferTimeLimit = reFunction( '出让年限:(?:\s*)([\s\S]*)成交价\(万元\)', item).strip() # 成交价(万元) transferPrice = reFunction( '成交价\(万元\):(?:\s*)([\s\S]*)土地用途明细', item).strip() # 土地用途明细(用途名称、面积) landPurposeDetail = reFunction( '(?:\s*)面积\(公顷\)(?:\s*)([\w}/\.{]*)受让单位', item).strip() if reFunction( '(?:\s*)面积\(公顷\)(?:\s*)([\w}/\.{]*)受让单位', item).strip() else reFunction( '(?:\s*)([\d\.]*)(?:[\s]*)受让单位', item).strip() # 受让单位 transferUnit = reFunction('受让单位:(?:\s*)([\w}/{]*)(?:\s*)', item).strip() # 土地使用条件 lanServiceCondition = reFunction( '土地使用条件:(?:\s*)([\s\S]*)备注', item).strip() # 备注 # remark = reFunction(u'备注:(?:\s*)([\w}/,、\u4e00-\uffe5()《》:\-\.<≤。{\u3002\uff1f\uff01\uff0c\u3001\uff1b\uff1a\u201c\u201d\u2018\u2019\uff08\uff09\u300a\u300b\u3008\u3009\u3010\u3011\u300e\u300f\u300c\u300d\ufe43\ufe44\u3014\u3015\u2026\u2014\uff5e\ufe4f\uffe5]*)(?:\s*)', item).strip() remark = reFunction(u'备注:(?:\s*)([\s\S]*)(?:\s*)[二、]?', item).strip() # 爬取时间 crawlingTime = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) # 爬取地址url url = response.url # 唯一标识 md5Mark = encrypt_md5(parcelNumber + publishTime + parcelLocation + url) # 存储数据 csvFile = [ administration, supplyNoticeTitle, publishTime, fileTitle, textTitle, projectName, parcelNumber, parcelLocation, landPurpose, landArea, transferTimeLimit, transferPrice, landPurposeDetail, transferUnit, remark, publicityPeriod, contactUnit, unitAddr, postalCode, contactTel, contacter, email, lanServiceCondition, crawlingTime, url, md5Mark ] self.fileDetail.write(','.join([ _.replace(',', ' ').replace('\n', '').replace( '\r', '') if _ else _ for _ in csvFile ])) self.fileDetail.write('\n') yield #TODO elif '地块编号' in items: for item in [ '地块编号' + _ for _ in re.findall('([\s\S]*)二、', items) [0].split('地块编号')[1:] ]: # 地块编号 parcelNumber = reFunction('地块编号:(?:\s*)([\s\S]*)地块位置', item).strip() # 地块位置 parcelArea parcelLocation = reFunction('地块位置:(?:\s*)([\s\S]*)土地用途:', item).strip() # 土地用途 landPurpose = reFunction('土地用途:(?:\s*)([\s\S]*)土地面积\(公顷\)', item).strip() # 土地面积(公顷) landArea = reFunction( '土地面积\(公顷\):(?:\s*)([\w}/\.{]*)(?:\s*)', item).strip() # 项目名称 projectName = reFunction('项目名称:(?:\s*)([\s\S]*)土地用途明细', item).strip() # 出让年限 transferTimeLimit = reFunction( '出让年限:(?:\s*)([\s\S]*)成交价\(万元\)', item).strip() # 成交价(万元) transferPrice = reFunction( '成交价\(万元\):(?:\s*)([\s\S]*)土地用途明细', item).strip() # 土地用途明细(用途名称、面积) landPurposeDetail = reFunction( '(?:\s*)面积\(公顷\)(?:\s*)([\w}/\.{]*)受让单位', item).strip() if reFunction( '(?:\s*)面积\(公顷\)(?:\s*)([\w}/\.{]*)受让单位', item).strip() else reFunction( '(?:\s*)([\d\.]*)(?:[\s]*)受让单位', item).strip() # 受让单位 transferUnit = reFunction('受让单位:(?:\s*)([\w}/{]*)(?:\s*)', item).strip() # 土地使用条件 lanServiceCondition = reFunction( '土地使用条件:(?:\s*)([\s\S]*)备注', item).strip() # 备注 remark = reFunction(u'备注:(?:\s*)([\s\S]*)(?:\s*)[二、]?', item).strip() # 爬取时间 crawlingTime = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) # 爬取地址url url = response.url # 唯一标识 md5Mark = encrypt_md5(parcelNumber + publishTime + parcelLocation + url) # 存储数据 csvFile = [ administration, supplyNoticeTitle, publishTime, fileTitle, textTitle, projectName, parcelNumber, parcelLocation, landPurpose, landArea, transferTimeLimit, transferPrice, landPurposeDetail, transferUnit, remark, publicityPeriod, contactUnit, unitAddr, postalCode, contactTel, contacter, email, lanServiceCondition, crawlingTime, url, md5Mark ] self.fileDetail.write(','.join([ _.replace(',', ' ').replace('\n', '').replace( '\r', '') if _ else _ for _ in csvFile ])) self.fileDetail.write('\n') #TODO except Exception as e: self.log(f'详情页数据解析失败, 错误: {e}', level=logging.ERROR)
def parseNews(self, response): self.response_body_decode(response) sel = Selector(response) homeurl = tools.getHomeUrl(response.url) brandname = response.meta['brandname'] news = None # news保存新闻主体部分的SelectorList pagerule = None # 判断是否已经可以确定页面规则 if response.meta.has_key('pagerule'): pagerule = response.meta['pagerule'] news = sel.xpath(pagerule['pageform']) else: # 对于新闻页面规则库的每条规则进行匹配,然后对该类型的新闻页面进行爬取 for each_rule in newspage_type.page_rules: news = sel.xpath(each_rule['pageform']) if len(news) > 0: pagerule = each_rule break if pagerule is None: raise ValueError('Error processing (' + response.url + ') This page do not have corresponding rules') # 获得allpage 和 nextpage url if pagerule['allpage'] is None: allpage = [] else: allpage = news.xpath(pagerule['allpage']).extract() if pagerule['nextpage'] is None: nextpage = [] else: nextpage = news.xpath(pagerule['nextpage']).extract() # 如果包含全页阅读的url,则进行该处理 if len(allpage) > 0: if tools.isCompleteUrl(allpage[0]): url = allpage[0] else: url = homeurl + allpage[0] r = Request(url, callback=self.parseNews) r.meta['brandname'] = brandname r.meta['pagerule'] = pagerule yield r elif len(nextpage) > 0: # 如果包含下一页,则进行该处理 if tools.isCompleteUrl(nextpage[0]): url = nextpage[0] else: url = homeurl + nextpage[0] # 提取当前页面的title, date, content,保存到article中,传递至下一请求 title = news.xpath(pagerule['title']).extract() date = self.getDate(news, response.url, pagerule['date']) content = self.getContent(news, pagerule['content']) article = { 'brandname': brandname, 'title': title, 'date': date, 'content': content } r = Request(url, callback=self.parseNextPage) r.meta['article'] = article r.meta['pagerule'] = pagerule yield r else: # 如果新闻只有一页,则直接提取新闻内容 title = news.xpath(pagerule['title']).extract() date = self.getDate(news, response.url, pagerule['date']) content = self.getContent(news, pagerule['content']) item = NewsItem() item['brandname'] = brandname item['date'] = date item['title'] = "".join(title) item['content'] = "".join(content) yield item
def preview_result(Xpath, inputtext): sel = Selector(text=inputtext) result = sel.xpath(Xpath).extract() n = len(result) for idx, element in enumerate(result[:min(4,n)], start=1): print(f"Element {idx}: {element}", end=sp)
def number_of_element(Xpath, inputtext): sel = Selector(text=inputtext) print(f"Number of selected element(s): {len(sel.xpath(Xpath))} elements", end=sp)
def preview_result(Xpath, inputtext): sel = Selector(text=inputtext) result = sel.xpath(Xpath).extract() n = len(result) for idx, element in enumerate(result[:min(4,n)], start=1): print(f"Element {idx}: {element}", end=sp) sp = '\n\n' url = 'https://www.cdc.gov/nchs/tutorials/NHANES/index_continuous.htm' # res = requests.get(url) # html = res.text html = requests.get(url).content xpath = '//p' xpath2 = '//*' sel = Selector(text=html) sll = sel.xpath('//p')[2].extract() # extract the 3rd element (here paragrph) of the selectorList sll_ = sel.xpath('//p') # without extract(), the selectorList give a 36 line preview of the paragraph slla = sel.xpath('//p').extract() sllf = sel.xpath('//p').extract_first() # print(sll, slla, sllf, sep=sp) print(number_of_element(xpath, html), number_of_element(xpath2, html),preview_result(xpath, html), sep=sp)
def parse(self, response): sites = json.loads(response.text) spider_name = response.meta['spider_name'] #网页html data = sites["items_html"] min_position = sites["min_position"] #第一条twitter position = '' if 'max_position' in sites: position = sites["max_position"] else: position = min_position.split('-')[2] if data == "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n \n": print ("抓取完成!!!,更新种子") self.db.updateSeedTag(spider_name) self.db.updateSeedCountLocation(spider_name, position) else: #是否还有下一页 #has_more_items = sites["has_more_items"] item = SpiderTwitterItem() # 获得贴文作者 twitter_author = re.compile('data-name="(.+)" data-user-id=').findall(data)[0] selector_app = Selector(text=data) twitter_group = selector_app.xpath("//li[@class='js-stream-item stream-item stream-item\n']").extract() twitter_group_count = len(twitter_group) next_page_id = "" for twitter_personal in twitter_group: selector_content = Selector(text=twitter_personal) twitter_id = selector_content.xpath("//li[@class='js-stream-item stream-item stream-item\n']/@data-item-id").extract() if len(twitter_id) > 0: next_page_id = twitter_id[0] if self.db.getTwitterById(next_page_id): # 判断是否是爬取到之前记录位置 if self.db.isSeedLocation(spider_name, next_page_id): print ("%s最新推文抓取完毕"%spider_name) self.db.updateSeedCountLocation(spider_name, position) return print ("%s已存在,进行去重过滤"%next_page_id) continue else: item['twitter_id'] = twitter_id else: item['twitter_id'] = '' twitter_content_whole = "" twitter_content_list = selector_content.xpath("//div[@class='js-tweet-text-container']").extract() for twitter_content in twitter_content_list: selector_content_text = Selector(text=twitter_content) twitter_content_text = selector_content_text.xpath("//text()").extract() twitter_content_text_num = len(twitter_content_text) for i in range(twitter_content_text_num): if twitter_content_text[i] != " " and twitter_content_text[i] != "\n ": twitter_content_add = twitter_content_text[i].replace("\n","") twitter_content_whole += twitter_content_add twitter_content_whole_trun = twitter_content_whole.replace('"','\\"') twitter_href = selector_content.xpath("//small[@class='time']/a/@href").extract() twitter_time = selector_content.xpath("//small[@class='time']/a/@title").extract() twitter_num = selector_content.xpath("//span[@class='ProfileTweet-actionCountForAria']/text()").extract() if len(twitter_num) > 0: twitter_reply = twitter_num[0] twitter_trunsmit = twitter_num[1] twitter_zan = twitter_num[2] else: twitter_reply = '' twitter_trunsmit = '' twitter_zan = '' twitter_img = selector_content.xpath("//div[@class='AdaptiveMedia-photoContainer js-adaptive-photo ']/@data-image-url").extract() print ("目标:%s" % twitter_id[0]) print ("内容:%s" % twitter_content_whole_trun) if len(twitter_author) > 0: author = twitter_author item['twitter_author'] = author else: item['twitter_author'] = '' if len(twitter_id) > 0: tw_id = twitter_id[0] item['twitter_id'] = tw_id else: item['twitter_id'] = '' if twitter_content_whole: content = twitter_content_whole_trun item['twitter_content'] = content else: item['twitter_content'] = '' if len(twitter_href) > 0: href = "https://twitter.com%s"%twitter_href[0] item['twitter_href'] = href else: item['twitter_href'] = '' if len(twitter_time) > 0: time = twitter_time[0] item['twitter_time'] = time else: item['twitter_time'] = '' if len(twitter_num) > 0: reply = twitter_reply item['twitter_reply'] = reply else: item['twitter_reply'] = '' if len(twitter_num) > 0: trunsmit = twitter_trunsmit item['twitter_trunsmit'] = trunsmit else: item['twitter_trunsmit'] = '' if len(twitter_num) > 0: zan = twitter_zan item['twitter_zan'] = zan else: item['twitter_zan'] = '' if len(twitter_img) == 1: img = twitter_img[0] item['twitter_img'] = img elif len(twitter_img) > 1: img_list = [] for img in twitter_img: img_list.append(img) item['twitter_img'] = img_list else: item['twitter_img'] = '' yield item print ("下一页等待中...") #has_more_items 为true 代表还有下一页 yield Request(url=self.next_page_url.format(spider_name,self.now_time, next_page_id, position), callback=self.parse,headers={'Referer': "https://twitter.com/"}, meta={'spider_name': spider_name})
def parse(self, response): driver = response.meta['driver'] driver.maximize_window() driver.execute_script("window.open('');") driver.switch_to.window(driver.window_handles[0]) for _, value in self.df.iterrows(): cntr = 199 while True: location = value['Location'] category = value['Category'] subCat = value['Subcategory'] url = f"{value['URL']}{cntr}" driver.get(url) cntr += 1 WebDriverWait(driver, 15).until( EC.visibility_of_element_located(( By.XPATH, "(//div[@class='js-project-group'])[2]//h3/parent::a[@class='soft-black mb3']" ))) html = driver.page_source respObj = Selector(text=html) count = respObj.xpath( "normalize-space(//b[contains(@class, 'count')]/text())" ).get() pCount = int("".join(re.findall(r'\d+', count))) driver.switch_to.window(driver.window_handles[1]) items = respObj.xpath( "(//div[@class='js-project-group'])[2]//h3/parent::a[@class='soft-black mb3']" ) for item in items: title = item.xpath("normalize-space(.//h3/text())").get() if title not in self.li: self.li.append(title) url = item.xpath(".//@href").get() driver.get(url) time.sleep(1) WebDriverWait(driver, 15).until( EC.visibility_of_element_located(( By.XPATH, "//a[@data-modal-title='About the creator']"))) html1 = driver.page_source respObj1 = Selector(text=html1) title = respObj1.xpath( "normalize-space(//h2/span/a/text())").get() creator = respObj1.xpath( "normalize-space(//a[@data-modal-title='About the creator']/text())" ).get() backers = respObj1.xpath( "normalize-space(//b[contains(text(), 'backers')]/text())" ).get() money = respObj1.xpath( "normalize-space(//span[@class='money']/text())" ).get() driver.find_element_by_xpath( "//a[@data-modal-title='About the creator']" ).click() time.sleep(2) html2 = driver.page_source respObj2 = Selector(text=html2) yield { 'Title': title, 'Creator': creator, 'Backers': backers.replace(" backers", ""), 'Money': money, 'Website': respObj2.xpath( "//h4[contains(text(), 'Websites')]/following-sibling::ul/li/a/@href" ).getall(), 'Location': location, 'Category': category, 'Sub Category': subCat } else: pass driver.switch_to.window(driver.window_handles[0]) a = pCount // 12 if pCount % 12 != 0: a += 1 else: a += 0 if cntr > 200: break
def person_registered(self, response): test = Selector( response=response).xpath('//dl/div/text()').extract_first() if test is not '没有数据': if response.meta['staff_type'] == 1: info_data = Selector(response=response).xpath('//dl') for i in info_data: person_data = { 'companyName': response.meta['company_name'], 'licenseNum': '', 'name': response.meta['person_name'], 'area': '青海省', 'sex': '', 'idCard': response.meta['id_card'], 'grade': '', 'major': '', 'num': '', 'regNum': '', 'validTime': '', 'tel': '', 'tokenKey': self.token } dd = i.xpath('./dd') # 注册类别 grade = dd[0].xpath('./b/text()').extract_first() if grade: person_data['grade'] = grade # 注册专业 major = dd[1].xpath('./text()').extract_first() if major: person_data['major'] = major # 注册号 num = dd[3].xpath('./text()').extract_first() if num: person_data['num'] = num print(person_data, '注册人员') yield scrapy.FormRequest( url= 'https://api.maotouin.com/rest/companyInfo/addCompanyRecordEngineer.htm', formdata=person_data, callback=self.person_post, meta={ 'data': person_data, 'company_name': response.meta['company_name'] }, dont_filter=True, ) elif response.meta['staff_type'] == 2: info_data = Selector(response=response).xpath('//dl') for i in info_data: person_data = { 'companyName': response.meta['company_name'], 'licenseNum': '', 'name': response.meta['person_name'], 'area': '青海省', 'sex': '', 'idCard': response.meta['id_card'], 'grade': '', 'major': '', 'num': '', 'regNum': '', 'validTime': '', 'tel': '', 'tokenKey': self.token } dd = i.xpath('./dd') # 注册类别 try: grade = dd[0].xpath('./b/text()').extract_first() except IndexError: continue if grade: person_data['grade'] = grade # 注册专业 major = dd[1].xpath('./text()').extract_first() if major: person_data['major'] = major # 职称编号 num = dd[2].xpath('./text()').extract_first() if num: person_data['num'] = num print(person_data, '职业人员') yield scrapy.FormRequest( url= 'https://api.maotouin.com/rest/companyInfo/addCompanyRecordEngineer.htm', formdata=person_data, callback=self.person_post, meta={ 'data': person_data, 'company_name': response.meta['company_name'] }, dont_filter=True, ) elif response.meta['staff_type'] == 3: info_data = Selector(response=response).xpath('//dl') for i in info_data: person_data = { 'companyName': response.meta['company_name'], 'licenseNum': '', 'name': response.meta['person_name'], 'area': '青海省', 'sex': '', 'idCard': response.meta['id_card'], 'grade': '', 'major': '', 'num': '', 'regNum': '', 'validTime': '', 'tel': '', 'tokenKey': self.token } dd = i.xpath('./dd') # 注册专业 number = dd[3].xpath('./text()').extract_first() if number: person_data['num'] = number # # 职称编号 # validTime = dd[4].xpath('./text()').extract_first() # if validTime: # person_data['validTime'] = validTime.replace('/') print(person_data, '安全三类人员') yield scrapy.FormRequest( url= 'https://api.maotouin.com/rest/companyInfo/addCompanyRecordEngineer.htm', formdata=person_data, callback=self.person_post, meta={ 'data': person_data, 'company_name': response.meta['company_name'] }, dont_filter=True, ) elif response.meta['staff_type'] == 4: info_data = Selector(response=response).xpath('//dl') for i in info_data: person_data = { 'companyName': response.meta['company_name'], 'licenseNum': '', 'name': response.meta['person_name'], 'area': '青海省', 'sex': '', 'idCard': response.meta['id_card'], 'grade': '', 'major': '', 'num': '', 'regNum': '', 'validTime': '', 'tel': '', 'tokenKey': self.token } dd = i.xpath('./dd') # 注册专业 try: major = dd[0].xpath('./text()').extract_first() except IndexError: continue if major: person_data['major'] = major # 资格专业 grade = dd[1].xpath('./text()').extract_first() if grade: person_data['grade'] = grade # 证书编号 num = dd[3].xpath('./text()').extract_first() if num: person_data['num'] = num # # 职称编号 # validTime = dd[4].xpath('./text()').extract_first() # if validTime: # person_data['validTime'] = validTime print(person_data, '专业岗位证书') yield scrapy.FormRequest( url= 'https://api.maotouin.com/rest/companyInfo/addCompanyRecordEngineer.htm', formdata=person_data, callback=self.person_post, meta={ 'data': person_data, 'company_name': response.meta['company_name'] }, dont_filter=True, ) elif response.meta['staff_type'] == 5: info_data = Selector(response=response).xpath('//dl') for i in info_data: person_data = { 'companyName': response.meta['company_name'], 'licenseNum': '', 'name': response.meta['person_name'], 'area': '青海省', 'sex': '', 'idCard': response.meta['id_card'], 'grade': '', 'major': '', 'num': '', 'regNum': '', 'validTime': '', 'tel': '', 'tokenKey': self.token } dd = i.xpath('./dd') # 注册专业 major = dd[0].xpath('./text()').extract_first() if major: person_data['major'] = major # 资格专业 grade = dd[1].xpath('./text()').extract_first() if grade: person_data['grade'] = grade # 证书编号 num = dd[3].xpath('./text()').extract_first() if num: person_data['num'] = num # 有效期至 validTime = dd[4].xpath('./text()').extract_first() if validTime: person_data['validTime'] = validTime print(person_data, '技术人员') yield scrapy.FormRequest( url= 'https://api.maotouin.com/rest/companyInfo/addCompanyRecordEngineer.htm', formdata=person_data, callback=self.person_post, meta={ 'data': person_data, 'company_name': response.meta['company_name'] }, dont_filter=True, ) elif response.meta['staff_type'] == 6: info_data = Selector(response=response).xpath('//dl') for i in info_data: person_data = { 'companyName': response.meta['company_name'], 'licenseNum': '', 'name': response.meta['person_name'], 'area': '青海省', 'sex': '', 'idCard': response.meta['id_card'], 'grade': '', 'major': '', 'num': '', 'regNum': '', 'validTime': '', 'tel': '', 'tokenKey': self.token } dd = i.xpath('./dd') # 资格类别 try: major = dd[0].xpath('./text()').extract_first() except IndexError: continue if major: person_data['major'] = major # 等级 grade = dd[1].xpath('./text()').extract_first() if grade: person_data['grade'] = grade # 等级 num = dd[3].xpath('./text()').extract_first() if num: person_data['num'] = num # 职称编号 validTime = dd[4].xpath('./text()').extract_first() if validTime: person_data['validTime'] = validTime print(person_data, '职业技能人员') yield scrapy.FormRequest( url= 'https://api.maotouin.com/rest/companyInfo/addCompanyRecordEngineer.htm', formdata=person_data, callback=self.person_post, meta={ 'data': person_data, 'company_name': response.meta['company_name'] }, dont_filter=True, )
def parse(self, response): ''' Scrape archive for articles Parameters ---------- self: the PostillonSpider object response: The response from a scrapy request ''' def init_selenium_driver(): ''' Initialize and return a firefox or chorme selenium driver depending on the option SELENIUM_DRIVER Returns ------- A firefox or chrome selenium driver depending on the option SELENIUM_DRIVER ''' if SELENIUM_DRIVER == 'Firefox': firefoxOptions = webdriver.FirefoxOptions() firefoxOptions.headless = True desired_capabilities = firefoxOptions.to_capabilities() driver = webdriver.Firefox( desired_capabilities=desired_capabilities) else: # Chrome driver chrome_options = Options() chrome_options.headless = True driver = webdriver.Chrome(options=chrome_options) return driver def get_closed_elements(): ''' Returns all or some closed year and month elements, depending on the limit definitions. Returns ------- All or some closed year and month elements, depending on the limit definitions. ''' # Get all closed months of year to crawl, that are newer or equal to the limit specified by LIMIT_MIN_MONTH_OF_YEAR_TO_CRAWL if LIMIT_MIN_MONTH_OF_YEAR_TO_CRAWL: # get year element_of_YEAR_TO_CRAWL = driver.find_element_by_class_name( 'year-' + str(YEAR_TO_CRAWL)) # Get closed months xpath = ".//li[contains(@class, 'closed') and (contains(@class, 'month-12')" for month in range(LIMIT_MIN_MONTH_OF_YEAR_TO_CRAWL - 1, 12): month_plus_1 = month + 1 xpath += " or contains(@class, 'month-" + "{:02d}".format( month + 1) + "')" xpath = xpath + ")]" closed_elements = element_of_YEAR_TO_CRAWL.find_elements_by_xpath( xpath) closed_elements.append(element_of_YEAR_TO_CRAWL) # Get all closed months of year to crawl elif YEAR_TO_CRAWL: element_of_YEAR_TO_CRAWL = driver.find_element_by_class_name( 'year-' + str(YEAR_TO_CRAWL)) closed_elements = element_of_YEAR_TO_CRAWL.find_elements_by_class_name( 'closed') closed_elements.append(element_of_YEAR_TO_CRAWL) # Get all closed years/months of the entire archive else: # also finds closed months inside closed years closed_elements = driver.find_elements_by_class_name('closed') return closed_elements def waitForLoad(): ''' Wait until at 1 article per year has been loaded. If the current year is being crawled wait until an article of january or LIMIT_MIN_MONTH_OF_YEAR_TO_CRAWL has been loaded (Because the current month of the current year is already loaded on page load). ''' CURRENT_YEAR = datetime.now().year TIMEOUT = 20 wait = WebDriverWait(driver, TIMEOUT) try: # xpath for tag that with class 'date' and content that includes '2020' or '1.2020' or '<LIMIT_MIN_MONTH_OF_YEAR_TO_CRAWL>.2020', # depending on what is to be crawled xpath = "//a/div/div/div[contains(@class, 'date') and contains(string(), '" if YEAR_TO_CRAWL: # If the current year is crawled wait for an article of the first month to be loaded. # This is necessary because the current month is already loaded on page load. if YEAR_TO_CRAWL == CURRENT_YEAR: if LIMIT_MIN_MONTH_OF_YEAR_TO_CRAWL: xpath += str( LIMIT_MIN_MONTH_OF_YEAR_TO_CRAWL) + "." else: xpath += "1." xpath += str(YEAR_TO_CRAWL) + "')]" wait.until( EC.presence_of_element_located((By.XPATH, xpath))) # Wait for 1 artile per year else: base_xpath = xpath for i in range(2008, CURRENT_YEAR + 1): # xpath for tag with class 'date' and the content that includes the year i xpath = base_xpath + str(i) + "')]" wait.until( EC.presence_of_element_located((By.XPATH, xpath))) except TimeoutException as e: logging.warning( "TimeoutException has been thrown while waiting for articles to load: %s", e) def click_elements(elements): '''" Click all elements in elements Parameters ---------- elements: HTML Elements to be clicked ''' for element in elements: try: # element.click() causes Exception: "could not be scrolled into view" driver.execute_script("arguments[0].click();", element) # print("click: " + element.get_attribute('class').split()[1]) except Exception as e: logging.warning( "An exception has been thrown while clicking closed years/months: %s", e) driver = init_selenium_driver() driver.get(root) # Close all years/months click_elements(driver.find_elements_by_class_name('open')) # Open closed years/months to load articles click_elements(get_closed_elements()) # Wait for articles to be loaded waitForLoad() # Hand-off between Selenium and Scrapy sel = Selector(text=driver.page_source) # for all ul tags with class 'month-inner' get all contained li tags and get their direct a-tag children articleList = sel.xpath('//ul[@class="month-inner"]//li/a') articleList = utils.limit_crawl(articleList, TESTRUN_ARTICLES_LIMIT) if articleList: for article in articleList: # extract the value of the href attribute from article long_url = article.xpath('./@href').extract()[0] # extract the content of div-tags with class 'date' contained by article published_time = article.xpath( './/div[@class="date"]/text()').extract() published_time = published_time[0] if len( published_time) > 0 else '' if long_url and not utils.is_url_in_db(long_url): yield scrapy.Request(long_url, callback=self.parse_article, cb_kwargs=dict( long_url=long_url, published_time=published_time)) else: utils.log_event(utils(), self.name, long_url, 'exists', 'info') logging.info('%s already in db', long_url) # Quit the selenium driver and close every associated window driver.quit()
def parse(self, response): user_item = UserItem() user_item['crawl_time'] = int(time.time()) selector = Selector(response) user_item['_id'] = re.findall('(\d+)/info', response.url)[0] user_info_text = ";".join( selector.xpath('body/div[@class="c"]//text()').extract()) nick_name = re.findall('昵称;?:?(.*?);', user_info_text) gender = re.findall('性别;?:?(.*?);', user_info_text) place = re.findall('地区;?:?(.*?);', user_info_text) brief_introduction = re.findall('简介;?:?(.*?);', user_info_text) birthday = re.findall('生日;?:?(.*?);', user_info_text) sex_orientation = re.findall('性取向;?:?(.*?);', user_info_text) sentiment = re.findall('感情状况;?:?(.*?);', user_info_text) vip_level = re.findall('会员等级;?:?(.*?);', user_info_text) authentication = re.findall('认证;?:?(.*?);', user_info_text) labels = re.findall('标签;?:?(.*?)更多>>', user_info_text) if nick_name and nick_name[0]: user_item["nick_name"] = nick_name[0].replace(u"\xa0", "") if gender and gender[0]: user_item["gender"] = gender[0].replace(u"\xa0", "") if place and place[0]: place = place[0].replace(u"\xa0", "").split(" ") user_item["province"] = place[0] if len(place) > 1: user_item["city"] = place[1] if brief_introduction and brief_introduction[0]: user_item["brief_introduction"] = brief_introduction[0].replace( u"\xa0", "") if birthday and birthday[0]: user_item['birthday'] = birthday[0] if sex_orientation and sex_orientation[0]: if sex_orientation[0].replace(u"\xa0", "") == gender[0]: user_item["sex_orientation"] = "同性恋" else: user_item["sex_orientation"] = "异性恋" if sentiment and sentiment[0]: user_item["sentiment"] = sentiment[0].replace(u"\xa0", "") if vip_level and vip_level[0]: user_item["vip_level"] = vip_level[0].replace(u"\xa0", "") if authentication and authentication[0]: user_item["authentication"] = authentication[0].replace( u"\xa0", "") if labels and labels[0]: user_item["labels"] = labels[0].replace(u"\xa0", ",").replace(';', '').strip(',') education_info = selector.xpath('//div[contains(text(),"学习经历")]/following-sibling::div[1]'). \ xpath('string(.)').extract() if education_info: user_item['education'] = education_info[0].replace(u"\xa0", "") work_info = selector.xpath('//div[contains(text(),"工作经历")]/following-sibling::div[1]'). \ xpath('string(.)').extract() if work_info: user_item['work'] = work_info[0].replace(u"\xa0", "") request_meta = response.meta request_meta['item'] = user_item yield Request(self.base_url + '/u/{}'.format(user_item['_id']), callback=self.parse_further_information, meta=request_meta, dont_filter=True, priority=1)
def _select_data_table_from_page(self, pupdate_table): table_section = pupdate_table.xpath(f'//div[@class="{self.table_name}"]').extract() return Selector(text=table_section[0])
class JCpenneySpider(BaseCheckoutSpider): name = 'jcpenney_checkout_products' allowed_domains = ['jcpenney.com' ] # do not remove comment - used in find_spiders() SHOPPING_CART_URL = 'http://www.jcpenney.com/jsp/cart/viewShoppingBag.jsp' CHECKOUT_PAGE_URL = "https://www.jcpenney.com/dotcom/" \ "jsp/checkout/secure/checkout.jsp" def start_requests(self): yield scrapy.Request('http://www.jcpenney.com/') def _get_colors_names(self): swatches = self._find_by_xpath( '//ul[@class="small_swatches"]' '/li[not(@class="sku_not_available_select")]' '//a[not(span[@class="no_color"]) and ' 'not(span[@class="color_illegal"])]/img') return [x.get_attribute("name") for x in swatches] def select_size(self, element=None): default_attr_xpath = '*//div[@id="skuOptions_size"]//' \ 'li[@class="sku_select"]' avail_attr_xpath = '*//*[@id="skuOptions_size"]//' \ 'li[not(@class="sku_not_available" or @class="sku_illegal")]/a' self.select_attribute(default_attr_xpath, avail_attr_xpath, element) def select_color(self, element=None, color=None): default_attr_xpath = '*//li[@class="swatch_selected"]' avail_attr_xpath = ('*//*[@class="small_swatches"]' '//a[not(span[@class="no_color"]) and ' 'not(span[@class="color_illegal"])]') if color and color in self.available_colors: default_attr_xpath = '*//*[@class="small_swatches"]//a' \ '[img[@name="%s"]]' % color self.select_attribute(default_attr_xpath, avail_attr_xpath, element) self._find_by_xpath('//h1')[0].click() time.sleep(1) def click_condition(self, default_xpath, all_xpaths): return self._find_by_xpath(default_xpath) or self._find_by_xpath( all_xpaths) def select_attribute(self, default_attr_xpath, avail_attr_xpath, element): max_retries = 20 retries = 0 if self.click_condition(default_attr_xpath, avail_attr_xpath): self._click_attribute(default_attr_xpath, avail_attr_xpath, element) while self.driver.find_elements( By.ID, 'page_loader') and retries < max_retries: time.sleep(1) retries += 1 print(inspect.currentframe().f_back.f_code.co_name) def select_width(self, element=None): default_attr_xpath = '*//div[@id="skuOptions_width"]//' \ 'li[@class="sku_select"]' avail_attr_xpath = '*//*[@id="skuOptions_width"]//' \ 'li[not(@class="sku_not_available" or @class="sku_illegal")]/a' self.select_attribute(default_attr_xpath, avail_attr_xpath, element) def select_waist(self, element=None): default_attr_xpath = ( '*//*[@id="skuOptions_waist"]//li[@class="sku_select"]') avail_attr_xpath = ('*//*[@id="skuOptions_waist"]//' 'li[not(@class="sku_not_available" ' 'or @class="sku_illegal")]') self.select_attribute(default_attr_xpath, avail_attr_xpath, element) def select_inseam(self, element=None): default_attr_xpath = ( '*//*[@id="skuOptions_inseam"]//li[@class="sku_select"]') avail_attr_xpath = ('*//*[@id="skuOptions_inseam"]//' 'li[not(@class="sku_not_available" ' 'or @class="sku_illegal")]') self.select_attribute(default_attr_xpath, avail_attr_xpath, element) def select_neck(self, element=None): default_attr_xpath = ( '*//*[@id="skuOptions_neck size"]//li[@class="sku_select"]') avail_attr_xpath = ('*//*[@id="skuOptions_neck size"]//' 'li[not(@class="sku_not_available" ' 'or @class="sku_illegal")]') self.select_attribute(default_attr_xpath, avail_attr_xpath, element) def select_sleeve(self, element=None): default_attr_xpath = ( '*//*[@id="skuOptions_sleeve"]//li[@class="sku_select"]') avail_attr_xpath = ('*//*[@id="skuOptions_sleeve"]//' 'li[not(@class="sku_not_available" ' 'or @class="sku_illegal")]') self.select_attribute(default_attr_xpath, avail_attr_xpath, element) def _parse_attributes(self, product, color, quantity): time.sleep(10) self.select_color(product, color) self.select_size(product) self.select_width(product) self.select_waist(product) self.select_inseam(product) self.select_neck(product) self.select_sleeve(product) self._set_quantity(product, quantity) def _get_products(self): return self._find_by_xpath( '//*[@id="regularPP"]|//*[contains(@class,"product_row")]') def _add_to_cart(self): addtobagbopus = self._find_by_xpath('//*[@id="addtobagbopus"]') addtobag = self._find_by_xpath('//*[@id="addtobag"]') if addtobagbopus: self._click_on_element_with_id('addtobagbopus') elif addtobag: self._click_on_element_with_id('addtobag') time.sleep(5) def _do_others_actions(self): skip_this_offer = self._find_by_xpath( '//a[contains(@href,"javascript:skipThisOffer")]') if skip_this_offer: skip_this_offer[0].click() time.sleep(4) def _set_quantity(self, product, quantity): quantity_option = Select( self.driver.find_element_by_xpath('*//*[@name="prod_quantity"]')) try: quantity_option.select_by_value(str(quantity)) quantity_selected = quantity_option.first_selected_option.text if quantity_selected != str(quantity): time.sleep(4) self.log('Quantity "{}" selected'.format(quantity)) except: pass def _get_product_list_cart(self): time.sleep(1) self.page_source = self.driver.page_source self.page_selector = Selector(text=self.page_source) try: item_info = re.findall('var jcpORDERJSONjcp = (\{.+?\});', self.page_source, re.MULTILINE)[0] self.item_info = json.loads(item_info) return self.item_info except IndexError: return None def _get_products_in_cart(self, product_list): return product_list.get('purchasedItems') def _get_subtotal(self): return self.item_info.get('merchantTotalWithSavings') def _get_total(self): return self.item_info.get('orderTotal') def _get_item_name(self, item): return item.get('displayName') def _get_item_id(self, item): return item.get('itemNumber')[2:] def _get_item_price(self, item): return str(item.get('lineTotalPrice')) def _get_item_price_on_page(self, item): price_on_page_from_json = float(item.get('lineUnitPrice')) price_on_page_from_html = self.page_selector.xpath( '//span[contains(@data-anid, "product_CurrentSellingPrice")]/text()' ).re(FLOATING_POINT_RGEX) price_on_page_from_html = float(is_empty(price_on_page_from_html, 0)) return price_on_page_from_json if price_on_page_from_json >= 0 else price_on_page_from_html def _get_item_color(self, item): selector = scrapy.Selector(text=self.page_source) color_new = is_empty( selector.xpath( '//span[@class="size" and ' 'contains(text(),"color:")]/text()').re('color\:\n(.+)')) color_old = is_empty( selector.xpath( '//span[@class="size" and contains(text(),"color:")]' '/strong/text()').extract()) return color_new or color_old def _get_item_quantity(self, item): return item.get('quantity') def _enter_promo_code(self, promo_code): self.log('Enter promo code: {}'.format(promo_code)) promo_field = self._find_by_xpath('//*[@id="cr-code"]')[0] promo_field.send_keys(promo_code) time.sleep(2) promo_field.send_keys(Keys.ENTER) time.sleep(5) self.driver.refresh() time.sleep(5) self.item_info = self._get_product_list_cart() def _remove_promo_code(self): self.log('Remove promo code') try: remove_field = self._find_by_xpath( '//a[@title="remove" and @class="cr-remove"]')[0] remove_field.click() time.sleep(10) except IndexError: self.log('Invalid promo code') def _get_promo_total(self): return self._get_total() def _get_promo_subtotal(self): return str(self._get_subtotal()) def _parse_no_longer_available(self): return bool(self._find_by_xpath('//*[@class="error_holder"]'))
def parse(self, response): page = Selector(response) review_boxes = page.xpath( '//ul[@class="ylist ylist-bordered reviews"]/li') del review_boxes[0] for review_box in review_boxes: rv = Review() rv.business_id = self.biz_id rv.user_id = review_box.xpath( './/li[@class="user-name"]/a/@href').extract_first() if rv.user_id != None: user_url = rv.user_id rv.user_id = rv.user_id[rv.user_id.rfind("=") + 1:] if (self.session.query(YelpUser).filter( YelpUser.yelp_id == rv.user_id).count() == 0): user = self.fetch_userdata('https://www.yelp.com' + user_url) self.session.add(user) else: user = YelpUser() user.yelp_id = None user.name = "Qype User" user.location = review_box.xpath( './/li[@class="user-location responsive-hidden-small"]/b/text()' ).extract_first().strip() user.photos_count = review_box.xpath( './/li[@class="photo-count responsive-small-display-inline-block"]/b/text()' ).extract_first() user.friends_count = review_box.xpath( './/li[@class="friend-count responsive-small-display-inline-block"]/b/text()' ).extract_first() user.reviews_count = review_box.xpath( './/li[@class="review-count responsive-small-display-inline-block"]/b/text()' ).extract_first() user.meta = None self.session.add(user) rv.text = review_box.xpath( './/div[@class="review-content"]/p/text()').extract_first() rv.rating = review_box.xpath( './/div[@class="review-content"]/div[@class="biz-rating biz-rating-large clearfix"]/div/div/@title' ).extract_first() rv.rating = rv.rating[0:rv.rating.find(" ")] rv.date = review_box.xpath( './/div[@class="review-content"]/span[@class="rating-qualifier"]/text()' ).extract_first() self.session.add(rv) if (self.session.query(CrawlData).filter( CrawlData.url == response.url).count() != 0): crawl_data = CrawlData() crawl_data.body = response.body crawl_data.requestHeader = str(response.request.headers) crawl_data.url = response.url self.session.add(crawl_data) self.session.commit() next_page = page.xpath('//link[@rel="next"]/@href').extract_first() if (next_page != None): yield response.follow(next_page, self.parse)
def parse_house_info(self, resp): """ 解析二手房信息 :return: """ item = dict() response = Selector(resp) generalXpath = "//span[text()='{}']/../text()" # 链家编号 item['houseCode'] = response.xpath( "//div[@class='houseRecord']/span[2]/text()").extract_first( "").strip() # 小区名 item['houseName'] = response.xpath( "//div[@class='communityName']/a[1]/text()").extract_first( "").strip() # 朝向 item['houseDirection'] = response.xpath( generalXpath.format("房屋朝向")).extract_first("").strip() # 户型 item['houseType'] = response.xpath( generalXpath.format("房屋户型")).extract_first("").strip() # 电梯 item['houseElevator'] = response.xpath( generalXpath.format("配备电梯")).extract_first("").strip() # 区域 item['houseAddress'] = response.xpath( "//div[@class='areaName']/a/text()").extract_first("").strip() item['houseDistrict'] = response.xpath( "//div[@class='areaName']/span[@class='info']/a[2]/text()" ).extract_first("").strip() item['houseRegion'] = response.xpath( "//div[@class='areaName']/span[@class='info']/a[1]/text()" ).extract_first("").strip() # 楼层 item['houseFloor'] = response.xpath( generalXpath.format("所在楼层")).extract_first("").strip() # 建筑面积 item['houseSize'] = response.xpath( generalXpath.format("建筑面积")).extract_first("").strip() # 装修情况 item['houseStatus'] = response.xpath( generalXpath.format("装修情况")).extract_first("").strip() # 每平米价格 item['houseUnitPrice'] = response.xpath( "//span[@class='unitPriceValue']/text()").extract_first( "").strip() # 总价 item['houseAllPrice'] = response.xpath( "//div[@class='price ']/span[@class='total']/text()" ).extract_first("").strip() # 建设时间 item['houseYear'] = response.xpath( "//div[@class='area']/div[@class='subInfo']/text()").re_first( r"(\d+)") # 原文链接 item['url'] = resp.url # 经纬度 postions = self.pattern_position.search(resp.text) # 获取坐标 item['Longitude'] = postions.group(1) item['Latitude'] = postions.group(2) self.db.update_set('houseCode', item) self.lianjia_spider_log.info(f'parse item success:{resp.url}')
def get_hxs(url): text=requests.get(url).text hxs=Selector(text=text) return hxs
def fetch_userdata(self, url): user = YelpUser() response = requests.get(url) page = Selector(response) user.yelp_id = url[url.rfind('=') + 1:] user.name = page.xpath( '//div[@class="user-profile_info arrange_unit"]/h1/text()' ).extract_first() user.location = page.xpath( '//div[@class="user-profile_info arrange_unit"]/h3/text()' ).extract_first() user.tagline = page.xpath( '//p[@class="user-tagline"]/text()').extract_first() user.friends_count = page.xpath( '//li[@class="friend-count"]/strong/text()').extract_first() user.reviews_count = page.xpath( '//li[@class="review-count"]/strong/text()').extract_first() user.photos_count = page.xpath( '//li[@class="photo-count"]/strong/text()').extract_first() user.image_url = page.xpath( '//div[@class="user-profile_avatar"]//img/@src').extract_first() if (MUST_DOWNLOAD_USER_IMAGE): if (os.path.exists(BASE_DIR + '/UserImages') == False): os.mkdir(BASE_DIR + '/UserImages') with open(BASE_DIR + 'UserImages/' + user.yelp_id + '.jpg', 'wb') as f: f.write(requests.get(user.image_url)) user.image_path = BASE_DIR + 'UserImages/' + user.yelp_id + '.jpg' sidebar = page.xpath('//div[@class="user-details-overview_sidebar"]') extra_data = {} for ysection in sidebar.xpath('.//div[@class="ysection"]'): key = ysection.xpath('.//h4/text()').extract_first() if (key == 'Rating Distribution'): starts_distribution = ysection.xpath( './/td[@class="histogram_count"]/text()').extract() extra_data[key] = dict() extra_data[key]['5 stars'] = starts_distribution[0] extra_data[key]['4 stars'] = starts_distribution[1] extra_data[key]['3 stars'] = starts_distribution[2] extra_data[key]['2 stars'] = starts_distribution[3] extra_data[key]['1 stars'] = starts_distribution[4] elif (key == 'Review Votes' or key == 'Stats'): items = ysection.xpath('.//ul/li') items_title = ysection.xpath( './/ul/li/text()[not(normalize-space(.)="")]').extract() for item in items_title: item = item.strip() extra_data[key] = dict() for title, item in dict(zip(items_title, items)).items(): extra_data[key][title.strip()] = item.xpath( './/strong/text()').extract_first() elif (key.find('Compliments') != -1): items = ysection.xpath('.//li') extra_data['Compliments'] = dict() for item in items: compliment = item.xpath('.//span/@class').extract_first() extra_data['Compliments'][ self.compliments[compliment]] = item.xpath( './/small/text()').extract_first() user.meta = json.dumps(extra_data) return user
def _all_exhibit_rows(self): return Selector(text=self._content).xpath( '//table[@class="apexir_WORKSHEET_DATA"]/tr[@class="even"] | ' + \ '//table[@class="apexir_WORKSHEET_DATA"]/tr[@class="odd"]')
def parse_details(self, response): product = response.meta["product"] hxs = Selector(response) # Get standard shipping fee shipping_fee = hxs.xpath( './/div[@class="shpp_opt"]/p[@name="delivery_option_no" and text()[contains(.,"Qxpress")]]/em/text()').extract_first() if not shipping_fee: shipping_fee = hxs.xpath( './/div[@class="shpp_opt"]/ul/li/label[text()[contains(.,"Qxpress")]]/em/text()').extract_first() if not shipping_fee: shipping_fee = hxs.xpath( './/div[@class="shpp_opt"]/ul/li/label/em/text()').extract_first() if shipping_fee: product['shipping_fee'] = shipping_fee # Get number of reviews review = hxs.xpath('.//a[@tab_name="CustomerReview"]/em/text()').extract_first() if review: product['review'] = review # Get seller rating # Format 4.1 / 5 seller_rating = hxs.xpath('//span[@class="on"]/text()').extract_first() if seller_rating: product["seller_rating"] = seller_rating.split(" ")[-1] # Get oversea location location = hxs.xpath('//dl[@name="shipping_panel_area"]/dd/text()').extract_first() if location: product["local_overseas"] = location # Get sku category category_list = hxs.xpath('//span[@itemprop="name"]/text()').extract() if category_list: for level in range(0, len(category_list)): if level > 2: break product["category_level%s" % str(level + 1)] = category_list[level] # Get variations variation_list = hxs.xpath( '//div[@id="inventory_layer_0"]/div[@class="innerWrap"]/div[@class="select_inner"]/ul/li/a/span/text()').extract() if not variation_list: variation_list = hxs.xpath( '//div[@id="opt_layer_0"]/div[@class="innerWrap"]/div[@class="select_inner"]/ul/li/a/span/text()').extract() if variation_list: max_variations = 10 for i in range(0, min(max_variations, len(variation_list))): if '----' not in variation_list[i]: product["V%s" % str(i + 1)] = variation_list[i] quantity = re.search(r'(Qty\s\:\s)([0-9]+)([\w]*)', variation_list[i]) if quantity: product["Q%s" % str(i + 1)] = quantity.group(2) price = re.search(r'(.+\()([+-]\$[0-9]+\.[0-9]+)(\)\w*)', variation_list[i]) if price: product["P%s" % str(i + 1)] = price.group(2) yield product
) driver.get("https://www.matrimonio.com/wedding-planner") time.sleep(2) try: cookiesBtnElem = driver.find_element_by_xpath("//button[text()='Accetta']") driver.execute_script("arguments[0].click()", cookiesBtnElem) time.sleep(1) except: pass while True: pageCntr += 1 html = driver.page_source respObj = Selector(text=html) #if pageCntr > 27: cards = respObj.xpath("//div[@data-list-type='Catalog']/div[@id]") for card in cards: urlList.append(card.xpath(".//a[contains(@id, 'app_lnk')]/@href").get()) nextPageType1 = respObj.xpath(f"//a[@data-page and text()='{pageCntr}']") nextPageType2 = respObj.xpath(f"//span[contains(@class, 'pagination') and text()='{pageCntr}']") if nextPageType1: nextBtnElem = driver.find_element_by_xpath(f"//a[@data-page and text()='{pageCntr}']") driver.execute_script("arguments[0].click()", nextBtnElem) time.sleep(2) print(f"\n\n PAGE-{pageCntr}") elif nextPageType2:
def _all_principal_td(self): page_selector = Selector(text=self._content) return page_selector.xpath( '//td[starts-with(@headers, "LINK BREAK_COUNTRY_NAME")]')
# 可利用Beautiful Soup、pyquery及正则表达式来提取网页数据 # Scrapy提供了自己的数据提取方法:Selector(选择器).基于lxml构建,支持XPath选择器、CSS选择器就正则,解析速度和准确度非常高 # 1.直接使用:独立模块,可直接利用Selector类构建一个选择器对象,调用相关方法如xpath、css来提取数据 # 针对一段HTML,用如下方式狗结案Selector对象提取数据: from scrapy import Selector body = '<html><head><title>Hello World</title></head><body></body></html>' selector = Selector(text=body) title = selector.xpath('//title/text()').extract_first() # 查找title中的文本,XPath选择器最后加text方法发可实现文本提取 print(title) # 没有在Scrapy框架中运行,把Scrapy中的Selector单独拿出来使用,构建时传入text参数,生成了Selector选择器对象,像Scrapy中的解析 # 方式一样,调用xpath、css方法来提取。 # 2.Scrapy shell:Selector主要与Scrapy结合使用,Scrapy的回调函数中response直接调用xpath或者css方法提取数据, # 所以借助Scrapy shell模拟Scrapy请求过程,理解相关提取方法 # 用官方文档样例页面:http://doc.scrapy.org/en/latest/_static/selectors-sample1.html # 开启Srapy shell,命令行输入: scrapy shell http://doc.scrapy.org/en/latest/_static/selectors-sample1.html # 进入到Scrapy shell模式。过程是,Scrapy发起一次请求,请求的URL是命令行下输入的URL,把可操作的变量request、response传递给我 # 可在命令行模式下输入命令调用对象的一些操作方法,回车后实时显示结果。 # 演示实例都将页面的源码作为分析目标,源码: <html> <head> <base href='http://example.com/' /> <title>Example website</title> </head> <body> <div id = 'images'> <a href='imgae1.html'>Name: My image 1 <br /><img src='image1_thumb.jpg' /></a> <a href='imgae1.html'>Name: My image 2 <br /><img src='image2_thumb.jpg' /></a> <a href='imgae1.html'>Name: My image 3 <br /><img src='image3_thumb.jpg' /></a>
def parse(self, response): """ 解析 """ sel = Selector(text=response.body) print len(sel.xpath(u"//b[text()='单位名称']"))!= 0, "parse 条件" log.msg("parse 条件=%s"%str(len(sel.xpath(u"//b[text()='单位名称']")) != 0), level=log.INFO) if (len(sel.xpath(u"//b[text()='单位名称']")) != 0): #判别是否为要输入验证码 pass else: log.msg("code=%s, %s"%(str(response.status),response.body), level=log.INFO) raise UnknownResponseError #======================================================== """ 第一部分:企业信用档案 """ item = DetailInformation() item['basic_info'] = fundation_info_extract(response) #======================================================== #======================================================== """ 第一部分 政府监管信息 """ item['regulatory_info'] = extract_combine_JCXX(response) #======================================================== #======================================================== """ 第三部分 行业评价信息 """ keywords_list = ['2-1.体系/产品/行业认证信息', '2-2.行业协会(社会组织)评价信息',\ '2-3.水电气通讯等公共事业单位评价'] item['envaluated_info'] = block_info_extract(response,\ keywords_list) #======================================================== """ 第四部分 媒体评价信息 """ keywords_list = ['3-1.媒体评价信息'] item['media_env'] = block_info_extract(response, keywords_list) #======================================================== """ 第五部分 金融信贷信息 """ #url = 'http://www.11315.com/\ #getTradeLendingCount?companyId=%s'%response.url[7:15] #header = {'User-Agent':"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.89 Safari/537.36", # 'Referer':response.url} #req = urllib2.Request(url=url, headers=header) #xtml = urllib2.urlopen(req) #Nums = xtml.read() #print Nums, "this is Nums" #Nums = eval(Nums).split(",") #print Nums, "this is anothor Nums" #total = str(sum([int(i) for i in Nums])) #Nums.insert(0, total) #在头部插入 #if total == '0': # t_url = "" #else: # t_url = sel.xpath(u"//script").re(ur"html\(\'<a href=\"([\w\W]*?)\"")[0] #Nums.append(t_url) #Nums_re = "|".join(Nums) keywords_list = ['4-2.民间借贷评价信息'] item["credit_fin"] = block_info_extract(response, keywords_list) #======================================================= """ 第六部分 企业运营信息 """ #keywords_list = ['5-3.水电煤气电话费信息', #'5-4.纳税信息'] #要么运行js,要么模拟请求,破网站,就两行数据至于吗 #item['operation_info'] = block_info_extract(response, keywords_list) #======================================================== """ 第七部分 市场反馈信息 """ keywords_list = ['6-1.消费者评价信息', '6-2.企业之间履约评价','6-3.员工评价信息', '6-4.其他'] item['feedback_info'] = block_info_extract(response, keywords_list) #======================================================== return item
def main(): adsl = ADSL() result = [] df_input = pd.read_excel('sku.xlsx') sku_list = df_input['sku'].values start = 0 length = len(sku_list) while 1: if start == length: break print('正在爬取第{}条'.format(start + 1)) sku = sku_list[start] options = webdriver.ChromeOptions() options.add_argument( '--user-agent=Mozilla/5.0 (Windows NT 999999.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36' ) options.add_argument('--headless') options.add_argument('--ignore-certificate-errors') options.add_argument('--disable-gpu') driver = webdriver.Chrome(executable_path=r'chromedriver.exe', chrome_options=options) wait = WebDriverWait(driver, TIMEOUT) # 等待加载最长时间 url = 'https://item.jd.com/{}.html'.format(sku) try: driver.get(url) except Exception as e: print(e) start += 1 continue try: wait.until( EC.presence_of_element_located( (By.XPATH, '//a[@id="InitCartUrl"]'))) except: print('访问超时,重试') start += 1 continue text = driver.page_source resp = Selector(text=text) title = resp.xpath('//div[@class="sku-name"]/text()').extract() if len(title) > 1: title = title[1].strip() else: title = title[0].strip() price = resp.xpath( '//span[@class="p-price"]/span[2]/text()').extract_first() comment = resp.xpath( '//div[@id="comment-count"]/a/text()').extract_first() try: activity_type = resp.xpath( '//div[@class="activity-type"]/strong/text()').extract_first() except: activity_type = None area = resp.xpath( '//div[@class="ui-area-text"]/text()').extract_first() store = resp.xpath( '//div[@id="store-prompt"]/strong/text()').extract_first() d = {} d['title'] = title d['price'] = price d['comment'] = comment d['activity_type'] = activity_type d['area'] = area d['store'] = store d['sku'] = str(sku) d['url'] = url result.append(d) time.sleep(2 * random.randint(2, 6)) driver.close() start += 1 adsl.reconnect() df = pd.DataFrame(result) df.to_csv(output_filename, encoding='gbk', mode='a', header=False) print('爬取结束,共爬取了{}条'.format(length))
# -*- coding: utf-8 -*- from scrapy import Selector import requests response = requests.get("https://www.baidu.com").text select = Selector(text=response) title = select.xpath("//title/text()").extract_first() print(title)
def parse(self, response): pagesource = Selector(response) tax_rate = .01 interest = 0.0435 loan_term = 30 insurance = .5 dp_percentage = 0.25 total_page = re.findall( r"\d+", response.xpath('//span[@class="pageText"]//text()').extract() [0])[1] current_page = re.findall( r"\d+", response.xpath('//span[@class="pageText"]//text()').extract() [0])[0] search_results = pagesource.xpath( "//div[@class='MapHomeCardReact HomeCard']") for search in search_results: entry = RedfinTestItem() entry['price'] = float(''.join( re.findall( r"\d+", search.xpath( './/span[@data-rf-test-name="homecard-price"]//text()' ).extract()[0]))) entry['street'] = search.xpath( './/span[@data-rf-test-id="abp-streetLine"]//text()').extract( )[0] entry['citystatezip'] = search.xpath( './/span[@data-rf-test-id="abp-cityStateZip"]//text()' ).extract()[0] entry['zipcode'] = re.findall( r"\d+", search.xpath( './/span[@data-rf-test-id="abp-cityStateZip"]//text()'). extract()[0]) entry['HOA'] = ''.join( re.findall( r"\d+", search.xpath( './/span[@data-rf-test-name="homecard-amenities-hoa"]//text()' ).extract()[0])) entry['Beds'] = ''.join( search.xpath('.//div[@class="value"]//text()').extract()[0]) entry['Baths'] = ''.join( search.xpath('.//div[@class="value"]//text()').extract()[1]) entry['SQFT'] = ''.join( search.xpath('.//div[@class="value"]//text()').extract()[2]) entry['year_built'] = search.xpath( './/span[@data-rf-test-name="homecard-amenities-year-built"]//text()' ).extract()[0] entry['rent'] = get_rent(str(entry['street']), str(entry['zipcode'])) entry['mortgage_pmt'] = float( Loan(entry['price'] * 1 - (dp_percentage), interest, loan_term).monthly_payment) entry['insurance'] = insurance * make_float(entry['SQFT']) if entry['insurance'] == 0: entry['insurance'] == 60 entry['tax'] = entry['price'] * tax_rate / 12 entry['total_pmt'] = make_float( entry['HOA'] ) + entry['mortgage_pmt'] + entry['insurance'] + entry['tax'] entry['cashflow'] = get_cashflow(entry['rent'], entry['total_pmt']) #, entry['price_estimate'] yield entry if int(total_page) > int(current_page): if int(current_page) == 1: next_page = response.url + "/page-2" else: next_page = re.sub(r"[page-][\d]+", "-" + str(int(current_page) + 1), response.url) yield Request(next_page, callback=self.parse)
def amica(report_label, product, model): from globals import file_path if product[7].startswith('http'): page_address = product[7] driver.get(product[7]) html = requests.get(product[7]).content sel = Selector(text=html) else: search = product[1][product[1].lower().find('amica') + len('amica') + 1:] amica_link = f'https://www.amica.pl/szukaj/{search}' driver.get(amica_link) html = requests.get(amica_link).content sel = Selector(text=html) # Znajdź model na stronie Amica try: for i in range(len(sel.xpath('//div[@class="container"]'))): if driver.find_element_by_xpath( f'//h3[@class="prodSymbol"][{i + 1}]').text == model: page_address = driver.find_element_by_xpath( f'//h3[@class="prodSymbol"][{i + 1}]/a').get_attribute( 'href') break except NoSuchElementException: report_label[ 'text'] += f"Nie znaleziono {model} na stronie Amica. Pomijam go." return -1 driver.find_element_by_css_selector( '#produkty > div.moreProducts > div > div > div > div > div > div > div.image > a' ).click() sleep(1) driver.find_element_by_css_selector( '#menu01 > div > div.product-view__media > img').click() first = driver.find_element_by_css_selector( '#prod_app > div.medialightbox__overlay > div > div.cool-lightbox__inner > div.cool-lightbox__wrapper > ' 'div > div > img').get_attribute('src') # Zapisywanie i obrabianie zdjęc do miniaturek i = 0 while i < 15: if i == 0: res = requests.get(first) else: desc_img = driver.find_element_by_css_selector( '#prod_app > div.medialightbox__overlay > div > div.cool-lightbox__inner > div.cool-lightbox__wrapper ' '> div > div > img').get_attribute('src') if desc_img == first: break res = requests.get(desc_img) with open(f'{file_path}/{model}/obrazki_produktu/{i}.jpg', 'wb') as file_format: file_format.write(res.content) try: driver.find_element_by_xpath( '//*[@id="prod_app"]/div[4]/div/div[2]/div[2]/button[2]/div' ).click() except ElementNotInteractableException: pass sleep(1) i = i + 1 for y in range(i): im = Image.open(f'{file_path}/{model}/obrazki_produktu/{y}.jpg') file_format = im.format width, height = im.size if width > height: ratio = width / 600 else: ratio = height / 600 new_width = round(width / ratio) new_height = round(height / ratio) im = im.resize((new_width, new_height)) if file_format == 'PNG': im.save(f'{file_path}/{model}/obrazki_produktu/{y}.jpg', 'PNG') elif file_format == 'JPEG': im.save(f'{file_path}/{model}/obrazki_produktu/{y}.jpg', 'JPEG') else: print(f"Nie umiem zrobić zdjęcia nr {y} :'( (typ {file_format})") driver.find_element_by_tag_name('body').send_keys(Keys.ESCAPE) html = requests.get(page_address).content sel = Selector(text=html) raw = sel.xpath('/html/body/div[1]/script[4]/text()').extract() for i in range(len(raw)): raw[i] = raw[i].replace('\n', '') raw[i] = raw[i].replace('\t', '') raw[i] = raw[i].replace('\xa0', '') raw[i] = raw[i].replace('\r', '') raw[i] = raw[i].replace(' ', '') t = raw[0] t = t[t.find('"descTitle":'):] t = t[:t.find('}]}')] desc = [] imgs = [] while t.find('"descTitle":') != -1: t = t[t.find('"descTitle":') + 13:] desc.append(t[:t.find('"')]) t = t[t.find('"descIconUrl":') + 15:] imgs.append(t[:t.find('"')]) t = t[t.find('"descText":') + 12:] desc.append(t[:t.find('"')]) for i in range(len(imgs)): imgs[i] = imgs[i].replace('\\', '') # pobieranie zdjęć z opisu na dysk lokalny for i, img in enumerate(imgs): res = requests.get(img) with open(f'{file_path}/{model}/obrazki_opisu/{i}.jpg', 'wb') as file_format: file_format.write(res.content) for i in range(len(desc)): desc[i] = desc[i].replace('\\u0105', 'ą') desc[i] = desc[i].replace('\\u0119', 'ę') desc[i] = desc[i].replace('\\u0107', 'ć') desc[i] = desc[i].replace('\\u0144', 'ń') desc[i] = desc[i].replace('\\u015b', 'ś') desc[i] = desc[i].replace('\\u015a', 'Ś') desc[i] = desc[i].replace('\\u00f3', 'ó') desc[i] = desc[i].replace('\\u0141', 'Ł') desc[i] = desc[i].replace('\\u0142', 'ł') desc[i] = desc[i].replace('\\u017a', 'ź') desc[i] = desc[i].replace('\\u017b', 'Ż') desc[i] = desc[i].replace('\\u017c', 'ż') desc[i] = desc[i].replace('\\u017', 'Ź') desc[i] = desc[i].replace('\\u00ae', '®') desc[i] = desc[i].replace('\\u00b0', '°') desc[i] = desc[i].replace('\u00b0', '°') desc[i] = desc[i].replace('\u2070', '°') desc[i] = desc[i].replace('\\u2070', '°') desc[i] = desc[i].replace('\\u2013', '-') desc[i] = desc[i].replace('\u2013', '-') desc[i] = desc[i].replace('\\u2026', '...') desc[i] = desc[i].replace('\u2026', '...') desc[i] = desc[i].replace('\\n', '') desc[i] = desc[i].replace('\\/', '/') j = 0 fin = ['<div class="product-description-section">'] for i in range(0, len(desc), 6): fin.append('<div class="three-col-equaly">') try: fin.append( f'<div><img src="https://matrixmedia.pl/media/wysiwyg/Amica/' f'{model}/{j}.jpg"/><br/><h2 class="important-header">{desc[i]}</h2>' ) fin.append(f'<p style="font-size: large;">{desc[i + 1]}</p></div>') fin.append( f'<div><img src="https://matrixmedia.pl/media/wysiwyg/Amica/' f'{model}/{j + 1}.jpg"/><br/><h2 class="important-header"> {desc[i + 2]}</h2>' ) fin.append(f'<p style="font-size: large;">{desc[i + 3]}</p></div>') fin.append( f'<div><img src="https://matrixmedia.pl/media/wysiwyg/Amica/' f'{model}/{j + 2}.jpg"/><br/><h2 class="important-header"> {desc[i + 4]}</h2>' ) fin.append(f'<p style="font-size: large;">{desc[i + 5]}</p></div>') except IndexError: pass finally: fin.append('</div>') j = j + 3 fin.append('</div>') reg = ''.join(fin) reg = reg.replace( '*Zdjęcie ma charakter poglądowy i może nie przedstawiać dokładnego modelu produktu.', '') print("------------ OPIS GRAFICZNY ------------") print(reg + '\n\n') """ OPIS TECHNICZNY """ html = requests.get(page_address).content sel = Selector(text=html) tech_raw = sel.xpath('/html/body/div[1]/script[4]/text()').extract() tech_raw2 = tech_raw[0] tech_d = tech_raw2[tech_raw2.find('"attrGroupData"'):tech_raw2. find('"docFilesDataList"')] tech_desc_1 = [] while tech_d.find('"attrName":') != -1: tech_d = tech_d[tech_d.find('"attrName":') + 12:] tech_desc_1.append(tech_d[:tech_d.find('"')]) tech_d = tech_d[tech_d.find('"attrValue":') + 13:] tech_desc_1.append(tech_d[:tech_d.find('"')]) tech_d2 = tech_d[tech_d.find(tech_desc_1[-1]):] tech_desc_2 = [] while tech_d2.find('"attrValue":') != -1: tech_d2 = tech_d2[tech_d2.find('"attrValue":') + 13:] tech_desc_2.append(tech_d2[:tech_d2.find('"')]) tech_desc = [ '<table id="plan_b" class="data-table"><tbody><tr class="specs_category"><td ' 'colspan="2">Specyfikacja</td></tr>' ] for i in range(0, len(tech_desc_1), 2): tech_desc.append(f'<tr><td class="c_left">{tech_desc_1[i]}</td>') tech_desc.append(f'<td class="c_left">{tech_desc_1[i + 1]}</td></tr>') for i in range(len(tech_desc_2)): if i == 0: tech_desc.append(f'<tr><td class="c_left">Funkcje</td>') tech_desc.append(f'<td class="c_left">{tech_desc_2[i]}</td></tr>') else: tech_desc.append(f'<tr><td class="c_left"></td>') tech_desc.append(f'<td class="c_left">{tech_desc_2[i]}</td></tr>') tech_desc.append('</tbody></table>') for i in range(len(tech_desc)): tech_desc[i] = tech_desc[i].replace('\\u0105', 'ą') tech_desc[i] = tech_desc[i].replace('\\u0119', 'ę') tech_desc[i] = tech_desc[i].replace('\\u0107', 'ć') tech_desc[i] = tech_desc[i].replace('\\u0144', 'ń') tech_desc[i] = tech_desc[i].replace('\\u015b', 'ś') tech_desc[i] = tech_desc[i].replace('\\u015a', 'Ś') tech_desc[i] = tech_desc[i].replace('\\u00f3', 'ó') tech_desc[i] = tech_desc[i].replace('\\u0141', 'Ł') tech_desc[i] = tech_desc[i].replace('\\u0142', 'ł') tech_desc[i] = tech_desc[i].replace('\\u017a', 'ź') tech_desc[i] = tech_desc[i].replace('\\u017b', 'Ż') tech_desc[i] = tech_desc[i].replace('\\u017c', 'ż') tech_desc[i] = tech_desc[i].replace('\\u017', 'Ź') tech_desc[i] = tech_desc[i].replace('\\u00ae', '®') tech_desc[i] = tech_desc[i].replace('\\u00b0', '°') tech_desc[i] = tech_desc[i].replace('\u00b0', '°') tech_desc[i] = tech_desc[i].replace('\u2070', '°') tech_desc[i] = tech_desc[i].replace('\\u2070', '°') tech_desc[i] = tech_desc[i].replace('\\u2013', '-') tech_desc[i] = tech_desc[i].replace('\u2013', '-') tech_desc[i] = tech_desc[i].replace('\\u2026', '...') tech_desc[i] = tech_desc[i].replace('\u2026', '...') tech_desc[i] = tech_desc[i].replace('\\n', '') tech_desc[i] = tech_desc[i].replace('\\/', '/') tech_desc[i] = tech_desc[i].replace(':', '') tech = ''.join(tech_desc) print('------------ OPIS TECHNICZNY ------------') print(tech + '\n\n') """ OPIS KRÓTKI """ for i in range(len(tech_desc_1)): tech_desc_1[i] = tech_desc_1[i].replace('\\u0105', 'ą') tech_desc_1[i] = tech_desc_1[i].replace('\\u0119', 'ę') tech_desc_1[i] = tech_desc_1[i].replace('\\u0107', 'ć') tech_desc_1[i] = tech_desc_1[i].replace('\\u0144', 'ń') tech_desc_1[i] = tech_desc_1[i].replace('\\u015b', 'ś') tech_desc_1[i] = tech_desc_1[i].replace('\\u015a', 'Ś') tech_desc_1[i] = tech_desc_1[i].replace('\\u00f3', 'ó') tech_desc_1[i] = tech_desc_1[i].replace('\\u0141', 'Ł') tech_desc_1[i] = tech_desc_1[i].replace('\\u0142', 'ł') tech_desc_1[i] = tech_desc_1[i].replace('\\u017a', 'ź') tech_desc_1[i] = tech_desc_1[i].replace('\\u017b', 'Ż') tech_desc_1[i] = tech_desc_1[i].replace('\\u017c', 'ż') tech_desc_1[i] = tech_desc_1[i].replace('\\u017', 'Ź') tech_desc_1[i] = tech_desc_1[i].replace('\\u00ae', '®') tech_desc_1[i] = tech_desc_1[i].replace('\\u00b0', '°') tech_desc_1[i] = tech_desc_1[i].replace('\u00b0', '°') tech_desc_1[i] = tech_desc_1[i].replace('\u2070', '°') tech_desc_1[i] = tech_desc_1[i].replace('\\u2070', '°') tech_desc_1[i] = tech_desc_1[i].replace('\\u2013', '-') tech_desc_1[i] = tech_desc_1[i].replace('\u2013', '-') tech_desc_1[i] = tech_desc_1[i].replace('\\u2026', '...') tech_desc_1[i] = tech_desc_1[i].replace('\u2026', '...') tech_desc_1[i] = tech_desc_1[i].replace('\\n', '') tech_desc_1[i] = tech_desc_1[i].replace('\\/', '/') tech_desc_1[i] = tech_desc_1[i].replace(':', '') if len(tech_desc_1) < 12: n = len(tech_desc_1) else: n = 12 short = ['<ul>'] for i in range(0, n, 2): short.append(f'<li>{tech_desc_1[i]}: {tech_desc_1[i + 1]}</li>') short.append('</ul>') short = '\n'.join(short) print('------------ OPIS KRÓTKI ------------') print(short + '\n\n') return [reg, short, tech]
info_transit = [] info_school = [] info_hospital = [] info_food = [] info_shopping = [] info_environment = [] id = 3211228186 wait = WebDriverWait(browser, 10) browser.get( "http://esf.cd.fang.com/newsecond/map/NewHouse/NewProjMap.aspx?newcode={0}" .format(id)) time.sleep(1) t_selector = Selector(text=browser.page_source) def get_transit_detail(): distance_lists = t_selector.xpath("//td/text()").extract()[1:] tag_names_list = t_selector.xpath("//th/text()").extract() for tag_name, distance in zip(tag_names_list, distance_lists): match_tag_name = re.match('【(.*)】(.*)', tag_name) if match_tag_name: tag = match_tag_name.group(1).encode('gbk', 'ignore').decode('gbk') nearname = match_tag_name.group(2).encode('gbk', 'ignore').decode('gbk') else: tag = "" nearname = "" match_distance = re.match('.*?(\d+)米', distance)
def parse(self, response): root = Selector(response) item = HorseRacingItem() for each in root.xpath('//select[@id="raceDateSelect"]'): item['date_pages'] = each.xpath('.//option/@value').extract() return item
def postman(self, threadID, date, class_start=None, class_end=None): url = "https://bases-marques.inpi.fr/Typo3_INPI_Marques/marques_resultats_liste.html" payload = self.forge_payload(date, class_start, class_end) headers = { 'accept': "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8", 'accept-encoding': "gzip, deflate, br", 'accept-language': "en-US,en;q=0.9", 'Cache-Control': "no-cache", 'connection': "keep-alive", 'content-length': "183", 'content-type': "application/x-www-form-urlencoded", 'host': "bases-marques.inpi.fr", 'origin': "https://bases-marques.inpi.fr", 'referer': "https://bases-marques.inpi.fr/Typo3_INPI_Marques/marques_recherche_avancee.html", 'upgrade-insecure-requests': "1", 'user-agent': "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu \ Chromium/64.0.3282.167 Chrome/64.0.3282.167 Safari/537.36", } response = self.s.request("POST", url, data=payload, headers=headers) text_response = response.text soup = BeautifulSoup(text_response, features='html.parser') html_response = soup.prettify() with open(self.url_directory + date + '.txt', 'wb') as \ my_file: my_file.write(html_response.encode('utf-8')) sel = Selector(text=html_response) company_number = 0 try: company_number = sel.css( "div.csc-default:nth-child(2) div.tx-pitrechercheinpi-pi1:nth-child(1) form:nth-child(1) \ div.txtresultats:nth-child(3) p:nth-child(1) > strong:nth-child(1)::text" ).extract()[0].strip() company_number = self.is_int(company_number) except: pass print(str(date) + " : spider-" + str(threadID)) if company_number > 500: self.split_postman(date, self.threadID, 0, 10) self.split_postman(date, self.threadID, 11, 19) self.split_postman(date, self.threadID, 21, 27) self.split_postman(date, self.threadID, 28, 35) self.split_postman(date, self.threadID, 36, 45) return for i in range(1, company_number + 1): if i % 5 == threadID: self.detail_annonce(i, date) else: pass
def parse(self, response): sel = Selector(response=response) comment_tables = sel.xpath('//div[@class="sub_ins"]/table') movie_id = re.findall(u'/subject/(\d+?)/', response.url)[0] for comment_table in comment_tables: user_info = dict() user_info['movie_id'] = movie_id comment_user_img_ele = comment_table.xpath('.//img') if comment_user_img_ele: comment_user_img = comment_user_img_ele.xpath('@src') user_info['img'] = comment_user_img.extract()[0] comment_username = comment_table.xpath('.//div[@class="pl2"]/a') if comment_username: username_str = comment_username.xpath('text()').extract()[0] user_info['name'] = username_str.strip() username_href = comment_username.xpath('@href').extract()[0] user_info['url'] = username_href.strip() comment_user_addr = comment_username.xpath('.//span') if comment_user_addr: user_addr_str = comment_user_addr.xpath('text()').extract()[0] user_info['address'] = user_addr_str.strip()[1:-1] comment_date = comment_table.xpath('.//p[@class="pl"]') if comment_date: user_info['date'] = comment_date.xpath('text()').extract()[0].strip() comment_content = comment_table.xpath('.//tr/td/p') if len(comment_content) == 2: p_values = comment_content.xpath('text()').extract() user_info['comment_content'] = p_values[len(p_values) - 1]
def on_detail_page(self, response): if response.url == response.old_url: try: hxs = Selector(text=response.content) summary = hxs.xpath('//div[@class="card-summary-content"]/*').extract() content = [] for ctx in summary: text = clean_html_text(ctx) content.append(text) content_text = " ".join(content) content_text=content_text.replace("[1]","") content_text=content_text.replace("[2]","") item_dict={} items = hxs.xpath('//div[@class="baseInfoWrap"]/div/div/*') for item in items: title = item.xpath('./span/text()').extract() title_value = item.xpath('./div/text()').extract() print("key:value", to_value(title), to_value(title_value)) item_dict[to_value(title)] = to_value(title_value) item_dict['summary'] = content_text imgs = hxs.xpath('//div[@class="lemma-picture summary-pic"]/a/img/@src').extract() item_dict['logo'] = to_value(imgs) print(item_dict) # save_content(self.site.name, url, json.dumps(item_dict)) # update_url(self.site.name, url, 200) return item_dict except Exception,e: # update_url(self.site.name, url, 500) logging.error(e)
def reviews_parse(self,response): hxs = Selector(response) # print 11111111 item = reviewsItem() sites = hxs.xpath('//*[@class="article"]/div[2]/div[@class="item"]/div[@class="info"]/ul') # sites = hxs.xpath('//*[@class="article"]/div[2]/div[@class="item"]/div[@class="info"]') for site in sites: item['userID'] = re.findall('people/(.+)/collect',response.url) # print response.url item['moviename'] = site.xpath('li[@class="title"]/a/em/text()').extract() item['movieID'] = site.xpath('li[@class="title"]/a/@href').re('subject/(.+)/$') moviesUrl =site.xpath('li[@class="title"]/a/@href').extract()[0] yield Request(url=moviesUrl,callback=self.movie_parse) item['ratingdate'] = site.xpath('li[3]/span[@class="date"]/text()').extract() if re.findall('rating\d+-t',site.xpath('li[3]/span[1]/@class').extract()[0]): item['rating'] = site.xpath('li[3]/span[1]/@class').re('\d+') else: item['rating'] = [u''] if site.xpath('li[4]/span[@class="comment"]/text()').extract(): item['comment'] = site.xpath('li[4]/span[@class="comment"]/text()').extract() else: item['comment'] = [u''] yield item # print item if hxs.xpath('//*[@class="paginator"]/span[@class="next"]/a/@href').extract(): nextreviewsUrl = hxs.xpath('//*[@class="paginator"]/span[@class="next"]/a/@href').extract()[0] # print nextreviewsUrl yield Request(url=nextreviewsUrl, callback=self.reviews_parse) pass