def parse_travel_asy(self, response): xxs = XmlXPathSelector(response) xxs.remove_namespaces() json_object = json.loads(xxs.select("//string/text()").extract()[0]) request_list = [] for product in json_object['product']: if product['isYuyue'] == 'True': url = 'http://www.zhongmin.cn/Product/ProductDetails.aspx?pid=%s&bid=11' % product['Id'] else: url = 'http://www.zhongmin.cn/Travel/Product/TravelDetailArr%(Id)s-%(age)sd%(day)s.html' % product request_list.append(Request(url = url)) return request_list
def parse(self, response): x = XmlXPathSelector(response) x.remove_namespaces() x.register_namespace("rdf", "http://www.w3.org/1999/02/22-rdf-syntax-ns#") items = [] items = x.select('//record/metadata/RDF') jsons = [] for item in items: creator = item.select('MetaResource/creator/Agent/name/text()').extract() title = item.select('Resource/title/text()').extract() uri = item.select('Resource/screen/Image/@rdf:about').extract() tags = item.select('Resource/subject/Description/value/text()').extract() thumbnail = item.select('Resource/thumbnail/Image/@rdf:about').extract() lat = item.select('Resource/spatial/Description/lat/text()').extract() long = item.select('Resource/spatial/Description/long/text()').extract() locality = item.select('Resource/spatial/Description/locality/text()').extract() tags_string = '"' + '", "'.join(tags) + '"' if not lat: newlat = 'null' else: newlat = lat[0] if not long: newlong = 'null' else: newlong = long[0] if not locality: newloc = '' else: newloc = locality[0] json_entry = '{"title": "' + title[0] + '", "uri": "' + uri[0] + '", "attribution_uri": "' + uri[0] + '", "media_creator_username": "******", "thumbnail_url": "' + thumbnail[0] + '", "media_geo_latitude": ' + newlat + ', "media_geo_longitude": ' + newlong + ', "location": "' + newloc + '", "tags": [' + tags_string + '], "archive":"Yahoo! Japan", "media_type": "Image", "layer_type": "Image", "child_items_count":0, "published":1}, ' jsons.append(json_entry) resumptionToken = x.select('//resumptionToken/text()').extract() if resumptionToken == []: nextFileLink = '' open('last.txt', 'wb').write(''.join(jsons).encode("UTF-8")) else: nextFileLink = "http://search.shinrokuden.irides.tohoku.ac.jp/webapi/oaipmh?verb=ListRecords&metadataPrefix=sdn&resumptionToken=" + resumptionToken[0].encode('ascii') open(resumptionToken[0].encode('ascii') + '.txt', 'wb').write(''.join(jsons).encode("UTF-8")) yield Request(nextFileLink, callback = self.parse)
def parse_rss(self, response): item = response.request.meta['item'] if response.status != 500: xxs = XmlXPathSelector(response) xxs.remove_namespaces() item['date'] = xxs.select('.//channel/date/text()').extract() description = xxs.select('.//channel/description/text()').extract() if (len(item.get('description', '')) < 10) and description: item['description'] = ''.join(description).strip() del (item['subpage_urls']) return item
def parse_rss(self, response): item = response.request.meta['item'] if response.status != 500: xxs = XmlXPathSelector(response) xxs.remove_namespaces() item['date'] = xxs.select('.//channel/date/text()').extract() description = xxs.select('.//channel/description/text()').extract() if (len(item.get('description', '')) < 10) and description: item['description'] = ''.join(description).strip() del(item['subpage_urls']) return item
def parse(self, response): xxs = XmlXPathSelector(response) xxs.remove_namespaces() products = xxs.select('//item') for product in products: mpn = product.xpath('mpn/text()') if mpn: mpn = mpn[0].extract().upper().strip() else: mpn = None row = self.monitored_products.get(mpn) if mpn else None if row is None or (row and row['Discontinued'].lower().strip() == 'yes'): continue loader = ProductLoader(selector=product, item=Product()) loader.add_xpath('identifier', 'id/text()') loader.add_xpath('sku', 'mpn/text()') loader.add_xpath('brand', 'brand/text()') loader.add_xpath('image_url', 'image_link/text()') loader.add_xpath('url', 'link/text()') loader.add_xpath('name', 'title/text()') price = product.select('sale_price/text()').extract() if not price: price = product.select('price/text()').extract() loader.add_value('price', extract_price(price[0])) categories = product.select( 'product_type/text()').extract()[-1].split('>') categories = map(lambda x: x.strip(), categories) loader.add_value('category', categories) shipping_cost = product.select('shipping/price/text()').extract() shipping_cost = extract_price( shipping_cost[0]) if shipping_cost else '' loader.add_value('shipping_cost', shipping_cost) in_stock = product.select( 'availability[contains(text(), "in stock")]').extract() if not in_stock: loader.add_value('price', 0) item = loader.load_item() item['metadata'] = RHSMeta() item['metadata']['cost_price'] = row['Cost Price'] yield item
def parse(self, response): item = ArxivOrgItem() xxs = XmlXPathSelector(response) xxs.remove_namespaces() # 需要先将selector对象格式化成str xml_data = str(xxs.xpath('//link')) #logging.log(logging.INFO, xml_data) url_list = re.findall('http://arxiv.org/abs/\d+.\d+', xml_data) #logging.log(logging.INFO, url_list) for url in url_list: logging.log( logging.INFO, f'**************** crawling link: {url} ***************** ') yield Request(url=url, callback=self.parse_single_page, meta={'item': item}, dont_filter=True)
def parse(self, response): xxs = XmlXPathSelector(response) xxs.remove_namespaces() urls = xxs.select('//loc/text()').extract() for url in urls: if 'brands-sitemap.xml' in url: continue if 'productbrand' in url: prod_id = re.findall('productbrand_(\d+).html', url) prod_id = prod_id[0] if prod_id else '' if prod_id: if prod_id in self.product_ids: continue else: self.product_ids.append(prod_id) yield Request(url, callback=self.parse_product, meta={"dont_merge_cookies": True}) else: yield Request(url, meta={"dont_merge_cookies": True}) '''
def parse(self, response): ######### # Setup # ######### x = XmlXPathSelector(response) x.remove_namespaces() x.register_namespace('rdf', 'http://www.w3.org/1999/02/22-rdf-syntax-ns#') category = getCAT() category = category.upper() output_path = self.PATH + category.lower() + '_output/' token_path = output_path + '.previous_resumption_token' dup_path = output_path + '.dup_list' items = [] items = x.select('//record/metadata/RDF') result = [] id_list = [] nextFileLink = '' resumption_token = x.select('//resumptionToken/text()').extract() saveResumptionToken(resumption_token, token_path) ############### # Parse Items # ############### print '****** PARSING FILE... ******' for item in items: #################### ##### creator ###### ##### archive ###### #### layer_type #### #################### media_creator_username = '******' archive = 'Kahoku Shimpo Disasters Archive' layer_type = 'Image' #################### #### media_type #### #################### if category == 'MOVIE': media_type = 'Video' if category == 'DOCUMENT' or category == 'OTHER': media_type = 'Headline' if category == 'IMAGE': media_type = 'Image' ###################### # media_date_created # ###################### media_date_created = item.select('Resource/created/text()').extract() media_date_created = processField(media_date_created) ################## #### abstract #### ##### title ###### ################## abstract = item.select('Resource/abstract/text()').extract() title = item.select('Resource/title/text()').extract() title = processField(title) abstract = processField(abstract) abstract = abstract.replace('\r\n', '') # Abstract tends to be more unique, though not always there. Title is often repetitive but more consistently present. if not abstract: abstract = title ################### #### unique_id ##### ################### unique_id = item.select('Resource/identifier/text()').extract() unique_id = str(unique_id[0]) # Used for de-duping id_list.append(unique_id) #################### ###### Source ###### #################### source = item.select('Resource/@rdf:about').extract() source = processField(source) #################### ####### URI ######## #################### # Download image if it has not already been downloaded if category == 'IMAGE': uri = item.select('Resource/screen/Image/@rdf:about').extract() downloaded = os.path.exists(output_path + unique_id + '.jpg') if uri and not downloaded: uri = uri[0] urllib.urlretrieve(uri, output_path + unique_id + '.jpg') uri = 'https://s3.amazonaws.com/JDA-Files/' + unique_id if category == 'MOVIE': uri = item.select('Resource/ogg/Image/@rdf:about').extract() if category == 'DOCUMENT' or category == 'OTHER': uri = source uri = processField(uri) #################### #### Thumbnail ##### #################### thumbnail_url = item.select('Resource/thumbnail/Image/@rdf:about').extract() thumbnail_url = processField(thumbnail_url) #################### ####### Tags ####### #################### tags = item.select('Resource/subject/Description/value/text()').extract() if not tags: tags_string = '[]' else: tags_string = '"' + '", "'.join(tags) + '"' #################### ##### Location ##### #################### region = item.select('Resource/spatial/Description/region/text()').extract() locality = item.select('Resource/spatial/Description/locality/text()').extract() street_address = item.select('Resource/spatial/Description/street-address/text()').extract() if region or locality or street_address: region = processField(region) locality = processField(locality) street_address = processField(street_address) locationTemp = [street_address, locality, region] location = '' # Handles comma location and attribute existence variability for item in locationTemp: if item: if location == '': location = location + item else: location = location + ', ' + item if location[location.__len__()-1] == ',': location = location[:-1] else: location = '' ########################## ######## Lat/Long ######## ########################## # Find coordinates using Google Maps API lat = '' lng = '' if location != '': key = '&key=AIzaSyCGF2BwNPNckrbx6L2tQRATBcjKv0C3xCo' google_uri = 'https://maps.googleapis.com/maps/api/geocode/json?address=' location_encoded = location.encode('utf8') location_url_ready = urllib.quote_plus(location_encoded, safe='') request_uri = google_uri + location_url_ready + key with contextlib.closing(urllib.urlopen(request_uri)) as response: data = json.load(response) if json.dumps(data['results']) != '[]': lat = json.dumps(data['results'][0]['geometry']['location']['lat']) lng = json.dumps(data['results'][0]['geometry']['location']['lng']) else: lat = 'null' lng = 'null' ########################## ######## JSONify ######### ########################## json_entry = ( '{"title": "' + abstract + '", "uri": "' + uri + '", "attribution_uri": "' + source + '", "media_date_created": "' + media_date_created + '", "media_creator_username": "******", "thumbnail_url": "' + thumbnail_url + '", "media_geo_latitude": "' + lat + '", "media_geo_longitude": "' + lng + '", "location": "' + location + '", "tags": [' + tags_string + '], "archive": "' + archive + '", "media_type": "' + media_type + '", "layer_type": "' + layer_type + '", "child_items_count": 0, "published": 1}, ' ) ##################### # Duplicate Checker # ##################### # Check for duplicates only in the "final-..." file since that is the only # file without a resumption token and thus could possibly contain duplicates. if not resumption_token and os.path.exists(dup_path): dup_list = open(dup_path, 'r').read() if unique_id not in dup_list: print 'not in dup' result.append(json_entry) else: result.append(json_entry) ################### # Save Duplicates # ################### # Save Item URI List with open(dup_path, 'w+r') as f: print '****** (OVER)WRITING DEDUP LIST ******' f.truncate() for item in id_list: print>>f, item f.close() ########### # If Done # ########### if resumption_token == []: print '****** DONE ******' nextFileLink = "" path = output_path + 'final-' + getDateString() + '.json' open(path, 'wb').write(''.join(result).encode('UTF-8')) removeEmptyFiles(output_path) ############### # Or Next Job # ############### else: url = self.template_url + category + '&resumptionToken=' nextFileLink = url + resumption_token[0].encode('ascii') path = output_path + resumption_token[0].encode('ascii') + '.json' open(path, 'wb').write(''.join(result).encode('UTF-8')) yield Request(nextFileLink, callback = self.parse)
def parse(self, response): x = XmlXPathSelector(response) x.remove_namespaces() x.register_namespace("rdf", "http://www.w3.org/1999/02/22-rdf-syntax-ns#") items = [] items = x.select('//record/metadata/RDF') jsons = [] for item in items: creator = item.select( 'MetaResource/creator/Agent/name/text()').extract() title = item.select('Resource/title/text()').extract() uri = item.select('Resource/screen/Image/@rdf:about').extract() tags = item.select( 'Resource/subject/Description/value/text()').extract() thumbnail = item.select( 'Resource/thumbnail/Image/@rdf:about').extract() lat = item.select( 'Resource/spatial/Description/lat/text()').extract() long = item.select( 'Resource/spatial/Description/long/text()').extract() locality = item.select( 'Resource/spatial/Description/locality/text()').extract() tags_string = '"' + '", "'.join(tags) + '"' if not lat: newlat = 'null' else: newlat = lat[0] if not long: newlong = 'null' else: newlong = long[0] if not locality: newloc = '' else: newloc = locality[0] json_entry = '{"title": "' + title[0] + '", "uri": "' + uri[ 0] + '", "attribution_uri": "' + uri[ 0] + '", "media_creator_username": "******", "thumbnail_url": "' + thumbnail[ 0] + '", "media_geo_latitude": ' + newlat + ', "media_geo_longitude": ' + newlong + ', "location": "' + newloc + '", "tags": [' + tags_string + '], "archive":"Yahoo! Japan", "media_type": "Image", "layer_type": "Image", "child_items_count":0, "published":1}, ' jsons.append(json_entry) resumptionToken = x.select('//resumptionToken/text()').extract() if resumptionToken == []: nextFileLink = '' open('last.txt', 'wb').write(''.join(jsons).encode("UTF-8")) else: nextFileLink = "http://search.shinrokuden.irides.tohoku.ac.jp/webapi/oaipmh?verb=ListRecords&metadataPrefix=sdn&resumptionToken=" + resumptionToken[ 0].encode('ascii') open(resumptionToken[0].encode('ascii') + '.txt', 'wb').write(''.join(jsons).encode("UTF-8")) yield Request(nextFileLink, callback=self.parse)
def _extract_links(self, response): xxs = XmlXPathSelector(response) if self.remove_namespaces: xxs.remove_namespaces() for url in xxs.select(self.xpath).extract(): yield Link(url.encode(response.encoding))
def _extract_links(self, response): xxs = XmlXPathSelector(response) if self.remove_namespaces: xxs.remove_namespaces() for url in xxs.select(self.xpath).extract(): yield Link(url.encode(response.encoding))