Beispiel #1
0
 def parse_travel_asy(self, response):
     xxs = XmlXPathSelector(response)
     xxs.remove_namespaces()
     json_object = json.loads(xxs.select("//string/text()").extract()[0])
     request_list = []
     for product in json_object['product']:
         if product['isYuyue'] == 'True':
             url = 'http://www.zhongmin.cn/Product/ProductDetails.aspx?pid=%s&bid=11' % product['Id']
         else:
             url = 'http://www.zhongmin.cn/Travel/Product/TravelDetailArr%(Id)s-%(age)sd%(day)s.html' % product
         request_list.append(Request(url = url))
     return request_list
Beispiel #2
0
	def parse(self, response):
		x = XmlXPathSelector(response)
		x.remove_namespaces()
		x.register_namespace("rdf", "http://www.w3.org/1999/02/22-rdf-syntax-ns#")
		items = []
		items = x.select('//record/metadata/RDF')

		jsons = []

		for item in items:
			creator = item.select('MetaResource/creator/Agent/name/text()').extract()
			title = item.select('Resource/title/text()').extract()
			uri = item.select('Resource/screen/Image/@rdf:about').extract()
			tags = item.select('Resource/subject/Description/value/text()').extract()
			thumbnail = item.select('Resource/thumbnail/Image/@rdf:about').extract()
			lat = item.select('Resource/spatial/Description/lat/text()').extract()
			long = item.select('Resource/spatial/Description/long/text()').extract()
			locality = item.select('Resource/spatial/Description/locality/text()').extract()
			
			tags_string = '"' + '", "'.join(tags) + '"'
			
			if not lat:
				newlat = 'null'
			else:
				newlat = lat[0]

			if not long:
				newlong = 'null'
			else:
				newlong = long[0]

			if not locality:
				newloc = ''
			else:
				newloc = locality[0]
			
			
			
			json_entry = '{"title": "' + title[0] + '", "uri": "' + uri[0] + '", "attribution_uri": "' + uri[0] + '", "media_creator_username": "******", "thumbnail_url": "' + thumbnail[0] + '", "media_geo_latitude": ' + newlat + ', "media_geo_longitude": ' + newlong + ', "location": "' + newloc + '", "tags": [' + tags_string + '], "archive":"Yahoo! Japan", "media_type": "Image", "layer_type": "Image", "child_items_count":0, "published":1}, '
			
			
			jsons.append(json_entry)
			

		resumptionToken = x.select('//resumptionToken/text()').extract()
		if resumptionToken == []:
			nextFileLink = ''
			open('last.txt', 'wb').write(''.join(jsons).encode("UTF-8"))
		else:
			nextFileLink = "http://search.shinrokuden.irides.tohoku.ac.jp/webapi/oaipmh?verb=ListRecords&metadataPrefix=sdn&resumptionToken=" + resumptionToken[0].encode('ascii')
			open(resumptionToken[0].encode('ascii') + '.txt', 'wb').write(''.join(jsons).encode("UTF-8"))
		yield Request(nextFileLink, callback = self.parse)
Beispiel #3
0
    def parse_rss(self, response):
        item = response.request.meta['item']

        if response.status != 500:
            xxs = XmlXPathSelector(response)
            xxs.remove_namespaces()

            item['date'] = xxs.select('.//channel/date/text()').extract()
            description = xxs.select('.//channel/description/text()').extract()
            if (len(item.get('description', '')) < 10) and description:
                item['description'] = ''.join(description).strip()

        del (item['subpage_urls'])

        return item
Beispiel #4
0
    def parse_rss(self, response):
        item = response.request.meta['item']

        if response.status != 500:
            xxs = XmlXPathSelector(response)
            xxs.remove_namespaces()

            item['date'] = xxs.select('.//channel/date/text()').extract()
            description = xxs.select('.//channel/description/text()').extract()
            if (len(item.get('description', '')) < 10) and description:
                item['description'] = ''.join(description).strip()

        del(item['subpage_urls'])

        return item
Beispiel #5
0
    def parse(self, response):

        xxs = XmlXPathSelector(response)
        xxs.remove_namespaces()
        products = xxs.select('//item')
        for product in products:
            mpn = product.xpath('mpn/text()')
            if mpn:
                mpn = mpn[0].extract().upper().strip()
            else:
                mpn = None
            row = self.monitored_products.get(mpn) if mpn else None
            if row is None or (row and row['Discontinued'].lower().strip()
                               == 'yes'):
                continue
            loader = ProductLoader(selector=product, item=Product())
            loader.add_xpath('identifier', 'id/text()')
            loader.add_xpath('sku', 'mpn/text()')
            loader.add_xpath('brand', 'brand/text()')
            loader.add_xpath('image_url', 'image_link/text()')
            loader.add_xpath('url', 'link/text()')
            loader.add_xpath('name', 'title/text()')
            price = product.select('sale_price/text()').extract()
            if not price:
                price = product.select('price/text()').extract()

            loader.add_value('price', extract_price(price[0]))

            categories = product.select(
                'product_type/text()').extract()[-1].split('>')
            categories = map(lambda x: x.strip(), categories)
            loader.add_value('category', categories)

            shipping_cost = product.select('shipping/price/text()').extract()
            shipping_cost = extract_price(
                shipping_cost[0]) if shipping_cost else ''
            loader.add_value('shipping_cost', shipping_cost)

            in_stock = product.select(
                'availability[contains(text(), "in stock")]').extract()
            if not in_stock:
                loader.add_value('price', 0)

            item = loader.load_item()
            item['metadata'] = RHSMeta()
            item['metadata']['cost_price'] = row['Cost Price']
            yield item
 def parse(self, response):
     item = ArxivOrgItem()
     xxs = XmlXPathSelector(response)
     xxs.remove_namespaces()
     # 需要先将selector对象格式化成str
     xml_data = str(xxs.xpath('//link'))
     #logging.log(logging.INFO, xml_data)
     url_list = re.findall('http://arxiv.org/abs/\d+.\d+', xml_data)
     #logging.log(logging.INFO, url_list)
     for url in url_list:
         logging.log(
             logging.INFO,
             f'**************** crawling link: {url} ***************** ')
         yield Request(url=url,
                       callback=self.parse_single_page,
                       meta={'item': item},
                       dont_filter=True)
Beispiel #7
0
    def parse(self, response):

        xxs = XmlXPathSelector(response)
        xxs.remove_namespaces()
        urls = xxs.select('//loc/text()').extract()
        for url in urls:
            if 'brands-sitemap.xml' in url:
                continue

            if 'productbrand' in url:
                prod_id = re.findall('productbrand_(\d+).html', url)
                prod_id = prod_id[0] if prod_id else ''
                if prod_id:
                    if prod_id in self.product_ids:
                        continue
                    else:
                        self.product_ids.append(prod_id)
                yield Request(url,
                              callback=self.parse_product,
                              meta={"dont_merge_cookies": True})
            else:
                yield Request(url, meta={"dont_merge_cookies": True})
        '''
  def parse(self, response):

    #########
    # Setup #
    #########
    x = XmlXPathSelector(response)
    x.remove_namespaces()
    x.register_namespace('rdf', 'http://www.w3.org/1999/02/22-rdf-syntax-ns#')

    category     = getCAT()
    category     = category.upper()
    output_path  = self.PATH + category.lower() + '_output/'
    token_path   = output_path + '.previous_resumption_token'
    dup_path     = output_path + '.dup_list'
    items        = []
    items        = x.select('//record/metadata/RDF')
    result       = []
    id_list      = []
    nextFileLink = ''

    resumption_token = x.select('//resumptionToken/text()').extract()
    saveResumptionToken(resumption_token, token_path)

    ###############
    # Parse Items #
    ###############
    print '****** PARSING FILE... ******'
    for item in items:

      ####################
      ##### creator ######
      ##### archive ######
      #### layer_type ####
      ####################
      media_creator_username = '******'
      archive                = 'Kahoku Shimpo Disasters Archive'
      layer_type             = 'Image' 

      ####################
      #### media_type ####
      ####################
      if category == 'MOVIE': media_type = 'Video'
      if category == 'DOCUMENT' or category == 'OTHER': media_type = 'Headline'
      if category == 'IMAGE': media_type = 'Image'

      ######################
      # media_date_created #
      ######################
      media_date_created = item.select('Resource/created/text()').extract()
      media_date_created = processField(media_date_created)

      ##################
      #### abstract ####
      ##### title ######
      ##################
      abstract = item.select('Resource/abstract/text()').extract()
      title    = item.select('Resource/title/text()').extract()
      title    = processField(title)
      abstract = processField(abstract)
      abstract = abstract.replace('\r\n', '')

      # Abstract tends to be more unique, though not always there. Title is often repetitive but more consistently present.
      if not abstract: abstract = title

      ###################
      #### unique_id ##### 
      ###################
      unique_id = item.select('Resource/identifier/text()').extract()
      unique_id = str(unique_id[0])
      # Used for de-duping
      id_list.append(unique_id)

      ####################
      ###### Source ###### 
      ####################
      source = item.select('Resource/@rdf:about').extract()
      source = processField(source)

      ####################
      ####### URI ######## 
      ####################
      # Download image if it has not already been downloaded
      if category == 'IMAGE':
      	uri = item.select('Resource/screen/Image/@rdf:about').extract()
      	downloaded = os.path.exists(output_path + unique_id + '.jpg')
      	if uri and not downloaded:
      	  uri = uri[0]
      	  urllib.urlretrieve(uri, output_path + unique_id + '.jpg')
      	  uri = 'https://s3.amazonaws.com/JDA-Files/' + unique_id
      
      if category == 'MOVIE':
      	uri = item.select('Resource/ogg/Image/@rdf:about').extract()
	      
      if category == 'DOCUMENT' or category == 'OTHER':
      	uri = source

      uri = processField(uri)

      ####################
      #### Thumbnail ##### 
      ####################
      thumbnail_url = item.select('Resource/thumbnail/Image/@rdf:about').extract()
      thumbnail_url = processField(thumbnail_url)

      ####################
      ####### Tags ####### 
      ####################
      tags = item.select('Resource/subject/Description/value/text()').extract()
      if not tags:
        tags_string = '[]'
      else:
        tags_string = '"' + '", "'.join(tags) + '"'

      ####################
      ##### Location ##### 
      ####################
      region           = item.select('Resource/spatial/Description/region/text()').extract()
      locality         = item.select('Resource/spatial/Description/locality/text()').extract()
      street_address   = item.select('Resource/spatial/Description/street-address/text()').extract()

      if region or locality or street_address:
        region         = processField(region)
        locality       = processField(locality)
        street_address = processField(street_address)
        locationTemp   = [street_address, locality, region]
        location       = ''

        # Handles comma location and attribute existence variability
        for item in locationTemp:
          if item:
            if location == '':
              location = location + item
            else:
              location = location + ', ' + item
        if location[location.__len__()-1] == ',':
          location = location[:-1]
      else:
        location = ''

      ##########################
      ######## Lat/Long ########
      ##########################
      # Find coordinates using Google Maps API
      lat = '' 
      lng = ''
      if location != '':
        key                = '&key=AIzaSyCGF2BwNPNckrbx6L2tQRATBcjKv0C3xCo'
        google_uri         = 'https://maps.googleapis.com/maps/api/geocode/json?address=' 
        location_encoded   = location.encode('utf8')
        location_url_ready = urllib.quote_plus(location_encoded, safe='')
        request_uri        = google_uri + location_url_ready + key 
        with contextlib.closing(urllib.urlopen(request_uri)) as response:
          data = json.load(response)
          if json.dumps(data['results']) != '[]':
            lat = json.dumps(data['results'][0]['geometry']['location']['lat'])
            lng = json.dumps(data['results'][0]['geometry']['location']['lng'])
          else:
            lat = 'null' 
            lng = 'null'

      ##########################
      ######## JSONify #########
      ##########################
      json_entry = ( '{"title": "' 
        + abstract + '", "uri": "' 
        + uri + '", "attribution_uri": "' 
        + source + '", "media_date_created": "' 
        + media_date_created + '", "media_creator_username": "******", "thumbnail_url": "' 
        + thumbnail_url + '", "media_geo_latitude": "' 
        + lat + '", "media_geo_longitude": "' 
        + lng + '", "location": "' 
        + location + '", "tags": [' 
        + tags_string + '], "archive": "' 
        + archive + '",  "media_type": "'
        + media_type + '", "layer_type": "'
        + layer_type + '", "child_items_count": 0, "published": 1}, '
      )

      #####################
      # Duplicate Checker #
      #####################
      # Check for duplicates only in the "final-..." file since that is the only 
      # file without a resumption token and thus could possibly contain duplicates.
      if not resumption_token and os.path.exists(dup_path):
        dup_list = open(dup_path, 'r').read()
        if unique_id not in dup_list:
          print 'not in dup'
          result.append(json_entry)
      else:
        result.append(json_entry)


    ###################
    # Save Duplicates #
    ###################
    # Save Item URI List
    with open(dup_path, 'w+r') as f:
      print '****** (OVER)WRITING DEDUP LIST ******'
      f.truncate() 
      for item in id_list:
        print>>f, item
      f.close() 

    ###########
    # If Done #
    ###########
    if resumption_token == []:
      print '****** DONE ******'
      nextFileLink = ""
      path = output_path + 'final-' + getDateString() + '.json'
      open(path, 'wb').write(''.join(result).encode('UTF-8'))
      removeEmptyFiles(output_path)

    ###############
    # Or Next Job #
    ###############
    else: 
      url          = self.template_url + category + '&resumptionToken='
      nextFileLink = url + resumption_token[0].encode('ascii')
      path         = output_path + resumption_token[0].encode('ascii') + '.json'

      open(path, 'wb').write(''.join(result).encode('UTF-8'))
      yield Request(nextFileLink, callback = self.parse)
Beispiel #9
0
    def parse(self, response):
        x = XmlXPathSelector(response)
        x.remove_namespaces()
        x.register_namespace("rdf",
                             "http://www.w3.org/1999/02/22-rdf-syntax-ns#")
        items = []
        items = x.select('//record/metadata/RDF')

        jsons = []

        for item in items:
            creator = item.select(
                'MetaResource/creator/Agent/name/text()').extract()
            title = item.select('Resource/title/text()').extract()
            uri = item.select('Resource/screen/Image/@rdf:about').extract()
            tags = item.select(
                'Resource/subject/Description/value/text()').extract()
            thumbnail = item.select(
                'Resource/thumbnail/Image/@rdf:about').extract()
            lat = item.select(
                'Resource/spatial/Description/lat/text()').extract()
            long = item.select(
                'Resource/spatial/Description/long/text()').extract()
            locality = item.select(
                'Resource/spatial/Description/locality/text()').extract()

            tags_string = '"' + '", "'.join(tags) + '"'

            if not lat:
                newlat = 'null'
            else:
                newlat = lat[0]

            if not long:
                newlong = 'null'
            else:
                newlong = long[0]

            if not locality:
                newloc = ''
            else:
                newloc = locality[0]

            json_entry = '{"title": "' + title[0] + '", "uri": "' + uri[
                0] + '", "attribution_uri": "' + uri[
                    0] + '", "media_creator_username": "******", "thumbnail_url": "' + thumbnail[
                            0] + '", "media_geo_latitude": ' + newlat + ', "media_geo_longitude": ' + newlong + ', "location": "' + newloc + '", "tags": [' + tags_string + '], "archive":"Yahoo! Japan", "media_type": "Image", "layer_type": "Image", "child_items_count":0, "published":1}, '

            jsons.append(json_entry)

        resumptionToken = x.select('//resumptionToken/text()').extract()
        if resumptionToken == []:
            nextFileLink = ''
            open('last.txt', 'wb').write(''.join(jsons).encode("UTF-8"))
        else:
            nextFileLink = "http://search.shinrokuden.irides.tohoku.ac.jp/webapi/oaipmh?verb=ListRecords&metadataPrefix=sdn&resumptionToken=" + resumptionToken[
                0].encode('ascii')
            open(resumptionToken[0].encode('ascii') + '.txt',
                 'wb').write(''.join(jsons).encode("UTF-8"))
        yield Request(nextFileLink, callback=self.parse)
Beispiel #10
0
 def _extract_links(self, response):
     xxs = XmlXPathSelector(response)
     if self.remove_namespaces:
         xxs.remove_namespaces()
     for url in xxs.select(self.xpath).extract():
         yield Link(url.encode(response.encoding))
Beispiel #11
0
 def _extract_links(self, response):
     xxs = XmlXPathSelector(response)
     if self.remove_namespaces:
         xxs.remove_namespaces()
     for url in xxs.select(self.xpath).extract():
         yield Link(url.encode(response.encoding))