def _parse_entity(self, item, entity=None): try: if entity is None: entity = BasicEntity() attributes = item.find('.//ItemAttributes') # parse the product group and shortcut the parsing process to return # None in the likely event that this entity doesn't belong to one of # the targeted product groups that we're interested in. try: product_group = attributes.find('.//ProductGroup').text.lower() entity.subcategory = self._subcategory_map[product_group] except: #print product_group #pprint(entity) return None attribute_elems = { 'Title' : 'title', 'Brand' : 'brand', 'Publisher' : 'publisher', 'Studio' : 'studio_name', 'ReleaseDate' : 'original_release_date', 'Title' : 'title', } item_elems = { 'ASIN' : 'asin', 'DetailPageURL' : 'amazon_link', } elems = [] for k, v in attribute_elems.iteritems(): elems.append((attributes, k, v)) for k, v in item_elems.iteritems(): elems.append((item, k, v)) # parse all optional fields which are relatively easy-to-extract for elem in elems: node = elem[0].find(elem[1]) if node is not None: entity[elem[2]] = node.text # ensure that every entity has a valid title and a valid asin if entity.title is None or entity.asin is None: return None # parse the author(s) authors = attributes.findall('Author') if len(authors) > 0: entity.author = string.joinfields(map(lambda a: a.text, authors), ', ') # parse the artist(s) artists = attributes.findall('Artist') if len(artists) > 0: entity.artist_display_name = string.joinfields(map(lambda a: a.text, artists), ', ') # parse the running time running_time = attributes.find('RunningTime') if running_time is not None: length = running_time.pyval if running_time.get('Units').lower() == 'minutes': # internally, duration is stored in seconds length = length * 60 entity.track_length = length # parse the manufacturer manufacturer = attributes.find('.//Manufacturer') if manufacturer: if entity.subcategory == 'book': entity.publisher = manufacturer.text else: entity.manufacturer = manufacturer.text # parse the price of this product price = attributes.find('ListPrice') if price is not None: entity.amount = price.find('Amount').pyval entity.currency_code = price.find('CurrencyCode').text entity.formatted_price = price.find('FormattedPrice').text # parse the amazon sales rank of this product sales_rank = item.find('SalesRank') if sales_rank is not None: entity.salesRank = sales_rank.pyval # parse the number of pages for a book num_pages = item.find('NumberOfPages') if num_pages is not None: entity.num_pages = num_pages.pyval # parse the track list for an album tracks = item.find('Tracks') if tracks is not None: tracks = tracks.findall('.//Track') tracks = list(track.text for track in tracks) if len(tracks) > 0: entity.tracks = tracks # parse the editorial review as the closest thing we have to a # real product description editorial_review = item.find('.//EditorialReview') if editorial_review is not None: desc = editorial_review.find('Content') if desc is not None: desc = desc.text soup = BeautifulSoup(desc) entity.desc = ''.join(soup.findAll(text=True)) # parse browse nodes to try and narrow in on a more accurate subcategory browse_nodes = item.find('BrowseNodes') if browse_nodes is not None: names = [] for node in browse_nodes.findall('.//BrowseNode'): name = node.find('Name') if name is not None: name = name.text.lower() names.append(name) #print "%s) %s" % (entity.title, names) if 'tv' in names: entity.subcategory = 'tv' #elif 'singer-songwriters' in names: # entity.subcategory = 'artist' # parse the binding to try and narrow in on a more accurate subcategory binding = attributes.find('Binding') if binding is not None: binding = binding.text.strip().lower() if binding in self._binding_blacklist: return None # parse the ProductTypeName to try and narrow in on a more accurate subcategory product_type_name = attributes.find('ProductTypeName') if product_type_name is not None: product_type_name = product_type_name.text.strip().lower() try: subcategory = self._product_type_names[product_type_name] if subcategory is None: return else: entity.subcategory = subcategory except KeyError: pass # parse images associated with this product potential_images = { 'SmallImage' : 'tiny', 'MediumImage' : 'small', 'LargeImage' : 'large', } for k, v in potential_images.iteritems(): image = item.find(k) if image is not None: entity[v] = image.find('URL').text return entity except (AttributeError, KeyError): utils.printException() raise return None