Beispiel #1
0
 def extractItems(self, response):
     # we keep this dictionary for partially filled items.
     temp_items = {}
     item = DliMetaItem()
     soup = BeautifulSoup.BeautifulSoup(response.body)
     table = soup.find('table', width='90%')
     rows = table.findAll('tr')
     for row in rows:
         cell = row.find('td')
         if not cell:
             continue
         anchorTag = cell.find('a')
         if anchorTag and anchorTag.attrMap:
             item.metadataLink = anchorTag.attrMap['href']
         metaText = cell.findAll(text=True)
         item['pages'] = metaText[1].split('.')[-2].strip(
             '\n \t')  # -1 is empty since there is a dot at the end.
         item['title'] = metaText[0]
         item['barcode'] = metaText[1].lstrip(', ')
         self.temp_items[item['barcode']] = item
     return temp_items