def extractItems(response): item = DliMetaItem() soup = BeautifulSoup.BeautifulSoup(response.body) table = soup.find('table', width='90%') rows = table.findAll('tr') for row in rows: cell = row.find('td') if not cell: continue anchorTag = data.find('a') if anchorTag: item.metadataLink = anchorTag.attrMap['href'] metaText = data.findAll(text=True) item.pages = itemmetaText[1].split('.')[-2] # -1 is empty since there is a dot at the end. item.title = metaText[0] item.barcode = metaText[1].lstrip(', ') self.temp_items[item.barcode] = item
def extractItems(self, response): # we keep this dictionary for partially filled items. temp_items = {} item = DliMetaItem() soup = BeautifulSoup.BeautifulSoup(response.body) table = soup.find('table', width='90%') rows = table.findAll('tr') for row in rows: cell = row.find('td') if not cell: continue anchorTag = cell.find('a') if anchorTag and anchorTag.attrMap: item.metadataLink = anchorTag.attrMap['href'] metaText = cell.findAll(text=True) item['pages'] = metaText[1].split('.')[-2].strip('\n \t') # -1 is empty since there is a dot at the end. item['title'] = metaText[0] item['barcode'] = metaText[1].lstrip(', ') self.temp_items[item['barcode']] = item return temp_items
def extractItems(self, response): # we keep this dictionary for partially filled items. temp_items = {} item = DliMetaItem() soup = BeautifulSoup.BeautifulSoup(response.body) table = soup.find('table', width='90%') rows = table.findAll('tr') for row in rows: cell = row.find('td') if not cell: continue anchorTag = cell.find('a') if anchorTag and anchorTag.attrMap: item.metadataLink = anchorTag.attrMap['href'] metaText = cell.findAll(text=True) item['pages'] = metaText[1].split('.')[-2].strip( '\n \t') # -1 is empty since there is a dot at the end. item['title'] = metaText[0] item['barcode'] = metaText[1].lstrip(', ') self.temp_items[item['barcode']] = item return temp_items