def crawl_document(self, url): try: self.check_tag(url=url) except TagExists: pass res = requests.get(url) doc = html.fromstring(res.content) data = { 'details_url': url, 'title': doc.findtext('.//div[@class="c00v3-introduction"]/h1'), 'summary': doc.findtext('.//span[@id="detail_abstract"]') or doc.findtext('.//span[@id="summary_abstract"]') } log.info("Crawling WB document: %s, %s", data['title'], url) if doc.find('.//div[@id="CitationHidDiv"]') is not None: text = clean(doc.find('.//div[@id="CitationHidDiv"]')) data['citation'] = text for li in doc.findall('.//ul[@class="detail"]/li'): label = li.findtext('./label') if label is None: continue label = slugify(label, sep='_') value = li.find('./span').xpath('string()') if value is None: continue value = value.strip().strip(';') if label == 'rel_proj_id': values = value.split(' -- ') value = values[0] if len(values) > 1: data['project_id'] = values[1] if len(value): data[label] = clean(value) for li in doc.findall('.//ul[@class="documentLnks"]/li'): record = data.copy() if li.get('class') != 'textdoc': doc_url = li.find('a').get('href') # from pprint import pprint # pprint(record) self.emit_url(doc_url, title=data['title'], summary=data['summary'], meta=record)
def crawl_document(self, url): try: id = self.check_tag(url=url) res = requests.get(url) doc = html.fromstring(res.content) data = { 'url': url, 'title': doc.findtext('.//td[@class="pageHeading"]'), 'report': doc.findtext('.//td[@class="pageSubHeading"]') } for row in doc.findall('.//tr'): label = row.find('./td[@class="labelCell"]') if label is None or label.text is None: continue label = clean(label.text) label = label.replace('.', '').replace('/', '').replace(' ', '_').lower() label = label.replace('sector1', 'sector') node = row.find('./td[@class="dataCell"]') if node is not None: value = clean(node.xpath('string()')) data[label] = value self.emit_url(url, package_id=id, mime_type='text/html', extension='html', article=True, meta=data) attachments = doc.find('.//input[@name="AttachmentNames"]') if attachments is not None: # GOT TO BE F*****G KIDDING ME attachments = attachments.get('value').split('^~') docid = doc.find('.//input[@name="DocID"]').get('value') for attachment in attachments: if not len(attachment.strip()): continue aurl = 'http://ifcext.ifc.org/ifcext/spiwebsite1.nsf/0/%s/$File/%s' aurl = aurl % (docid, attachment) try: aid = self.check_tag(url=aurl) self.emit_url(aurl, package_id=aid, meta=data) except TagExists: pass except TagExists: pass