Python clean 예제들, aleph_crawlers.util.clean Python 예제들

예제 #1

0

파일 보기

파일: wb_docs.py 프로젝트: CodeForAfrica/aleph_crawlers

    def crawl_document(self, url):
        try:
            self.check_tag(url=url)
        except TagExists:
            pass

        res = requests.get(url)
        doc = html.fromstring(res.content)
        data = {
            'details_url':
            url,
            'title':
            doc.findtext('.//div[@class="c00v3-introduction"]/h1'),
            'summary':
            doc.findtext('.//span[@id="detail_abstract"]')
            or doc.findtext('.//span[@id="summary_abstract"]')
        }

        log.info("Crawling WB document: %s, %s", data['title'], url)

        if doc.find('.//div[@id="CitationHidDiv"]') is not None:
            text = clean(doc.find('.//div[@id="CitationHidDiv"]'))
            data['citation'] = text

        for li in doc.findall('.//ul[@class="detail"]/li'):
            label = li.findtext('./label')
            if label is None:
                continue
            label = slugify(label, sep='_')
            value = li.find('./span').xpath('string()')
            if value is None:
                continue
            value = value.strip().strip(';')

            if label == 'rel_proj_id':
                values = value.split(' -- ')
                value = values[0]
                if len(values) > 1:
                    data['project_id'] = values[1]

            if len(value):
                data[label] = clean(value)

        for li in doc.findall('.//ul[@class="documentLnks"]/li'):
            record = data.copy()
            if li.get('class') != 'textdoc':
                doc_url = li.find('a').get('href')
                # from pprint import pprint
                # pprint(record)
                self.emit_url(doc_url,
                              title=data['title'],
                              summary=data['summary'],
                              meta=record)

예제 #2

0

파일 보기

파일: ifc_docs.py 프로젝트: CodeForAfrica/aleph_crawlers

    def crawl_document(self, url):
        try:
            id = self.check_tag(url=url)

            res = requests.get(url)
            doc = html.fromstring(res.content)
            data = {
                'url': url,
                'title': doc.findtext('.//td[@class="pageHeading"]'),
                'report': doc.findtext('.//td[@class="pageSubHeading"]')
            }
            for row in doc.findall('.//tr'):
                label = row.find('./td[@class="labelCell"]')
                if label is None or label.text is None:
                    continue
                label = clean(label.text)
                label = label.replace('.',
                                      '').replace('/',
                                                  '').replace(' ',
                                                              '_').lower()
                label = label.replace('sector1', 'sector')
                node = row.find('./td[@class="dataCell"]')
                if node is not None:
                    value = clean(node.xpath('string()'))
                    data[label] = value

            self.emit_url(url,
                          package_id=id,
                          mime_type='text/html',
                          extension='html',
                          article=True,
                          meta=data)

            attachments = doc.find('.//input[@name="AttachmentNames"]')
            if attachments is not None:
                # GOT TO BE F*****G KIDDING ME
                attachments = attachments.get('value').split('^~')
                docid = doc.find('.//input[@name="DocID"]').get('value')
                for attachment in attachments:
                    if not len(attachment.strip()):
                        continue
                    aurl = 'http://ifcext.ifc.org/ifcext/spiwebsite1.nsf/0/%s/$File/%s'
                    aurl = aurl % (docid, attachment)
                    try:
                        aid = self.check_tag(url=aurl)
                        self.emit_url(aurl, package_id=aid, meta=data)
                    except TagExists:
                        pass
        except TagExists:
            pass

예제 #3

0

파일 보기

파일: wb_docs.py 프로젝트: 01-/aleph_crawlers

    def crawl_document(self, url):
        try:
            self.check_tag(url=url)
        except TagExists:
            pass

        res = requests.get(url)
        doc = html.fromstring(res.content)
        data = {
            'details_url': url,
            'title': doc.findtext('.//div[@class="c00v3-introduction"]/h1'),
            'summary': doc.findtext('.//span[@id="detail_abstract"]') or
            doc.findtext('.//span[@id="summary_abstract"]')
        }

        log.info("Crawling WB document: %s, %s", data['title'], url)

        if doc.find('.//div[@id="CitationHidDiv"]') is not None:
            text = clean(doc.find('.//div[@id="CitationHidDiv"]'))
            data['citation'] = text

        for li in doc.findall('.//ul[@class="detail"]/li'):
            label = li.findtext('./label')
            if label is None:
                continue
            label = slugify(label, sep='_')
            value = li.find('./span').xpath('string()')
            if value is None:
                continue
            value = value.strip().strip(';')

            if label == 'rel_proj_id':
                values = value.split(' -- ')
                value = values[0]
                if len(values) > 1:
                    data['project_id'] = values[1]

            if len(value):
                data[label] = clean(value)

        for li in doc.findall('.//ul[@class="documentLnks"]/li'):
            record = data.copy()
            if li.get('class') != 'textdoc':
                doc_url = li.find('a').get('href')
                # from pprint import pprint
                # pprint(record)
                self.emit_url(doc_url, title=data['title'],
                              summary=data['summary'],
                              meta=record)

예제 #4

0

파일 보기

파일: ifc_docs.py 프로젝트: 01-/aleph_crawlers

    def crawl_document(self, url):
        try:
            id = self.check_tag(url=url)

            res = requests.get(url)
            doc = html.fromstring(res.content)
            data = {
                'url': url,
                'title': doc.findtext('.//td[@class="pageHeading"]'),
                'report': doc.findtext('.//td[@class="pageSubHeading"]')
            }
            for row in doc.findall('.//tr'):
                label = row.find('./td[@class="labelCell"]')
                if label is None or label.text is None:
                    continue
                label = clean(label.text)
                label = label.replace('.', '').replace('/', '').replace(' ', '_').lower()
                label = label.replace('sector1', 'sector')
                node = row.find('./td[@class="dataCell"]')
                if node is not None:
                    value = clean(node.xpath('string()'))
                    data[label] = value

            self.emit_url(url, package_id=id, mime_type='text/html',
                          extension='html', article=True, meta=data)

            attachments = doc.find('.//input[@name="AttachmentNames"]')
            if attachments is not None:
                # GOT TO BE F*****G KIDDING ME
                attachments = attachments.get('value').split('^~')
                docid = doc.find('.//input[@name="DocID"]').get('value')
                for attachment in attachments:
                    if not len(attachment.strip()):
                        continue
                    aurl = 'http://ifcext.ifc.org/ifcext/spiwebsite1.nsf/0/%s/$File/%s'
                    aurl = aurl % (docid, attachment)
                    try:
                        aid = self.check_tag(url=aurl)
                        self.emit_url(aurl, package_id=aid,
                                      meta=data)
                    except TagExists:
                        pass
        except TagExists:
            pass