Example #1
0
    def harvest(self, start_date=None, end_date=None):

        start_date = (start_date or date.today() -
                      timedelta(settings.DAYS_BACK)).isoformat()
        end_date = (end_date or date.today()).isoformat()

        if self.timezone_granularity:
            start_date += 'T00:00:00Z'
            end_date += 'T00:00:00Z'

        url = furl(self.base_url)
        url.args['verb'] = 'ListRecords'
        url.args['metadataPrefix'] = 'oai_dc'
        url.args['from'] = start_date
        url.args['until'] = end_date

        records = self.get_records(url.url, start_date, end_date)

        rawdoc_list = []
        for record in records:
            doc_id = record.xpath('ns0:header/ns0:identifier',
                                  namespaces=self.namespaces)[0].text
            record = etree.tostring(record, encoding=self.record_encoding)
            rawdoc_list.append(
                RawDocument({
                    'doc': record,
                    'source': util.copy_to_unicode(self.short_name),
                    'docID': util.copy_to_unicode(doc_id),
                    'filetype': 'xml'
                }))

        return rawdoc_list
Example #2
0
    def harvest(self, start_date=None, end_date=None):

        start_date = (start_date or date.today() - timedelta(settings.DAYS_BACK)).isoformat()
        end_date = (end_date or date.today()).isoformat()

        if self.timezone_granularity:
            start_date += 'T00:00:00Z'
            end_date += 'T00:00:00Z'

        url = furl(self.base_url)
        url.args['verb'] = 'ListRecords'
        url.args['metadataPrefix'] = 'oai_dc'
        url.args['from'] = start_date
        url.args['until'] = end_date

        records = self.get_records(url.url, start_date, end_date)

        rawdoc_list = []
        for record in records:
            doc_id = record.xpath(
                'ns0:header/ns0:identifier', namespaces=self.namespaces)[0].text
            record = etree.tostring(record, encoding=self.record_encoding)
            rawdoc_list.append(RawDocument({
                'doc': record,
                'source': util.copy_to_unicode(self.short_name),
                'docID': util.copy_to_unicode(doc_id),
                'filetype': 'xml'
            }))

        return rawdoc_list
Example #3
0
 def process_raw(self, raw_doc):
     self.send_to_database(source=copy_to_unicode(raw_doc['source']),
                           docID=copy_to_unicode(raw_doc['docID']),
                           doc=six.text_type(
                               raw_doc['doc']).encode('utf-8'),
                           filetype=copy_to_unicode(raw_doc['filetype']),
                           timestamps=raw_doc.get('timestamps', {})).save()
Example #4
0
    def harvest(self, start_date=None, end_date=None):

        start_date = (start_date or date.today() - timedelta(settings.DAYS_BACK)).isoformat()
        end_date = (end_date or date.today()).isoformat()

        if self.timezone_granularity:
            start_date += 'T00:00:00Z'
            end_date += 'T00:00:00Z'

        records_url = self.base_url + self.RECORDS_URL
        request_url = records_url + self.META_PREFIX_DATE.format(start_date, end_date)

        records = self.get_records(request_url, start_date, end_date)

        rawdoc_list = []
        for record in records:
            doc_id = record.xpath(
                'ns0:header/ns0:identifier', namespaces=self.namespaces)[0].text
            record = etree.tostring(record, encoding=self.record_encoding)
            rawdoc_list.append(RawDocument({
                'doc': record,
                'source': util.copy_to_unicode(self.short_name),
                'docID': util.copy_to_unicode(doc_id),
                'filetype': 'xml'
            }))

        return rawdoc_list
Example #5
0
    def harvest(self, start_date=None, end_date=None):

        start_date = (start_date or date.today() -
                      timedelta(settings.DAYS_BACK)).isoformat()
        end_date = (end_date or date.today()).isoformat()

        if self.timezone_granularity:
            start_date += 'T00:00:00Z'
            end_date += 'T00:00:00Z'

        records_url = self.base_url + self.RECORDS_URL
        request_url = records_url + self.META_PREFIX_DATE.format(
            start_date, end_date)

        records = self.get_records(request_url, start_date, end_date)

        rawdoc_list = []
        for record in records:
            doc_id = record.xpath('ns0:header/ns0:identifier',
                                  namespaces=self.namespaces)[0].text
            record = etree.tostring(record, encoding=self.record_encoding)
            rawdoc_list.append(
                RawDocument({
                    'doc': record,
                    'source': util.copy_to_unicode(self.short_name),
                    'docID': util.copy_to_unicode(doc_id),
                    'filetype': 'xml'
                }))

        return rawdoc_list
Example #6
0
 def process_raw(self, raw_doc):
     self.send_to_database(
         source=copy_to_unicode(raw_doc['source']),
         docID=copy_to_unicode(raw_doc['docID']),
         doc=six.text_type(raw_doc['doc']).encode('utf-8'),
         filetype=copy_to_unicode(raw_doc['filetype']),
         timestamps=raw_doc.get('timestamps', {})
     ).save()
Example #7
0
    def harvest(self, start_date=None, end_date=None):

        start_date = (start_date or date.today() -
                      timedelta(settings.DAYS_BACK)).isoformat()
        end_date = (end_date or date.today()).isoformat()

        start_date += 'T00:00:00Z'
        end_date += 'T00:00:00Z'

        base_url = 'https://www.earthsystemgrid.org/oai/repository?verb=ListRecords&metadataPrefix=dif&from={}&until={}'

        url = base_url.format(start_date, end_date)

        data = requests.get(url)
        doc = etree.XML(data.content)

        records = doc.xpath('//OAI-PMH:record', namespaces=self.namespaces)

        xml_list = []
        for record in records:
            doc_id = record.xpath('//OAI-PMH:header/OAI-PMH:identifier/node()',
                                  namespaces=self.namespaces)[0]
            record = etree.tostring(record)
            xml_list.append(
                RawDocument({
                    'doc': record,
                    'source': self.short_name,
                    'docID': copy_to_unicode(doc_id),
                    'filetype': 'xml'
                }))

        return xml_list
Example #8
0
    def harvest(self, start_date=None, end_date=None):

        start_date = start_date or date.today() - timedelta(settings.DAYS_BACK)
        end_date = end_date or date.today()

        base_url = 'http://www.osti.gov/pages/pagesxml?nrows={0}&EntryDateFrom={1}&EntryDateTo={2}'
        url = base_url.format('1', format_date_with_slashes(start_date), format_date_with_slashes(end_date))
        initial_data = requests.get(url)
        record_encoding = initial_data.encoding
        initial_doc = etree.XML(initial_data.content)

        num_results = int(initial_doc.xpath('//records/@count', namespaces=self.namespaces)[0])

        url = base_url.format(num_results, start_date, end_date)

        data = requests.get(url)
        doc = etree.XML(data.content)

        records = doc.xpath('records/record')

        xml_list = []
        for record in records:
            doc_id = record.xpath('dc:ostiId/node()', namespaces=self.namespaces)[0]
            record = etree.tostring(record, encoding=record_encoding)
            xml_list.append(RawDocument({
                'doc': record,
                'source': self.short_name,
                'docID': copy_to_unicode(doc_id),
                'filetype': 'xml'
            }))

        return xml_list
Example #9
0
    def harvest(self, start_date=None, end_date=None):

        start_date = start_date or date.today() - timedelta(settings.DAYS_BACK)
        end_date = end_date or date.today()

        records = self.get_records(start_date, end_date)

        xml_list = []
        for record in records:
            # This ID is unique per data package, but won't unify multiple packages for the same project
            doc_id = record.xpath("str[@name='id']")[0].text
            format_type = record.xpath("str[@name='formatType']")[0].text
            record = ElementTree.tostring(record,
                                          encoding=self.record_encoding)
            if format_type.lower() != 'metadata':
                logger.info(
                    'Not normalizing record with ID {}, type {}'.format(
                        doc_id, format_type))
            else:
                xml_list.append(
                    RawDocument({
                        'doc': record,
                        'source': self.short_name,
                        'docID': copy_to_unicode(doc_id),
                        'filetype': 'xml'
                    }))

        return xml_list
Example #10
0
    def harvest(self, start_date=None, end_date=None):
        """
        Return a list of RawDocuments
        """
        start_date = start_date or date.today() - timedelta(settings.DAYS_BACK)
        end_date = end_date or date.today()

        base_url = 'http://exporter.nih.gov/'
        table_url = 'http://exporter.nih.gov/ExPORTER_Catalog.aspx/'

        # get ExPORTER page html and rows storing records
        html = requests.get(table_url).content
        soup = BeautifulSoup(html, 'lxml')
        table = soup.find('table', id="ContentPlaceHolder1_ProjectData_dgProjectData")
        rows = table.find_all('tr', class_="row_bg")
        urls = [i for i in construct_urls(base_url, start_date, end_date, rows)]

        return [
            RawDocument({
                'doc': etree.tostring(record, encoding=self.DEFAULT_ENCODING),
                'source': self.short_name,
                'docID': copy_to_unicode(record.xpath('.//APPLICATION_ID/node()', namespaces=self.namespaces)[0]),
                'filetype': 'xml'
            }) for record in xml_records(get_xml_files(urls))
        ]
Example #11
0
    def harvest(self, start_date=None, end_date=None):
        """
        Return a list of RawDocuments
        """
        start_date = start_date or date.today() - timedelta(settings.DAYS_BACK)
        end_date = end_date or date.today()

        base_url = 'http://exporter.nih.gov/'
        table_url = 'http://exporter.nih.gov/ExPORTER_Catalog.aspx/'

        # get ExPORTER page html and rows storing records
        html = requests.get(table_url).content
        soup = BeautifulSoup(html, 'lxml')
        table = soup.find('table',
                          id="ContentPlaceHolder1_ProjectData_dgProjectData")
        rows = table.find_all('tr', class_="row_bg")
        urls = [
            i for i in construct_urls(base_url, start_date, end_date, rows)
        ]

        return [
            RawDocument({
                'doc':
                etree.tostring(record, encoding=self.DEFAULT_ENCODING),
                'source':
                self.short_name,
                'docID':
                copy_to_unicode(
                    record.xpath('.//APPLICATION_ID/node()',
                                 namespaces=self.namespaces)[0]),
                'filetype':
                'xml'
            }) for record in xml_records(get_xml_files(urls))
        ]
Example #12
0
    def harvest(self, start_date=None, end_date=None):
        ''' First, get a list of all recently updated study urls,
        then get the xml one by one and save it into a list
        of docs including other information '''

        start_date = (start_date or date.today() - timedelta(settings.DAYS_BACK)).isoformat()
        end_date = (end_date or date.today()).isoformat()
        start_date += 'T00:00:00Z'
        end_date += 'T00:00:00Z'

        # grab each of those urls for full content
        xml_list = []
        xml_base_url = self.canonical_base_url + '&view=xml'
        for dataset_id in self.query_by_date(start_date, end_date):
            try:
                item_url = str(xml_base_url).format(dataset_id)
                content = requests.get(item_url, throttle=2)
            except exceptions.ConnectionError as e:
                logger.info('Connection error: {}, wait a bit...'.format(e))
                time.sleep(30)
                content = requests.get(item_url)
            doc = etree.XML(content.content)

            record = etree.tostring(doc, encoding=self.DEFAULT_ENCODING)
            xml_list.append(RawDocument({
                'doc': record,
                'source': self.short_name,
                'docID': copy_to_unicode(dataset_id),
                'filetype': 'xml',
            }))

        return xml_list
Example #13
0
    def harvest(self, start_date=None, end_date=None):
        ''' First, get a list of all recently updated study urls,
        then get the xml one by one and save it into a list
        of docs including other information '''

        start_date = (start_date or date.today() -
                      timedelta(settings.DAYS_BACK)).isoformat()
        end_date = (end_date or date.today()).isoformat()
        start_date += 'T00:00:00Z'
        end_date += 'T00:00:00Z'

        # grab each of those urls for full content
        xml_list = []
        xml_base_url = self.canonical_base_url + '&view=xml'
        for dataset_id in self.query_by_date(start_date, end_date):
            try:
                item_url = str(xml_base_url).format(dataset_id)
                content = requests.get(item_url, throttle=2)
            except exceptions.ConnectionError as e:
                logger.info('Connection error: {}, wait a bit...'.format(e))
                time.sleep(30)
                content = requests.get(item_url)
            doc = etree.XML(content.content)

            record = etree.tostring(doc, encoding=self.DEFAULT_ENCODING)
            xml_list.append(
                RawDocument({
                    'doc': record,
                    'source': self.short_name,
                    'docID': copy_to_unicode(dataset_id),
                    'filetype': 'xml',
                }))

        return xml_list
Example #14
0
    def harvest(self, start_date=None, end_date=None):

        start_date = (start_date or date.today() - timedelta(settings.DAYS_BACK)).isoformat()
        end_date = (end_date or date.today()).isoformat()

        start_date += 'T00:00:00Z'
        end_date += 'T00:00:00Z'

        base_url = 'https://www.earthsystemgrid.org/oai/repository?verb=ListRecords&metadataPrefix=dif&from={}&until={}'

        url = base_url.format(start_date, end_date)

        data = requests.get(url)
        doc = etree.XML(data.content)

        records = doc.xpath('//OAI-PMH:record', namespaces=self.namespaces)

        xml_list = []
        for record in records:
            doc_id = record.xpath('//OAI-PMH:header/OAI-PMH:identifier/node()', namespaces=self.namespaces)[0]
            record = etree.tostring(record)
            xml_list.append(RawDocument({
                'doc': record,
                'source': self.short_name,
                'docID': copy_to_unicode(doc_id),
                'filetype': 'xml'
            }))

        return xml_list
Example #15
0
    def harvest(self, days_back=1):
        today = date.today()
        start_date = today - timedelta(days_back)
        base_url = 'http://www.osti.gov/pages/pagesxml?nrows={0}&EntryDateFrom={1}&EntryDateTo={2}'
        url = base_url.format('1', start_date.strftime('%m/%d/%Y'), today.strftime('%m/%d/%Y'))
        initial_data = requests.get(url)
        record_encoding = initial_data.encoding
        initial_doc = etree.XML(initial_data.content)

        num_results = int(initial_doc.xpath('//records/@count', namespaces=self.namespaces)[0])

        url = base_url.format(num_results, start_date.strftime('%m/%d/%Y'), today.strftime('%m/%d/%Y'))
        data = requests.get(url)
        doc = etree.XML(data.content)

        records = doc.xpath('records/record')

        xml_list = []
        for record in records:
            doc_id = record.xpath('dc:ostiId/node()', namespaces=self.namespaces)[0]
            record = etree.tostring(record, encoding=record_encoding)
            xml_list.append(RawDocument({
                'doc': record,
                'source': self.short_name,
                'docID': copy_to_unicode(doc_id),
                'filetype': 'xml'
            }))

        return xml_list
Example #16
0
    def harvest(self, days_back=1):
        records = self.get_records(days_back)

        xml_list = []
        for record in records:
            doc_id = record.xpath("str[@name='id']")[0].text
            record = ElementTree.tostring(record, encoding=self.record_encoding)
            xml_list.append(RawDocument({
                'doc': record,
                'source': self.short_name,
                'docID': copy_to_unicode(doc_id),
                'filetype': 'xml'
            }))

        return xml_list
Example #17
0
    def get_ids(self, result, doc):
        """
        Gather the DOI and url from identifiers, if possible.
        Tries to save the DOI alone without a url extension.
        Tries to save a link to the original content at the source,
        instead of direct to a PDF, which is usually linked with viewcontent.cgi?
        in the url field
        """
        serviceID = doc.get('docID')

        url = 'http://core.tdar.org/document/' + serviceID.replace('oai:tdar.org:Resource:', '')

        return {
            'serviceID': serviceID,
            'url': util.copy_to_unicode(url),
            'doi': ''
        }
Example #18
0
    def harvest(self, start_date=None, end_date=None):

        start_date = start_date or date.today() - timedelta(settings.DAYS_BACK)
        end_date = end_date or date.today()

        records = self.get_records(start_date, end_date)

        xml_list = []
        for record in records:
            doc_id = record.xpath("str[@name='id']")[0].text
            record = ElementTree.tostring(record, encoding=self.record_encoding)
            xml_list.append(
                RawDocument(
                    {"doc": record, "source": self.short_name, "docID": copy_to_unicode(doc_id), "filetype": "xml"}
                )
            )

        return xml_list
Example #19
0
    def harvest(self, start_date=None, end_date=None):

        start_date = start_date or date.today() - timedelta(settings.DAYS_BACK)
        end_date = end_date or date.today()

        records = self.get_records(start_date, end_date)

        xml_list = []
        for record in records:
            # This ID is unique per data package, but won't unify multiple packages for the smae project
            doc_id = record.xpath("str[@name='id']")[0].text
            record = ElementTree.tostring(record, encoding=self.record_encoding)
            xml_list.append(RawDocument({
                'doc': record,
                'source': self.short_name,
                'docID': copy_to_unicode(doc_id),
                'filetype': 'xml'
            }))

        return xml_list
Example #20
0
    def harvest(self, start_date=None, end_date=None):

        url = 'http://dailyssrn.com/rss/rss-all-2.0.xml'

        data = requests.get(url)
        doc = etree.XML(data.content)

        records = doc.xpath('channel/item')

        xml_list = []
        for record in records:
            doc_id = parse_id_from_url(record.xpath('link/node()'))
            record = etree.tostring(record)
            xml_list.append(RawDocument({
                'doc': record,
                'source': self.short_name,
                'docID': copy_to_unicode(doc_id),
                'filetype': 'xml'
            }))

        return xml_list
Example #21
0
    def harvest(self, start_date=None, end_date=None):

        start_date = start_date or date.today() - timedelta(settings.DAYS_BACK)
        end_date = end_date or date.today()

        records = self.get_records(start_date, end_date)

        xml_list = []
        for record in records:
            doc_id = record.xpath("str[@name='id']")[0].text
            record = ElementTree.tostring(record,
                                          encoding=self.record_encoding)
            xml_list.append(
                RawDocument({
                    'doc': record,
                    'source': self.short_name,
                    'docID': copy_to_unicode(doc_id),
                    'filetype': 'xml'
                }))

        return xml_list
Example #22
0
    def harvest(self, start_date=None, end_date=None):

        url = 'http://dailyssrn.com/rss/rss-all-2.0.xml'

        data = requests.get(url, force=True)
        doc = etree.XML(data.content)

        records = doc.xpath('channel/item')

        xml_list = []
        for record in records:
            doc_id = parse_id_from_url(record.xpath('link/node()'))
            record = etree.tostring(record)
            xml_list.append(
                RawDocument({
                    'doc': record,
                    'source': self.short_name,
                    'docID': copy_to_unicode(doc_id),
                    'filetype': 'xml'
                }))

        return xml_list
Example #23
0
    def harvest(self, start_date=None, end_date=None):

        start_date = start_date or date.today() - timedelta(settings.DAYS_BACK)
        end_date = end_date or date.today()

        records = self.get_records(start_date, end_date)

        xml_list = []
        for record in records:
            # This ID is unique per data package, but won't unify multiple packages for the same project
            doc_id = record.xpath("str[@name='id']")[0].text
            format_type = record.xpath("str[@name='formatType']")[0].text
            record = ElementTree.tostring(record, encoding=self.record_encoding)
            if format_type.lower() != 'metadata':
                logger.info('Not normalizing record with ID {}, type {}'.format(doc_id, format_type))
            else:
                xml_list.append(RawDocument({
                    'doc': record,
                    'source': self.short_name,
                    'docID': copy_to_unicode(doc_id),
                    'filetype': 'xml'
                }))

        return xml_list
Example #24
0
 def process_normalized(self, raw_doc, normalized):
     self.send_to_database(
         source=copy_to_unicode(raw_doc['source']),
         docID=copy_to_unicode(raw_doc['docID']),
         contributors=copy_to_unicode(json.dumps(normalized['contributors'])),
         description=copy_to_unicode(normalized.get('description')),
         uris=copy_to_unicode(json.dumps(normalized['uris'])),
         providerUpdatedDateTime=parse(normalized['providerUpdatedDateTime']).replace(tzinfo=None),
         freeToRead=copy_to_unicode(json.dumps(normalized.get('freeToRead', {}))),
         languages=normalized.get('language'),
         licenses=copy_to_unicode(json.dumps(normalized.get('licenseRef', []))),
         publisher=copy_to_unicode(json.dumps(normalized.get('publisher', {}))),
         sponsorships=copy_to_unicode(json.dumps(normalized.get('sponsorship', []))),
         title=copy_to_unicode(normalized['title']),
         version=copy_to_unicode(json.dumps(normalized.get('version'), {})),
         otherProperties=copy_to_unicode(json.dumps(normalized.get('otherProperties', {}))),
         shareProperties=copy_to_unicode(json.dumps(normalized['shareProperties']))
     ).save()
Example #25
0
 def process_normalized(self, raw_doc, normalized):
     self.send_to_database(
         source=copy_to_unicode(raw_doc['source']),
         docID=copy_to_unicode(raw_doc['docID']),
         contributors=copy_to_unicode(json.dumps(normalized['contributors'])),
         description=copy_to_unicode(normalized.get('description')),
         uris=copy_to_unicode(json.dumps(normalized['uris'])),
         providerUpdatedDateTime=parse(normalized['providerUpdatedDateTime']).replace(tzinfo=None),
         freeToRead=copy_to_unicode(json.dumps(normalized.get('freeToRead', {}))),
         languages=normalized.get('language'),
         licenses=copy_to_unicode(json.dumps(normalized.get('licenseRef', []))),
         publisher=copy_to_unicode(json.dumps(normalized.get('publisher', {}))),
         sponsorships=copy_to_unicode(json.dumps(normalized.get('sponsorship', []))),
         title=copy_to_unicode(normalized['title']),
         version=copy_to_unicode(json.dumps(normalized.get('version'), {})),
         otherProperties=copy_to_unicode(json.dumps(normalized.get('otherProperties', {}))),
         shareProperties=copy_to_unicode(json.dumps(normalized['shareProperties']))
     ).save()
Example #26
0
    def harvest(self, start_date=None, end_date=None):
        """ First, get a list of all recently updated study urls,
        then get the xml one by one and save it into a list
        of docs including other information """

        start_date = start_date or date.today() - timedelta(settings.DAYS_BACK)
        end_date = end_date or date.today()

        end_month = end_date.strftime('%m')
        end_day = end_date.strftime('%d')
        end_year = end_date.strftime('%Y')

        start_month = start_date.strftime('%m')
        start_day = start_date.strftime('%d')
        start_year = start_date.strftime('%Y')

        base_url = 'http://clinicaltrials.gov/ct2/results?lup_s='
        url_end = '{}%2F{}%2F{}&lup_e={}%2F{}%2F{}&displayxml=true'.\
            format(start_month, start_day, start_year, end_month, end_day, end_year)

        url = base_url + url_end

        # grab the total number of studies
        initial_request = requests.get(url)
        record_encoding = initial_request.encoding
        initial_request_xml = etree.XML(initial_request.content)
        count = int(initial_request_xml.xpath('//search_results/@count')[0])
        xml_list = []
        if int(count) > 0:
            # get a new url with all results in it
            url = url + '&count=' + str(count)
            total_requests = requests.get(url)
            initial_doc = etree.XML(total_requests.content)

            # make a list of urls from that full list of studies
            study_urls = []
            for study in initial_doc.xpath('//clinical_study'):
                study_urls.append(
                    study.xpath('url/node()')[0] + '?displayxml=true')

            # grab each of those urls for full content
            logger.info("There are {} urls to harvest - be patient...".format(
                len(study_urls)))
            count = 0
            official_count = 0
            for study_url in study_urls:
                try:
                    content = requests.get(study_url)
                except requests.exceptions.ConnectionError as e:
                    logger.info(
                        'Connection error: {}, wait a bit...'.format(e))
                    time.sleep(30)
                    continue
                doc = etree.XML(content.content)
                record = etree.tostring(doc, encoding=record_encoding)
                doc_id = doc.xpath('//nct_id/node()')[0]
                xml_list.append(
                    RawDocument({
                        'doc': record,
                        'source': self.short_name,
                        'docID': copy_to_unicode(doc_id),
                        'filetype': 'xml',
                    }))
                official_count += 1
                count += 1
                if count % 100 == 0:
                    logger.info(
                        "You've requested {} studies, keep going!".format(
                            official_count))
                    count = 0

        return xml_list
Example #27
0
    def harvest(self, start_date=None, end_date=None):
        """ First, get a list of all recently updated study urls,
        then get the xml one by one and save it into a list
        of docs including other information """

        start_date = start_date or date.today() - timedelta(settings.DAYS_BACK)
        end_date = end_date or date.today()

        end_month = end_date.strftime('%m')
        end_day = end_date.strftime('%d')
        end_year = end_date.strftime('%Y')

        start_month = start_date.strftime('%m')
        start_day = start_date.strftime('%d')
        start_year = start_date.strftime('%Y')

        base_url = 'http://clinicaltrials.gov/ct2/results?lup_s='
        url_end = '{}%2F{}%2F{}%2F&lup_e={}%2F{}%2F{}&displayxml=true'.\
            format(start_month, start_day, start_year, end_month, end_day, end_year)

        url = base_url + url_end

        # grab the total number of studies
        initial_request = requests.get(url)
        record_encoding = initial_request.encoding
        initial_request_xml = etree.XML(initial_request.content)
        count = int(initial_request_xml.xpath('//search_results/@count')[0])
        xml_list = []
        if int(count) > 0:
            # get a new url with all results in it
            url = url + '&count=' + str(count)
            total_requests = requests.get(url)
            initial_doc = etree.XML(total_requests.content)

            # make a list of urls from that full list of studies
            study_urls = []
            for study in initial_doc.xpath('//clinical_study'):
                study_urls.append(study.xpath('url/node()')[0] + '?displayxml=true')

            # grab each of those urls for full content
            logger.info("There are {} urls to harvest - be patient...".format(len(study_urls)))
            count = 0
            official_count = 0
            for study_url in study_urls:
                try:
                    content = requests.get(study_url)
                except requests.exceptions.ConnectionError as e:
                    logger.info('Connection error: {}, wait a bit...'.format(e))
                    time.sleep(30)
                    continue
                doc = etree.XML(content.content)
                record = etree.tostring(doc, encoding=record_encoding)
                doc_id = doc.xpath('//nct_id/node()')[0]
                xml_list.append(RawDocument({
                    'doc': record,
                    'source': self.short_name,
                    'docID': copy_to_unicode(doc_id),
                    'filetype': 'xml',
                }))
                official_count += 1
                count += 1
                if count % 100 == 0:
                    logger.info("You've requested {} studies, keep going!".format(official_count))
                    count = 0

        return xml_list
Example #28
0
    def test_copy_to_unicode(self):
        converted = util.copy_to_unicode('test')

        assert converted == u'test'
        assert isinstance(converted, unicode)