Example #1
0
class ClinicalTrialsHarvester(XMLHarvester):

    short_name = 'clinicaltrials'
    long_name = 'ClinicalTrials.gov'
    url = 'https://clinicaltrials.gov/'

    DEFAULT_ENCODING = 'UTF-8'
    record_encoding = None

    # TODO - clinicaltrials elements have a lot of extra metadata - at some
    # point in the future we should do a more thorough audit.
    schema = {
        "contributors":
        ('//overall_official/last_name/node()', default_name_parser),
        "uris": {
            "canonicalUri": ("//required_header/url/node()", single_result)
        },
        "providerUpdatedDateTime":
        ("lastchanged_date/node()", compose(datetime_formatter,
                                            single_result)),
        "title": ('//official_title/node()', '//brief_title/node()',
                  lambda x, y: single_result(x) or single_result(y)),
        "description": ('//brief_summary/textblock/node()',
                        '//brief_summary/textblock/node()',
                        lambda x, y: single_result(x) or single_result(y)),
        "tags":
        ("//keyword/node()", lambda tags: [tag.lower() for tag in tags]),
        "sponsorships": [{
            "sponsor": {
                "sponsorName":
                ("//sponsors/lead_sponsor/agency/node()", single_result)
            }
        }, {
            "sponsor": {
                "sponsorName":
                ("//sponsors/collaborator/agency/node()", single_result)
            }
        }],
        "otherProperties":
        build_properties(
            ("serviceID", "//nct_id/node()"),
            ('oversightAuthority', '//oversight_info/authority/node()'),
            ('studyDesign', '//study_design/node()'),
            ('numberOfArms', '//number_of_arms/node()'),
            ('source', '//source/node()'),
            ('verificationDate', '//verification_date/node()'),
            ('lastChanged', '//lastchanged_date/node()'),
            ('condition', '//condition/node()'),
            ('verificationDate', '//verification_date/node()'),
            ('lastChanged', '//lastchanged_date/node()'),
            ('status', '//status/node()'),
            ('locationCountries', '//location_countries/country/node()'),
            ('isFDARegulated', '//is_fda_regulated/node()'),
            ('isSection801', '//is_section_801/node()'),
            ('hasExpandedAccess', '//has_expanded_access/node()'),
            ('leadSponsorAgencyClass', '//lead_sponsor/agency_class/node()'),
            ('collaborator', '//collaborator/agency/node()'),
            ('collaboratorAgencyClass', '//collaborator/agency_class/node()'),
            ('measure', '//primary_outcome/measure/node()'),
            ('timeFrame', '//primary_outcome/time_frame/node()'),
            ('safetyIssue', '//primary_outcome/safety_issue/node()'),
            ('secondaryOutcomes', '//secondary_outcome/measure/node()'),
            ('enrollment', '//enrollment/node()'),
            ('armGroup', '//arm_group/arm_group_label/node()'),
            ('intervention', '//intervention/intervention_type/node()'),
            ('eligibility', ('//eligibility/node()',
                             compose(lambda x: list(map(element_to_dict, x)),
                                     lambda x: list(filter(non_string, x))))),
            ('link', '//link/url/node()'),
            ('responsible_party',
             '//responsible_party/responsible_party_full_name/node()'))
    }

    @property
    def namespaces(self):
        return None

    def harvest(self, start_date=None, end_date=None):
        """ First, get a list of all recently updated study urls,
        then get the xml one by one and save it into a list
        of docs including other information """

        start_date = start_date or date.today() - timedelta(settings.DAYS_BACK)
        end_date = end_date or date.today()

        end_month = end_date.strftime('%m')
        end_day = end_date.strftime('%d')
        end_year = end_date.strftime('%Y')

        start_month = start_date.strftime('%m')
        start_day = start_date.strftime('%d')
        start_year = start_date.strftime('%Y')

        base_url = 'http://clinicaltrials.gov/ct2/results?lup_s='
        url_end = '{}%2F{}%2F{}&lup_e={}%2F{}%2F{}&displayxml=true'.\
            format(start_month, start_day, start_year, end_month, end_day, end_year)

        url = base_url + url_end

        # grab the total number of studies
        initial_request = requests.get(url)
        record_encoding = initial_request.encoding
        initial_request_xml = etree.XML(initial_request.content)
        count = int(initial_request_xml.xpath('//search_results/@count')[0])
        xml_list = []
        if int(count) > 0:
            # get a new url with all results in it
            url = url + '&count=' + str(count)
            total_requests = requests.get(url)
            initial_doc = etree.XML(total_requests.content)

            # make a list of urls from that full list of studies
            study_urls = []
            for study in initial_doc.xpath('//clinical_study'):
                study_urls.append(
                    study.xpath('url/node()')[0] + '?displayxml=true')

            # grab each of those urls for full content
            logger.info("There are {} urls to harvest - be patient...".format(
                len(study_urls)))
            count = 0
            official_count = 0
            for study_url in study_urls:
                try:
                    content = requests.get(study_url)
                except requests.exceptions.ConnectionError as e:
                    logger.info(
                        'Connection error: {}, wait a bit...'.format(e))
                    time.sleep(30)
                    continue
                doc = etree.XML(content.content)
                record = etree.tostring(doc, encoding=record_encoding)
                doc_id = doc.xpath('//nct_id/node()')[0]
                xml_list.append(
                    RawDocument({
                        'doc': record,
                        'source': self.short_name,
                        'docID': copy_to_unicode(doc_id),
                        'filetype': 'xml',
                    }))
                official_count += 1
                count += 1
                if count % 100 == 0:
                    logger.info(
                        "You've requested {} studies, keep going!".format(
                            official_count))
                    count = 0

        return xml_list
Example #2
0
            isotope analysis.',
    'providerUpdatedDateTime':
    '2015-02-23T00:00:00',
    'shareProperties': {
        'source': 'test'
    }
}

TEST_SCHEMA = updated_schema(
    DOESCHEMA, {
        "title": ("//dc:title/node()", lambda x: "Title overwritten"),
        "otherProperties":
        build_properties(
            ("title1", ("//dc:title/node()", single_result)),
            ("title2",
             ("//dc:title/node()", lambda x: single_result(x).lower())),
            ("title3",
             ("//dc:title/node()", "//dc:title/node()",
              lambda x, y: single_result(x) + single_result(y).lower())))
    })

TEST_NAMESPACES = {
    'rdf': 'http://www.w3.org/1999/02/22-rdf-syntax-ns#',
    'dc': 'http://purl.org/dc/elements/1.1/',
    'dcq': 'http://purl.org/dc/terms/'
}

TEST_XML_DOC = '''
    <rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:dcq="http://purl.org/dc/terms/">
        <records count="97" morepages="true" start="1" end="10">
            <record rownumber="1">
Example #3
0
    'description': 'This study seeks to understand how humans impact\
            the dietary patterns of eight free-ranging vervet monkey\
            (Chlorocebus pygerythrus) groups in South Africa using stable\
            isotope analysis.',
    'providerUpdatedDateTime': '2015-02-23T00:00:00',
    'shareProperties': {
        'source': 'test'
    }
}


TEST_SCHEMA = updated_schema(DOESCHEMA, {
    "title": ("//dc:title/node()", lambda x: "Title overwritten"),
    "otherProperties": build_properties(
        ("title1", ("//dc:title/node()", single_result)),
        ("title2", ("//dc:title/node()", lambda x: single_result(x).lower())),
        ("title3", ("//dc:title/node()", "//dc:title/node()", lambda x, y: single_result(x) + single_result(y).lower()))
    )
})


TEST_NAMESPACES = {
    'rdf': 'http://www.w3.org/1999/02/22-rdf-syntax-ns#',
    'dc': 'http://purl.org/dc/elements/1.1/',
    'dcq': 'http://purl.org/dc/terms/'
}


TEST_XML_DOC = b'''
    <rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:dcq="http://purl.org/dc/terms/">
        <records count="97" morepages="true" start="1" end="10">
Example #4
0
class DataOneHarvester(XMLHarvester):
    short_name = 'dataone'
    long_name = 'DataONE: Data Observation Network for Earth'
    url = 'https://www.dataone.org/'

    namespaces = {}

    record_encoding = None

    schema = {
        'otherProperties':
        build_properties(
            ('authorGivenName', ("str[@name='authorGivenName']/node()")),
            ('authorSurName', ("str[@name='authorSurName']/node()")),
            ('authoritativeMN', ("str[@name='authoritativeMN']/node()")),
            ('checksum', ("str[@name='checksum']/node()")),
            ('checksumAlgorithm', ("str[@name='checksumAlgorithm']/node()")),
            ('dataUrl', ("str[@name='dataUrl']/node()")),
            ('datasource', ("str[@name='datasource']/node()")),
            ('dateModified', ("date[@name='dateModified']/node()")),
            ('datePublished', ("date[@name='datePublished']/node()")),
            ('dateUploaded', ("date[@name='dateUploaded']/node()")),
            ('pubDate', ("date[@name='pubDate']/node()")),
            ('updateDate', ("date[@name='updateDate']/node()")),
            ('fileID', ("str[@name='fileID']/node()")),
            ('formatId', ("str[@name='formatId']/node()")),
            ('formatType', ("str[@name='formatType']/node()")),
            ('identifier', ("str[@name='identifier']/node()")),
            ('investigator', "arr[@name='investigator']/str/node()"),
            ('origin', "arr[@name='origin']/str/node()"),
            ('isPublic', ("bool[@name='isPublic']/node()")),
            ('readPermission', "arr[@name='readPermission']/str/node()"),
            ('replicaMN', "arr[@name='replicaMN']/str/node()"),
            ('replicaVerifiedDate',
             "arr[@name='replicaVerifiedDate']/date/node()"),
            ('replicationAllowed',
             ("bool[@name='replicationAllowed']/node()")),
            ('numberReplicas', ("int[@name='numberReplicas']/node()")),
            ('preferredReplicationMN',
             "arr[@name='preferredReplicationMN']/str/node()"),
            ('resourceMap', "arr[@name='resourceMap']/str/node()"),
            ('rightsHolder', ("str[@name='rightsHolder']/node()")),
            ('scientificName', "arr[@name='scientificName']/str/node()"),
            ('site', "arr[@name='site']/str/node()"),
            ('size',
             ("long[@name='size']/node()")), ('sku',
                                              ("str[@name='sku']/node()")),
            ('isDocumentedBy', "arr[@name='isDocumentedBy']/str/node()"),
            ('serviceID', "str[@name='id']/node()")),
        'freeToRead': {
            'startDate': ("bool[@name='isPublic']/node()",
                          "date[@name='dateModified']/node()",
                          lambda x, y: parse(y[0]).date().isoformat()
                          if x else None)
        },
        'contributors':
        ("str[@name='author']/node()", "str[@name='submitter']/node()",
         "arr[@name='origin']/str/node()", process_contributors),
        'uris': {
            'canonicalUri': ("str[@name='id']/node()",
                             "//str[@name='dataUrl']/node()", lambda x, y: y[0]
                             if 'http' in single_result(y) else x[0]
                             if 'http' in single_result(x) else ''),
        },
        'tags': ("//arr[@name='keywords']/str/node()", lambda x: x
                 if isinstance(x, list) else [x]),
        'providerUpdatedDateTime':
        ("str[@name='dateModified']/node()",
         compose(lambda x: parse(x).date().isoformat(), single_result)),
        'title': ("str[@name='title']/node()", single_result),
        'description': ("str[@name='abstract']/node()", single_result)
    }

    def harvest(self, start_date=None, end_date=None):

        start_date = start_date or date.today() - timedelta(settings.DAYS_BACK)
        end_date = end_date or date.today()

        records = self.get_records(start_date, end_date)

        xml_list = []
        for record in records:
            doc_id = record.xpath("str[@name='id']")[0].text
            record = ElementTree.tostring(record,
                                          encoding=self.record_encoding)
            xml_list.append(
                RawDocument({
                    'doc': record,
                    'source': self.short_name,
                    'docID': copy_to_unicode(doc_id),
                    'filetype': 'xml'
                }))

        return xml_list

    def get_records(self, start_date, end_date):
        ''' helper function to get a response from the DataONE
        API, with the specified number of rows.
        Returns an etree element with results '''

        query = 'dateModified:[{}T00:00:00Z TO {}T00:00:00Z]'.format(
            start_date.isoformat(), end_date.isoformat())
        doc = requests.get(DATAONE_SOLR_ENDPOINT,
                           params={
                               'q': query,
                               'start': 0,
                               'rows': 1
                           })
        doc = etree.XML(doc.content)
        rows = int(doc.xpath("//result/@numFound")[0])

        n = 0
        while n < rows:
            data = requests.get(DATAONE_SOLR_ENDPOINT,
                                params={
                                    'q': query,
                                    'start': n,
                                    'rows': 1000
                                })
            docs = etree.XML(data.content).xpath('//doc')
            for doc in docs:
                yield doc
            n += 1000