Ejemplo n.º 1
0
 def schema(self):
     return {
         "contributors":
         ('//PIS/PI/PI_NAME/node()', '//ORG_NAME', nih_name_parser),
         "uris": {
             "canonicalUri": ("//APPLICATION_ID/node()",
                              compose(self.construct_project_url,
                                      single_result)),
             "descriptorUris":
             ("//APPLICATION_ID/node()", "//FOA_NUMBER/node()",
              self.construct_descriptor_uris)
         },
         "providerUpdatedDateTime": ("AWARD_NOTICE_DATE/node()",
                                     compose(datetime_formatter,
                                             single_result)),
         "title": ('//PROJECT_TITLE/node()', single_result),
         "tags": ('//PROJECT_TERMSX/TERM/node()'),
         "otherProperties":
         build_properties(
             ("applicationID", "//APPLICATION_ID/node()"),
             ('activity', '//ACTIVITY/node()'),
             ('administeringIC', '//ADMINISTERING_IC/node()'),
             ('arraFunded', '//ARRA_FUNDED/node()'),
             ('budgetStart', '//BUDGET_START/node()'),
             ('budgetEnd', '//BUDGET_END/node()'),
             ('FOANumber', '//FOA_NUMBER/node()'),
             ('fullProjectNumber', '//FULL_PROJECT_NUM/node()'),
             ('fundingICs', '//FUNDING_ICs/node()'),
             ('fiscalYear', '//FY/node()'),
             ('NIHSpendingCats', '//NIH_SPENDING_CATS/@xsi:nil'),
             ('organizationCity', '//ORG_CITY/node()'),
             ('organizationCountry', '//ORG_CONTRY/node()'),
             ('organizationDistrict', '//ORG_DISTRICT/node()'),
             ('organizationDUNS', '//ORG_DUNS/node()'),
             ('organizationDept', '//ORG_DEPT/node()'),
             ('organizationFIPS', '//ORG_FIPS/node()'),
             ('organizationState', '//ORG_STATE/node()'),
             ('organizationZipcode', '//ORG_ZIPCODE/node()'),
             ('ICName', '//IC_NAME/node()'), ('organizationName',
                                              '//ORG_NAME/node()'),
             ('projectStart', '//PROJECT_START/node()'),
             ('projectEnd', '//PROJECT_END/node()'),
             ('PHR', '//PHR/node()'), ('serialNumber',
                                       '//SERIAL_NUMBER/node()'),
             ('studySection', '//STUDY_SECTION/node()'),
             ('studySectionName', '//STUDY_SECTION_NAME/node()'),
             ('supportYear', '//SUPPORT_YEAR/node()'),
             ('suffix', '//SUFFIX/node()'), ('subProjectID',
                                             '//SUBPROJECT_ID/@xsi:nil'),
             ('totalCost', '//TOTAL_COST/node()'),
             ('totalCostSubProject', '//TOTAL_COST_SUB_PROJECT/node()'),
             ('coreProjectNumber', '//CORE_PROJECT_NUM/node()'),
             ('CFDACode', '//CFDA_CODE/node()'),
             ('programOfficerName', '//PROGRAM_OFFICER_NAME/node()'),
             ('edInstType', '//ED_INST_TYPE/node()'),
             ('awardNoticeDate', '//AWARD_NOTICE_DATE/node()'),
             ('fundingMechanism', '//FUNDING_MECHANISM/node()'))
     }
Ejemplo n.º 2
0
 def schema(self):
     return {
         "contributors": ('//PIS/PI/PI_NAME/node()', '//ORG_NAME', nih_name_parser),
         "uris": {
             "canonicalUri": ("//APPLICATION_ID/node()", compose(self.construct_project_url, single_result)),
             "descriptorUris": ("//APPLICATION_ID/node()", "//FOA_NUMBER/node()",
                                self.construct_descriptor_uris)
         },
         "providerUpdatedDateTime": ("AWARD_NOTICE_DATE/node()", compose(datetime_formatter, single_result)),
         "title": ('//PROJECT_TITLE/node()', single_result),
         "tags": ('//PROJECT_TERMSX/TERM/node()'),
         "otherProperties": build_properties(
             ("applicationID", "//APPLICATION_ID/node()"),
             ('activity', '//ACTIVITY/node()'),
             ('administeringIC', '//ADMINISTERING_IC/node()'),
             ('arraFunded', '//ARRA_FUNDED/node()'),
             ('budgetStart', '//BUDGET_START/node()'),
             ('budgetEnd', '//BUDGET_END/node()'),
             ('FOANumber', '//FOA_NUMBER/node()'),
             ('fullProjectNumber', '//FULL_PROJECT_NUM/node()'),
             ('fundingICs', '//FUNDING_ICs/node()'),
             ('fiscalYear', '//FY/node()'),
             ('NIHSpendingCats', '//NIH_SPENDING_CATS/@xsi:nil'),
             ('organizationCity', '//ORG_CITY/node()'),
             ('organizationCountry', '//ORG_CONTRY/node()'),
             ('organizationDistrict', '//ORG_DISTRICT/node()'),
             ('organizationDUNS', '//ORG_DUNS/node()'),
             ('organizationDept', '//ORG_DEPT/node()'),
             ('organizationFIPS', '//ORG_FIPS/node()'),
             ('organizationState', '//ORG_STATE/node()'),
             ('organizationZipcode', '//ORG_ZIPCODE/node()'),
             ('ICName', '//IC_NAME/node()'),
             ('organizationName', '//ORG_NAME/node()'),
             ('projectStart', '//PROJECT_START/node()'),
             ('projectEnd', '//PROJECT_END/node()'),
             ('PHR', '//PHR/node()'),
             ('serialNumber', '//SERIAL_NUMBER/node()'),
             ('studySection', '//STUDY_SECTION/node()'),
             ('studySectionName', '//STUDY_SECTION_NAME/node()'),
             ('supportYear', '//SUPPORT_YEAR/node()'),
             ('suffix', '//SUFFIX/node()'),
             ('subProjectID', '//SUBPROJECT_ID/@xsi:nil'),
             ('totalCost', '//TOTAL_COST/node()'),
             ('totalCostSubProject', '//TOTAL_COST_SUB_PROJECT/node()'),
             ('coreProjectNumber', '//CORE_PROJECT_NUM/node()'),
             ('CFDACode', '//CFDA_CODE/node()'),
             ('programOfficerName', '//PROGRAM_OFFICER_NAME/node()'),
             ('edInstType', '//ED_INST_TYPE/node()'),
             ('awardNoticeDate', '//AWARD_NOTICE_DATE/node()'),
             ('fundingMechanism', '//FUNDING_MECHANISM/node()')
         )
     }
Ejemplo n.º 3
0
 def schema(self):
     return {
         'contributors': ('/authors', process_contributors),
         'uris': {
             'objectUris': ('/url', '/full_dataset_url', compose(filter_none, lambda x, y: [x, y])),
             'descriptorUris': ('/DOI', '/paper_url', compose(filter_none, lambda x, y: [('http://dx.doi.org/{}'.format(x) if x else None), y])),
             'canonicalUri': '/url',
         },
         'title': '/name',
         'providerUpdatedDateTime': ('/modify_date', datetime_formatter),
         'description': '/description',
         'otherProperties': build_properties(
             ('owner_name', '/owner_name'),
         )
     }
Ejemplo n.º 4
0
 def schema(self):
     return helpers.updated_schema(self._schema, {
         "uris": {
             "canonicalUri": ('//dc:identifier/node()', helpers.compose(create_icpsr_url, helpers.single_result)),
             "objectUris": [('//dc:identifier/node()', icpsr_exttract_doi)]
         }
     })
Ejemplo n.º 5
0
 def schema(self):
     return {
         'contributors':
         ('/creators',
          compose(default_name_parser,
                  lambda authors: [author['creator']
                                   for author in authors])),
         'uris': ('/url', process_urls),
         'title':
         '/title',
         'providerUpdatedDateTime':
         ('/publicationDate', datetime_formatter),
         'description':
         '/abstract',
         'freeToRead': {
             'startDate': ('/openaccess', '/publicationDate', lambda x, y: y
                           if x == 'true' else None)
         },
         'publisher': {
             'name': '/publisher'
         },
         'subjects': ('/genre', lambda x: [x] if x else []),
         'otherProperties':
         build_properties(
             ('url', '/url'), ('doi', '/doi'), ('isbn', '/isbn'),
             ('printIsbn', '/printIsbn'),
             ('electronicIsbn', '/electronicIsbn'), ('volume', '/volume'),
             ('number', '/number'), ('startingPage', '/startingPage'),
             ('copyright', '/copyright'), ('identifier', '/identifier'))
     }
Ejemplo n.º 6
0
 def schema(self):
     return {
         'contributors': (
             '/creators',
             compose(
                 default_name_parser,
                 lambda authors: [author['creator'] for author in authors]
             )
         ),
         'uris': ('/url', process_urls),
         'title': '/title',
         'providerUpdatedDateTime': ('/publicationDate', datetime_formatter),
         'description': '/abstract',
         'freeToRead': {
             'startDate': ('/openaccess', '/publicationDate', lambda x, y: y if x == 'true' else None)
         },
         'publisher': {
             'name': '/publisher'
         },
         'subjects': ('/genre', lambda x: [x] if x else []),
         'otherProperties': build_properties(
             ('url', '/url'),
             ('doi', '/doi'),
             ('isbn', '/isbn'),
             ('printIsbn', '/printIsbn'),
             ('electronicIsbn', '/electronicIsbn'),
             ('volume', '/volume'),
             ('number', '/number'),
             ('startingPage', '/startingPage'),
             ('copyright', '/copyright'),
             ('identifier', '/identifier')
         )
     }
Ejemplo n.º 7
0
 def format_property(self, property):
     if property == "date":
         fn = compose(
             lambda x: list(map(null_on_error(datetime_formatter), x)), coerce_to_list, self.resolve_property
         )
     else:
         fn = self.resolve_property
     return (property, ("//dc:{}/node()".format(property), "//ns0:{}/node()".format(property), fn))
Ejemplo n.º 8
0
 def schema(self):
     return updated_schema(self._schema, {
         "description": ("//dc:description/node()", get_second_description),
         "uris": {
             "canonicalUri": ('//dc:identifier/node()', compose(single_result, oai_extract_dois)),
             "objectUris": ('//dc:identifier/node()', oai_extract_dois)
         }
     })
Ejemplo n.º 9
0
 def schema(self):
     id_stanza = './gmd:identificationInfo/gmd:MD_DataIdentification/'
     cite_stanza = id_stanza + 'gmd:citation/gmd:CI_Citation/'
     return {
         'title':
         (cite_stanza + 'gmd:title', compose(xml_text_only, single_result)),
         'description': (id_stanza + 'gmd:abstract',
                         compose(xml_text_only, single_result)),
         'contributors':
         (cite_stanza + 'gmd:citedResponsibleParty/gmd:CI_ResponsibleParty',
          compose(parse_contributors, filter_to_contributors)),
         'uris': {
             'canonicalUri':
             ('./gmd:fileIdentifier',
              compose(lambda x: str(self.canonical_base_url).format(x),
                      xml_text_only, single_result)),
         },
         'publisher': (
             cite_stanza +
             'gmd:citedResponsibleParty/gmd:CI_ResponsibleParty',
             compose(extract_organization, single_result,
                     filter_to_publishers),
         ),
         'providerUpdatedDateTime': ('./gmd:dateStamp/gco:DateTime/node()',
                                     compose(datetime_formatter,
                                             single_result)),
         'languages': ('./gmd:language/gmd:LanguageCode',
                       compose(language_codes, xml_text_only_list,
                               coerce_to_list)),
         'subjects': (id_stanza + 'gmd:descriptiveKeywords/gmd:MD_Keywords',
                      lambda x: filter_keywords(x)),
     }
Ejemplo n.º 10
0
 def format_property(self, property):
     if property == 'date':
         fn = compose(
             lambda x: list(map(null_on_error(datetime_formatter), x)),
             coerce_to_list, self.resolve_property)
     else:
         fn = self.resolve_property
     return (property, ('//dc:{}/node()'.format(property),
                        '//ns0:{}/node()'.format(property), fn))
Ejemplo n.º 11
0
class DailyssrnHarvester(XMLHarvester):
    short_name = 'dailyssrn'
    long_name = 'Social Science Research Network'
    url = 'http://papers.ssrn.com/'

    schema = {
        "description":
        ('//description/node()', compose(lambda x: x.strip(), single_result)),
        "title": ('//title/node()', compose(lambda x: x.strip(),
                                            single_result)),
        "providerUpdatedDateTime": ('//pubDate/node()',
                                    compose(lambda x: x.isoformat(), parse,
                                            lambda x: x.strip(),
                                            single_result)),
        "contributors":
        '//contributors/node()',
        "uris": {
            "canonicalUri":
            ('//link/node()', compose(lambda x: x.strip(), single_result)),
        }
    }

    def harvest(self, start_date=None, end_date=None):

        url = 'http://dailyssrn.com/rss/rss-all-2.0.xml'

        data = requests.get(url, force=True)
        doc = etree.XML(data.content)

        records = doc.xpath('channel/item')

        xml_list = []
        for record in records:
            doc_id = parse_id_from_url(record.xpath('link/node()'))
            record = etree.tostring(record)
            xml_list.append(
                RawDocument({
                    'doc': record,
                    'source': self.short_name,
                    'docID': copy_to_unicode(doc_id),
                    'filetype': 'xml'
                }))

        return xml_list
Ejemplo n.º 12
0
 def schema(self):
     return helpers.updated_schema(
         self._schema, {
             "uris": {
                 "canonicalUri":
                 ('//ns0:header/ns0:identifier/node()',
                  helpers.compose(oai_extract_url_pubmedcentral,
                                  helpers.single_result))
             }
         })
Ejemplo n.º 13
0
 def schema(self):
     return {
         'title': ('/title', lambda x: x[0] if x else ''),
         'description': ('/subtitle', lambda x: x[0]
                         if (isinstance(x, list) and x) else x or ''),
         'providerUpdatedDateTime':
         ('/issued/date-parts',
          compose(datetime_formatter,
                  lambda x: ' '.join([str(part) for part in x[0]]))),
         'uris': {
             'canonicalUri': '/URL'
         },
         'contributors':
         ('/author',
          compose(
              lambda x: [
                  process_contributor(*[
                      '{} {}'.format(entry.get('given'), entry.get('family')
                                     ),
                      entry.get('ORCID')
                  ]) for entry in x
              ], lambda x: x or [])),
         'sponsorships': ('/funder', lambda x: process_sponsorships(x)
                          if x else []),
         'tags':
         ('/subject', '/container-title',
          lambda x, y: [tag.lower() for tag in (x or []) + (y or [])]),
         'subjects':
         ('/subject', '/container-title',
          lambda x, y: [tag.lower() for tag in (x or []) + (y or [])]),
         'otherProperties':
         build_properties(
             ('journalTitle', '/container-title'), ('volume', '/volume'),
             ('issue', '/issue'), ('publisher', '/publisher'),
             ('type', '/type'), ('ISSN', '/ISSN'), ('ISBN', '/ISBN'),
             ('member', '/member'), ('score', '/score'),
             ('issued', '/issued'), ('deposited', '/deposited'),
             ('indexed', '/indexed'), ('page', '/page'),
             ('issue', '/issue'), ('volume', '/volume'),
             ('referenceCount', '/reference-count'),
             ('updatePolicy', '/update-policy'),
             ('depositedTimestamp', '/deposited/timestamp'))
     }
Ejemplo n.º 14
0
 def format_property(self, property):
     if property == 'date':
         fn = compose(lambda x: map(null_on_error(date_formatter), x), coerce_to_list, self.resolve_property)
     else:
         fn = self.resolve_property
     return (property, (
         '//dc:{}/node()'.format(property),
         '//ns0:{}/node()'.format(property),
         fn)
     )
Ejemplo n.º 15
0
 def schema(self):
     return updated_schema(
         self._schema, {
             "description":
             ("//dc:description/node()", get_second_description),
             "uris": {
                 "canonicalUri": ('//dc:identifier/node()',
                                  compose(single_result, oai_extract_dois)),
                 "objectUris": ('//dc:identifier/node()', oai_extract_dois)
             }
         })
Ejemplo n.º 16
0
 def schema(self):
     return {
         'contributors': ('/authors', process_contributors),
         'uris': {
             'objectUris': ('/url', '/full_dataset_url',
                            compose(filter_none, lambda x, y: [x, y])),
             'descriptorUris':
             ('/DOI', '/paper_url',
              compose(
                  filter_none,
                  lambda x, y: [('http://dx.doi.org/{}'.format(x)
                                 if x else None), y])),
             'canonicalUri':
             '/url',
         },
         'title': '/name',
         'providerUpdatedDateTime': ('/modify_date', datetime_formatter),
         'description': '/description',
         'otherProperties': build_properties(
             ('owner_name', '/owner_name'), )
     }
Ejemplo n.º 17
0
 def schema(self):
     return helpers.updated_schema(
         self._schema, {
             "uris": {
                 "canonicalUri":
                 ('//dc:identifier/node()',
                  helpers.compose(create_icpsr_url, helpers.single_result)),
                 "objectUris": [
                     ('//dc:identifier/node()', icpsr_exttract_doi)
                 ]
             }
         })
Ejemplo n.º 18
0
 def schema(self):
     return {
         'title': ('/title', lambda x: x[0] if x else ''),
         'description': ('/subtitle', lambda x: x[0] if (isinstance(x, list) and x) else x or ''),
         'providerUpdatedDateTime': ('/issued/date-parts',
                                     compose(datetime_formatter, lambda x: ' '.join([str(part) for part in x[0]]))),
         'uris': {
             'canonicalUri': '/URL'
         },
         'contributors': ('/author', compose(lambda x: [
             process_contributor(*[
                 '{} {}'.format(entry.get('given'), entry.get('family')),
                 entry.get('ORCID')
             ]) for entry in x
         ], lambda x: x or [])),
         'sponsorships': ('/funder', lambda x: process_sponsorships(x) if x else []),
         'tags': ('/subject', '/container-title', lambda x, y: [tag.lower() for tag in (x or []) + (y or [])]),
         'subjects': ('/subject', '/container-title', lambda x, y: [tag.lower() for tag in (x or []) + (y or [])]),
         'otherProperties': build_properties(
             ('journalTitle', '/container-title'),
             ('volume', '/volume'),
             ('issue', '/issue'),
             ('publisher', '/publisher'),
             ('type', '/type'),
             ('ISSN', '/ISSN'),
             ('ISBN', '/ISBN'),
             ('member', '/member'),
             ('score', '/score'),
             ('issued', '/issued'),
             ('deposited', '/deposited'),
             ('indexed', '/indexed'),
             ('page', '/page'),
             ('issue', '/issue'),
             ('volume', '/volume'),
             ('referenceCount', '/reference-count'),
             ('updatePolicy', '/update-policy'),
             ('depositedTimestamp', '/deposited/timestamp')
         )
     }
Ejemplo n.º 19
0
class PubMedCentralHarvester(OAIHarvester):
    short_name = 'pubmedcentral'
    long_name = 'PubMed Central'
    url = 'http://www.ncbi.nlm.nih.gov/pmc/'

    schema = helpers.updated_schema(
        schemas.OAISCHEMA, {
            "uris": {
                "canonicalUri": ('//ns0:header/ns0:identifier/node()',
                                 helpers.compose(oai_extract_url_pubmedcentral,
                                                 helpers.single_result))
            }
        })

    base_url = 'http://www.pubmedcentral.nih.gov/oai/oai.cgi'
    property_list = [
        'type', 'source', 'rights', 'format', 'setSpec', 'date', 'identifier'
    ]
Ejemplo n.º 20
0
 def schema(self):
     return {
         "title": ("/title", lambda x: x[0] if x else ""),
         "description": ("/subtitle", lambda x: x[0] if (isinstance(x, list) and x) else x or ""),
         "providerUpdatedDateTime": (
             "/issued/date-parts",
             lambda x: parse(" ".join([str(part) for part in x[0]])).date().isoformat(),
         ),
         "uris": {"canonicalUri": "/URL"},
         "contributors": (
             "/author",
             compose(
                 lambda x: [
                     process_contributor(
                         *["{} {}".format(entry.get("given"), entry.get("family")), entry.get("ORCID")]
                     )
                     for entry in x
                 ],
                 lambda x: x or [],
             ),
         ),
         "sponsorships": ("/funder", lambda x: process_sponsorships(x) if x else []),
         "otherProperties": build_properties(
             ("journalTitle", "/container-title"),
             ("volume", "/volume"),
             ("tags", ("/subject", "/container-title", lambda x, y: [tag.lower() for tag in (x or []) + (y or [])])),
             ("issue", "/issue"),
             ("publisher", "/publisher"),
             ("type", "/type"),
             ("ISSN", "/ISSN"),
             ("ISBN", "/ISBN"),
             ("member", "/member"),
             ("score", "/score"),
             ("issued", "/issued"),
             ("deposited", "/deposited"),
             ("indexed", "/indexed"),
             ("page", "/page"),
             ("issue", "/issue"),
             ("volume", "/volume"),
             ("referenceCount", "/reference-count"),
             ("updatePolicy", "/update-policy"),
             ("depositedTimestamp", "/deposited/timestamp"),
         ),
     }
Ejemplo n.º 21
0
 def schema(self):
     return {
         'contributors': ('/contributors', process_contributors),
         'title': ('/title', lambda x: x or ''),
         'providerUpdatedDateTime': ('/date_registered', datetime_formatter),
         'description': '/description',
         'uris': {
             'canonicalUri': ('/url', url_from_guid),
             'providerUris': ('/url', compose(coerce_to_list, url_from_guid))
         },
         'tags': '/tags',
         'otherProperties': build_properties(
             ('parent_title', '/parent_title'),
             ('category', '/category'),
             ('wiki_link', '/wiki_link'),
             ('is_component', '/is_component'),
             ('is_registration', '/is_registration'),
             ('parent_url', '/parent_url'),
             ('journal Id', '/journal Id')
         )
     }
Ejemplo n.º 22
0
 def schema(self):
     id_stanza = './gmd:identificationInfo/gmd:MD_DataIdentification/'
     cite_stanza = id_stanza + 'gmd:citation/gmd:CI_Citation/'
     return {
         'title': (cite_stanza + 'gmd:title', compose(xml_text_only, single_result)),
         'description': (id_stanza + 'gmd:abstract', compose(xml_text_only, single_result)),
         'contributors': (cite_stanza + 'gmd:citedResponsibleParty/gmd:CI_ResponsibleParty', compose(parse_contributors, filter_to_contributors)),
         'uris': {
             'canonicalUri': (
                 './gmd:fileIdentifier',
                 compose(lambda x: str(self.canonical_base_url).format(x), xml_text_only, single_result)
             ),
         },
         'publisher': (
             cite_stanza + 'gmd:citedResponsibleParty/gmd:CI_ResponsibleParty',
             compose(extract_organization, single_result, filter_to_publishers),
         ),
         'providerUpdatedDateTime': ('./gmd:dateStamp/gco:DateTime/node()', compose(datetime_formatter, single_result)),
         'languages': ('./gmd:language/gmd:LanguageCode', compose(language_codes, xml_text_only_list, coerce_to_list)),
         'subjects': (id_stanza + 'gmd:descriptiveKeywords/gmd:MD_Keywords', lambda x: filter_keywords(x)),
     }
Ejemplo n.º 23
0
 def schema(self):
     return helpers.updated_schema(self._schema, {
         "uris": {
             "canonicalUri": ('//ns0:header/ns0:identifier/node()', helpers.compose(oai_extract_url_pubmedcentral, helpers.single_result))
         }
     })
Ejemplo n.º 24
0
class ClinicalTrialsHarvester(XMLHarvester):

    short_name = 'clinicaltrials'
    long_name = 'ClinicalTrials.gov'
    url = 'https://clinicaltrials.gov/'

    DEFAULT_ENCODING = 'UTF-8'
    record_encoding = None

    # TODO - clinicaltrials elements have a lot of extra metadata - at some
    # point in the future we should do a more thorough audit.
    schema = {
        "contributors":
        ('//overall_official/last_name/node()', default_name_parser),
        "uris": {
            "canonicalUri": ("//required_header/url/node()", single_result)
        },
        "providerUpdatedDateTime":
        ("lastchanged_date/node()", compose(datetime_formatter,
                                            single_result)),
        "title": ('//official_title/node()', '//brief_title/node()',
                  lambda x, y: single_result(x) or single_result(y)),
        "description": ('//brief_summary/textblock/node()',
                        '//brief_summary/textblock/node()',
                        lambda x, y: single_result(x) or single_result(y)),
        "tags":
        ("//keyword/node()", lambda tags: [tag.lower() for tag in tags]),
        "sponsorships": [{
            "sponsor": {
                "sponsorName":
                ("//sponsors/lead_sponsor/agency/node()", single_result)
            }
        }, {
            "sponsor": {
                "sponsorName":
                ("//sponsors/collaborator/agency/node()", single_result)
            }
        }],
        "otherProperties":
        build_properties(
            ("serviceID", "//nct_id/node()"),
            ('oversightAuthority', '//oversight_info/authority/node()'),
            ('studyDesign', '//study_design/node()'),
            ('numberOfArms', '//number_of_arms/node()'),
            ('source', '//source/node()'),
            ('verificationDate', '//verification_date/node()'),
            ('lastChanged', '//lastchanged_date/node()'),
            ('condition', '//condition/node()'),
            ('verificationDate', '//verification_date/node()'),
            ('lastChanged', '//lastchanged_date/node()'),
            ('status', '//status/node()'),
            ('locationCountries', '//location_countries/country/node()'),
            ('isFDARegulated', '//is_fda_regulated/node()'),
            ('isSection801', '//is_section_801/node()'),
            ('hasExpandedAccess', '//has_expanded_access/node()'),
            ('leadSponsorAgencyClass', '//lead_sponsor/agency_class/node()'),
            ('collaborator', '//collaborator/agency/node()'),
            ('collaboratorAgencyClass', '//collaborator/agency_class/node()'),
            ('measure', '//primary_outcome/measure/node()'),
            ('timeFrame', '//primary_outcome/time_frame/node()'),
            ('safetyIssue', '//primary_outcome/safety_issue/node()'),
            ('secondaryOutcomes', '//secondary_outcome/measure/node()'),
            ('enrollment', '//enrollment/node()'),
            ('armGroup', '//arm_group/arm_group_label/node()'),
            ('intervention', '//intervention/intervention_type/node()'),
            ('eligibility', ('//eligibility/node()',
                             compose(lambda x: list(map(element_to_dict, x)),
                                     lambda x: list(filter(non_string, x))))),
            ('link', '//link/url/node()'),
            ('responsible_party',
             '//responsible_party/responsible_party_full_name/node()'))
    }

    @property
    def namespaces(self):
        return None

    def harvest(self, start_date=None, end_date=None):
        """ First, get a list of all recently updated study urls,
        then get the xml one by one and save it into a list
        of docs including other information """

        start_date = start_date or date.today() - timedelta(settings.DAYS_BACK)
        end_date = end_date or date.today()

        end_month = end_date.strftime('%m')
        end_day = end_date.strftime('%d')
        end_year = end_date.strftime('%Y')

        start_month = start_date.strftime('%m')
        start_day = start_date.strftime('%d')
        start_year = start_date.strftime('%Y')

        base_url = 'http://clinicaltrials.gov/ct2/results?lup_s='
        url_end = '{}%2F{}%2F{}&lup_e={}%2F{}%2F{}&displayxml=true'.\
            format(start_month, start_day, start_year, end_month, end_day, end_year)

        url = base_url + url_end

        # grab the total number of studies
        initial_request = requests.get(url)
        record_encoding = initial_request.encoding
        initial_request_xml = etree.XML(initial_request.content)
        count = int(initial_request_xml.xpath('//search_results/@count')[0])
        xml_list = []
        if int(count) > 0:
            # get a new url with all results in it
            url = url + '&count=' + str(count)
            total_requests = requests.get(url)
            initial_doc = etree.XML(total_requests.content)

            # make a list of urls from that full list of studies
            study_urls = []
            for study in initial_doc.xpath('//clinical_study'):
                study_urls.append(
                    study.xpath('url/node()')[0] + '?displayxml=true')

            # grab each of those urls for full content
            logger.info("There are {} urls to harvest - be patient...".format(
                len(study_urls)))
            count = 0
            official_count = 0
            for study_url in study_urls:
                try:
                    content = requests.get(study_url)
                except requests.exceptions.ConnectionError as e:
                    logger.info(
                        'Connection error: {}, wait a bit...'.format(e))
                    time.sleep(30)
                    continue
                doc = etree.XML(content.content)
                record = etree.tostring(doc, encoding=record_encoding)
                doc_id = doc.xpath('//nct_id/node()')[0]
                xml_list.append(
                    RawDocument({
                        'doc': record,
                        'source': self.short_name,
                        'docID': copy_to_unicode(doc_id),
                        'filetype': 'xml',
                    }))
                official_count += 1
                count += 1
                if count % 100 == 0:
                    logger.info(
                        "You've requested {} studies, keep going!".format(
                            official_count))
                    count = 0

        return xml_list
Ejemplo n.º 25
0
from datetime import date, timedelta

import xmltodict
from lxml import etree

from scrapi import requests
from scrapi import settings
from scrapi.base import XMLHarvester
from scrapi.util import copy_to_unicode
from scrapi.linter.document import RawDocument
from scrapi.base.helpers import (compose, single_result, build_properties,
                                 datetime_formatter, default_name_parser)

logger = logging.getLogger(__name__)

element_to_dict = compose(xmltodict.parse, etree.tostring)


def non_string(item):
    return not isinstance(item, str)


class ClinicalTrialsHarvester(XMLHarvester):

    short_name = 'clinicaltrials'
    long_name = 'ClinicalTrials.gov'
    url = 'https://clinicaltrials.gov/'

    DEFAULT_ENCODING = 'UTF-8'
    record_encoding = None
Ejemplo n.º 26
0
class DataOneHarvester(XMLHarvester):
    short_name = 'dataone'
    long_name = 'DataONE: Data Observation Network for Earth'
    url = 'https://www.dataone.org/'

    namespaces = {}

    record_encoding = None

    schema = {
        'otherProperties':
        build_properties(
            ('authorGivenName', ("str[@name='authorGivenName']/node()")),
            ('authorSurName', ("str[@name='authorSurName']/node()")),
            ('authoritativeMN', ("str[@name='authoritativeMN']/node()")),
            ('checksum', ("str[@name='checksum']/node()")),
            ('checksumAlgorithm', ("str[@name='checksumAlgorithm']/node()")),
            ('datasource', ("str[@name='datasource']/node()")),
            ('datePublished', ("date[@name='datePublished']/node()")),
            ('dateUploaded', ("date[@name='dateUploaded']/node()")),
            ('pubDate', ("date[@name='pubDate']/node()")),
            ('updateDate', ("date[@name='updateDate']/node()")),
            ('fileID', ("str[@name='fileID']/node()")),
            ('formatId', ("str[@name='formatId']/node()")),
            ('formatType', ("str[@name='formatType']/node()")),
            ('identifier', ("str[@name='identifier']/node()")),
            ('readPermission', "arr[@name='readPermission']/str/node()"),
            ('replicaMN', "arr[@name='replicaMN']/str/node()"),
            ('replicaVerifiedDate',
             "arr[@name='replicaVerifiedDate']/date/node()"),
            ('replicationAllowed',
             ("bool[@name='replicationAllowed']/node()")),
            ('numberReplicas', ("int[@name='numberReplicas']/node()")),
            ('preferredReplicationMN',
             "arr[@name='preferredReplicationMN']/str/node()"),
            ('rightsHolder', ("str[@name='rightsHolder']/node()")),
            ('scientificName', "arr[@name='scientificName']/str/node()"),
            ('site', "arr[@name='site']/str/node()"),
            ('size', ("long[@name='size']/node()")),
            ('isDocumentedBy', "arr[@name='isDocumentedBy']/str/node()"),
            ('serviceID',
             "str[@name='id']/node()"), ('sku', "str[@name='sku']/node()")),
        'freeToRead': {
            'startDate': ("bool[@name='isPublic']/node()",
                          "date[@name='dateModified']/node()",
                          lambda x, y: parse(y[0]).date().isoformat()
                          if x else None)
        },
        'contributors':
        ("str[@name='author']/node()", "str[@name='submitter']/node()",
         "arr[@name='origin']/str/node()",
         "arr[@name='investigator']/str/node()", process_contributors),
        'uris': ("str[@name='id']/node()", "//str[@name='dataUrl']/node()",
                 "arr[@name='resourceMap']/str/node()",
                 partial(helpers.oai_process_uris, use_doi=True)),
        'tags': ("//arr[@name='keywords']/str/node()", lambda x: x
                 if isinstance(x, list) else [x]),
        'providerUpdatedDateTime': ("str[@name='dateModified']/node()",
                                    compose(datetime_formatter,
                                            single_result)),
        'title': ("str[@name='title']/node()", single_result),
        'description': ("str[@name='abstract']/node()", single_result)
    }

    def harvest(self, start_date=None, end_date=None):

        start_date = start_date or date.today() - timedelta(settings.DAYS_BACK)
        end_date = end_date or date.today()

        records = self.get_records(start_date, end_date)

        xml_list = []
        for record in records:
            # This ID is unique per data package, but won't unify multiple packages for the same project
            doc_id = record.xpath("str[@name='id']")[0].text
            format_type = record.xpath("str[@name='formatType']")[0].text
            record = ElementTree.tostring(record,
                                          encoding=self.record_encoding)
            if format_type.lower() != 'metadata':
                logger.info(
                    'Not normalizing record with ID {}, type {}'.format(
                        doc_id, format_type))
            else:
                xml_list.append(
                    RawDocument({
                        'doc': record,
                        'source': self.short_name,
                        'docID': copy_to_unicode(doc_id),
                        'filetype': 'xml'
                    }))

        return xml_list

    def get_records(self, start_date, end_date):
        ''' helper function to get a response from the DataONE
        API, with the specified number of rows.
        Returns an etree element with results '''

        query = 'dateModified:[{}T00:00:00Z TO {}T00:00:00Z]'.format(
            start_date.isoformat(), end_date.isoformat())
        doc = requests.get(DATAONE_SOLR_ENDPOINT,
                           params={
                               'q': query,
                               'start': 0,
                               'rows': 1
                           })
        doc = etree.XML(doc.content)
        rows = int(doc.xpath("//result/@numFound")[0])

        n = 0
        while n < rows:
            data = requests.get(DATAONE_SOLR_ENDPOINT,
                                params={
                                    'q': query,
                                    'start': n,
                                    'rows': 1000
                                })
            docs = etree.XML(data.content).xpath('//doc')
            for doc in docs:
                yield doc
            n += 1000
Ejemplo n.º 27
0
from scrapi import settings
from scrapi.base import XMLHarvester
from scrapi.util import copy_to_unicode
from scrapi.linter.document import RawDocument
from scrapi.base.helpers import (
    compose,
    single_result,
    build_properties,
    datetime_formatter,
    default_name_parser
)

logger = logging.getLogger(__name__)


element_to_dict = compose(xmltodict.parse, etree.tostring)


def non_string(item):
    return not isinstance(item, str)


class ClinicalTrialsHarvester(XMLHarvester):

    short_name = 'clinicaltrials'
    long_name = 'ClinicalTrials.gov'
    url = 'https://clinicaltrials.gov/'

    DEFAULT_ENCODING = 'UTF-8'
    record_encoding = None
Ejemplo n.º 28
0
class PlosHarvester(XMLHarvester):
    short_name = 'plos'
    long_name = 'Public Library of Science'
    url = 'http://www.plos.org/'

    namespaces = {}

    MAX_ROWS_PER_REQUEST = 999
    BASE_URL = 'http://api.plos.org/search'

    def fetch_rows(self, start_date, end_date):
        query = 'publication_date:[{}T00:00:00Z TO {}T00:00:00Z]'.format(
            start_date, end_date)

        resp = requests.get(self.BASE_URL,
                            params={
                                'q': query,
                                'rows': '0',
                                'api_key': PLOS_API_KEY,
                            })

        total_rows = etree.XML(resp.content).xpath('//result/@numFound')
        total_rows = int(total_rows[0]) if total_rows else 0

        current_row = 0
        while current_row < total_rows:
            response = requests.get(self.BASE_URL,
                                    throttle=5,
                                    params={
                                        'q': query,
                                        'start': current_row,
                                        'api_key': PLOS_API_KEY,
                                        'rows': self.MAX_ROWS_PER_REQUEST,
                                    })

            for doc in etree.XML(response.content).xpath('//doc'):
                yield doc

            current_row += self.MAX_ROWS_PER_REQUEST

    def harvest(self, start_date=None, end_date=None):

        start_date = start_date or date.today() - timedelta(settings.DAYS_BACK)
        end_date = end_date or date.today()

        if not PLOS_API_KEY:
            return []

        return [
            RawDocument({
                'filetype': 'xml',
                'source': self.short_name,
                'doc': etree.tostring(row),
                'docID': row.xpath("str[@name='id']")[0].text,
            }) for row in self.fetch_rows(start_date.isoformat(),
                                          end_date.isoformat())
            if row.xpath("arr[@name='abstract']")
            or row.xpath("str[@name='author_display']")
        ]

    schema = {
        'uris': {
            'canonicalUri': ('//str[@name="id"]/node()',
                             compose('http://dx.doi.org/{}'.format,
                                     single_result)),
        },
        'contributors':
        ('//arr[@name="author_display"]/str/node()', default_name_parser),
        'providerUpdatedDateTime':
        ('//date[@name="publication_data"]/node()',
         compose(lambda x: parse(x).date().isoformat(), single_result)),
        'title': ('//str[@name="title_display"]/node()', single_result),
        'description': ('//arr[@name="abstract"]/str/node()', single_result),
        'publisher': {
            'name': ('//str[@name="journal"]/node()', single_result)
        },
        'otherProperties':
        build_properties(('eissn', '//str[@name="eissn"]/node()'),
                         ('articleType', '//str[@name="article_type"]/node()'),
                         ('score', '//float[@name="score"]/node()'))
    }
Ejemplo n.º 29
0
class ELifeHarvester(XMLHarvester):
    short_name = 'elife'
    long_name = 'eLife Sciences'
    url = 'http://elifesciences.org/'
    DEFAULT_ENCODING = 'UTF-8'
    record_encoding = None

    namespaces = {}

    MAX_ROWS_PER_REQUEST = 999
    BASE_URL = 'https://api.github.com/repos/elifesciences/elife-article-xml/commits?'
    BASE_COMMIT_URL = 'https://api.github.com/repos/elifesciences/elife-article-xml/commits/{}'
    BASE_DATA_URL = 'https://raw.githubusercontent.com/elifesciences/elife-article-xml/master/{}'

    def harvest(self, start_date=None, end_date=None):
        start_date = start_date or datetime.date.today() - datetime.timedelta(settings.DAYS_BACK)
        end_date = end_date or datetime.date.today()

        shas = fetch_commits(self.BASE_URL, start_date.isoformat(), end_date.isoformat())

        files = list(set(chain.from_iterable([
            fetch_file_names(self.BASE_COMMIT_URL, sha)
            for sha in shas])))

        files = filter(lambda filename: filename.endswith('.xml'), files)

        xml_records = [
            fetch_xml(self.BASE_DATA_URL, filename)
            for filename in files
        ]

        return [
            RawDocument({
                'filetype': 'xml',
                'source': self.short_name,
                'doc': etree.tostring(record),
                'docID': record.xpath('//article-id[@*]')[0].text,
            }) for record in xml_records
        ]

    schema = {
        'uris': {
            'canonicalUri': ('//article-id/node()', compose('http://dx.doi.org/10.7554/eLife.{}'.format,
                                                            single_result)),
            'objectUri': ('//article-id/node()', compose('http://dx.doi.org/10.7554/eLife.{}'.format, single_result))
        },
        'contributors': ('//article-meta/contrib-group/contrib/name/*[not(self::suffix)]/node()', elife_name_parser),
        'providerUpdatedDateTime': ('//article-meta/pub-date[@publication-format="electronic"]/*/node()',
                                    compose(datetime_formatter, elife_date_parser)),
        'title': ('//article-meta/title-group/article-title//text()', collapse_list),
        'description': ('//abstract[not(@abstract-type="executive-summary")]/p[1]//text()', collapse_list),
        'publisher': {
            'name': ('//publisher-name/node()', single_result)
        },
        'subjects': '//article-meta/article-categories/descendant::text()',
        'freeToRead': {
            'startDate': ('//article-meta/pub-date[@publication-format="electronic"]/*/node()',
                          elife_date_parser)
        },
        'tags': '//kwd/text()',
        'otherProperties': build_properties(
                ('rights', ('//permissions/license/license-p/ext-link/text()', single_result))
        )
    }