コード例 #1
0
ファイル: elsevier_package.py プロジェクト: GiorgosPa/scoap3
 def get_references(self, xml):
     references = []
     for reference in xml.getElementsByTagName("ce:bib-reference"):
         label = get_value_in_tag(reference, "ce:label")
         authors = []
         for author in reference.getElementsByTagName("sb:author"):
             given_name = get_value_in_tag(author, "ce:given-name")
             surname = get_value_in_tag(author, "ce:surname")
             if given_name:
                 name = "%s, %s" % (surname, given_name)
             else:
                 name = surname
             authors.append(name)
         doi = get_value_in_tag(reference, "ce:doi")
         issue = get_value_in_tag(reference, "sb:issue")
         page = get_value_in_tag(reference, "sb:first-page")
         title = get_value_in_tag(reference, "sb:maintitle")
         volume = get_value_in_tag(reference, "sb:volume-nr")
         tmp_issues = reference.getElementsByTagName('sb:issue')
         if tmp_issues:
             year = get_value_in_tag(tmp_issues[0], "sb:date")[:4]
         else:
             year = None
         textref = get_value_in_tag(reference, "ce:textref")
         ext_link = format_arxiv_id(self.get_ref_link(reference, 'arxiv'))
         references.append((label, authors, doi, issue, page, title, volume, year, textref, ext_link))
     return references
コード例 #2
0
ファイル: app_utils.py プロジェクト: GiorgosPa/scoap3
 def get_authors(self, xml):
     authors = []
     for author in xml.getElementsByTagName("Author"):
         tmp = {}
         surname = get_value_in_tag(author, "FamilyName")
         if surname:
             tmp["surname"] = surname
         given_name = get_value_in_tag(author, "GivenName")
         if given_name:
             tmp["given_name"] = given_name.replace('\n', ' ')
         # initials = get_value_in_tag(author, "ce:initials")
         # if initials:
         #     tmp["initials"] = initials
         # It's not there
         # orcid = author.getAttribute('orcid').encode('utf-8')
         # if orcid:
         #     tmp["orcid"] = orcid
         emails = author.getElementsByTagName("Email")
         for email in emails:
             if email.getAttribute("type").encode('utf-8') in ('email', ''):
                 tmp["email"] = xml_to_text(email)
                 break
         # cross_refs = author.getElementsByTagName("ce:cross-ref")
         # if cross_refs:
         #     tmp["cross_ref"] = []
         #     for cross_ref in cross_refs:
         #         tmp["cross_ref"].append(cross_ref.getAttribute("refid").encode('utf-8'))
         tmp["affiliations_ids"] = []
         aids = author.getAttribute("AffiliationIDS").split()
         for aid in aids:
             tmp["affiliations_ids"].append(aid.encode('utf-8'))
         authors.append(tmp)
     affiliations = {}
     for affiliation in xml.getElementsByTagName("Affiliation"):
         aff_id = affiliation.getAttribute("ID").encode('utf-8')
         text = xml_to_text(affiliation, delimiter=', ')
         affiliations[aff_id] = text
     implicit_affilations = True
     for author in authors:
         matching_ref = [ref for ref in author.get("affiliations_ids") if ref in affiliations]
         if matching_ref:
             implicit_affilations = False
             author["affiliation"] = []
             for i in xrange(0, len(matching_ref)):
                 author["affiliation"].append(affiliations[matching_ref[i]])
     if implicit_affilations and len(affiliations) > 1:
         print >> sys.stderr, "Implicit affiliations are used, but there's more than one affiliation: %s" % affiliations
     if implicit_affilations and len(affiliations) >= 1:
         for author in authors:
             author["affiliation"] = []
             for aff in affiliations.values():
                 author["affiliation"].append(aff)
     return authors
コード例 #3
0
ファイル: app_utils.py プロジェクト: GiorgosPa/scoap3
 def get_references(self, xml):
     references = []
     for reference in xml.getElementsByTagName("Citation"):
         if not reference.getElementsByTagName("BibArticle"):
             references.append((get_value_in_tag(reference,
                                                 "BibUnstructured"),
                                '', '', '', '', '', '', ''))
         else:
             label = get_value_in_tag(reference, "ArticleTitle")
             authors = []
             for author in reference.getElementsByTagName("BibAuthorName"):
                 given_name = get_value_in_tag(author, "Initials")
                 surname = get_value_in_tag(author, "FamilyName")
                 if given_name:
                     name = "%s, %s" % (surname, given_name)
                 else:
                     name = surname
                 authors.append(name)
             doi_tag = reference.getElementsByTagName("Occurrence")
             doi = ""
             for tag in doi_tag:
                 if tag.getAttribute("Type") == "DOI":
                     doi = xml_to_text(tag)
             ## What is it exactly?
             # issue = get_value_in_tag(reference, "sb:issue")
             issue = ""
             page = get_value_in_tag(reference, "FirstPage")
             title = get_value_in_tag(reference, "JournalTitle")
             volume = get_value_in_tag(reference, "VolumeID")
             year = get_value_in_tag(reference, "Year")
             references.append((label, authors, doi, issue, page, title, volume, year))
     return references
コード例 #4
0
ファイル: app_utils.py プロジェクト: GiorgosPa/scoap3
 def get_publication_date(self, xml):
     article_info = xml.getElementsByTagName("ArticleInfo")[0]
     article_history = article_info.getElementsByTagName("ArticleHistory")[0]
     online_date = article_history.getElementsByTagName("OnlineDate")
     if online_date:
         online_date = online_date[0]
         year = get_value_in_tag(online_date, "Year")
         month = get_value_in_tag(online_date, "Month")
         day = get_value_in_tag(online_date, "Day")
         try:
             return "%04d-%02d-%02d" % (int(year), int(month), int(day))
         except Exception, err:
             print >> sys.stderr, "Can't reliably extract the publication date: %s" % err
             return ""
コード例 #5
0
ファイル: elsevier_package.py プロジェクト: GiorgosPa/scoap3
 def get_authors(self, xml):
     authors = []
     for author in xml.getElementsByTagName("ce:author"):
         tmp = {}
         surname = get_value_in_tag(author, "ce:surname")
         if surname:
             tmp["surname"] = surname
         given_name = get_value_in_tag(author, "ce:given-name")
         if given_name:
             tmp["given_name"] = given_name
         initials = get_value_in_tag(author, "ce:initials")
         if initials:
             tmp["initials"] = initials
         orcid = author.getAttribute('orcid').encode('utf-8')
         if orcid:
             tmp["orcid"] = orcid
         emails = author.getElementsByTagName("ce:e-address")
         for email in emails:
             if email.getAttribute("type").encode('utf-8') in ('email', ''):
                 tmp["email"] = xml_to_text(email)
                 break
         cross_refs = author.getElementsByTagName("ce:cross-ref")
         if cross_refs:
             tmp["cross_ref"] = []
             for cross_ref in cross_refs:
                 tmp["cross_ref"].append(cross_ref.getAttribute("refid").encode('utf-8'))
         authors.append(tmp)
     affiliations = {}
     for affiliation in xml.getElementsByTagName("ce:affiliation"):
         aff_id = affiliation.getAttribute("id").encode('utf-8')
         text = re.sub(r'^(\d+\ ?)', "", get_value_in_tag(affiliation, "ce:textfn"))
         affiliations[aff_id] = text
     implicit_affilations = True
     for author in authors:
         matching_ref = [ref for ref in author.get("cross_ref", []) if ref in affiliations]
         if matching_ref:
             implicit_affilations = False
             author["affiliation"] = []
             for i in xrange(0, len(matching_ref)):
                 author["affiliation"].append(affiliations[matching_ref[i]])
     if implicit_affilations and len(affiliations) > 1:
         print >> sys.stderr, "Implicit affiliations are used, but there's more than one affiliation: %s" % affiliations
     if implicit_affilations and len(affiliations) >= 1:
         for author in authors:
             author["affiliation"] = []
             for aff in affiliations.values():
                 author["affiliation"].append(aff)
     return authors
コード例 #6
0
ファイル: app_utils.py プロジェクト: GiorgosPa/scoap3
 def get_publication_information(self, xml):
     try:
         doi = get_value_in_tag(xml, "ArticleDOI")
         if not doi:
             raise ValueError("DOI not found")
     except Exception, err:
         print >> sys.stderr, "Can't find doi: %s" % err
         raise
コード例 #7
0
ファイル: app_utils.py プロジェクト: GiorgosPa/scoap3
 def get_arxiv_id(self, xml):
     article_note = xml.getElementsByTagName('ArticleNote')
     if article_note:
         article_note = article_note[0]
     else:
         return ""
     arxiv_id = get_value_in_tag(article_note, "RefSource")
     if RE_ARXIV_ID.match(arxiv_id):
         return "arXiv:%s" % arxiv_id
     return ""
コード例 #8
0
ファイル: jats_utils.py プロジェクト: GiorgosPa/scoap3
 def get_publication_information(self, xml):
     jid = get_value_in_tag(xml, "journal-id")
     journal = ""
     #journal = CFG_ELSEVIER_JID_MAP.get(jid, jid)
     try:
         art = xml.getElementsByTagName('article-meta')[0]
     except IndexError, err:
         register_exception()
         print >> sys.stderr, "ERROR: XML corupted: %s" % err
         pass
コード例 #9
0
ファイル: jats_utils.py プロジェクト: GiorgosPa/scoap3
    def get_date(self, xml):
        dates = xml.getElementsByTagName('pub-date')
        ret = None
        for date in dates:
            if date.getAttribute('pub-type').encode('utf-8') == 'epub':
                ret = get_value_in_tag(date, 'year')

        if not ret and dates:
            return dates[0]
        else:
            return ret
コード例 #10
0
ファイル: hindawi_bibfilter.py プロジェクト: GiorgosPa/scoap3
def convert_record(record, response_date, request):
    header = record.getElementsByTagName("header")[0]
    oai_identifier = get_value_in_tag(header, "identifier")
    datestamp = get_value_in_tag(header, "datestamp")
    status = header.getAttribute("status").encode('utf8')
    rec = {}
    record_add_field(rec, tag="035", subfields=[
            ('a', oai_identifier),
            ('u', request),
            ('9', 'Hindawi'),
            ('d', datestamp),
            ('h', response_date),
            ('m', 'marc21'),
            ('t', 'false')
        ])
    new = True
    if find_records_from_extoaiid(oai_identifier, 'Hindawi'):
        new = False
    if status == 'deleted':
        if new:
            ## deleting a record we didn't have? Who cares :-)
            return None, True
        else:
            record_add_field(rec, tag="980", subfields=[('a', 'SCOAP3'), ('b', 'Hindawi'), ('c', 'DELETED')])
            return record_xml_output(rec), False
    for datafield in record.getElementsByTagName("datafield"):
        tag = datafield.getAttribute("tag").encode('utf-8')
        ind1 = datafield.getAttribute("ind1").encode('utf-8') or ' '
        ind2 = datafield.getAttribute("ind2").encode('utf-8') or ' '
        subfields = []
        for subfield in datafield.getElementsByTagName("subfield"):
            code = subfield.getAttribute("code").encode('utf-8')
            value = xml_to_text(subfield)
            subfields.append((code, value))
        record_add_field(rec, tag=tag, ind1=ind1, ind2=ind2, subfields=subfields)
    return record_xml_output(rec), new
コード例 #11
0
ファイル: elsevier_package.py プロジェクト: GiorgosPa/scoap3
    def _build_doi_mapping(self):
        self._dois = {}
        for path in self._found_issues:
            xml = parse(open(join(path, "resolved_issue.xml")))
            jid = get_value_in_tag(xml, "jid")
            journal = CFG_ELSEVIER_JID_MAP.get(jid, jid)
            issn = get_value_in_tag(xml, "ce:issn")
            volume = get_value_in_tag(xml, "vol-first")
            issue = get_value_in_tag(xml, "iss-first")
            year = get_value_in_tag(xml, "start-date")[:4]
            start_date = get_value_in_tag(xml, "start-date")
            if len(start_date) is 8:
                start_date = time.strftime('%Y-%m-%d', time.strptime(start_date, '%Y%m%d'))
            elif len(start_date) is 6:
                start_date = time.strftime('%Y-%m', time.strptime(start_date, '%Y%m'))

            for included_item in xml.getElementsByTagName("ce:include-item"):
                doi = get_value_in_tag(included_item, "ce:doi")
                first_page = get_value_in_tag(included_item, "ce:first-page")
                last_page = get_value_in_tag(included_item, "ce:last-page")
                self._dois[doi] = (journal, issn, volume, issue, first_page, last_page, year, start_date)
コード例 #12
0
ファイル: hindawi_bibfilter.py プロジェクト: GiorgosPa/scoap3
def bibfilter(filename):
    print >> sys.stderr, "Parsing %s" % filename
    xml = get_xml(open(filename))
    request = xml.getElementsByTagName("request")[0].toxml()
    response_date = get_value_in_tag(xml, "responseDate")
    new_records, updated_records = [], []
    records = xml.getElementsByTagName("record")
    print >> sys.stderr, "Found %s records" % len(records)
    for record in records:
        marcxml, new = convert_record(record, response_date, request)
        if marcxml is None:
            continue
        if new:
            new_records.append(marcxml)
        else:
            updated_records.append(marcxml)
    create_record_file(filename + '.insert.xml', new_records)
    create_record_file(filename + '.correct.xml', updated_records)
コード例 #13
0
ファイル: elsevier_package.py プロジェクト: GiorgosPa/scoap3
 def get_abstract(self, xml):
     try:
         return get_value_in_tag(xml.getElementsByTagName("ce:abstract-sec")[0], "ce:simple-para")
     except Exception, err:
         print >> sys.stderr, "Can't find abstract"
コード例 #14
0
ファイル: jats_utils.py プロジェクト: GiorgosPa/scoap3
 def get_title(self, xml):
     try:
         return get_value_in_tag(xml, "article-title")
     except Exception, err:
         print >> sys.stderr, "Can't find title"
コード例 #15
0
ファイル: app_utils.py プロジェクト: GiorgosPa/scoap3
    def get_title(self, xml):
        try:
            return get_value_in_tag(xml, "ArticleTitle")
        except Exception, err:
            print >> sys.stderr, "Can't find title"

    def get_publication_information(self, xml):
        try:
            doi = get_value_in_tag(xml, "ArticleDOI")
            if not doi:
                raise ValueError("DOI not found")
        except Exception, err:
            print >> sys.stderr, "Can't find doi: %s" % err
            raise
        #journal, issn, volume, issue, first_page, last_page, year
        journal = get_value_in_tag(xml, "JournalAbbreviatedTitle")
        if journal == 'J. High Energ. Phys.':
            journal = 'JHEP'
        issn = get_value_in_tag(xml, "JournalAbbreviatedTitle")
        volume = get_value_in_tag(xml, "VolumeIDStart")[2:] + "%02d" % int(get_value_in_tag(xml, "IssueIDStart"))
        issue = ""
        first_page = "%03d" % int(get_value_in_tag(xml, "ArticleSequenceNumber"))
        pages = get_value_in_tag(xml, "ArticleLastPage")
        year = get_value_in_tag(xml, "VolumeIDStart")
        return journal, issn, volume, issue, first_page, pages, year, doi

    def get_authors(self, xml):
        authors = []
        for author in xml.getElementsByTagName("Author"):
            tmp = {}
            surname = get_value_in_tag(author, "FamilyName")
コード例 #16
0
ファイル: jats_utils.py プロジェクト: GiorgosPa/scoap3
 def get_abstract(self, xml):
     try:
         return get_value_in_tag(xml, "abstract").replace("Abstract", "", 1)
     except Exception, err:
         print >> sys.stderr, "Can't find abstract"
コード例 #17
0
ファイル: jats_utils.py プロジェクト: GiorgosPa/scoap3
 def get_copyright(self, xml):
     try:
         return get_value_in_tag(xml, "copyright-holder")
     except Exception, err:
         print >> sys.stderr, "Can't find copyright"
コード例 #18
0
ファイル: jats_utils.py プロジェクト: GiorgosPa/scoap3
        jid = get_value_in_tag(xml, "journal-id")
        journal = ""
        #journal = CFG_ELSEVIER_JID_MAP.get(jid, jid)
        try:
            art = xml.getElementsByTagName('article-meta')[0]
        except IndexError, err:
            register_exception()
            print >> sys.stderr, "ERROR: XML corupted: %s" % err
            pass
        except Exception, err:
            register_exception()
            print >> sys.stderr, "ERROR: Exception captured: %s" % err
            pass

        issn = self.get_issn(art)
        volume = get_value_in_tag(art, "volume")
        issue = get_value_in_tag(art, "issue")
        year = self.get_date(art)
        first_page = get_value_in_tag(art, "fpage")
        last_page = get_value_in_tag(art, "lpage")
        doi = self.get_doi(art)

        return (journal, issn, volume, issue, first_page, last_page, year, doi)

    def get_doi(self, xml):
        ids = xml.getElementsByTagName('article-id')
        ret = ""
        for i in ids:
            if i.getAttribute('pub-id-type').encode('utf-8') == 'doi':
                ret = xml_to_text(i)
コード例 #19
0
ファイル: elsevier_package.py プロジェクト: GiorgosPa/scoap3
 def get_article_journal(self, xml):
     return CFG_ELSEVIER_JID_MAP[get_value_in_tag(xml, "jid")]
コード例 #20
0
ファイル: app_utils.py プロジェクト: GiorgosPa/scoap3
 def get_copyright(self, xml):
     try:
         return get_value_in_tag(xml.getElementsByTagName("ArticleCopyright")[0], "CopyrightHolderName")
     except Exception, err:
         print >> sys.stderr, "Can't find copyright. %s" % (err, )
コード例 #21
0
ファイル: elsevier_package.py プロジェクト: GiorgosPa/scoap3
 def _get_doi(self, xml):
     try:
         return get_value_in_tag(xml, "ce:doi")
     except Exception, err:
         print >> sys.stderr, "Can't find doi"
コード例 #22
0
ファイル: jats_utils.py プロジェクト: GiorgosPa/scoap3
    def get_authors(self, xml):
        authors = []
        for author in xml.getElementsByTagName("contrib"):
            tmp = {}
            surname = get_value_in_tag(author, "surname")
            if surname:
                tmp["surname"] = surname
            given_name = get_value_in_tag(author, "given-names")
            if given_name:
                tmp["given_name"] = given_name.replace('\n', ' ')

            # It's not there
            # orcid = author.getAttribute('orcid').encode('utf-8')
            # if orcid:
            #     tmp["orcid"] = orcid

            # cross_refs = author.getElementsByTagName("ce:cross-ref")
            # if cross_refs:
            #     tmp["cross_ref"] = []
            #     for cross_ref in cross_refs:
            #         tmp["cross_ref"].append(cross_ref.getAttribute("refid").encode('utf-8'))
            tmp["affiliations_ids"] = []
            tmp["contact_ids"] = []

            xrefs = author.getElementsByTagName("xref")
            for x in xrefs:
                if x.getAttribute('ref-type').encode('utf-8') == 'aff':
                    tmp["affiliations_ids"].extend([a.encode('utf-8') for a in x.getAttribute('rid').split()])
                if x.getAttribute('ref-type').encode('utf-8') == 'corresp':
                    tmp["contact_ids"].extend([a.encode('utf-8') for a in x.getAttribute('rid').split()])

            authors.append(tmp)

        affiliations = {}
        for affiliation in xml.getElementsByTagName("aff"):
            aff_id = affiliation.getAttribute("id").encode('utf-8')
            # removes numbering in from affiliations
            text = re.sub(r'^(\d+\ ?)', "", xml_to_text(affiliation))
            affiliations[aff_id] = text

        emails = {}
        for contact in xml.getElementsByTagName("corresp"):
            contact_id = contact.getAttribute("id").encode('utf-8')
            text = xml_to_text(contact.getElementsByTagName('email')[0])
            emails[contact_id] = text

        implicit_affilations = True
        for author in authors:
            matching_ref = [ref for ref in author.get("affiliations_ids") if ref in affiliations]
            if matching_ref:
                implicit_affilations = False
                author["affiliation"] = []
                for i in xrange(0, len(matching_ref)):
                    author["affiliation"].append(affiliations[matching_ref[i]])
            matching_contact = [cont for cont in author.get('contact_ids') if cont in emails]
            if matching_contact:
                author["email"] = emails[matching_contact[0]]

        if implicit_affilations and len(affiliations) > 1:
            print >> sys.stderr, "Implicit affiliations are used, but there's more than one affiliation: %s" % affiliations
        if implicit_affilations and len(affiliations) >= 1:
            for author in authors:
                author["affiliation"] = []
                for aff in affiliations.values():
                    author["affiliation"].append(aff)
        return authors
コード例 #23
0
ファイル: nlm_utils.py プロジェクト: GiorgosPa/scoap3
 def get_references(self, xml):
     references = []
     for reference in xml.getElementsByTagName("ref"):
         plain_text = None
         ref_type = reference.getElementsByTagName('citation')[0].getAttribute('publication-type').encode('utf-8')
         label = get_value_in_tag(reference, "label").strip('.')
         authors = []
         for author in reference.getElementsByTagName("name"):
             given_name = get_value_in_tag(author, "given-names")
             surname = get_value_in_tag(author, "surname")
             if given_name:
                 name = "%s, %s" % (surname, given_name)
             else:
                 name = surname
             if name.strip().split() == []:
                 name = get_value_in_tag(author, "string-name")
             authors.append(name)
         doi_tag = reference.getElementsByTagName("pub-id")
         doi = ""
         for tag in doi_tag:
             if tag.getAttribute("pub-id-type") == "doi":
                 doi = xml_to_text(tag)
         issue = get_value_in_tag(reference, "issue")
         page = get_value_in_tag(reference, "fpage")
         page_last = get_value_in_tag(reference, "lpage")
         title = get_value_in_tag(reference, "source")
         volume = get_value_in_tag(reference, "volume")
         year = get_value_in_tag(reference, "year")
         ext_link = format_arxiv_id(super(NLMParser, self).get_ref_link(reference, "arxiv"))
         if ref_type != 'journal':
             plain_text = get_value_in_tag(reference, "mixed-citation")
         references.append((label, authors, doi, issue, page, page_last, title, volume, year, ext_link, plain_text))
     self.references = references
コード例 #24
0
ファイル: elsevier_package.py プロジェクト: GiorgosPa/scoap3
 def get_keywords(self, xml):
     try:
         return [get_value_in_tag(keyword, "ce:text") for keyword in xml.getElementsByTagName("ce:keyword")]
     except Exception, err:
         print >> sys.stderr, "Can't find keywords"