Exemple #1
0
    def _get_metadata_and_fulltex_dir(self):
        # Prints stuff
        print >> sys.stdout, "\nRetrieving journal items directories."
        # Create progrss bar
        p_bar = progress_bar(len(self.files_list))
        # Print stuff
        sys.stdout.write(p_bar.next())
        sys.stdout.flush()

        for name in self.files_list:
            dataset_link = join(self.path_unpacked, name.split('.')[0], 'dataset.xml')

            try:
                dataset_xml = parse(dataset_link)
            except Exception, err:
                register_exception(alert_admin=True, prefix="Elsevier error reading dataset.xml file.")
                self.logger.error("Error reading dataset.xml file: %s" % (dataset_link,))
                print >> sys.stdout, "\nError reading dataset.xml file: %s" % (dataset_link,)
                continue

            # created = get_value_in_tag(dataset_xml.getElementsByTagName('dataset-unique-ids')[0], 'timestamp')
            journal_items = dataset_xml.getElementsByTagName('journal-item')
            self.logger.info("Getting metadata and fulltex directories for %i journal items." % (len(journal_items),))
            for journal_item in journal_items:
                xml_pathname = join(self.path_unpacked, name.split('.')[0], xml_to_text(journal_item.getElementsByTagName('ml')[0].getElementsByTagName('pathname')[0]))
                pdf_pathname = join(self.path_unpacked, name.split('.')[0], xml_to_text(journal_item.getElementsByTagName('web-pdf')[0].getElementsByTagName('pathname')[0]))
                self.found_articles.append(dict(xml=xml_pathname, pdf=pdf_pathname))
            self.logger.info("Got metadata and fulltex directories of %i journals." % (len(self.found_articles),))
            # Print stuff
            sys.stdout.write(p_bar.next())
            sys.stdout.flush()
Exemple #2
0
 def get_authors(self, xml):
     authors = []
     for author in xml.getElementsByTagName("Author"):
         tmp = {}
         surname = get_value_in_tag(author, "FamilyName")
         if surname:
             tmp["surname"] = surname
         given_name = get_value_in_tag(author, "GivenName")
         if given_name:
             tmp["given_name"] = given_name.replace('\n', ' ')
         # initials = get_value_in_tag(author, "ce:initials")
         # if initials:
         #     tmp["initials"] = initials
         # It's not there
         # orcid = author.getAttribute('orcid').encode('utf-8')
         # if orcid:
         #     tmp["orcid"] = orcid
         emails = author.getElementsByTagName("Email")
         for email in emails:
             if email.getAttribute("type").encode('utf-8') in ('email', ''):
                 tmp["email"] = xml_to_text(email)
                 break
         # cross_refs = author.getElementsByTagName("ce:cross-ref")
         # if cross_refs:
         #     tmp["cross_ref"] = []
         #     for cross_ref in cross_refs:
         #         tmp["cross_ref"].append(cross_ref.getAttribute("refid").encode('utf-8'))
         tmp["affiliations_ids"] = []
         aids = author.getAttribute("AffiliationIDS").split()
         for aid in aids:
             tmp["affiliations_ids"].append(aid.encode('utf-8'))
         authors.append(tmp)
     affiliations = {}
     for affiliation in xml.getElementsByTagName("Affiliation"):
         aff_id = affiliation.getAttribute("ID").encode('utf-8')
         text = xml_to_text(affiliation, delimiter=', ')
         affiliations[aff_id] = text
     implicit_affilations = True
     for author in authors:
         matching_ref = [ref for ref in author.get("affiliations_ids") if ref in affiliations]
         if matching_ref:
             implicit_affilations = False
             author["affiliation"] = []
             for i in xrange(0, len(matching_ref)):
                 author["affiliation"].append(affiliations[matching_ref[i]])
     if implicit_affilations and len(affiliations) > 1:
         print >> sys.stderr, "Implicit affiliations are used, but there's more than one affiliation: %s" % affiliations
     if implicit_affilations and len(affiliations) >= 1:
         for author in authors:
             author["affiliation"] = []
             for aff in affiliations.values():
                 author["affiliation"].append(aff)
     return authors
Exemple #3
0
 def get_keywords(self, xml):
     try:
         kwd_groups = xml.getElementsByTagName('kwd-group')
         pacs = []
         other = []
         for kwd_group in kwd_groups:
             if kwd_group.getAttribute('kwd-group-type').encode('utf-8') == "pacs":
                 pacs = [xml_to_text(keyword) for keyword in kwd_group.getElementsByTagName("kwd")]
             else:
                 other = [xml_to_text(keyword) for keyword in kwd_group.getElementsByTagName("kwd")]
         return {"pacs": pacs, "other": other}
     except Exception, err:
         print >> sys.stderr, "Can't find keywords"
Exemple #4
0
 def get_references(self, xml):
     references = []
     for reference in xml.getElementsByTagName("ref"):
         plain_text = None
         ref_type = reference.getElementsByTagName('citation')[0].getAttribute('publication-type').encode('utf-8')
         label = get_value_in_tag(reference, "label").strip('.')
         authors = []
         for author in reference.getElementsByTagName("name"):
             given_name = get_value_in_tag(author, "given-names")
             surname = get_value_in_tag(author, "surname")
             if given_name:
                 name = "%s, %s" % (surname, given_name)
             else:
                 name = surname
             if name.strip().split() == []:
                 name = get_value_in_tag(author, "string-name")
             authors.append(name)
         doi_tag = reference.getElementsByTagName("pub-id")
         doi = ""
         for tag in doi_tag:
             if tag.getAttribute("pub-id-type") == "doi":
                 doi = xml_to_text(tag)
         issue = get_value_in_tag(reference, "issue")
         page = get_value_in_tag(reference, "fpage")
         page_last = get_value_in_tag(reference, "lpage")
         title = get_value_in_tag(reference, "source")
         volume = get_value_in_tag(reference, "volume")
         year = get_value_in_tag(reference, "year")
         ext_link = format_arxiv_id(super(NLMParser, self).get_ref_link(reference, "arxiv"))
         if ref_type != 'journal':
             plain_text = get_value_in_tag(reference, "mixed-citation")
         references.append((label, authors, doi, issue, page, page_last, title, volume, year, ext_link, plain_text))
     self.references = references
Exemple #5
0
 def get_ref_link(self, xml, name):
     links = xml.getElementsByTagName('ext-link')
     ret = None
     for link in links:
         if name in link.getAttribute("xlink:href").encode('utf-8'):
             ret = xml_to_text(link).strip()
     return ret
Exemple #6
0
 def get_references(self, xml):
     references = []
     for reference in xml.getElementsByTagName("Citation"):
         if not reference.getElementsByTagName("BibArticle"):
             references.append((get_value_in_tag(reference,
                                                 "BibUnstructured"),
                                '', '', '', '', '', '', ''))
         else:
             label = get_value_in_tag(reference, "ArticleTitle")
             authors = []
             for author in reference.getElementsByTagName("BibAuthorName"):
                 given_name = get_value_in_tag(author, "Initials")
                 surname = get_value_in_tag(author, "FamilyName")
                 if given_name:
                     name = "%s, %s" % (surname, given_name)
                 else:
                     name = surname
                 authors.append(name)
             doi_tag = reference.getElementsByTagName("Occurrence")
             doi = ""
             for tag in doi_tag:
                 if tag.getAttribute("Type") == "DOI":
                     doi = xml_to_text(tag)
             ## What is it exactly?
             # issue = get_value_in_tag(reference, "sb:issue")
             issue = ""
             page = get_value_in_tag(reference, "FirstPage")
             title = get_value_in_tag(reference, "JournalTitle")
             volume = get_value_in_tag(reference, "VolumeID")
             year = get_value_in_tag(reference, "Year")
             references.append((label, authors, doi, issue, page, title, volume, year))
     return references
Exemple #7
0
    def get_doi(self, xml):
        ids = xml.getElementsByTagName('article-id')
        ret = ""
        for i in ids:
            if i.getAttribute('pub-id-type').encode('utf-8') == 'doi':
                ret = xml_to_text(i)

        if not ret:
            print >> sys.stdout, "Can't find DOI."
        return ret
Exemple #8
0
    def get_issn(self, xml):
        issns = xml.getElementsByTagName('issn')
        ret = None

        for issn in issns:
            if issn.getAttribute("pub-type").encode('utf-8') == 'epub':
                ret = issn.getAttribute("pub-type").encode('utf-8')

        if not ret and issns:
            ret = xml_to_text(issns[0])

        return ret
Exemple #9
0
    def _get_packages(self):
        # Prints stuff
        print >> sys.stdout, "\nRetrieving packages names."
        # Create progrss bar
        p_bar = progress_bar(len(self.files_list))
        # Print stuff
        sys.stdout.write(p_bar.next())
        sys.stdout.flush()

        for pack in self.path_r_pkg:
            self.logger.info("Retrieved package name: %s" % (pack,))
            pack_xml = parse(pack)
            package_file = pack_xml.getElementsByTagName('dataset-package-file')
            for pf in package_file:
                filename = pf.getElementsByTagName('filename')[0]
                md5_val = pf.getElementsByTagName('md5')[0]
                self.retrieved_packages[xml_to_text(filename)] = xml_to_text(md5_val)
             # Print stuff
            sys.stdout.write(p_bar.next())
            sys.stdout.flush()

        return self.retrieved_packages
Exemple #10
0
 def get_authors(self, xml):
     authors = []
     for author in xml.getElementsByTagName("ce:author"):
         tmp = {}
         surname = get_value_in_tag(author, "ce:surname")
         if surname:
             tmp["surname"] = surname
         given_name = get_value_in_tag(author, "ce:given-name")
         if given_name:
             tmp["given_name"] = given_name
         initials = get_value_in_tag(author, "ce:initials")
         if initials:
             tmp["initials"] = initials
         orcid = author.getAttribute('orcid').encode('utf-8')
         if orcid:
             tmp["orcid"] = orcid
         emails = author.getElementsByTagName("ce:e-address")
         for email in emails:
             if email.getAttribute("type").encode('utf-8') in ('email', ''):
                 tmp["email"] = xml_to_text(email)
                 break
         cross_refs = author.getElementsByTagName("ce:cross-ref")
         if cross_refs:
             tmp["cross_ref"] = []
             for cross_ref in cross_refs:
                 tmp["cross_ref"].append(cross_ref.getAttribute("refid").encode('utf-8'))
         authors.append(tmp)
     affiliations = {}
     for affiliation in xml.getElementsByTagName("ce:affiliation"):
         aff_id = affiliation.getAttribute("id").encode('utf-8')
         text = re.sub(r'^(\d+\ ?)', "", get_value_in_tag(affiliation, "ce:textfn"))
         affiliations[aff_id] = text
     implicit_affilations = True
     for author in authors:
         matching_ref = [ref for ref in author.get("cross_ref", []) if ref in affiliations]
         if matching_ref:
             implicit_affilations = False
             author["affiliation"] = []
             for i in xrange(0, len(matching_ref)):
                 author["affiliation"].append(affiliations[matching_ref[i]])
     if implicit_affilations and len(affiliations) > 1:
         print >> sys.stderr, "Implicit affiliations are used, but there's more than one affiliation: %s" % affiliations
     if implicit_affilations and len(affiliations) >= 1:
         for author in authors:
             author["affiliation"] = []
             for aff in affiliations.values():
                 author["affiliation"].append(aff)
     return authors
Exemple #11
0
    def _get_issues(self):
        for name in self.files_list:
            dataset_link = join(self.path_unpacked, name.split('.')[0], 'dataset.xml')

            try:
                dataset_xml = parse(dataset_link)
            except Exception, err:
                register_exception(alert_admin=True, prefix="Elsevier error reading dataset.xml file.")
                self.logger.error("Error reading dataset.xml file: %s" % (dataset_link,))
                print >> sys.stdout, "\nError reading dataset.xml file: %s" % (dataset_link,)
                continue

            journal_issues = dataset_xml.getElementsByTagName('journal-issue')
            if journal_issues:
                for journal_issue in journal_issues:
                    filename = xml_to_text(journal_issue.getElementsByTagName('ml')[0].getElementsByTagName('pathname')[0])
                    self.logger.info("Found issue %s in %s." % (filename, name))
                    pathname = join(self.path_unpacked, name.split('.')[0], filename)
                    self.found_issues.append(pathname)
            else:
                def visit(arg, dirname, names):
                    if "issue.xml" in names:
                        self.found_issues.append(join(dirname, "issue.xml"))
                walk(join(self.path_unpacked, name.split('.')[0]), visit, None)
Exemple #12
0
def convert_record(record, response_date, request):
    header = record.getElementsByTagName("header")[0]
    oai_identifier = get_value_in_tag(header, "identifier")
    datestamp = get_value_in_tag(header, "datestamp")
    status = header.getAttribute("status").encode('utf8')
    rec = {}
    record_add_field(rec, tag="035", subfields=[
            ('a', oai_identifier),
            ('u', request),
            ('9', 'Hindawi'),
            ('d', datestamp),
            ('h', response_date),
            ('m', 'marc21'),
            ('t', 'false')
        ])
    new = True
    if find_records_from_extoaiid(oai_identifier, 'Hindawi'):
        new = False
    if status == 'deleted':
        if new:
            ## deleting a record we didn't have? Who cares :-)
            return None, True
        else:
            record_add_field(rec, tag="980", subfields=[('a', 'SCOAP3'), ('b', 'Hindawi'), ('c', 'DELETED')])
            return record_xml_output(rec), False
    for datafield in record.getElementsByTagName("datafield"):
        tag = datafield.getAttribute("tag").encode('utf-8')
        ind1 = datafield.getAttribute("ind1").encode('utf-8') or ' '
        ind2 = datafield.getAttribute("ind2").encode('utf-8') or ' '
        subfields = []
        for subfield in datafield.getElementsByTagName("subfield"):
            code = subfield.getAttribute("code").encode('utf-8')
            value = xml_to_text(subfield)
            subfields.append((code, value))
        record_add_field(rec, tag=tag, ind1=ind1, ind2=ind2, subfields=subfields)
    return record_xml_output(rec), new
Exemple #13
0
    def get_authors(self, xml):
        authors = []
        for author in xml.getElementsByTagName("contrib"):
            tmp = {}
            surname = get_value_in_tag(author, "surname")
            if surname:
                tmp["surname"] = surname
            given_name = get_value_in_tag(author, "given-names")
            if given_name:
                tmp["given_name"] = given_name.replace('\n', ' ')

            # It's not there
            # orcid = author.getAttribute('orcid').encode('utf-8')
            # if orcid:
            #     tmp["orcid"] = orcid

            # cross_refs = author.getElementsByTagName("ce:cross-ref")
            # if cross_refs:
            #     tmp["cross_ref"] = []
            #     for cross_ref in cross_refs:
            #         tmp["cross_ref"].append(cross_ref.getAttribute("refid").encode('utf-8'))
            tmp["affiliations_ids"] = []
            tmp["contact_ids"] = []

            xrefs = author.getElementsByTagName("xref")
            for x in xrefs:
                if x.getAttribute('ref-type').encode('utf-8') == 'aff':
                    tmp["affiliations_ids"].extend([a.encode('utf-8') for a in x.getAttribute('rid').split()])
                if x.getAttribute('ref-type').encode('utf-8') == 'corresp':
                    tmp["contact_ids"].extend([a.encode('utf-8') for a in x.getAttribute('rid').split()])

            authors.append(tmp)

        affiliations = {}
        for affiliation in xml.getElementsByTagName("aff"):
            aff_id = affiliation.getAttribute("id").encode('utf-8')
            # removes numbering in from affiliations
            text = re.sub(r'^(\d+\ ?)', "", xml_to_text(affiliation))
            affiliations[aff_id] = text

        emails = {}
        for contact in xml.getElementsByTagName("corresp"):
            contact_id = contact.getAttribute("id").encode('utf-8')
            text = xml_to_text(contact.getElementsByTagName('email')[0])
            emails[contact_id] = text

        implicit_affilations = True
        for author in authors:
            matching_ref = [ref for ref in author.get("affiliations_ids") if ref in affiliations]
            if matching_ref:
                implicit_affilations = False
                author["affiliation"] = []
                for i in xrange(0, len(matching_ref)):
                    author["affiliation"].append(affiliations[matching_ref[i]])
            matching_contact = [cont for cont in author.get('contact_ids') if cont in emails]
            if matching_contact:
                author["email"] = emails[matching_contact[0]]

        if implicit_affilations and len(affiliations) > 1:
            print >> sys.stderr, "Implicit affiliations are used, but there's more than one affiliation: %s" % affiliations
        if implicit_affilations and len(affiliations) >= 1:
            for author in authors:
                author["affiliation"] = []
                for aff in affiliations.values():
                    author["affiliation"].append(aff)
        return authors
Exemple #14
0
 def get_keywords(self, xml):
     try:
         return [xml_to_text(keyword) for keyword in xml.getElementsByTagName("Keyword")]
     except Exception, err:
         print >> sys.stderr, "Can't find keywords. %s" % (err,)