def _get_metadata_and_fulltex_dir(self): # Prints stuff print >> sys.stdout, "\nRetrieving journal items directories." # Create progrss bar p_bar = progress_bar(len(self.files_list)) # Print stuff sys.stdout.write(p_bar.next()) sys.stdout.flush() for name in self.files_list: dataset_link = join(self.path_unpacked, name.split('.')[0], 'dataset.xml') try: dataset_xml = parse(dataset_link) except Exception, err: register_exception(alert_admin=True, prefix="Elsevier error reading dataset.xml file.") self.logger.error("Error reading dataset.xml file: %s" % (dataset_link,)) print >> sys.stdout, "\nError reading dataset.xml file: %s" % (dataset_link,) continue # created = get_value_in_tag(dataset_xml.getElementsByTagName('dataset-unique-ids')[0], 'timestamp') journal_items = dataset_xml.getElementsByTagName('journal-item') self.logger.info("Getting metadata and fulltex directories for %i journal items." % (len(journal_items),)) for journal_item in journal_items: xml_pathname = join(self.path_unpacked, name.split('.')[0], xml_to_text(journal_item.getElementsByTagName('ml')[0].getElementsByTagName('pathname')[0])) pdf_pathname = join(self.path_unpacked, name.split('.')[0], xml_to_text(journal_item.getElementsByTagName('web-pdf')[0].getElementsByTagName('pathname')[0])) self.found_articles.append(dict(xml=xml_pathname, pdf=pdf_pathname)) self.logger.info("Got metadata and fulltex directories of %i journals." % (len(self.found_articles),)) # Print stuff sys.stdout.write(p_bar.next()) sys.stdout.flush()
def get_authors(self, xml): authors = [] for author in xml.getElementsByTagName("Author"): tmp = {} surname = get_value_in_tag(author, "FamilyName") if surname: tmp["surname"] = surname given_name = get_value_in_tag(author, "GivenName") if given_name: tmp["given_name"] = given_name.replace('\n', ' ') # initials = get_value_in_tag(author, "ce:initials") # if initials: # tmp["initials"] = initials # It's not there # orcid = author.getAttribute('orcid').encode('utf-8') # if orcid: # tmp["orcid"] = orcid emails = author.getElementsByTagName("Email") for email in emails: if email.getAttribute("type").encode('utf-8') in ('email', ''): tmp["email"] = xml_to_text(email) break # cross_refs = author.getElementsByTagName("ce:cross-ref") # if cross_refs: # tmp["cross_ref"] = [] # for cross_ref in cross_refs: # tmp["cross_ref"].append(cross_ref.getAttribute("refid").encode('utf-8')) tmp["affiliations_ids"] = [] aids = author.getAttribute("AffiliationIDS").split() for aid in aids: tmp["affiliations_ids"].append(aid.encode('utf-8')) authors.append(tmp) affiliations = {} for affiliation in xml.getElementsByTagName("Affiliation"): aff_id = affiliation.getAttribute("ID").encode('utf-8') text = xml_to_text(affiliation, delimiter=', ') affiliations[aff_id] = text implicit_affilations = True for author in authors: matching_ref = [ref for ref in author.get("affiliations_ids") if ref in affiliations] if matching_ref: implicit_affilations = False author["affiliation"] = [] for i in xrange(0, len(matching_ref)): author["affiliation"].append(affiliations[matching_ref[i]]) if implicit_affilations and len(affiliations) > 1: print >> sys.stderr, "Implicit affiliations are used, but there's more than one affiliation: %s" % affiliations if implicit_affilations and len(affiliations) >= 1: for author in authors: author["affiliation"] = [] for aff in affiliations.values(): author["affiliation"].append(aff) return authors
def get_keywords(self, xml): try: kwd_groups = xml.getElementsByTagName('kwd-group') pacs = [] other = [] for kwd_group in kwd_groups: if kwd_group.getAttribute('kwd-group-type').encode('utf-8') == "pacs": pacs = [xml_to_text(keyword) for keyword in kwd_group.getElementsByTagName("kwd")] else: other = [xml_to_text(keyword) for keyword in kwd_group.getElementsByTagName("kwd")] return {"pacs": pacs, "other": other} except Exception, err: print >> sys.stderr, "Can't find keywords"
def get_references(self, xml): references = [] for reference in xml.getElementsByTagName("ref"): plain_text = None ref_type = reference.getElementsByTagName('citation')[0].getAttribute('publication-type').encode('utf-8') label = get_value_in_tag(reference, "label").strip('.') authors = [] for author in reference.getElementsByTagName("name"): given_name = get_value_in_tag(author, "given-names") surname = get_value_in_tag(author, "surname") if given_name: name = "%s, %s" % (surname, given_name) else: name = surname if name.strip().split() == []: name = get_value_in_tag(author, "string-name") authors.append(name) doi_tag = reference.getElementsByTagName("pub-id") doi = "" for tag in doi_tag: if tag.getAttribute("pub-id-type") == "doi": doi = xml_to_text(tag) issue = get_value_in_tag(reference, "issue") page = get_value_in_tag(reference, "fpage") page_last = get_value_in_tag(reference, "lpage") title = get_value_in_tag(reference, "source") volume = get_value_in_tag(reference, "volume") year = get_value_in_tag(reference, "year") ext_link = format_arxiv_id(super(NLMParser, self).get_ref_link(reference, "arxiv")) if ref_type != 'journal': plain_text = get_value_in_tag(reference, "mixed-citation") references.append((label, authors, doi, issue, page, page_last, title, volume, year, ext_link, plain_text)) self.references = references
def get_ref_link(self, xml, name): links = xml.getElementsByTagName('ext-link') ret = None for link in links: if name in link.getAttribute("xlink:href").encode('utf-8'): ret = xml_to_text(link).strip() return ret
def get_references(self, xml): references = [] for reference in xml.getElementsByTagName("Citation"): if not reference.getElementsByTagName("BibArticle"): references.append((get_value_in_tag(reference, "BibUnstructured"), '', '', '', '', '', '', '')) else: label = get_value_in_tag(reference, "ArticleTitle") authors = [] for author in reference.getElementsByTagName("BibAuthorName"): given_name = get_value_in_tag(author, "Initials") surname = get_value_in_tag(author, "FamilyName") if given_name: name = "%s, %s" % (surname, given_name) else: name = surname authors.append(name) doi_tag = reference.getElementsByTagName("Occurrence") doi = "" for tag in doi_tag: if tag.getAttribute("Type") == "DOI": doi = xml_to_text(tag) ## What is it exactly? # issue = get_value_in_tag(reference, "sb:issue") issue = "" page = get_value_in_tag(reference, "FirstPage") title = get_value_in_tag(reference, "JournalTitle") volume = get_value_in_tag(reference, "VolumeID") year = get_value_in_tag(reference, "Year") references.append((label, authors, doi, issue, page, title, volume, year)) return references
def get_doi(self, xml): ids = xml.getElementsByTagName('article-id') ret = "" for i in ids: if i.getAttribute('pub-id-type').encode('utf-8') == 'doi': ret = xml_to_text(i) if not ret: print >> sys.stdout, "Can't find DOI." return ret
def get_issn(self, xml): issns = xml.getElementsByTagName('issn') ret = None for issn in issns: if issn.getAttribute("pub-type").encode('utf-8') == 'epub': ret = issn.getAttribute("pub-type").encode('utf-8') if not ret and issns: ret = xml_to_text(issns[0]) return ret
def _get_packages(self): # Prints stuff print >> sys.stdout, "\nRetrieving packages names." # Create progrss bar p_bar = progress_bar(len(self.files_list)) # Print stuff sys.stdout.write(p_bar.next()) sys.stdout.flush() for pack in self.path_r_pkg: self.logger.info("Retrieved package name: %s" % (pack,)) pack_xml = parse(pack) package_file = pack_xml.getElementsByTagName('dataset-package-file') for pf in package_file: filename = pf.getElementsByTagName('filename')[0] md5_val = pf.getElementsByTagName('md5')[0] self.retrieved_packages[xml_to_text(filename)] = xml_to_text(md5_val) # Print stuff sys.stdout.write(p_bar.next()) sys.stdout.flush() return self.retrieved_packages
def get_authors(self, xml): authors = [] for author in xml.getElementsByTagName("ce:author"): tmp = {} surname = get_value_in_tag(author, "ce:surname") if surname: tmp["surname"] = surname given_name = get_value_in_tag(author, "ce:given-name") if given_name: tmp["given_name"] = given_name initials = get_value_in_tag(author, "ce:initials") if initials: tmp["initials"] = initials orcid = author.getAttribute('orcid').encode('utf-8') if orcid: tmp["orcid"] = orcid emails = author.getElementsByTagName("ce:e-address") for email in emails: if email.getAttribute("type").encode('utf-8') in ('email', ''): tmp["email"] = xml_to_text(email) break cross_refs = author.getElementsByTagName("ce:cross-ref") if cross_refs: tmp["cross_ref"] = [] for cross_ref in cross_refs: tmp["cross_ref"].append(cross_ref.getAttribute("refid").encode('utf-8')) authors.append(tmp) affiliations = {} for affiliation in xml.getElementsByTagName("ce:affiliation"): aff_id = affiliation.getAttribute("id").encode('utf-8') text = re.sub(r'^(\d+\ ?)', "", get_value_in_tag(affiliation, "ce:textfn")) affiliations[aff_id] = text implicit_affilations = True for author in authors: matching_ref = [ref for ref in author.get("cross_ref", []) if ref in affiliations] if matching_ref: implicit_affilations = False author["affiliation"] = [] for i in xrange(0, len(matching_ref)): author["affiliation"].append(affiliations[matching_ref[i]]) if implicit_affilations and len(affiliations) > 1: print >> sys.stderr, "Implicit affiliations are used, but there's more than one affiliation: %s" % affiliations if implicit_affilations and len(affiliations) >= 1: for author in authors: author["affiliation"] = [] for aff in affiliations.values(): author["affiliation"].append(aff) return authors
def _get_issues(self): for name in self.files_list: dataset_link = join(self.path_unpacked, name.split('.')[0], 'dataset.xml') try: dataset_xml = parse(dataset_link) except Exception, err: register_exception(alert_admin=True, prefix="Elsevier error reading dataset.xml file.") self.logger.error("Error reading dataset.xml file: %s" % (dataset_link,)) print >> sys.stdout, "\nError reading dataset.xml file: %s" % (dataset_link,) continue journal_issues = dataset_xml.getElementsByTagName('journal-issue') if journal_issues: for journal_issue in journal_issues: filename = xml_to_text(journal_issue.getElementsByTagName('ml')[0].getElementsByTagName('pathname')[0]) self.logger.info("Found issue %s in %s." % (filename, name)) pathname = join(self.path_unpacked, name.split('.')[0], filename) self.found_issues.append(pathname) else: def visit(arg, dirname, names): if "issue.xml" in names: self.found_issues.append(join(dirname, "issue.xml")) walk(join(self.path_unpacked, name.split('.')[0]), visit, None)
def convert_record(record, response_date, request): header = record.getElementsByTagName("header")[0] oai_identifier = get_value_in_tag(header, "identifier") datestamp = get_value_in_tag(header, "datestamp") status = header.getAttribute("status").encode('utf8') rec = {} record_add_field(rec, tag="035", subfields=[ ('a', oai_identifier), ('u', request), ('9', 'Hindawi'), ('d', datestamp), ('h', response_date), ('m', 'marc21'), ('t', 'false') ]) new = True if find_records_from_extoaiid(oai_identifier, 'Hindawi'): new = False if status == 'deleted': if new: ## deleting a record we didn't have? Who cares :-) return None, True else: record_add_field(rec, tag="980", subfields=[('a', 'SCOAP3'), ('b', 'Hindawi'), ('c', 'DELETED')]) return record_xml_output(rec), False for datafield in record.getElementsByTagName("datafield"): tag = datafield.getAttribute("tag").encode('utf-8') ind1 = datafield.getAttribute("ind1").encode('utf-8') or ' ' ind2 = datafield.getAttribute("ind2").encode('utf-8') or ' ' subfields = [] for subfield in datafield.getElementsByTagName("subfield"): code = subfield.getAttribute("code").encode('utf-8') value = xml_to_text(subfield) subfields.append((code, value)) record_add_field(rec, tag=tag, ind1=ind1, ind2=ind2, subfields=subfields) return record_xml_output(rec), new
def get_authors(self, xml): authors = [] for author in xml.getElementsByTagName("contrib"): tmp = {} surname = get_value_in_tag(author, "surname") if surname: tmp["surname"] = surname given_name = get_value_in_tag(author, "given-names") if given_name: tmp["given_name"] = given_name.replace('\n', ' ') # It's not there # orcid = author.getAttribute('orcid').encode('utf-8') # if orcid: # tmp["orcid"] = orcid # cross_refs = author.getElementsByTagName("ce:cross-ref") # if cross_refs: # tmp["cross_ref"] = [] # for cross_ref in cross_refs: # tmp["cross_ref"].append(cross_ref.getAttribute("refid").encode('utf-8')) tmp["affiliations_ids"] = [] tmp["contact_ids"] = [] xrefs = author.getElementsByTagName("xref") for x in xrefs: if x.getAttribute('ref-type').encode('utf-8') == 'aff': tmp["affiliations_ids"].extend([a.encode('utf-8') for a in x.getAttribute('rid').split()]) if x.getAttribute('ref-type').encode('utf-8') == 'corresp': tmp["contact_ids"].extend([a.encode('utf-8') for a in x.getAttribute('rid').split()]) authors.append(tmp) affiliations = {} for affiliation in xml.getElementsByTagName("aff"): aff_id = affiliation.getAttribute("id").encode('utf-8') # removes numbering in from affiliations text = re.sub(r'^(\d+\ ?)', "", xml_to_text(affiliation)) affiliations[aff_id] = text emails = {} for contact in xml.getElementsByTagName("corresp"): contact_id = contact.getAttribute("id").encode('utf-8') text = xml_to_text(contact.getElementsByTagName('email')[0]) emails[contact_id] = text implicit_affilations = True for author in authors: matching_ref = [ref for ref in author.get("affiliations_ids") if ref in affiliations] if matching_ref: implicit_affilations = False author["affiliation"] = [] for i in xrange(0, len(matching_ref)): author["affiliation"].append(affiliations[matching_ref[i]]) matching_contact = [cont for cont in author.get('contact_ids') if cont in emails] if matching_contact: author["email"] = emails[matching_contact[0]] if implicit_affilations and len(affiliations) > 1: print >> sys.stderr, "Implicit affiliations are used, but there's more than one affiliation: %s" % affiliations if implicit_affilations and len(affiliations) >= 1: for author in authors: author["affiliation"] = [] for aff in affiliations.values(): author["affiliation"].append(aff) return authors
def get_keywords(self, xml): try: return [xml_to_text(keyword) for keyword in xml.getElementsByTagName("Keyword")] except Exception, err: print >> sys.stderr, "Can't find keywords. %s" % (err,)