def get_references(self, xml_doc): for ref in xml_doc.getElementsByTagName("ce:bib-reference"): label = get_value_in_tag(ref, "ce:label") if self.CONSYN: innerrefs = ref.getElementsByTagName("sb:reference") if not innerrefs: yield self._get_ref(ref, label) for inner in innerrefs: yield self._get_ref(inner, label) else: authors = [] for author in ref.getElementsByTagName("sb:author"): given_name = get_value_in_tag(author, "ce:given-name") surname = get_value_in_tag(author, "ce:surname") if given_name: name = "%s, %s" % (surname, given_name) else: name = surname authors.append(name) doi = get_value_in_tag(ref, "ce:doi") issue = get_value_in_tag(ref, "sb:issue") page = get_value_in_tag(ref, "sb:first-page") title = get_value_in_tag(ref, "sb:maintitle") volume = get_value_in_tag(ref, "sb:volume-nr") tmp_issues = ref.getElementsByTagName('sb:issue') if tmp_issues: year = get_value_in_tag(tmp_issues[0], "sb:date")[:4] else: year = '' textref = ref.getElementsByTagName("ce:textref") if textref: textref = xml_to_text(textref[0]) ext_link = format_arxiv_id(self.get_ref_link(ref, 'arxiv')) yield (label, authors, doi, issue, page, title, volume, year, textref, ext_link)
def _get_ref(self, ref, label): doi = get_value_in_tag(ref, "ce:doi") page = get_value_in_tag(ref, "sb:first-page") issue = get_value_in_tag(ref, "sb:issue") title = get_value_in_tag(ref, "sb:maintitle") volume = get_value_in_tag(ref, "sb:volume-nr") tmp_issues = ref.getElementsByTagName('sb:issue') if tmp_issues: year = get_value_in_tag(tmp_issues[0], "sb:date") else: year = '' textref = ref.getElementsByTagName("ce:textref") if textref: textref = xml_to_text(textref[0]) ext_link = format_arxiv_id(self.get_ref_link(ref, 'arxiv')) authors = [] for author in ref.getElementsByTagName("sb:author"): given_name = get_value_in_tag(author, "ce:given-name") surname = get_value_in_tag(author, "ce:surname") if given_name: name = "%s, %s" % (surname, given_name) else: name = surname authors.append(name) if ext_link and ext_link.lower().startswith('arxiv'): # check if the identifier contains # digits seperated by dot regex = r'\d*\.\d*' if not re.search(regex, ext_link): ext_link = ext_link[6:] comment = get_value_in_tag(ref, "sb:comment") links = [] for link in ref.getElementsByTagName("ce:inter-ref"): if link.firstChild: links.append(link.firstChild.data.encode('utf-8')) title = "" try: container = ref.getElementsByTagName("sb:contribution")[0] title = container.getElementsByTagName("sb:maintitle")[0] title = xml_to_text(title) except IndexError: title = '' except TypeError: title = '' isjournal = ref.getElementsByTagName("sb:issue") journal = "" if isjournal: if not page: page = comment container = ref.getElementsByTagName("sb:issue")[0] journal = get_value_in_tag(container, "sb:maintitle") edited_book = ref.getElementsByTagName("sb:edited-book") editors = [] book_title = "" publisher = "" if edited_book: # treat as a journal if ref.getElementsByTagName("sb:book-series"): container = ref.getElementsByTagName("sb:book-series")[0] journal = get_value_in_tag(container, "sb:maintitle") year = get_value_in_tag(ref, "sb:date") isjournal = True # conference elif ref.getElementsByTagName("sb:conference"): container = ref.getElementsByTagName( "sb:edited-book")[0] maintitle = get_value_in_tag(container, "sb:maintitle") conference = get_value_in_tag( container, "sb:conference") date = get_value_in_tag(container, "sb:date") # use this variable in order to get in the 'm' field publisher = maintitle + ", " + conference + ", " + date else: container = ref.getElementsByTagName( "sb:edited-book")[0] if ref.getElementsByTagName("sb:editors"): for editor in ref.getElementsByTagName("sb:editor"): surname = get_value_in_tag(editor, "ce:surname") firstname = get_value_in_tag(editor, "ce:given-name") editors.append("%s,%s" % (surname, firstname)) if title: book_title = get_value_in_tag( container, "sb:maintitle") else: title = get_value_in_tag(container, "sb:maintitle") year = get_value_in_tag(container, "sb:date") if ref.getElementsByTagName("sb:publisher"): container = ref.getElementsByTagName( "sb:publisher")[0] location = get_value_in_tag( container, "sb:location") publisher = get_value_in_tag(container, "sb:name") if location: publisher = location + ": " + publisher if ref.getElementsByTagName("sb:book"): if ref.getElementsByTagName("sb:book-series"): book_series = ref.getElementsByTagName( "sb:book-series")[0] title += ", " + \ get_value_in_tag(book_series, "sb:maintitle") title += ", " + \ get_value_in_tag(book_series, "sb:volume-nr") publisher = get_value_in_tag(ref, "sb:publisher") if not year: year = get_value_in_tag(ref, "sb:date") year = re.sub(r'\D', '', year) return (label, authors, doi, issue, page, title, volume, year, textref, ext_link, isjournal, comment, journal, publisher, editors, book_title)