def __init__(self, zot_csv_row): """Create a Zotero publication object from a Zotero CSV entry.""" super(Pub, self).__init__() self._zot_csv_row = zot_csv_row self.title = self._zot_csv_row["Title"] self.canonical_title = publication.to_canonical(self.title) self.zotero_id = self._zot_csv_row['\ufeff"Key"'] # BOM won't go away doi = self._zot_csv_row.get("DOI") if doi: # should be close to canonical already doi = publication.to_canonical_doi(doi) self.canonical_doi = doi self.url = self._zot_csv_row["Url"] # Can be empty # Authors is a semicolon separated list of "Last, First I." authors = self._zot_csv_row.get("Author") if authors: self.set_authors(authors, self.to_canonical_first_author(authors)) else: print("Warning: Zotero Pub '{0}'".format(self.title), file=sys.stderr) print(" Does not have any authors.\n", file=sys.stderr) self.year = self._zot_csv_row.get("Publication Year") if not self.year: self.year = "unknown" print("Warning: Zotero Pub '{0}'".format(self.title), file=sys.stderr) print(" Does not have a publication year.\n", file=sys.stderr) # Tags are a semicolon separated list self.tags = self._zot_csv_row["Manual Tags"].split("; ") if self._zot_csv_row["Item Type"] == "journalArticle": self.journal_name = self._zot_csv_row["Publication Title"] self.canonical_journal = publication.to_canonical( self.journal_name) else: self.canonical_journal = None # Entry date in Zotero CSV looks like "date": "2017-09-14 17:48:40" self.entry_date = self._zot_csv_row.get("Date Added")[0:10] self.ref = "" if not self.journal_name: self.ref = self._zot_csv_row["Publication Title"] year = self._zot_csv_row.get("Publication Year") if year: self.ref += " (" + year + ")" return None
def handle_data(self, data): data = data.strip() if self._parsing and (data == WileyEmailAlert2018AndBefore.SEARCH_COMING): self._search_coming = True elif self._in_search: self.search += data elif self._in_title: self._current_pub.set_title(self._current_pub.title + data) elif self._in_journal: self._current_pub.ref += data elif self._in_authors: # Author string also has date in it: # March 2015Pieter-Jan L. Maenhaut, Hend Moens and Filip De Turck # strip off anything looking like a year and before. authors = re.split(r"\d{4}", data)[-1] canonical_first_author = self._current_pub.canonical_first_author if not canonical_first_author: # extract last name of first author. first_author = authors.split(",")[0] # part that follows last period, or first space name_parts = first_author.split(". ") if len(name_parts) > 1: last_name = name_parts[-1] else: name_parts = first_author.split(" ") last_name = " ".join(name_parts[1:]) canonical_first_author = publication.to_canonical(last_name) self._current_pub.set_authors( self._current_pub.authors + " " + authors, canonical_first_author) return (None)
def handle_data(self, data): data = data.strip() if (self._state == WileyEmailAlert.STATE_PARSING_STARTED and data == WileyEmailAlert.SEARCH_COMING): self._state = WileyEmailAlert.STATE_AWAITING_SEARCH elif self._state == WileyEmailAlert.STATE_IN_SEARCH: self.search += data elif self._state == WileyEmailAlert.STATE_IN_TITLE: self._current_pub.set_title(self._current_pub.title + data) elif self._state == WileyEmailAlert.STATE_IN_JOURNAL: self._current_pub.ref += data elif self._state == WileyEmailAlert.STATE_IN_AUTHORS: # Author string also has date in it: # March 2015Pieter-Jan L. Maenhaut, Hend Moens and Filip De Turck # strip off anything looking like a year and before. authors = re.split(r"\d{4}", data)[-1] canonical_first_author = self._current_pub.canonical_first_author if not canonical_first_author: # extract last name of first author. first_author = authors.split(",")[0] # part that follows last period, or first space name_parts = first_author.split(". ") if len(name_parts) > 1: last_name = name_parts[-1] else: name_parts = first_author.split(" ") last_name = " ".join(name_parts[1:]) canonical_first_author = publication.to_canonical(last_name) self._current_pub.set_authors( self._current_pub.authors + " " + authors, canonical_first_author) return (None)
def __init__(self, pub_library, pub_alerts, known_pubs_db=None, ok_dup_titles=None): """Create a PubMatch database, given an input publication library, an optional db of known pubs, and a list of new pub alerts. """ # Provide quick access via title and DOI self._by_canonical_doi = {} self._by_canonical_title = {} self.canonical_titles_sorted = [] # use bisect with this. # Procss duplicate pub titles that should be ignored. self._ok_dups_by_canonical_title = set() if ok_dup_titles: for ok_title in ok_dup_titles: self._ok_dups_by_canonical_title.add( publication.to_canonical(ok_title)) # Create PubMatch's for every entry in the library. for lib_pub in pub_library.get_pubs(): self.add_pub_match(PubMatch(lib_pub=lib_pub)) # walk through pub_alerts, adding them to exising PubMatch's or # creating new ones when needed. self.add_pub_alerts(pub_alerts) if known_pubs_db: self.add_known_pub_info(known_pubs_db) return None
def __init__(self, cul_json): """Create a CiteULike publication object from CUL JSON.""" super(Pub, self).__init__() self._cul_json = cul_json self.title = self._cul_json["title"] self.canonical_title = publication.to_canonical(self.title) self.cul_id = self._cul_json["article_id"] doi = self._cul_json.get("doi") if doi: doi = publication.to_canonical_doi(doi) self.canonical_doi = doi self.url = self._cul_json["href"] # TODO: Type may not be the most useful. It's "JOUR" for # Journal Article and "THES" for thesis. May not map to BibTeX. self.pub_type = self._cul_json.get("type") # Authors is a list of "First I. Last" author_list = self._cul_json.get("authors") if author_list: authors = ", ".join(author_list) self.set_authors(authors, self.to_canonical_first_author(author_list[0])) else: print("Warning: CUL Pub '{0}'".format(self.title), file=sys.stderr) print(" Does not have any authors.\n", file=sys.stderr) published = self._cul_json.get("published") if published: self.year = published[0] else: self.year = "unknown" self.tags = self._cul_json["tags"] # a list journal = self._cul_json.get("journal") if journal: self.canonical_journal = publication.to_canonical(journal) else: self.canonical_journal = None # Entry date in CUL JSON looks like "date": "2016-12-22 00:18:58" self.entry_date = self._cul_json.get("date")[0:10] return None
def get_canonical_first_author(ncbi_author_list): """Extract the first author's last name. NCBI author lists look like: Wreczycka K, Gosdschan A, Yusuf D, Grüning B, Assenov Y, Akalin A. """ first_author = ncbi_author_list.split(",")[0] last_name = first_author.split(" ")[:-1] return publication.to_canonical(" ".join(last_name))
def to_canonical_first_author(self, zot_author_string): """Convert a Zotero author list to a canonical first author name. A Zotero author list looks like: Gloaguen, Yoann; Morton, Fraser; Daly, Rónán; Gurden, Ross Canonical first author is last name of first author. """ if zot_author_string: last_name = zot_author_string.split(",")[0] canonical_first_author = publication.to_canonical(last_name) else: canonical_first_author = None return canonical_first_author
def handle_data(self, data): data = data.strip() starting = WoSEmailAlert2018AndBefore.paper_start_re.match(data) if starting: # Each paper starts with: "Record m of n. " self._current_pub = publication.Pub() self._current_pub_alert = pub_alert.PubAlert( self._current_pub, self) self.pub_alerts.append(self._current_pub_alert) elif data == "Title:": self._in_title = True elif data == "Authors:": self._in_authors = True elif (WoSEmailAlert2018AndBefore.cited_article_re.match(data) or WoSEmailAlert2018AndBefore.alert_query_re.match(data)): self._in_query = True elif data == "Source:": self._in_ref = True self._current_pub.ref = "" elif self._in_title_value: if len(self._current_pub.title) > 0: self._current_pub.set_title( self._current_pub.title + " " + data) else: self._current_pub.set_title(data) elif self._in_authors: # WOS Author lists look like: # Galia, W; Leriche, F; Cruveiller, S; Thevenot-Sergentet, D canonical_first_author = publication.to_canonical( data.split(",")[0]) self._current_pub.set_authors(data, canonical_first_author) self._in_authors = False elif self._in_query_value: # need to strip "]]>" from anywhere. Bug in WOS, if punctuation # in title. self.search += data.replace("]]>", "") self._in_query_value = False elif self._in_ref: self._current_pub.ref += data + " " return None
def handle_data(self, data): data = data.strip() if data == "": return (None) if self._state == GSEmailAlert.STATE_LOOKING_FOR_HTML_PART: if GSEmailAlert.html_part_start_re.search(data): # Ignore any parts until we get to text/html. # Not ignoring them leads to duplicate entries. self._state = GSEmailAlert.STATE_LOOKING_FOR_TITLE_LINK elif (self._state == GSEmailAlert.STATE_LOOKING_FOR_TITLE_LINK and GSEmailAlert.search_start_re.match(data)): self.search += data self._state = GSEmailAlert.STATE_IN_SEARCH elif self._state == GSEmailAlert.STATE_IN_SEARCH: self.search += " " + data elif self._state == GSEmailAlert.STATE_IN_TITLE_TEXT: # sometimes we lose space between two parts of title. pub_title = self._current_pub.title if (pub_title and pub_title[-1] != " "): pub_title += " " pub_title += data self._current_pub.set_title(pub_title) elif self._state == GSEmailAlert.STATE_IN_AUTHOR_LIST: if self._current_pub.canonical_first_author: canonical_first_author = ( self._current_pub.canonical_first_author) else: # Google authors format: EB Alonso, L Cockx, J Swinnen canonical_first_author = (publication.to_canonical( data.split(",")[0].split(" ")[-1])) # Author list may also have source at end parts = data.split("- ") self._current_pub.set_authors( self._current_pub.authors + parts[0].strip(), canonical_first_author) if len(parts) == 2: self._current_pub.ref = parts[1] elif self._state == GSEmailAlert.STATE_IN_TEXT_FROM_PUB: self._current_pub_alert.text_from_pub += data + " " return (None)
def to_canonical_first_author(self, cul_author_string): """Convert a CUL author name to a canonical first author name. CUL Author name is First M. Last Canonical first author is last name of first author. """ if cul_author_string: by_dots = cul_author_string[0].split(".") if len(by_dots) > 1: # Last name is what follows the last period, first_author = by_dots[-1] else: # or if there is no period, then what follows the last space. first_author = cul_author_string.split()[-1] canonical_first_author = publication.to_canonical(first_author) else: canonical_first_author = None return canonical_first_author
def __init__(self, row=None): """Initialize a single KnownPubDB entry. If a row (from a DB) is given, initialize the entry with the values in the row. Row is a dictionary with COLUMNS for keys. If no row is given, create an empty entry. """ self._row = row if self._row: self._canonical_title = publication.to_canonical(self._row[TITLE]) self.set_doi(self._row[DOI]) # make sure it's in canonical form else: self._row = {} self.set_title(None) # also sets _canonical_title self.set_authors(None) self.set_doi(None) self.set_state(STATE_DONT_KNOW_YET) self.set_annotation("") self.set_qualifier("") return None
def handle_data(self, data): data = data.strip() if self._state == WileyEmailCitationAlert.STATE_IN_SEARCH: self.search += data elif self._state == WileyEmailCitationAlert.STATE_IN_AUTHOR: canonical_first_author = self._current_pub.canonical_first_author if not canonical_first_author: # extract last name of first author. last_name = data.split(" ")[-1] canonical_first_author = publication.to_canonical(last_name) self._current_pub.set_authors( self._current_pub.authors + " " + data, canonical_first_author) elif (self._state == WileyEmailCitationAlert.STATE_AWAITING_AUTHOR_OR_TITLE): # could be an author list joiner ", " or "and" or start of title if data in [",", "and"]: # still in author list self._current_pub.set_authors( self._current_pub.authors + " " + data, self._current_pub.canonical_first_author) else: # Into title self._state = WileyEmailCitationAlert.STATE_IN_TITLE_SECTION self.handle_title_section_data(data) elif self._state == WileyEmailCitationAlert.STATE_IN_JOURNAL: self._current_pub.ref += " " + data elif self._state == WileyEmailCitationAlert.STATE_IN_DOI: self._current_pub.canonical_doi = data self._state = WileyEmailCitationAlert.STATE_IN_TITLE_SECTION elif self._state == WileyEmailCitationAlert.STATE_IN_VOLUME: self._current_pub.ref += ", " + data elif self._state == WileyEmailCitationAlert.STATE_IN_REF_TAIL: self._current_pub.ref += ", " + data self._state = WileyEmailCitationAlert.STATE_IN_TITLE_SECTION return (None)
def to_canonical_first_author(sd_alert_authors_text): """Convert an SD email alert author list to a canonical first author name. Canonical first author is last name of first author. """ # SD alert authors look like: # Eugene Matthew P. Almazan, Sydney L. Lesko, Michael P. Markey # Last name of first author # - starts at last space or period before the first comma # - ends at the first comma if sd_alert_authors_text: first_author = sd_alert_authors_text.split(",")[0] by_dots = first_author.split(".") if len(by_dots) > 1: # Last name is what follows the last period first_author = by_dots[-1] else: # or if there is no period: it's what follows the last space. first_author = by_dots[-1].split()[-1] canonical_first_author = publication.to_canonical(first_author) else: canonical_first_author = None return canonical_first_author
def set_title(self, title): self._row[TITLE] = title self._canonical_title = publication.to_canonical(self._row[TITLE]) return None
def handle_data(self, data): # eliminate leading, trailing, and multiple embedded spaces data = re.sub(r'\s+', ' ', data).strip() if data == "": return None # nothing to see here folks. if self._state == WoSEmailAlert.State.AWAITING_CONTENT: if WoSEmailAlert.greetings_re.match(data): self._state = WoSEmailAlert.State.STARTING_CONTENT elif self._state == WoSEmailAlert.State.STARTING_CONTENT: if WoSEmailAlert.saved_search_type_next_re.match(data): self._state = WoSEmailAlert.State.SAVED_SEARCH_TYPE_NEXT elif WoSEmailAlert.is_citation_report_re.match(data): self._state = WoSEmailAlert.State.CITED_PUB_NEXT # Search alert states elif self._state == WoSEmailAlert.State.SAVED_SEARCH_TYPE_NEXT: self.search += data + " " self._state = ( WoSEmailAlert.State.SAVED_SEARCH_STRING_AND_COUNT_NEXT) elif (self._state == WoSEmailAlert.State.SAVED_SEARCH_STRING_AND_COUNT_NEXT): # form: "(Title or search) has 0 new records as of Mon XXth YYYY." # We Just want the title or search string. Match to "has" in case # title has parens in it. important_bits = WoSEmailAlert.saved_search_string_re.match(data) self.search += important_bits.group(1) if important_bits.group(2) == "0": self._state = WoSEmailAlert.State.DONE else: self._state = WoSEmailAlert.State.CITING_PUB_NEXT # Citation alert states elif self._state == WoSEmailAlert.State.CITED_PUB_NEXT: self.search += data self._state = WoSEmailAlert.State.CITATION_COUNT_NEXT elif self._state == WoSEmailAlert.State.CITATION_COUNT_NEXT: # ignore count, self._state = WoSEmailAlert.State.CITING_PUB_NEXT elif self._state == WoSEmailAlert.State.CITING_PUB_NEXT: counters = WoSEmailAlert.citing_pubs_over_re.match(data) if counters and int(counters.group(1)) == self.found_pub_count: self._state = WoSEmailAlert.State.DONE elif not counters: # Create a new pub alert. self._current_pub = publication.Pub() self._current_pub_alert = pub_alert.PubAlert( self._current_pub, self) self.pub_alerts.append(self._current_pub_alert) self.found_pub_count += 1 self._current_pub.set_title(data) self._state = WoSEmailAlert.State.CITING_PUB_AUTHORS_NEXT elif self._state == WoSEmailAlert.State.CITING_PUB_AUTHORS_NEXT: # WoS author list looks like: # Halbritter, Dale A.; Storer, Caroline G.; Kawahara, Akito Y. canonical_first_author = publication.to_canonical( data.split(",")[0]) self._current_pub.set_authors(data, canonical_first_author) self._state = WoSEmailAlert.State.CITING_PUB_JOURNAL_NEXT elif self._state == WoSEmailAlert.State.CITING_PUB_JOURNAL_NEXT: self._current_pub.ref = data self._state = WoSEmailAlert.State.CITING_PUB_EXCERPT_NEXT elif self._state == WoSEmailAlert.State.CITING_PUB_EXCERPT_NEXT: self._current_pub_alert.text_from_pub = data self._state = WoSEmailAlert.State.CITING_PUB_NEXT elif (data == "Terms of Use" and not self._state == WoSEmailAlert.State.DONE): print( "ERROR: WoS email parsing did not recognize email.", file=sys.stderr) sys.exit(1) return None
def handle_data(self, data): data = data.strip() if data == "": return None # nothing to see here folks. if self._expecting_search: if WoSEmailAlert201808To201911.search_preface_re.match(data): self._expecting_search = False self._in_search_section = True elif self._in_search_text: self.search += data self._in_search_text = False self._expecting_count_section = True elif self._in_count_section: self.expected_pub_count = int( WoSEmailAlert201808To201911.count_re.match(data).group(2)) self._in_count_section = False self._expecting_pub_section = True elif (self._expecting_pub and WoSEmailAlert201808To201911.paper_start_re.match(data)): # Each paper starts with: "Record m of n. " self._current_pub = publication.Pub() self._current_pub_alert = pub_alert.PubAlert( self._current_pub, self) self.pub_alerts.append(self._current_pub_alert) self.found_pub_count += 1 self._expecting_pub = False self._expecting_title = True elif self._in_title: self._current_pub.set_title(data) self._in_title = False self._expecting_authors = True elif self._expecting_authors and data == "Authors:": self._expecting_authors = False self._in_authors = True elif self._in_authors: # WOS Author lists look like: # Galia, W; Leriche, F; Cruveiller, S; Thevenot-Sergentet, D canonical_first_author = publication.to_canonical( data.split(",")[0]) self._current_pub.set_authors(data, canonical_first_author) self._in_authors = False self._expecting_journal = True elif self._in_journal: self._current_pub.ref = data self._in_journal = False self._expecting_citation = True elif self._in_citation: self._current_pub.ref += ", " + data elif self._expecting_doi and data == "DOI:": self._expecting_doi = False self._in_doi_section = True elif self._in_doi: self._current_pub.canonical_doi = publication.to_canonical_doi( data) self._in_doi = False self._expecting_pub = True return None