Ejemplo n.º 1
0
    def __init__(self, zot_csv_row):
        """Create a Zotero publication object from a Zotero CSV entry."""

        super(Pub, self).__init__()

        self._zot_csv_row = zot_csv_row
        self.title = self._zot_csv_row["Title"]
        self.canonical_title = publication.to_canonical(self.title)
        self.zotero_id = self._zot_csv_row['\ufeff"Key"']  # BOM won't go away
        doi = self._zot_csv_row.get("DOI")
        if doi:  # should be close to canonical already
            doi = publication.to_canonical_doi(doi)
        self.canonical_doi = doi
        self.url = self._zot_csv_row["Url"]  # Can be empty

        # Authors is a semicolon separated list of "Last, First I."
        authors = self._zot_csv_row.get("Author")
        if authors:
            self.set_authors(authors, self.to_canonical_first_author(authors))
        else:
            print("Warning: Zotero Pub '{0}'".format(self.title),
                  file=sys.stderr)
            print("  Does not have any authors.\n", file=sys.stderr)

        self.year = self._zot_csv_row.get("Publication Year")
        if not self.year:
            self.year = "unknown"
            print("Warning: Zotero Pub '{0}'".format(self.title),
                  file=sys.stderr)
            print("  Does not have a publication year.\n", file=sys.stderr)

        # Tags are a semicolon separated list
        self.tags = self._zot_csv_row["Manual Tags"].split("; ")

        if self._zot_csv_row["Item Type"] == "journalArticle":
            self.journal_name = self._zot_csv_row["Publication Title"]
            self.canonical_journal = publication.to_canonical(
                self.journal_name)
        else:
            self.canonical_journal = None

        # Entry date in Zotero CSV looks like "date": "2017-09-14 17:48:40"
        self.entry_date = self._zot_csv_row.get("Date Added")[0:10]

        self.ref = ""
        if not self.journal_name:
            self.ref = self._zot_csv_row["Publication Title"]
        year = self._zot_csv_row.get("Publication Year")
        if year:
            self.ref += " (" + year + ")"

        return None
Ejemplo n.º 2
0
    def handle_data(self, data):

        data = data.strip()

        if self._parsing and (data
                              == WileyEmailAlert2018AndBefore.SEARCH_COMING):
            self._search_coming = True
        elif self._in_search:
            self.search += data
        elif self._in_title:
            self._current_pub.set_title(self._current_pub.title + data)
        elif self._in_journal:
            self._current_pub.ref += data
        elif self._in_authors:
            # Author string also has date in it:
            # March 2015Pieter-Jan L. Maenhaut, Hend Moens and Filip De Turck
            # strip off anything looking like a year and before.
            authors = re.split(r"\d{4}", data)[-1]
            canonical_first_author = self._current_pub.canonical_first_author
            if not canonical_first_author:
                # extract last name of first author.
                first_author = authors.split(",")[0]
                # part that follows last period, or first space
                name_parts = first_author.split(". ")
                if len(name_parts) > 1:
                    last_name = name_parts[-1]
                else:
                    name_parts = first_author.split(" ")
                    last_name = " ".join(name_parts[1:])
                canonical_first_author = publication.to_canonical(last_name)
            self._current_pub.set_authors(
                self._current_pub.authors + " " + authors,
                canonical_first_author)

        return (None)
Ejemplo n.º 3
0
    def handle_data(self, data):

        data = data.strip()

        if (self._state == WileyEmailAlert.STATE_PARSING_STARTED
                and data == WileyEmailAlert.SEARCH_COMING):
            self._state = WileyEmailAlert.STATE_AWAITING_SEARCH
        elif self._state == WileyEmailAlert.STATE_IN_SEARCH:
            self.search += data
        elif self._state == WileyEmailAlert.STATE_IN_TITLE:
            self._current_pub.set_title(self._current_pub.title + data)
        elif self._state == WileyEmailAlert.STATE_IN_JOURNAL:
            self._current_pub.ref += data
        elif self._state == WileyEmailAlert.STATE_IN_AUTHORS:
            # Author string also has date in it:
            # March 2015Pieter-Jan L. Maenhaut, Hend Moens and Filip De Turck
            # strip off anything looking like a year and before.
            authors = re.split(r"\d{4}", data)[-1]
            canonical_first_author = self._current_pub.canonical_first_author
            if not canonical_first_author:
                # extract last name of first author.
                first_author = authors.split(",")[0]
                # part that follows last period, or first space
                name_parts = first_author.split(". ")
                if len(name_parts) > 1:
                    last_name = name_parts[-1]
                else:
                    name_parts = first_author.split(" ")
                    last_name = " ".join(name_parts[1:])
                canonical_first_author = publication.to_canonical(last_name)
            self._current_pub.set_authors(
                self._current_pub.authors + " " + authors,
                canonical_first_author)

        return (None)
Ejemplo n.º 4
0
    def __init__(self,
                 pub_library,
                 pub_alerts,
                 known_pubs_db=None,
                 ok_dup_titles=None):
        """Create a PubMatch database, given an input publication library, an
        optional db of known pubs, and a list of new pub alerts.
        """
        # Provide quick access via title and DOI
        self._by_canonical_doi = {}
        self._by_canonical_title = {}
        self.canonical_titles_sorted = []  # use bisect with this.

        # Procss duplicate pub titles that should be ignored.
        self._ok_dups_by_canonical_title = set()
        if ok_dup_titles:
            for ok_title in ok_dup_titles:
                self._ok_dups_by_canonical_title.add(
                    publication.to_canonical(ok_title))

        # Create PubMatch's for every entry in the library.
        for lib_pub in pub_library.get_pubs():
            self.add_pub_match(PubMatch(lib_pub=lib_pub))

        # walk through pub_alerts, adding them to exising PubMatch's or
        # creating new ones when needed.
        self.add_pub_alerts(pub_alerts)
        if known_pubs_db:
            self.add_known_pub_info(known_pubs_db)

        return None
Ejemplo n.º 5
0
    def __init__(self, cul_json):
        """Create a CiteULike publication object from CUL JSON."""

        super(Pub, self).__init__()

        self._cul_json = cul_json
        self.title = self._cul_json["title"]
        self.canonical_title = publication.to_canonical(self.title)
        self.cul_id = self._cul_json["article_id"]
        doi = self._cul_json.get("doi")
        if doi:
            doi = publication.to_canonical_doi(doi)
        self.canonical_doi = doi
        self.url = self._cul_json["href"]

        # TODO: Type may not be the most useful. It's "JOUR" for
        # Journal Article and "THES" for thesis.  May not map to BibTeX.
        self.pub_type = self._cul_json.get("type")

        # Authors is a list of "First I. Last"
        author_list = self._cul_json.get("authors")
        if author_list:
            authors = ", ".join(author_list)
            self.set_authors(authors,
                             self.to_canonical_first_author(author_list[0]))
        else:
            print("Warning: CUL Pub '{0}'".format(self.title), file=sys.stderr)
            print("  Does not have any authors.\n", file=sys.stderr)

        published = self._cul_json.get("published")
        if published:
            self.year = published[0]
        else:
            self.year = "unknown"
        self.tags = self._cul_json["tags"]  # a list
        journal = self._cul_json.get("journal")
        if journal:
            self.canonical_journal = publication.to_canonical(journal)
        else:
            self.canonical_journal = None

        # Entry date in CUL JSON looks like "date": "2016-12-22 00:18:58"
        self.entry_date = self._cul_json.get("date")[0:10]

        return None
Ejemplo n.º 6
0
def get_canonical_first_author(ncbi_author_list):
    """Extract the first author's last name.

    NCBI author lists look like:
      Wreczycka K, Gosdschan A, Yusuf D, Grüning B, Assenov Y, Akalin A.
    """
    first_author = ncbi_author_list.split(",")[0]
    last_name = first_author.split(" ")[:-1]
    return publication.to_canonical(" ".join(last_name))
Ejemplo n.º 7
0
    def to_canonical_first_author(self, zot_author_string):
        """Convert a Zotero author list to a canonical first author name.

        A Zotero author list looks like:
          Gloaguen, Yoann; Morton, Fraser; Daly, Rónán; Gurden, Ross

        Canonical first author is last name of first author.
        """
        if zot_author_string:
            last_name = zot_author_string.split(",")[0]
            canonical_first_author = publication.to_canonical(last_name)
        else:
            canonical_first_author = None
        return canonical_first_author
Ejemplo n.º 8
0
    def handle_data(self, data):
        data = data.strip()
        starting = WoSEmailAlert2018AndBefore.paper_start_re.match(data)
        if starting:
            # Each paper starts with: "Record m of n. "
            self._current_pub = publication.Pub()
            self._current_pub_alert = pub_alert.PubAlert(
                self._current_pub, self)
            self.pub_alerts.append(self._current_pub_alert)

        elif data == "Title:":
            self._in_title = True

        elif data == "Authors:":
            self._in_authors = True

        elif (WoSEmailAlert2018AndBefore.cited_article_re.match(data)
              or WoSEmailAlert2018AndBefore.alert_query_re.match(data)):
            self._in_query = True

        elif data == "Source:":
            self._in_ref = True
            self._current_pub.ref = ""

        elif self._in_title_value:
            if len(self._current_pub.title) > 0:
                self._current_pub.set_title(
                    self._current_pub.title + " " + data)
            else:
                self._current_pub.set_title(data)

        elif self._in_authors:
            # WOS Author lists look like:
            #   Galia, W; Leriche, F; Cruveiller, S; Thevenot-Sergentet, D
            canonical_first_author = publication.to_canonical(
                data.split(",")[0])
            self._current_pub.set_authors(data, canonical_first_author)
            self._in_authors = False

        elif self._in_query_value:
            # need to strip "]]>" from anywhere. Bug in WOS, if punctuation
            # in title.
            self.search += data.replace("]]>", "")
            self._in_query_value = False

        elif self._in_ref:
            self._current_pub.ref += data + " "

        return None
Ejemplo n.º 9
0
    def handle_data(self, data):

        data = data.strip()
        if data == "":
            return (None)

        if self._state == GSEmailAlert.STATE_LOOKING_FOR_HTML_PART:
            if GSEmailAlert.html_part_start_re.search(data):
                # Ignore any parts until we get to text/html.
                # Not ignoring them leads to duplicate entries.
                self._state = GSEmailAlert.STATE_LOOKING_FOR_TITLE_LINK

        elif (self._state == GSEmailAlert.STATE_LOOKING_FOR_TITLE_LINK
              and GSEmailAlert.search_start_re.match(data)):
            self.search += data
            self._state = GSEmailAlert.STATE_IN_SEARCH

        elif self._state == GSEmailAlert.STATE_IN_SEARCH:
            self.search += " " + data

        elif self._state == GSEmailAlert.STATE_IN_TITLE_TEXT:
            # sometimes we lose space between two parts of title.
            pub_title = self._current_pub.title
            if (pub_title and pub_title[-1] != " "):
                pub_title += " "
            pub_title += data
            self._current_pub.set_title(pub_title)

        elif self._state == GSEmailAlert.STATE_IN_AUTHOR_LIST:
            if self._current_pub.canonical_first_author:
                canonical_first_author = (
                    self._current_pub.canonical_first_author)
            else:
                # Google authors format: EB Alonso, L Cockx, J Swinnen
                canonical_first_author = (publication.to_canonical(
                    data.split(",")[0].split(" ")[-1]))
            # Author list may also have source at end
            parts = data.split("- ")
            self._current_pub.set_authors(
                self._current_pub.authors + parts[0].strip(),
                canonical_first_author)
            if len(parts) == 2:
                self._current_pub.ref = parts[1]

        elif self._state == GSEmailAlert.STATE_IN_TEXT_FROM_PUB:
            self._current_pub_alert.text_from_pub += data + " "

        return (None)
Ejemplo n.º 10
0
    def to_canonical_first_author(self, cul_author_string):
        """Convert a CUL author name to a canonical first author name.

        CUL Author name is
          First M. Last

        Canonical first author is last name of first author.
        """
        if cul_author_string:
            by_dots = cul_author_string[0].split(".")
            if len(by_dots) > 1:
                # Last name is what follows the last period,
                first_author = by_dots[-1]
            else:
                # or if there is no period, then what follows the last space.
                first_author = cul_author_string.split()[-1]
            canonical_first_author = publication.to_canonical(first_author)
        else:
            canonical_first_author = None
        return canonical_first_author
Ejemplo n.º 11
0
    def __init__(self, row=None):
        """Initialize a single KnownPubDB entry.
        If a row (from a DB) is given, initialize the entry with the values
        in the row.  Row is a dictionary with COLUMNS for keys.
        If no row is given, create an empty entry.
        """
        self._row = row
        if self._row:
            self._canonical_title = publication.to_canonical(self._row[TITLE])
            self.set_doi(self._row[DOI])  # make sure it's in canonical form
        else:
            self._row = {}
            self.set_title(None)  # also sets _canonical_title
            self.set_authors(None)
            self.set_doi(None)
            self.set_state(STATE_DONT_KNOW_YET)
            self.set_annotation("")
            self.set_qualifier("")

        return None
Ejemplo n.º 12
0
    def handle_data(self, data):

        data = data.strip()

        if self._state == WileyEmailCitationAlert.STATE_IN_SEARCH:
            self.search += data
        elif self._state == WileyEmailCitationAlert.STATE_IN_AUTHOR:
            canonical_first_author = self._current_pub.canonical_first_author
            if not canonical_first_author:
                # extract last name of first author.
                last_name = data.split(" ")[-1]
                canonical_first_author = publication.to_canonical(last_name)
            self._current_pub.set_authors(
                self._current_pub.authors + " " + data, canonical_first_author)
        elif (self._state ==
              WileyEmailCitationAlert.STATE_AWAITING_AUTHOR_OR_TITLE):
            # could be an author list joiner ", " or "and" or start of title
            if data in [",", "and"]:
                # still in author list
                self._current_pub.set_authors(
                    self._current_pub.authors + " " + data,
                    self._current_pub.canonical_first_author)
            else:  # Into title
                self._state = WileyEmailCitationAlert.STATE_IN_TITLE_SECTION
                self.handle_title_section_data(data)
        elif self._state == WileyEmailCitationAlert.STATE_IN_JOURNAL:
            self._current_pub.ref += " " + data
        elif self._state == WileyEmailCitationAlert.STATE_IN_DOI:
            self._current_pub.canonical_doi = data
            self._state = WileyEmailCitationAlert.STATE_IN_TITLE_SECTION
        elif self._state == WileyEmailCitationAlert.STATE_IN_VOLUME:
            self._current_pub.ref += ", " + data
        elif self._state == WileyEmailCitationAlert.STATE_IN_REF_TAIL:
            self._current_pub.ref += ", " + data
            self._state = WileyEmailCitationAlert.STATE_IN_TITLE_SECTION

        return (None)
Ejemplo n.º 13
0
def to_canonical_first_author(sd_alert_authors_text):
    """Convert an SD email alert author list to a canonical first
    author name.

    Canonical first author is last name of first author.
    """
    # SD alert authors look like:
    #  Eugene Matthew P. Almazan, Sydney L. Lesko, Michael P. Markey
    # Last name of first author
    # - starts at last space or period before the first comma
    # - ends at the first comma
    if sd_alert_authors_text:
        first_author = sd_alert_authors_text.split(",")[0]
        by_dots = first_author.split(".")
        if len(by_dots) > 1:
            # Last name is what follows the last period
            first_author = by_dots[-1]
        else:
            # or if there is no period: it's what follows the last space.
            first_author = by_dots[-1].split()[-1]
        canonical_first_author = publication.to_canonical(first_author)
    else:
        canonical_first_author = None
    return canonical_first_author
Ejemplo n.º 14
0
 def set_title(self, title):
     self._row[TITLE] = title
     self._canonical_title = publication.to_canonical(self._row[TITLE])
     return None
Ejemplo n.º 15
0
    def handle_data(self, data):
        # eliminate leading, trailing, and multiple embedded spaces
        data = re.sub(r'\s+', ' ', data).strip()
        if data == "":
            return None                   # nothing to see here folks.

        if self._state == WoSEmailAlert.State.AWAITING_CONTENT:
            if WoSEmailAlert.greetings_re.match(data):
                self._state = WoSEmailAlert.State.STARTING_CONTENT

        elif self._state == WoSEmailAlert.State.STARTING_CONTENT:
            if WoSEmailAlert.saved_search_type_next_re.match(data):
                self._state = WoSEmailAlert.State.SAVED_SEARCH_TYPE_NEXT
            elif WoSEmailAlert.is_citation_report_re.match(data):
                self._state = WoSEmailAlert.State.CITED_PUB_NEXT

        # Search alert states
        elif self._state == WoSEmailAlert.State.SAVED_SEARCH_TYPE_NEXT:
            self.search += data + " "
            self._state = (
                WoSEmailAlert.State.SAVED_SEARCH_STRING_AND_COUNT_NEXT)

        elif (self._state
              == WoSEmailAlert.State.SAVED_SEARCH_STRING_AND_COUNT_NEXT):
            # form: "(Title or search) has 0 new records as of Mon XXth YYYY."
            # We Just want the title or search string. Match to "has" in case
            # title has parens in it.
            important_bits = WoSEmailAlert.saved_search_string_re.match(data)
            self.search += important_bits.group(1)
            if important_bits.group(2) == "0":
                self._state = WoSEmailAlert.State.DONE
            else:
                self._state = WoSEmailAlert.State.CITING_PUB_NEXT

        # Citation alert states
        elif self._state == WoSEmailAlert.State.CITED_PUB_NEXT:
            self.search += data
            self._state = WoSEmailAlert.State.CITATION_COUNT_NEXT

        elif self._state == WoSEmailAlert.State.CITATION_COUNT_NEXT:
            # ignore count,
            self._state = WoSEmailAlert.State.CITING_PUB_NEXT

        elif self._state == WoSEmailAlert.State.CITING_PUB_NEXT:
            counters = WoSEmailAlert.citing_pubs_over_re.match(data)
            if counters and int(counters.group(1)) == self.found_pub_count:
                self._state = WoSEmailAlert.State.DONE
            elif not counters:
                # Create a new pub alert.
                self._current_pub = publication.Pub()
                self._current_pub_alert = pub_alert.PubAlert(
                    self._current_pub, self)
                self.pub_alerts.append(self._current_pub_alert)
                self.found_pub_count += 1
                self._current_pub.set_title(data)
                self._state = WoSEmailAlert.State.CITING_PUB_AUTHORS_NEXT

        elif self._state == WoSEmailAlert.State.CITING_PUB_AUTHORS_NEXT:
            # WoS author list looks like:
            #  Halbritter, Dale A.; Storer, Caroline G.; Kawahara, Akito Y.
            canonical_first_author = publication.to_canonical(
                data.split(",")[0])
            self._current_pub.set_authors(data, canonical_first_author)
            self._state = WoSEmailAlert.State.CITING_PUB_JOURNAL_NEXT

        elif self._state == WoSEmailAlert.State.CITING_PUB_JOURNAL_NEXT:
            self._current_pub.ref = data
            self._state = WoSEmailAlert.State.CITING_PUB_EXCERPT_NEXT

        elif self._state == WoSEmailAlert.State.CITING_PUB_EXCERPT_NEXT:
            self._current_pub_alert.text_from_pub = data
            self._state = WoSEmailAlert.State.CITING_PUB_NEXT

        elif (data == "Terms of Use"
              and not self._state == WoSEmailAlert.State.DONE):
            print(
                "ERROR: WoS email parsing did not recognize email.",
                file=sys.stderr)
            sys.exit(1)

        return None
Ejemplo n.º 16
0
    def handle_data(self, data):
        data = data.strip()
        if data == "":
            return None                   # nothing to see here folks.

        if self._expecting_search:
            if WoSEmailAlert201808To201911.search_preface_re.match(data):
                self._expecting_search = False
                self._in_search_section = True

        elif self._in_search_text:
            self.search += data
            self._in_search_text = False
            self._expecting_count_section = True

        elif self._in_count_section:
            self.expected_pub_count = int(
                WoSEmailAlert201808To201911.count_re.match(data).group(2))
            self._in_count_section = False
            self._expecting_pub_section = True

        elif (self._expecting_pub
              and WoSEmailAlert201808To201911.paper_start_re.match(data)):
            # Each paper starts with: "Record m of n. "
            self._current_pub = publication.Pub()
            self._current_pub_alert = pub_alert.PubAlert(
                self._current_pub, self)
            self.pub_alerts.append(self._current_pub_alert)
            self.found_pub_count += 1
            self._expecting_pub = False
            self._expecting_title = True

        elif self._in_title:
            self._current_pub.set_title(data)
            self._in_title = False
            self._expecting_authors = True

        elif self._expecting_authors and data == "Authors:":
            self._expecting_authors = False
            self._in_authors = True

        elif self._in_authors:
            # WOS Author lists look like:
            #   Galia, W; Leriche, F; Cruveiller, S; Thevenot-Sergentet, D
            canonical_first_author = publication.to_canonical(
                data.split(",")[0])
            self._current_pub.set_authors(data, canonical_first_author)
            self._in_authors = False
            self._expecting_journal = True

        elif self._in_journal:
            self._current_pub.ref = data
            self._in_journal = False
            self._expecting_citation = True

        elif self._in_citation:
            self._current_pub.ref += ", " + data

        elif self._expecting_doi and data == "DOI:":
            self._expecting_doi = False
            self._in_doi_section = True

        elif self._in_doi:
            self._current_pub.canonical_doi = publication.to_canonical_doi(
                data)
            self._in_doi = False
            self._expecting_pub = True

        return None