Example #1
0
    def handle_starttag(self, tag, attrs):

        if (tag == "h3"
                and self._state == GSEmailAlert.STATE_LOOKING_FOR_TITLE_LINK):
            # link to paper is shown in h3.
            self._state = GSEmailAlert.STATE_IN_TITLE_LINK
            self._current_pub = publication.Pub()
            self._current_pub_alert = pub_alert.PubAlert(
                self._current_pub, self)
            self.pub_alerts.append(self._current_pub_alert)

        elif tag == "a" and self._state == GSEmailAlert.STATE_IN_TITLE_LINK:
            full_url = attrs[0][1]
            url_args = full_url[full_url.find("?") + 1:].split("&")

            for url_arg in url_args:
                if url_arg[0:2] == "q=":
                    # need to get rid of URL encoding.
                    self._current_pub.url = urllib.parse.unquote(url_arg[2:])
                    break
                elif url_arg[0:4] == "url=":
                    self._current_pub.url = urllib.parse.unquote(url_arg[4:])
                    break
            if not self._current_pub.url:
                # Some URLs link directly to Google Scholar.
                self._current_pub.url = full_url
            self._state = GSEmailAlert.STATE_IN_TITLE_TEXT

        elif (tag in ["font", "div"]
              and self._state == GSEmailAlert.STATE_TEXT_FROM_PUB_NEXT):
            self._state = GSEmailAlert.STATE_IN_TEXT_FROM_PUB
            self._current_pub_alert.text_from_pub = ""

        return (None)
Example #2
0
    def handle_starttag(self, tag, attrs):

        if tag == "h5":
            # only 1 h5; wraps pub being cited.
            self._state = WileyEmailCitationAlert.STATE_IN_SEARCH
        elif (tag == "p"
              and self._state == WileyEmailCitationAlert.STATE_IN_PUB_LIST):
            self._state = (
                WileyEmailCitationAlert.STATE_AWAITING_AUTHOR_OR_TITLE)
            self._current_pub = publication.Pub()
            self.pub_alerts.append(pub_alert.PubAlert(self._current_pub, self))
        elif (tag == "span" and self._state
              == WileyEmailCitationAlert.STATE_AWAITING_AUTHOR_OR_TITLE):
            # Just entered an author.
            self._state = WileyEmailCitationAlert.STATE_IN_AUTHOR
        elif (tag == "em" and self._state
              == WileyEmailCitationAlert.STATE_IN_TITLE_SECTION):
            # em here means journal, I sure hope.
            self._state = WileyEmailCitationAlert.STATE_IN_JOURNAL
        elif (tag == "strong" and self._state
              == WileyEmailCitationAlert.STATE_IN_TITLE_SECTION):
            self._state = WileyEmailCitationAlert.STATE_IN_VOLUME
        elif (tag == "hr"
              and self._state == WileyEmailCitationAlert.STATE_IN_PUB_LIST):
            self._state = WileyEmailCitationAlert.STATE_DONE
        return (None)
Example #3
0
    def handle_starttag(self, tag, attrs):
        # print("Tag", tag)
        # print("Attrs", attrs)

        if self._in_search and tag == "b":
            self._in_search_text = True
            self._in_search = False
        elif (tag == "a" and len(attrs) > 1 and attrs[1][0] == "ref"
              and "linkname=pubmed_pubmed" not in attrs[0][1]):
            self._current_pub = publication.Pub()
            self._current_pub_alert = pub_alert.PubAlert(
                self._current_pub, self)
            self.pub_alerts.append(self._current_pub_alert)
            self._current_pub.url = attrs[0][1]
            self._in_title = True
        elif tag == "td" and self._expecting_authors:
            # This case actually handled by handle_startendtag
            self._expecting_authors = False
            self._really_expecting_authors = True
        elif tag == "td" and self._really_expecting_authors:
            self._really_expecting_authors = False
            self._in_authors = True
        elif (tag == "span" and attrs[0][0] == "class"
              and attrs[0][1] == "jrnl"):
            # Title tag has better jrnl name than display
            self._current_pub.ref = attrs[1][1]
            self._in_ref = True

        return (None)
Example #4
0
    def handle_data(self, data):
        data = data.strip()
        starting = WoSEmailAlert2018AndBefore.paper_start_re.match(data)
        if starting:
            # Each paper starts with: "Record m of n. "
            self._current_pub = publication.Pub()
            self._current_pub_alert = pub_alert.PubAlert(
                self._current_pub, self)
            self.pub_alerts.append(self._current_pub_alert)

        elif data == "Title:":
            self._in_title = True

        elif data == "Authors:":
            self._in_authors = True

        elif (WoSEmailAlert2018AndBefore.cited_article_re.match(data)
              or WoSEmailAlert2018AndBefore.alert_query_re.match(data)):
            self._in_query = True

        elif data == "Source:":
            self._in_ref = True
            self._current_pub.ref = ""

        elif self._in_title_value:
            if len(self._current_pub.title) > 0:
                self._current_pub.set_title(
                    self._current_pub.title + " " + data)
            else:
                self._current_pub.set_title(data)

        elif self._in_authors:
            # WOS Author lists look like:
            #   Galia, W; Leriche, F; Cruveiller, S; Thevenot-Sergentet, D
            canonical_first_author = publication.to_canonical(
                data.split(",")[0])
            self._current_pub.set_authors(data, canonical_first_author)
            self._in_authors = False

        elif self._in_query_value:
            # need to strip "]]>" from anywhere. Bug in WOS, if punctuation
            # in title.
            self.search += data.replace("]]>", "")
            self._in_query_value = False

        elif self._in_ref:
            self._current_pub.ref += data + " "

        return None
Example #5
0
    def handle_starttag(self, tag, attrs):

        if tag == "html":
            self._parsing = True
        elif self._search_coming and tag == "strong":  # 2018
            self._search_coming = False
            self._in_search = True
        elif (self._parsing and tag == "a" and len(attrs) > 2
              and attrs[2][1] == "http://journalshelp.wiley.com"):
            self._parsing = False  # Done looking at input.
            self._awaiting_title = False
        elif self._parsing and self._awaiting_title and tag == "a":
            self._awaiting_title = False
            self._in_title = True

            self._current_pub = publication.Pub()
            self.pub_alerts.append(pub_alert.PubAlert(self._current_pub, self))

            # URL looks like
            # http://onlinelibrary.wiley.com/doi/10.1002/spe.2320/abstract?
            #  campaign=wolsavedsearch
            # http://onlinelibrary.wiley.com/doi/10.1002/cpe.3533/abstract
            # loop through attrs looking for href
            for attr in attrs:
                if attr[0] == "href":
                    base_url = attr[1]
                    break
            if base_url[0:4] != "http":
                # Wiley sometimes forgets leading http://
                base_url = "http://" + base_url
            self._current_pub.url = base_url
            # self._current_pub.url = (
            #    publication.get_potentially_redirected_url(base_url))
            if base_url.split("/")[3] == "doi":
                doi_bits = "/".join(base_url.split("/")[4:6])
                self._current_pub.canonical_doi = (
                    publication.to_canonical_doi(doi_bits))
        elif self._awaiting_journal and tag == "span":
            self._in_journal = True
            self._awaiting_journal = False
            self._current_pub.ref = ""

        return (None)
Example #6
0
    def handle_starttag(self, tag, attrs):

        if tag == "html":
            self._state = WileyEmailAlert.STATE_PARSING_STARTED
        elif (self._state != WileyEmailAlert.STATE_DONE and tag == "a"
              and len(attrs) > 2
              and attrs[2][1] == "http://journalshelp.wiley.com"):
            self._state = WileyEmailAlert.STATE_DONE  # Done looking at input.
        elif (self._state == WileyEmailAlert.STATE_AWAITING_TITLE
              and tag == "a"):
            self._state = WileyEmailAlert.STATE_IN_TITLE
            self._current_pub = publication.Pub()
            self.pub_alerts.append(pub_alert.PubAlert(self._current_pub, self))

            # URL looks like
            # http://el.wiley.com/wf/click?upn=-2F4d0Y8aR13lVHu481a...
            # however, that redirects to
            # https://onlinelibrary.wiley.com/doi/10.15252/embr.201847227
            # EXCEPT IT DOES NOT. FROM THIS PROGRAM IT REDIRECTS TO
            # https://onlinelibrary.wiley.com/action/cookieAbsent
            #   Hmm. Works for CURL.  Updated publication.py to use CURL
            #   Nope, still doesn't work, still get cookieAbsent.
            # loop through attrs looking for href
            for attr in attrs:
                if attr[0] == "href":
                    base_url = attr[1]
                    break
            # if base_url[0:4] != "http":
            # Wiley sometimes forgets leading http://
            # base_url = "http://" + base_url
            self._current_pub.url = base_url
            # self._current_pub.url = (
            #    publication.get_potentially_redirected_url(base_url))
            if base_url.split("/")[3] == "doi":
                doi_bits = "/".join(base_url.split("/")[4:6])
                self._current_pub.canonical_doi = (
                    publication.to_canonical_doi(doi_bits))
        elif (self._state == WileyEmailAlert.STATE_AWAITING_JOURNAL
              and tag == "span"):
            self._state = WileyEmailAlert.STATE_IN_JOURNAL
            self._current_pub.ref = ""

        return (None)
Example #7
0
    def handle_starttag(self, tag, attrs):
        if tag == "h2":
            # citing pub has started
            pub = publication.Pub()
            self._current_pub_alert = pub_alert.PubAlert(pub, self)
            self._current_pub_alert.pub.set_authors("", "")
            self.pub_alerts.append(self._current_pub_alert)
            self._state = SDEmailAlert.STATE_IN_H2

        elif tag == "a" and self._state == SDEmailAlert.STATE_IN_H2:
            # First "a" inside H2 is link to citing pub at SD
            full_url = urllib.parse.unquote(attrs[0][1])

            # Current email links look like Either
            #  https://cwhib9vv.r.us-east-1.awstrack.me/L0/
            #   https:%2F%2Fwww.sciencedirect.com%2Fscience%2F
            #   article%2Fpii%2FB9780128156094000108
            #   %3Fdgcid=raven_sd_search_email/1/
            #   01000164f4ef81a4-8297928b-681a-463a-86c6-30f8eaf2bd7e-
            #   000000/_ewE29jTmNGAovSLl4HHgzWfTRQ=68
            #
            #  We want the second HTTPS up to the firs number after pii
            #  Proxy links won't work with full redirect URL
            # OR
            #  https://www.sciencedirect.com/science/article/pii/
            #  S0262407919306967
            try:
                minus_redirect = "https" + full_url.split("https")[2]
                pii_num_only = minus_redirect.split("/")[6]
                self._current_pub_alert.pub.url = gen_pub_url(pii_num_only)
            except IndexError:
                self._current_pub_alert.pub.url = full_url

            self._current_pub_alert.pub.set_title("")
            self._state = SDEmailAlert.STATE_IN_CITING_PUB_TITLE

        elif (tag == "span"
              and self._state == SDEmailAlert.STATE_EXPECTING_CITING_JOURNAL
              and attrs[0][1] == "color:#848484"):
            self._state = SDEmailAlert.STATE_IN_CITING_JOURNAL

        return (None)
Example #8
0
    def handle_starttag(self, tag, attrs):
        if tag == "td" and (len(attrs) > 0 and attrs[0][0] == "class"
                            and attrs[0][1] == "txtcontent"):
            """
            Paper has started; next tag is an anchor, and it has paper URL
            We now have a long URL that points to a public HTML version of
            the paper.  We don't have a doi. But we will have a title shortly.
            ScienceDirect has an API we could use to extract the DOI, or we
            could pull it from the HTML page.
            TODO: For now, go with title only match
            """
            self._in_title_link = True
            pub = publication.Pub()
            self._current_pub_alert = pub_alert.PubAlert(pub, self)
            self.pub_alerts.append(self._current_pub_alert)

        elif tag == "a" and self._in_title_link:
            full_url = attrs[0][1]
            url_args = full_url.split("&")
            for url_arg in url_args:
                if url_arg.startswith("_piikey="):
                    self._current_pub_alert.pub.url = gen_pub_url(url_arg[8:])
                    break
            self._in_title_link = False

        elif tag == "span" and (attrs[0][0] == "class"
                                and attrs[0][1] == "artTitle"):
            self._in_title_text = True
            self._in_title_text_span_depth = 1
        elif self._in_title_text and tag == "span":
            self._in_title_text_span_depth += 1
        elif tag == "i" and self._after_title_before_ref:
            self._in_ref = True
            self._after_title_before_ref = False

        elif (tag == "span" and attrs[0][0] == "class"
              and attrs[0][1] == "authorTxt"):
            self._in_authors = True

        return None
Example #9
0
    def handle_starttag(self, tag, attrs):
        """
        The search is wrapped in an H1:
          <h1 style="color:#505050;font-size:27px;line-height:40px;\
font-family:Arial,Helvetica">
            Showing top results for search alert:<br/>GalaxyProject.org
          </h1>
        There are other H1's so need to also match on data text.

        Everything of interest about a matched pub is in a TD followed by
        an H2.
        There are many TD's but only paper alerts are have H2's
          <td align="left" valign="top">
            <h2 style="color:#505050;font-size:23px;line-height:32px;\
font-family:Georgia,Arial,Helvetica">
              <a href="https://www.sciencedirect.com/science/article/pii/\
S0025619618304026?dgcid=raven_sd_search_email"
                 style="word-wrap:break-word;color:#007398;font-weight:none;\
text-decoration:none">
                C3 Glomerulopathy: Ten Years' Experience at Mayo Clinic
              </a>
            </h2>
            <p align="left" style="color:#505050;font-size:15px;\
line-height:24px;font-family:Arial,Helvetica;margin-bottom:2px">
              <span style="font-style:italic">
              </span>Research article
            </p>
            <p align="left" style="color:#848484;font-size:15px;\
line-height:24px;font-family:Arial,Helvetica;margin-bottom:2px">
              <span style="color:#848484">
                <span>Mayo Clinic Proceedings, Volume 93, Issue 8, \
Pages 991-1008,
                </span>
              </span>
            </p>
            <p align="left" style="color:#505050;font-size:15px;\
line-height:24px;font-family:Arial,Helvetica;margin-bottom:2px">
              Aishwarya Ravindran, Fernando C. Fervenza, ... Sanjeev Sethi
            </p>
          </td>
        """
        if not self._state == SDEmailAlert2018To2019.STATE_DONE:
            if tag == "td":
                self._in_td_depth += 1
            elif tag == "h1":
                self._state = SDEmailAlert2018To2019.STATE_IN_H1
            elif tag == "h2" and self._in_td_depth:
                # everything in this TD is about the publication.
                # The H2 is the first element in the TD
                self._state = SDEmailAlert2018To2019.STATE_IN_PUB_TITLE
                # paper has started
                pub = publication.Pub()
                self._current_pub_alert = pub_alert.PubAlert(pub, self)
                self.pub_alerts.append(self._current_pub_alert)

            elif (tag == "a" and self._state
                  == SDEmailAlert2018To2019.STATE_IN_PUB_TITLE):
                # pub title is the content of the a tag.
                # pub URL is where the a tag points to.
                full_url = urllib.parse.unquote(attrs[0][1])

                # Current email links look like Either
                #  https://cwhib9vv.r.us-east-1.awstrack.me/L0/
                #   https:%2F%2Fwww.sciencedirect.com%2Fscience
                #   %2Farticle%2Fpii%2FB9780128156094000108
                #   %3Fdgcid=raven_sd_search_email/1/
                #   01000164f4ef81a4-8297928b-681a-463a-86c6-30f8eaf2bd7e-
                #   000000/_ewE29jTmNGAovSLl4HHgzWfTRQ=68
                #
                #  We want the middle part, the second HTTPS.
                #  Proxy links won't work with full redirect URL
                # OR
                #  https://www.sciencedirect.com/science/article/pii/
                #  S0262407919306967?dgcid=raven_sd_search_email
                try:
                    minus_redirect = "https" + full_url.split("https")[2]
                    self._current_pub_alert.pub.url = minus_redirect.split(
                        "?")[0]
                except IndexError:
                    self._current_pub_alert.pub.url = full_url
                self._current_pub_alert.pub.title = ""

            elif (tag == "p" and self._state
                  == SDEmailAlert2018To2019.STATE_EXPECTING_PUB_TYPE):
                self._state = SDEmailAlert2018To2019.STATE_EXPECTING_REF

            elif (tag == "p" and self._state
                  == SDEmailAlert2018To2019.STATE_EXPECTING_REF):
                self._state = SDEmailAlert2018To2019.STATE_IN_REF

            elif (tag == "p" and self._state
                  == SDEmailAlert2018To2019.STATE_EXPECTING_AUTHORS):
                self._state = SDEmailAlert2018To2019.STATE_IN_AUTHORS

        return (None)
Example #10
0
    def handle_data(self, data):
        # eliminate leading, trailing, and multiple embedded spaces
        data = re.sub(r'\s+', ' ', data).strip()
        if data == "":
            return None                   # nothing to see here folks.

        if self._state == WoSEmailAlert.State.AWAITING_CONTENT:
            if WoSEmailAlert.greetings_re.match(data):
                self._state = WoSEmailAlert.State.STARTING_CONTENT

        elif self._state == WoSEmailAlert.State.STARTING_CONTENT:
            if WoSEmailAlert.saved_search_type_next_re.match(data):
                self._state = WoSEmailAlert.State.SAVED_SEARCH_TYPE_NEXT
            elif WoSEmailAlert.is_citation_report_re.match(data):
                self._state = WoSEmailAlert.State.CITED_PUB_NEXT

        # Search alert states
        elif self._state == WoSEmailAlert.State.SAVED_SEARCH_TYPE_NEXT:
            self.search += data + " "
            self._state = (
                WoSEmailAlert.State.SAVED_SEARCH_STRING_AND_COUNT_NEXT)

        elif (self._state
              == WoSEmailAlert.State.SAVED_SEARCH_STRING_AND_COUNT_NEXT):
            # form: "(Title or search) has 0 new records as of Mon XXth YYYY."
            # We Just want the title or search string. Match to "has" in case
            # title has parens in it.
            important_bits = WoSEmailAlert.saved_search_string_re.match(data)
            self.search += important_bits.group(1)
            if important_bits.group(2) == "0":
                self._state = WoSEmailAlert.State.DONE
            else:
                self._state = WoSEmailAlert.State.CITING_PUB_NEXT

        # Citation alert states
        elif self._state == WoSEmailAlert.State.CITED_PUB_NEXT:
            self.search += data
            self._state = WoSEmailAlert.State.CITATION_COUNT_NEXT

        elif self._state == WoSEmailAlert.State.CITATION_COUNT_NEXT:
            # ignore count,
            self._state = WoSEmailAlert.State.CITING_PUB_NEXT

        elif self._state == WoSEmailAlert.State.CITING_PUB_NEXT:
            counters = WoSEmailAlert.citing_pubs_over_re.match(data)
            if counters and int(counters.group(1)) == self.found_pub_count:
                self._state = WoSEmailAlert.State.DONE
            elif not counters:
                # Create a new pub alert.
                self._current_pub = publication.Pub()
                self._current_pub_alert = pub_alert.PubAlert(
                    self._current_pub, self)
                self.pub_alerts.append(self._current_pub_alert)
                self.found_pub_count += 1
                self._current_pub.set_title(data)
                self._state = WoSEmailAlert.State.CITING_PUB_AUTHORS_NEXT

        elif self._state == WoSEmailAlert.State.CITING_PUB_AUTHORS_NEXT:
            # WoS author list looks like:
            #  Halbritter, Dale A.; Storer, Caroline G.; Kawahara, Akito Y.
            canonical_first_author = publication.to_canonical(
                data.split(",")[0])
            self._current_pub.set_authors(data, canonical_first_author)
            self._state = WoSEmailAlert.State.CITING_PUB_JOURNAL_NEXT

        elif self._state == WoSEmailAlert.State.CITING_PUB_JOURNAL_NEXT:
            self._current_pub.ref = data
            self._state = WoSEmailAlert.State.CITING_PUB_EXCERPT_NEXT

        elif self._state == WoSEmailAlert.State.CITING_PUB_EXCERPT_NEXT:
            self._current_pub_alert.text_from_pub = data
            self._state = WoSEmailAlert.State.CITING_PUB_NEXT

        elif (data == "Terms of Use"
              and not self._state == WoSEmailAlert.State.DONE):
            print(
                "ERROR: WoS email parsing did not recognize email.",
                file=sys.stderr)
            sys.exit(1)

        return None
Example #11
0
    def handle_data(self, data):
        data = data.strip()
        if data == "":
            return None                   # nothing to see here folks.

        if self._expecting_search:
            if WoSEmailAlert201808To201911.search_preface_re.match(data):
                self._expecting_search = False
                self._in_search_section = True

        elif self._in_search_text:
            self.search += data
            self._in_search_text = False
            self._expecting_count_section = True

        elif self._in_count_section:
            self.expected_pub_count = int(
                WoSEmailAlert201808To201911.count_re.match(data).group(2))
            self._in_count_section = False
            self._expecting_pub_section = True

        elif (self._expecting_pub
              and WoSEmailAlert201808To201911.paper_start_re.match(data)):
            # Each paper starts with: "Record m of n. "
            self._current_pub = publication.Pub()
            self._current_pub_alert = pub_alert.PubAlert(
                self._current_pub, self)
            self.pub_alerts.append(self._current_pub_alert)
            self.found_pub_count += 1
            self._expecting_pub = False
            self._expecting_title = True

        elif self._in_title:
            self._current_pub.set_title(data)
            self._in_title = False
            self._expecting_authors = True

        elif self._expecting_authors and data == "Authors:":
            self._expecting_authors = False
            self._in_authors = True

        elif self._in_authors:
            # WOS Author lists look like:
            #   Galia, W; Leriche, F; Cruveiller, S; Thevenot-Sergentet, D
            canonical_first_author = publication.to_canonical(
                data.split(",")[0])
            self._current_pub.set_authors(data, canonical_first_author)
            self._in_authors = False
            self._expecting_journal = True

        elif self._in_journal:
            self._current_pub.ref = data
            self._in_journal = False
            self._expecting_citation = True

        elif self._in_citation:
            self._current_pub.ref += ", " + data

        elif self._expecting_doi and data == "DOI:":
            self._expecting_doi = False
            self._in_doi_section = True

        elif self._in_doi:
            self._current_pub.canonical_doi = publication.to_canonical_doi(
                data)
            self._in_doi = False
            self._expecting_pub = True

        return None