Example #1
0
def pdf2xml(pdffile, xmlfile, keep_tempfiles=False, ocr_ranges=None):
    """
    converts pdf to xml using pdftohtml or, if that fails, ocr2xml;
    returns 'pdftohtml' or 'ocr2xml' depending on which process was
    used. ocr_ranges (optional) is a list of pairs such as
    [(1,3),(7,10)] which would specify that only pages 1-3 and 7-10
    should get ocr'ed.
            
    TODO: check quality to see if ocr is needed?
    """
    if not exists(pdffile):
        raise FileNotFoundError("{} not found".format(pdffile))
    # first try pdftohtml
    try:
        pdftohtml(pdffile, xmlfile)
        return "pdftohtml"
    except NoTextInPDFException:
        debug(2, "no text in xml produced by pdftohtml")
    except Exception as e:
        debug(2, "pdftohtml failed: %s -- %s", pdffile, str(e))
    # then try ocr2xml (not catching exceptions here)
    if ocr_ranges:
        shortened_pdf = pdffile.rsplit(".")[0] + "-short.pdf"
        pdfcut(pdffile, shortened_pdf, ocr_ranges)
        pdffile = shortened_pdf
    ocr2xml(pdffile, xmlfile, keep_tempfiles=keep_tempfiles)
    if not keep_tempfiles and ocr_ranges:
        try:
            os.remove(shortened_pdf)
        except:
            pass
    return "ocr2xml"
Example #2
0
    def doc2text(doc):
        if len(doc.content) < 100000:
            text = doc.content
        else:
            text = doc.content[:50000] + doc.content[-50000:]
        # Simple hack to add authors etc. to document features:
        if len(text):
            if len(text) < 4000:
                text += " XLEN_TINY" * 2
            elif len(text) < 8000:
                text += " XLEN_VSHORT" * 2
            elif len(text) < 15000:
                text += " XLEN_SHORT" * 2
            elif len(text) < 40000:
                text += " XLEN_MEDIUM" * 2
            elif len(text) < 80000:
                text += " XLEN_LONG" * 2
            else:
                text += " XLEN_VLONG {}" * 2
        if doc.title:
            text += (" " + doc.title) * 2
        if doc.authors:
            for au in doc.authors.split(","):
                text += " " + re.sub(r' (\w+)\s*', r' XAU_\1', au)
        m = doc.url and re.match(r'(.+)/[^/]*', doc.url) # url path
        if m:
            text += " XPATH_" + re.sub(r'\W', '_', m.group(1))
        if doc.filetype:
            text += " XTYPE_" + doc.filetype

        debug(5, "doc text for classification:\n%s\n", text)
        return text
Example #3
0
    def old_link(self, url):
        """
        If a link to (a session variant of) url is already known on this
        page (as stored in the database), returns the stored Link,
        otherwise returns None.
        """
        if not hasattr(self, '_links'):
            cur = db.dict_cursor()
            query = "SELECT * FROM links WHERE source_id = %s"
            cur.execute(query, (self.source_id,))
            debug(5, cur._last_executed)
            self._links = [ Link(source=self, **li) for li in cur.fetchall() ]
            #debug(2, 'xxx old links:\n%s', '\n'.join([li.url for li in self._links]))

        for li in self._links:
            if li.url == url:
                return li

        s_url = self.strip_session_variables(url)
        if s_url != url:
            for li in self._links:
                if s_url == self.strip_session_variables(li.url):
                    return li

        return None
Example #4
0
 def update_db(self, **kwargs):
     """
     update self.**kwargs and write present state to db, also set
     'last_checked'
     """
     for k,v in kwargs.items():
         setattr(self, k, v)
     cur = db.cursor()
     self.last_checked = datetime.now()
     fields = [f for f in self.db_fields.keys()
               if f != 'link_id' and getattr(self, f) is not None]
     values = [getattr(self, f) for f in fields]
     if self.link_id:
         query = "UPDATE links SET {},urlhash=MD5(url) WHERE link_id = %s".format(
             ",".join(k+"=%s" for k in fields))
         cur.execute(query, values + [self.link_id])
     else:
         query = "INSERT INTO links ({},urlhash) VALUES ({},MD5(url))".format(
             ",".join(fields), ",".join(("%s",)*len(fields)))
         try:
             cur.execute(query, values)
         except:
             debug(1, "oops, %s: %s", query, ','.join(map(str, values)))
             raise
         self.link_id = cur.lastrowid
     debug(4, cur._last_executed)
     db.commit()
Example #5
0
def scholarquery(author, title):
    """ TODO: check if we're locked out of google scholar."""
    from . import scholar
    time.sleep(1)
    scholar.ScholarConf.COOKIE_JAR_FILE = os.path.join(tempdir(), 'scholar.cookie')
    querier = scholar.ScholarQuerier()
    settings = scholar.ScholarSettings()
    querier.apply_settings(settings)
    query = scholar.SearchScholarQuery()
    query.set_author(author)
    query.set_phrase(title)
    #before_year = 2016
    #query.set_timeframe(options.after, options.before)
    query.set_include_patents(False)
    querier.send_query(query)
    debug(4, 'google scholar query %s', query) 
    articles = querier.articles
    for a in articles:
        debug(4, 'result: %s (%s)', a['title'], a['year'])
        # Testing for exact equality of titles means that false
        # negatives are likely. On the other hand, we don't want to
        # treat "Desire as Belief II" as old just because there has
        # been "Desire as Belief". We err on the side of false
        # negatives:
        if a['title'].lower() == title.lower():
            return a
    return None
Example #6
0
def test_debug(caplog):
    debuglevel(4)
    debug(4, 'hi there')
    assert 'hi there' in caplog.text()
    debug(5, 'secret')
    assert 'secret' not in caplog.text()
    debuglevel(5)
Example #7
0
def evaluate(*docs):
   prob = clf.classify(*docs)
   if len(docs) > 1:
      debug(4, 'probability that documents are about philosophy: %s', ','.join(prob))
      return prob
   else:
      debug(4, 'probability that document is about philosophy: %s', prob)
      return prob
Example #8
0
    def default_author(self):
        """
        returns doc.source.default_author if that is defined (i.e., if
        doc.source is a personal page), otherwise tries to extract an
        author candidate from doc.link.context.

        The metadata extractor (docparser.paperparser) uses this
        property as default author if no author string can be found in
        the document, and to evaluate the plausibility of candidate
        author strings.

        Unfortunately, journal pages tend to put the author name in
        unpredictable places, often outside what is recognized as the
        link context. On the other hand, journal publications reliably
        contain the author name(s) in the document. So here we don't
        bother setting default_author at the moment. On repository
        pages, people do sometimes upload papers that don't contain
        any author names. 

        The metadata extractor assumes that default_author is a
        single author, because personal homepages only have a single
        default author. People also usually don't forget to put their
        names in the paper if there are co-authors. So we return the
        first author only.

        On philsci-archive, the format is 

        Teller, Paul (2016) Role-Player Realism.
        Livengood, Jonathan and Sytsma, Justin and Rose, David (2016) Following...

        On philpapers, it is 

        Stefan Dragulinescu, Mechanisms and Difference-Making.
        Michael Baumgartner & Lorenzo Casini, An Abductive Theory of Constitution.

        How do we know "Stefan Dragulinescu, Mechanisms" isn't the
        name of a person called "Mechanisms Stefan Dragulinescu" in
        last-comma-first format? Ultimately, we should use some clever
        general heuristics here. For now we simply split at /,| &|
        and|\(/; if the first element contains a whitespace, we return
        that element, otherwise we concatenate the first two elements
        in reverse order. This will only retrieve the surname on
        philsci-archive for authors with a double surname. 

        TODO: improve.
        """
        try:
            if self.source.sourcetype != 'repo':
                return self.source.default_author
            re_split = re.compile(',| & | and|\(')
            au, rest = re_split.split(self.link.context.strip(), 1)
            if len(au.split()) == 1:
                au2, rest2 = re_split.split(rest, 1)
                au = au2 + ' ' + au
            debug(3, 'setting "%s" as default_author', au)
            return au
        except Exception as e:
            return ''
Example #9
0
 def update_db(self, **kwargs):
     """write **kwargs to db, also update 'last_checked'"""
     if self.source_id:
         cur = db.cursor()
         kwargs['last_checked'] = time.strftime('%Y-%m-%d %H:%M:%S') 
         query = "UPDATE sources SET {},urlhash=MD5(url) WHERE source_id = %s".format(
             ",".join(k+"=%s" for k in kwargs.keys()))
         cur.execute(query, tuple(kwargs.values()) + (self.source_id,))
         debug(3, cur._last_executed)
         db.commit()
Example #10
0
 def load(self):
     if os.path.isfile(self.picklefile):
         debug(4, "loading classifier model from disk")
         with open(self.picklefile, 'rb') as f:
             (vect,clf) = pickle.load(f)
         self.vectorizer = vect
         self.classifier = clf
         self.ready = True
     else:
         self.reset()
Example #11
0
 def assign_category(self, cat_id, strength):
     """inserts or updates a docs2cats entry in the db"""
     if not self.doc_id:
         raise Exception("cannot assign category: document has no id")
     cur = db.cursor()
     query = ("INSERT INTO docs2cats (cat_id, doc_id, strength) VALUES (%s,%s,%s)"
              " ON DUPLICATE KEY UPDATE strength=%s")
     cur.execute(query, (cat_id, self.doc_id, strength, strength))
     debug(4, cur._last_executed)
     db.commit()
Example #12
0
def convert_to_pdf(tempfile):
    outfile = tempfile.rsplit('.',1)[0]+'.pdf'
    try:
        cmd = ['/usr/bin/python3', '/usr/bin/unoconv', 
               '-f', 'pdf', '-o', outfile, tempfile]
        debug(2, ' '.join(cmd))
        subprocess.check_call(cmd, timeout=20)
    except Exception as e:
        debug(1, "cannot convert %s to pdf: %s", tempfile, str(e))
        raise
    return outfile
Example #13
0
 def save_to_db(self):
     """write object to db"""
     cur = db.cursor()
     fields = [f for f in self.db_fields.keys()
               if f != 'link_id' and getattr(self, f) is not None]
     values = [getattr(self, f) for f in fields]
     query = "INSERT INTO sources ({}, urlhash) VALUES ({}, MD5(url))".format(
         ",".join(fields), ",".join(("%s",)*len(fields)))
     cur.execute(query, values)
     debug(3, cur._last_executed)
     db.commit()
     self.source_id = cur.lastrowid
Example #14
0
def parse(doc):
    """
    tries to enrich doc by metadata (authors, title, abstract,
    numwords, doctype, content); returns True if successful, False if
    doc.page doesn't look like an article.
    """
    page = doc.page
    debug(2, "parsing page %s", page.url)

    if "stanford.edu/entries" not in page.url:
        debug(2, "page is not a Stanford Encyclopedia entry")
        return False

    # title:
    h1s = page.xpath("//h1/text()")
    if not h1s:
        debug(2, "page is not a Stanford Encyclopedia entry")
        return False
    doc.title = h1s[0]

    # abstract:
    preamble_divs = page.xpath("//div[@id='preamble']")
    if not preamble_divs:
        debug(2, "page is not a Stanford Encyclopedia entry")
        return False
    preamble_html = etree.tostring(preamble_divs[0], encoding="unicode")
    doc.abstract = get_abstract(preamble_html)

    # authors:
    copyright_divs = page.xpath("//div[@id='article-copyright']")
    if not copyright_divs:
        debug(2, "page is not a Stanford Encyclopedia entry")
        return False
    copyright_html = etree.tostring(copyright_divs[0], encoding="unicode")
    copyright_html = re.sub("<a.+Copyright.+", "", copyright_html)
    copyright_html = re.sub("&lt;.+?&gt;", "", copyright_html)
    authors = [strip_tags(frag).strip() for frag in copyright_html.split("<br/>")]
    doc.authors = ", ".join([a for a in authors if a])

    # text content:
    # textnodes = page.xpath("//div[@id='article-content']//text()")
    # if not textnodes:
    #    debug(2, "page is not a Stanford Encyclopedia entry")
    #    return False
    # doc.content = ' '.join([n.strip() for n in textnodes if n.strip()])

    doc.content = page.text()
    doc.numwords = len(doc.content.split())
    doc.numpages = int(doc.numwords / 300)  # rough guess, just for classifiers
    doc.doctype = "article"
    doc.meta_confidence = 90

    return True
Example #15
0
def evaluate(doc):
    debug(4, 'trying to guess document type')
    probs = {
        'book': bookfilter.test(doc, debug=debuglevel()>3, smooth=False),
        'chapter': chapterfilter.test(doc, debug=debuglevel()>3, smooth=True),
        'thesis': thesisfilter.test(doc, debug=debuglevel()>3, smooth=False),
        'review': reviewfilter.test(doc, debug=debuglevel()>3, smooth=True)
    }
    debug(2, 'doctyper: %s', ', '.join(['{} {}'.format(k,v) for k,v in probs.items()]))
    if max(probs.values()) > 0.5:
        return max(probs, key=probs.get)
    else:
        return 'article'
Example #16
0
def parse(doc):
    """
    main method: fixes title and content of blogpost <doc> and adds
    authors, abstract, numwords
    """
    debug(3, "fetching blog post %s", doc.url)
    bytehtml = requests.get(doc.url).content.decode('utf-8', 'ignore')
    doc.content = extract_content(bytehtml, doc) or strip_tags(doc.content)
    doc.numwords = len(doc.content.split())
    doc.abstract = get_abstract(doc.content)
    if doc.title.isupper():
        doc.title = doc.title.capitalize()
    debug(2, "\npost abstract: %s\n", doc.abstract)
Example #17
0
def pdftohtml(pdffile, xmlfile):
    cmd = [
        PDFTOHTML,
        "-i",  # ignore images
        "-xml",  # xml output
        "-enc",
        "UTF-8",
        "-nodrm",  # ignore copy protection
        pdffile,
        xmlfile,
    ]
    debug(2, " ".join(cmd))
    try:
        stdout = subprocess.check_output(cmd, stderr=subprocess.STDOUT, timeout=10)
    except subprocess.CalledProcessError as e:
        debug(1, e.output)
        raise
    if not exists(xmlfile):
        raise PdftohtmlFailedException(stdout)
    xml = readfile(xmlfile)
    if not xml_ok(xml):
        debug(4, "No text in pdf: %s", xml)
        raise NoTextInPDFException
    else:
        debug(3, "pdftohtml output ok")
    writefile(xmlfile, fix_pdftohtml(xml))
    doctidy(xmlfile)
Example #18
0
 def load_from_db(self, url=''):
     url = url or self.url
     if not url:
         raise TypeError("need source url to load Source from db")
     cur = db.dict_cursor()
     query = "SELECT * FROM sources WHERE urlhash = MD5(%s)"
     cur.execute(query, (url,))
     debug(5, cur._last_executed)
     sources = cur.fetchall()
     if sources:
         for k,v in sources[0].items():
             setattr(self, k, v)
     else:
         debug(4, "%s not in sources table", url)
Example #19
0
def ocr2xml(pdffile, xmlfile, keep_tempfiles=False, write_hocr=False):
    """ocr pdffile and write pdftohtml-type parsing to xmlfile"""

    start_time = timer()
    debug(2, "ocr2xml %s %s", pdffile, xmlfile)

    try:
        numpages = int(pdfinfo(pdffile)['Pages'])
    except e:
        raise MalformedPDFError('pdfinfo failed')
    debug(2, '%s pages to process', numpages)
    
    xml = init_xml()
    hocr = b''
    for p in range(numpages):
        page_hocr = ocr_page(pdffile, p+1)
        xml_add_page(xml, page_hocr)
        hocr += page_hocr

    xmlstr = lxml.etree.tostring(xml, encoding='utf-8', pretty_print=True,
                                 xml_declaration=True)
    if write_hocr:
        with open(xmlfile, 'wb') as f:
            f.write(hocr)
    else:
        with open(xmlfile, 'wb') as f:
            f.write(xmlstr)
        doctidy(xmlfile)

    end_time = timer()
    if not keep_tempfiles:
        debug(3, 'cleaning up')
        remove_tempdir()

    debug(2, 'Time: %s seconds', str(end_time - start_time))
Example #20
0
def context_suggests_published(context):
    """
    returns True if the link context makes it fairly certain that the
    linked document has already been published before this year.
    """
    
    # uncomment to test paper processing:
    # return False

    if re.search('forthcoming|unpublished', context, re.I):
        debug(4, 'forthcoming/unpublished in context suggests not yet published')
        return False
    for m in re.finditer(r'\b\d{4}\b', context):
        if 1950 < int(m.group(0)) <= datetime.today().year:
            break
    else:
        debug(4, 'no suitable year in context suggests not yet published')
        return False

    # See https://github.com/wo/opp-tools/issues/54
    pubterms = [r'\beds?\b', r'edit(?:ed|ors?)', r'\d-+\d\d', r'\d:\s*\d', 'journal', r'philosophical\b']
    for t in pubterms:
        if re.search(t, context, re.I):
            debug(1, "ignoring published paper ('%s' in context)", t)
            return True
    debug(4, 'no publication keywords, assuming not yet published')
    return False
Example #21
0
def save_local(r):
    # use recognizable tempfile name:
    m = re.search('/([^/]+?)(?:\.\w+)?(?:[\?\#].+)*$', r.url)
    fname = m.group(1) if m else r.url
    fname = re.sub('\W', '_', fname) + '.' + r.filetype
    tempfile = os.path.join(tempdir(), fname)
    debug(2, "saving %s to %s", r.url, tempfile)
    try:
        with open(tempfile, 'wb') as f:
            for block in r.iter_content(1024):
                f.write(block)
    except EnvironmentError as e:
        debug(1, "cannot save %s to %s: %s", r.url, tempfile, str(e))
        raise
    return tempfile
Example #22
0
 def load_from_db(self, url='', source_id=0):
     url = url or self.url
     source_id = source_id or self.source_id
     if not url or not source_id:
         raise TypeError("need url and source_id to load Link from db")
     
     cur = db.dict_cursor()
     query = "SELECT * FROM links WHERE urlhash = MD5(%s) AND source_id = %s LIMIT 1"
     cur.execute(query, (url, source_id))
     debug(5, cur._last_executed)
     links = cur.fetchall()
     if links:
         for k,v in links[0].items():
             setattr(self, k, v)
     else:
         debug(4, "link to %s not in database", url)
Example #23
0
def pdfinfo(filename):
    """returns dictionary of pdfinfo (poppler) data"""
    cmd = [PDFINFO, filename]
    debug(3, " ".join(cmd))
    try:
        output = subprocess.check_output(cmd, stderr=subprocess.STDOUT, timeout=2)
        output = output.decode("utf-8")
    except subprocess.CalledProcessError as e:
        logger.warn(e.output)
        raise
    res = {}
    for line in output.split("\n"):
        if ":" in line:
            k, v = line.split(":", 1)
            res[k] = v.strip()
    return res
Example #24
0
def is_bad_url(url):
    if len(url) > 512:
        debug(1, 'url %s is too long', url)
        return True
    re_bad_url = re.compile("""
                ^\#|
                ^mailto|
                ^data|
                ^javascript|
                ^.+//[^/]+/?$|          # TLD
                twitter\.com|
                fonts\.googleapis\.com|
                philpapers\.org/asearch|
                \.(?:css|mp3|avi|mov|jpg|gif|ppt|png|ico|mso|xml)(?:\?.+)?$   # .css?version=12
                """, re.I | re.X)
    return re_bad_url.search(url) is not None
Example #25
0
def run():
    """
    retrieve and process new blog posts that have been put in the db
    by opp-web:feedhandler
    """
    cur = db.cursor()
    query = "SELECT doc_id FROM docs WHERE doctype = 'blogpost' AND status = 0"
    cur.execute(query)
    debug(4, cur._last_executed)
    posts = cur.fetchall()
    if not posts:
        return debug(3, "no new blog posts")
    for id in posts:
        post = Doc(doc_id=id)
        post.load_from_db()
        process_blogpost(post)
Example #26
0
def next_source():
    """return the next source from db that's due to be checked"""
    min_age = datetime.now() - timedelta(hours=16)
    min_age = min_age.strftime('%Y-%m-%d %H:%M:%S')
    cur = db.dict_cursor()
    query = ("SELECT * FROM sources WHERE"
             " sourcetype != 'blog'" # ignore rss feeds
             " AND (last_checked IS NULL OR last_checked < %s)"
             " ORDER BY last_checked LIMIT 1")
    cur.execute(query, (min_age,))
    debug(4, cur._last_executed)
    sources = cur.fetchall()
    if sources:
        return Source(**sources[0])
    else:
        debug(1, "all pages recently checked")
        return None
Example #27
0
def paper_is_old(doc):
    """
    checks online if document has been published earlier than this
    year
    """
    debug(4, "checking if paper is old")
    title = re.sub('<[\S]+?>', '', doc.title) # strip tags
    match = scholarquery(doc.authors, title)
    if (match and match['year'] 
        and 1950 < int(match['year']) < datetime.today().year-2):
        # Unfortunately, Google Scholar gives publication dates even
        # for unpublished manuscripts e.g. if they were cited with a
        # certain date once; so we only ignore papers if the given
        # date is at least two years old. TODO: improve! (If I finally
        # upload my "Generalizing Kripke Semantics" paper, I don't
        # want it to be treated as published in 2011!)
        debug(1, "paper already published in %s", match['year'])
        return True
    return False
Example #28
0
 def update_db(self, **kwargs):
     """update self.**kwargs and write present state to db"""
     for k, v in kwargs.items():
         setattr(self, k, v)
     cur = db.cursor()
     fields = [f for f in self.db_fields.keys()
               if f != 'doc_id' and getattr(self, f) is not None]
     values = [getattr(self, f) for f in fields]
     if self.doc_id:
         query = "UPDATE docs SET {},urlhash=MD5(url) WHERE doc_id = %s".format(
             ",".join(k+"=%s" for k in fields))
         cur.execute(query, values + [self.doc_id])
     else:
         query = "INSERT INTO docs ({},urlhash) VALUES ({},MD5(url))".format(
             ",".join(fields), ",".join(("%s",)*len(fields)))
         cur.execute(query, values)
         self.doc_id = cur.lastrowid
     debug(4, cur._last_executed)
     db.commit()
Example #29
0
    def extract_links(self, browser):
        """
        extracts links from source page; sets self.new_links and
        self.old_links, both lists of Link objects.
        """
        self.new_links = []
        self.old_links = []
        new_links = {} # url => Link
        old_links = {} # url => Link
        
        # lots of try/except because selenium easily crashes:
        try:
            els = browser.find_elements_by_tag_name("a")
        except:
            debug(1, "cannot retrieve links from page %s", self.url)
            return [],[]
        for el in els:
            try:
                if not el.is_displayed():
                    continue
                href = el.get_attribute('href')
                anchortext = el.text
                if not href:
                    continue
            except:
                continue
            if is_bad_url(href):
                debug(3, 'ignoring link to %s (bad url)', href)
                continue
            if href in old_links.keys() or href in new_links.keys():
                debug(3, 'ignoring repeated link to %s', href)
            old_link = self.old_link(href)
            if old_link:
                debug(3, 'link to %s is old', href)
                old_links[href] = old_link
                old_links[href].element = el
            else:
                debug(1, 'new link: "%s" %s', anchortext, href)
                new_links[href] = Link(url=href, source=self, element=el)

        self.new_links = new_links.values()
        self.old_links = old_links.values()
Example #30
0
def get_authors(full_html, post_html, post_text):
    # look for 'by (Foo Bar)' near the start of the post
    post_start = full_html.find(post_html)
    tagsoup = r'(?:<[^>]+>|\s)*'
    by = r'[Bb]y\b'+tagsoup
    name = r'[\w\.\-]+(?: (?!and)[\w\.\-]+){0,3}'
    separator = tagsoup+r'(?: and |, )'+tagsoup
    re_str = r'{}({})(?:{}({}))*'.format(by,name,separator,name)
    regex = re.compile(re_str)
    best_match = None
    for m in regex.finditer(full_html):
        if post_text.find(m.group(1)) > 20:
            debug(2, 'author candidate "%s" because too far in text', m.group(1))
            continue
        if not best_match or abs(m.start()-post_start) < abs(best_match.start()-post_start):
            best_match = m
    if best_match:
        names = [n for n in best_match.groups() if n]
        return ', '.join(names)
    return ''