def set_version_and_license(self, r=None): self.updated = datetime.datetime.utcnow().isoformat() if self.is_pmc: self.set_info_for_pmc_page() return # set as default self.scrape_version = "submittedVersion" is_updated = self.update_with_local_info() # now try to see what we can get out of the pdf itself if not r: logger.info(u"before scrape returning {} with scrape_version: {}, license {}".format(self.url, self.scrape_version, self.scrape_license)) return try: # http://crossmark.dyndns.org/dialog/?doi=10.1016/j.jml.2012 at http://dspace.mit.edu/bitstream/1721.1/102417/1/Gibson_The%20syntactic.pdf if re.findall(u"crossmark\.[^/]*\.org/", r.content_big(), re.IGNORECASE): self.scrape_version = "publishedVersion" text = convert_pdf_to_txt(r, max_pages=25) # logger.info(text) if text and self.scrape_version == "submittedVersion": patterns = [ re.compile(ur"©.?\d{4}", re.UNICODE), re.compile(ur"\(C\).?\d{4}", re.IGNORECASE), re.compile(ur"copyright.{0,6}\d{4}", re.IGNORECASE), re.compile(ur"received.{0,100}revised.{0,100}accepted.{0,100}publication", re.IGNORECASE | re.MULTILINE | re.DOTALL), re.compile(ur"all rights reserved", re.IGNORECASE), re.compile(ur"This article is distributed under the terms of the Creative Commons", re.IGNORECASE | re.MULTILINE | re.DOTALL), re.compile(ur"This article is licensed under a Creative Commons", re.IGNORECASE | re.MULTILINE | re.DOTALL), re.compile(ur"this is an open access article", re.IGNORECASE | re.MULTILINE | re.DOTALL) ] for pattern in patterns: if pattern.findall(text): logger.info(u'found {}, decided PDF is published version'.format(pattern.pattern)) self.scrape_version = "publishedVersion" if not self.scrape_license: open_license = find_normalized_license(text) if open_license: logger.info(u'found license in PDF: {}'.format(open_license)) self.scrape_license = open_license except Exception as e: logger.exception(u"exception in convert_pdf_to_txt for {}".format(self.url)) self.error += u"Exception doing convert_pdf_to_txt!" logger.info(self.error) logger.info(u"scrape returning {} with scrape_version: {}, license {}".format(self.url, self.scrape_version, self.scrape_license))
def pdf_to_qa_result(abs_file_path): try: text = convert_pdf_to_txt(abs_file_path).split('\n') except: return False if is_file_snc_mapcheck(text): return MapcheckResult(text).data_to_csv() + ',' + basename( abs_file_path)
def main(): baseEl = etree.Element('issue', published="true", current="false") issueData = getIssueData() etree.SubElement(baseEl, 'title').text = '0' etree.SubElement(baseEl, 'volume').text = issueData["volume"] etree.SubElement(baseEl, 'number').text = issueData["number"] etree.SubElement(baseEl, 'year').text = issueData["year"] etree.SubElement(baseEl, 'date_published').text = issueData["date_published"] etree.SubElement(baseEl, 'access_date').text = issueData["access_date"] articlesEl = etree.SubElement(baseEl, 'section') etree.SubElement(articlesEl, 'title', locale="en_US").text = 'Articles' etree.SubElement(articlesEl, 'abbr', locale="en_US").text = 'ART' if (len(sys.argv) > 1): for file in sys.argv[1:]: # add an article tag for each file fileText = convert_pdf_to_txt(file) fileBinary = open(file, "rb").read().encode("base64") fileXml = articleToXml(fileText, file, fileBinary, issueData["date_published"]) articlesEl.insert(-1, fileXml) tree = etree.ElementTree(baseEl) tree.write('output.xml', pretty_print=True) # write end result to 'output.xml'
def set_version_and_license(self, r=None): self.updated = datetime.datetime.utcnow().isoformat() if self.is_pmc: self.set_info_for_pmc_page() return # set as default self.scrape_version = "submittedVersion" is_updated = self.update_with_local_info() # now try to see what we can get out of the pdf itself if not r: logger.info( u"before scrape returning {} with scrape_version: {}, license {}" .format(self.url, self.scrape_version, self.scrape_license)) return try: # http://crossmark.dyndns.org/dialog/?doi=10.1016/j.jml.2012 at http://dspace.mit.edu/bitstream/1721.1/102417/1/Gibson_The%20syntactic.pdf if re.findall(u"crossmark\.[^/]*\.org/", r.content_big(), re.IGNORECASE): self.scrape_version = "publishedVersion" text = convert_pdf_to_txt(r, max_pages=25) # logger.info(text) if text and self.scrape_version != "publishedVersion": patterns = [ re.compile(ur"©.?\d{4}", re.UNICODE), re.compile(ur"\(C\).?\d{4}", re.IGNORECASE), re.compile(ur"copyright.{0,6}\d{4}", re.IGNORECASE), re.compile( ur"received.{0,100}revised.{0,100}accepted.{0,100}publication", re.IGNORECASE | re.MULTILINE | re.DOTALL), re.compile(ur"all rights reserved", re.IGNORECASE), re.compile( ur"This article is distributed under the terms of the Creative Commons", re.IGNORECASE | re.MULTILINE | re.DOTALL), re.compile( ur"This article is licensed under a Creative Commons", re.IGNORECASE | re.MULTILINE | re.DOTALL), re.compile(ur"this is an open access article", re.IGNORECASE | re.MULTILINE | re.DOTALL), re.compile( ur"This article is brought to you for free and open access by Works.", re.IGNORECASE | re.MULTILINE | re.DOTALL), ] for pattern in patterns: if pattern.findall(text): logger.info( u'found {}, decided PDF is published version'. format(pattern.pattern)) self.scrape_version = "publishedVersion" if not self.scrape_license: open_license = find_normalized_license(text) if open_license: logger.info( u'found license in PDF: {}'.format(open_license)) self.scrape_license = open_license except Exception as e: logger.exception(u"exception in convert_pdf_to_txt for {}".format( self.url)) self.error += u"Exception doing convert_pdf_to_txt!" logger.info(self.error) logger.info( u"scrape returning {} with scrape_version: {}, license {}".format( self.url, self.scrape_version, self.scrape_license))
def set_version_and_license(self, r=None): self.updated = datetime.datetime.utcnow().isoformat() if self.is_pmc: self.set_info_for_pmc_page() return # set as default self.scrape_version = self.default_version() is_updated = self.update_with_local_info() # now try to see what we can get out of the pdf itself version_is_from_strict_metadata = self.pmh_record and self.pmh_record.api_raw and re.compile( ur"<dc:type>{}</dc:type>".format(self.scrape_version), re.IGNORECASE | re.MULTILINE | re.DOTALL ).findall(self.pmh_record.api_raw) if version_is_from_strict_metadata or not r: logger.info(u"before scrape returning {} with scrape_version: {}, license {}".format(self.url, self.scrape_version, self.scrape_license)) return try: # http://crossmark.dyndns.org/dialog/?doi=10.1016/j.jml.2012 at http://dspace.mit.edu/bitstream/1721.1/102417/1/Gibson_The%20syntactic.pdf if re.findall(u"crossmark\.[^/]*\.org/", r.content_big(), re.IGNORECASE): self.scrape_version = "publishedVersion" text = convert_pdf_to_txt(r, max_pages=25) # logger.info(text) if text and self.scrape_version != "publishedVersion" and not version_is_from_strict_metadata: patterns = [ re.compile(ur"©.?\d{4}", re.UNICODE), re.compile(ur"\(C\).?\d{4}", re.IGNORECASE), re.compile(ur"copyright.{0,6}\d{4}", re.IGNORECASE), re.compile(ur"received.{0,100}revised.{0,100}accepted.{0,100}publication", re.IGNORECASE | re.MULTILINE | re.DOTALL), re.compile(ur"all rights reserved", re.IGNORECASE), re.compile(ur"This article is distributed under the terms of the Creative Commons", re.IGNORECASE | re.MULTILINE | re.DOTALL), re.compile(ur"This article is licensed under a Creative Commons", re.IGNORECASE | re.MULTILINE | re.DOTALL), re.compile(ur"this is an open access article", re.IGNORECASE | re.MULTILINE | re.DOTALL), re.compile(ur"This article is brought to you for free and open access by Works.", re.IGNORECASE | re.MULTILINE | re.DOTALL), ] for pattern in patterns: if pattern.findall(text): logger.info(u'found {}, decided PDF is published version'.format(pattern.pattern)) self.scrape_version = "publishedVersion" if text and self.scrape_version != 'acceptedVersion': patterns = [ re.compile(ur'This is a post-peer-review, pre-copyedit version', re.IGNORECASE | re.MULTILINE | re.DOTALL), re.compile(ur'This is the peer reviewed version of the following article', re.IGNORECASE | re.MULTILINE | re.DOTALL), re.compile(ur'The present manuscript as of \d\d \w+ \d\d\d\d has been accepted', re.IGNORECASE | re.MULTILINE | re.DOTALL), re.compile(ur'Post-peer-review, pre-copyedit version of accepted manuscript', re.IGNORECASE | re.MULTILINE | re.DOTALL), ] for pattern in patterns: if pattern.findall(text): logger.info(u'found {}, decided PDF is accepted version'.format(pattern.pattern)) self.scrape_version = "acceptedVersion" if r and r.url and '61RMIT_INST' in r.url: if 'Version: Accepted' in text: logger.info(u'found Version: Accepted, decided PDF is accepted version') self.scrape_version = "acceptedVersion" heading_text = text[0:50].lower() accepted_headings = [ "final accepted version", "accepted manuscript", ] for heading in accepted_headings: if heading in heading_text: logger.info(u'found {} in heading, decided PDF is accepted version'.format(heading)) self.scrape_version = "acceptedVersion" break if not self.scrape_license: open_license = find_normalized_license(text) if open_license: logger.info(u'found license in PDF: {}'.format(open_license)) self.scrape_license = open_license except Exception as e: logger.exception(u"exception in convert_pdf_to_txt for {}".format(self.url)) self.error += u"Exception doing convert_pdf_to_txt!" logger.info(self.error) if self.pmh_record: self.scrape_version = _scrape_version_override().get(self.pmh_record.bare_pmh_id, self.scrape_version) logger.info(u"scrape returning {} with scrape_version: {}, license {}".format(self.url, self.scrape_version, self.scrape_license))
def main(): if (len(sys.argv) > 1): baseEl = etree.Element('issue', published="true", current="false") issueData = getIssueData() etree.SubElement(baseEl, 'title').text = '0' etree.SubElement(baseEl, 'volume').text = issueData["volume"] etree.SubElement(baseEl, 'number').text = issueData["number"] etree.SubElement(baseEl, 'year').text = issueData["year"] etree.SubElement(baseEl, 'date_published').text = issueData["date_published"] etree.SubElement(baseEl, 'access_date').text = issueData["access_date"] articlesEl = etree.SubElement(baseEl, 'section') etree.SubElement(articlesEl, 'title', locale="en_US").text = 'Articles' etree.SubElement(articlesEl, 'abbr', locale="en_US").text = 'ART' while (True): print("Article information ####################\n") articleName = raw_input( "What is the article name? You can enter a temporary filename if you want." ) articleStart = raw_input( "What is the start page of this article? ") while (not articleStart.isdigit()): articleStart = raw_input( "Please enter an integer. What is the start page of this article? " ) articleEnd = raw_input("What is the end page of this article? ") while (not articleEnd.isdigit()): articleEnd = raw_input( "Please enter an integer. What is the end page of this article? " ) articleText = convert_pdf_to_txt(sys.argv[1], int(articleStart), int(articleEnd)) split_article(sys.argv[1], articleName, int(articleStart), int(articleEnd)) articleBinary = open(articleName, "rb").read().encode("base64") articleXml = articleToXml(articleText, articleName, articleBinary, issueData["date_published"]) articlesEl.insert(-1, articleXml) if (raw_input( "Parse another article? Type yes to continue or any other character to quit. " ) != "yes"): break if (raw_input( "Are there book reviews to parse? Type yes to continue. ")): bookReviewsElement = etree.SubElement(baseEl, 'section') etree.SubElement(bookReviewsElement, 'title', locale="en_US").text = 'Book Reviews' etree.SubElement(bookReviewsElement, 'abbr', locale="en_US").text = 'BKRV' while (True): articleName = raw_input( "What is the article name? You can enter a temporary filename if you want." ) articleStart = raw_input( "What is the start page of this article? ") while (not articleStart.isdigit()): articleStart = raw_input( "Please enter an integer. What is the start page of this article? " ) articleEnd = raw_input( "What is the end page of this article? ") while (not articleEnd.isdigit()): articleEnd = raw_input( "Please enter an integer. What is the end page of this article? " ) split_article(sys.argv[1], articleName, int(articleStart), int(articleEnd)) articleBinary = open(articleName, "rb").read().encode("base64") articleXml = bookReviewToXml(articleText, articleName, articleBinary, issueData["date_published"]) bookReviewsElement.insert(-1, articleXml) if (raw_input( "Parse another article? Type yes to continue or any other character to quit. " ) != "yes"): break tree = etree.ElementTree(baseEl) tree.write('output.xml', pretty_print=True) else: print "Not enough input entered"