def convertHtmlToDicts(url, content): """ given a url and content, create file and article dictionaries content has to include normal newlines, no \a or #N# replacers returns None, None on error """ # lxml does not like unicode if the document has an explicit encoding if " encoding=" not in content: content = pubGeneric.forceToUnicode(content) logging.debug("Converting to text: %s " % (repr(url))) artDict = pubStore.createEmptyArticleDict(source="bing", fulltextUrl=url) if not "<html" in content: return None, None try: logging.debug("Parsing html with lxml, html size %d" % len(content)) tree = lxml.html.document_fromstring(content) logging.debug("end parse html") except lxml.etree.XMLSyntaxError: return None, None titleEl = tree.find("head/title") if titleEl!=None: title = titleEl.text else: logging.debug("No title found?") title = "" metaTags = tree.findall("head/meta") artDict = parseMetaData(metaTags, artDict) logging.debug("Cleaning html tree") cleaner = Cleaner() cleaner.javascript = True cleaner.style = True cleaner.meta = True cleaner.embedded = True cleaner.page_structure=True #cleaner.remove_tags = ["a", "li", "td"] cleanTree = cleaner.clean_html(tree) logging.debug("Cleaning done, now converting to ASCII") #text = cleanTree.text_content() newlineTags = ["p", "br"] asciiText = pubXml.treeToAsciiText(cleanTree, newlineTags) logging.debug("ASCII conversion done") logging.debug("title: %s" % title) if "title" not in artDict or artDict["title"]=="": artDict["title"] = title if artDict["abstract"]=="": abstract = unidecode.unidecode(asciiText[0:1500]).strip() artDict["abstract"] = abstract logging.debug("abstract: %s" % artDict["abstract"]) fileDict = pubStore.createEmptyFileDict(url=url, content=asciiText, mimeType="text/html") logging.debug("meta data extract success: %s" % artDict) return artDict, fileDict
def convertOneChunk(inIndexFile, outFile): """ get files from inIndexFile, parse Xml, write everything to outfile in ascii format """ store = pubStore.PubWriterFile(outFile) i = 0 inRows = list(maxCommon.iterTsvRows(inIndexFile)) doi2pmid = None logging.info("Converting %d files" % len(inRows)) convCount = 0 for row in inRows: # read line i+=1 articleId, baseDir = row.articleId, row.baseDir zipFilename, filename = row.zipFilename, row.filename articleId=int(articleId) # open file from zipfile fullZipPath = join(baseDir, zipFilename) zipFile = zipfile.ZipFile(fullZipPath) logging.debug("Parsing %s, file %s, %d files left" % (fullZipPath, filename, len(inRows)-i)) if doi2pmid==None: doi2pmid = parseDoi2Pmid(baseDir) xmlString = zipFile.open(filename).read() xmlTree = pubXml.etreeFromXml(xmlString) # parse xml articleData = pubStore.createEmptyArticleDict(publisher="elsevier") articleData = parseElsevier(xmlTree, articleData) if articleData==None: logging.warn("Parser got no data for %s" % filename) continue articleData["origFile"]="consyn://"+zipFilename+"/"+filename if articleData["doi"] in doi2pmid: articleData["pmid"] = doi2pmid[articleData["doi"]] pii = splitext(basename(filename))[0] articleData["externalId"]="PII"+pii articleData["fulltextUrl"]="http://www.sciencedirect.com/science/svapps/pii/"+pii # convert to ascii asciiString, mimeType = treeToAscii_Elsevier(xmlTree) if asciiString==None: logging.warn("No ASCII for %s / %s" % (zipFilename, filename)) continue store.writeArticle(articleId, articleData) # write to output fileData = createFileData(articleData, mimeType, asciiString) store.writeFile(articleId, (1000*(articleId))+1, fileData, externalId=articleData["externalId"]) convCount += 1 logging.info("Converted %d files" % convCount) store.close()
def minimalHtmlToDicts(url, content): " a minimalistic article dict filler, does not try to parse the html " logging.debug("Falling back to minimal html to text") fileDict = pubStore.createEmptyFileDict(url=url, content=content, mimeType="text/html") fileDict = pubGeneric.toAsciiEscape(fileDict, mimeType="text/html") if fileDict==None or not "content" in fileDict: return None, None text = fileDict["content"] title = unidecode.unidecode(content[:100]) abstract = unidecode.unidecode(content[100:1000]) artDict = pubStore.createEmptyArticleDict(source="bing", fulltextUrl=url, \ title=title, abstract=abstract, externalId=url) #if fileDict==None: #continue return artDict, fileDict
def parseMedline(xmlParser): """ fill article data dict with pubmed xml data >>> xml = PubmedTestDoc() >>> data = parseMedline(maxXml.XmlParser(string=xml)) >>> del data["time"] >>> repr(data) "OrderedDict([('articleId', ''), ('externalId', 'PMID20430833'), ('source', ''), ('origFile', ''), ('journal', 'Brain : a journal of neurology'), ('printIssn', '0006-8950'), ('eIssn', '0006-8950'), ('journalUniqueId', '0372537'), ('year', '2010'), ('articleType', 'research-article'), ('articleSection', ''), ('authors', u'Willemsen, Mich\\\\xe9l A; Verbeek, Marcel M'), ('authorEmails', ''), ('authorAffiliations', 'Radboud University Nijmegen Medical Centre, Donders Institute for Brain, Cognition and Behaviour, Department of Paediatric Neurology (820 IKNC), PO Box 9101, 6500 HB Nijmegen, The Netherlands. [email protected]'), ('keywords', 'Age of Onset/Useless Research'), ('title', 'Tyrosine hydroxylase deficiency: a treatable disorder of brain catecholamine biosynthesis.'), ('abstract', 'An infantile onset, progressive, hypokinetic-rigid syndrome with dystonia (type A), and a complex encephalopathy with neonatal onset (type B). Decreased cerebrospinal fluid concentrations of homovanillic acid and c.698G>A and c.707T>C mutations. Carriership of at least one promotor mutation, however, apparently predicts type A tyrosine hydroxylase deficiency. Most patients with tyrosine hydroxylase deficiency can be successfully treated with l-dopa.'), ('vol', '133'), ('issue', 'Pt 6'), ('page', '1810-22'), ('pmid', '20430833'), ('pmcId', ''), ('doi', ''), ('fulltextUrl', 'https://www.ncbi.nlm.nih.gov/pubmed/20430833')])" """ data = pubStore.createEmptyArticleDict() #medlineData = xmlParser.getXmlFirst("MedlineCitation") medlineData = xmlParser data["pmid"] = medlineData.getTextFirst("PMID") el = medlineData.getElFirst("PMID", None) data["pmidVersion"] = el.attrib.get("Version", "") if el is not None else "" data["externalId"] = "PMID"+data["pmid"] data["fulltextUrl"] = "https://www.ncbi.nlm.nih.gov/pubmed/%s" % data["pmid"] logging.log(5, "PMID %s" % data["pmid"]) data["medlineCreatedDate"] = getMedlineDate(medlineData, "DateCreated") data["medlineCompletedDate"] = getMedlineDate(medlineData, "DateCompleted") data["medlineRevisedDate"] = getMedlineDate(medlineData, "DateRevised") otherIds = medlineData.getTextAll("OtherID", reqAttrDict={"Source":"NLM"}) pmcIds = [i for i in otherIds if i.startswith("PMC")] if len(pmcIds) > 0: data["pmcId"] = pmcIds[0].split()[0].replace("PMC","") artTree = medlineData.getXmlFirst("Article") data["title"] = getMedlineText(artTree.getXmlAll("ArticleTitle")) # handle structured abstracts data["abstract"] = getMedlineText(artTree.getXmlAll("Abstract/AbstractText")) if data["abstract"]=="": data["abstract"] = getMedlineText(artTree.getXmlAll("OtherAbstract/AbstractText")) data["authorAffiliations"] = artTree.getTextFirst("Affiliation", default="") data["doi"] = artTree.getTextFirst("ELocationID", default="", reqAttrDict={"EIdType":"doi"}) data["lang"] = artTree.getTextFirst("Language", default="") data["journalUniqueId"] = medlineData.getTextFirst("MedlineJournalInfo/NlmUniqueID") linkingIssn = medlineData.getTextFirst("MedlineJournalInfo/ISSNLinking", default="") journalTree = artTree.getXmlFirst("Journal") data["eIssn"] = journalTree.getTextFirst("ISSN", reqAttrDict={"IssnType": 'Electronic'}, default="") data["printIssn"] = journalTree.getTextFirst("ISSN", reqAttrDict={"IssnType": 'Print'}, default="") # keep the link ISSN when we have space, e.g. PNAS is not storing the print ISSN anymore, only as link Issn if data["printIssn"]=="" and linkingIssn!="": data["printIssn"] = linkingIssn if data["eIssn"]=="" and linkingIssn!="": data["eIssn"] = linkingIssn data["vol"] = journalTree.getTextFirst("JournalIssue/Volume", default="") data["issue"] = journalTree.getTextFirst("JournalIssue/Issue", default="") data["year"] = journalTree.getTextFirst("JournalIssue/PubDate/Year", default="") if data["year"]=="": year = journalTree.getTextFirst("JournalIssue/PubDate/MedlineDate", default="").split()[0] if not year.isdigit(): year = "" data["year"] = year data["journal"] = journalTree.getTextFirst("Title", default="") data["page"] = artTree.getTextFirst("Pagination/MedlinePgn", default="") authorList = artTree.getXmlFirst("AuthorList") lastNames = [] initialList = [] if authorList!=None: authorTrees = authorList.getXmlAll("Author") for authorTree in authorTrees: lastName = authorTree.getTextFirst("LastName", default="") if lastName=="": lastName = authorTree.getTextFirst("CollectiveName", default="") lastNames.append(lastName) initials = authorTree.getTextFirst("ForeName", default="") if initials=="": initials = authorTree.getTextFirst("Initials", default="") initialList.append(initials) authors = [lastNames[i]+", "+initialList[i] for i in range(0, min(len(lastNames), len(initialList)))] data["authors"]="; ".join(authors) articleTypeList = set(artTree.getTextAll("PublicationTypeList/PublicationType")) articleTypesString = ",".join(articleTypeList) articleType="research-article" noResearchArticleTags = ["Bibliography", "Biography", "Case Reports", "Webcasts", "Dictionary", "Directory", "Editorial", "Festschrift", "Patient Education Handout", "Periodical Index", "Portraits", "Published Erratum", "Scientific Integrity Review" "Congresses"] if "Review" in articleTypeList: articleType = "review" elif "Letter" in articleTypeList: articleType = "research-article" else: for noResearchArticleTag in noResearchArticleTags: if noResearchArticleTag in articleTypeList: articleType = "other" break data["articleType"] = articleType #data["pubmedArticleTypes"] = articleTypesString logging.log(5, "pubmedArticleTypes %s, articleType %s" % (articleTypesString, articleType)) meshDescriptors = [] meshHeadingList = medlineData.getXmlFirst("MeshHeadingList", default="") if meshHeadingList: #for meshHeadingDescriptor in meshHeadingList.getTextAll("MeshHeading/DescriptorName", reqAttrDict={"MajorTopicYN":"Y"}): for meshHeadingDescriptor in meshHeadingList.getTextAll("MeshHeading/DescriptorName"): meshDescriptors.append(meshHeadingDescriptor.strip()) data["keywords"] = "/".join(meshDescriptors) # remove these annoying linebreaks! filtData = {} for key, val in data.iteritems(): filtData[key] = val.replace(u'\u2028', ' ') return filtData
def convertOneChunk(zipDir, inIndexFile, inIdFile, outFile): """ get files from inIndexFile, parse Xml, write everything to outfile in ascii format """ store = pubStore.PubWriterFile(outFile) # read all already done IDs donePiis = pubGeneric.parseDoneIds(inIdFile) # open output id files idFname = join(dirname(outFile), basename(outFile).split(".")[0]+".ids.tab") logging.debug("Writing ids to %s" % idFname) idFh = open(idFname, "w") idFh.write("#articleId\tdoi\texternalId\tpmid\n") i = 0 inRows = list(maxCommon.iterTsvRows(inIndexFile)) #doi2pmid = None convCount = 0 skipCount = 0 pmidFinder = pubCompare.PmidFinder() logging.info("Converting %d files" % len(inRows)) for row in inRows: # read line i+=1 articleId = row.articleId zipFilename, filename = row.zipFilename, row.filename articleId=int(articleId) pii = splitext(basename(filename))[0] if pii in donePiis: logging.debug("PII %s has already been converted, skipping" % pii) skipCount += 1 continue donePiis.add(pii) # open file from zipfile fullZipPath = join(zipDir, zipFilename) zipFile = zipfile.ZipFile(fullZipPath) logging.debug("Parsing %s, file %s, %d files left" % (fullZipPath, filename, len(inRows)-i)) #if doi2pmid==None: #doi2pmid = parseDoi2Pmid(baseDir) xmlString = zipFile.open(filename).read() try: xmlTree = pubXml.etreeFromXml(xmlString) except lxml.etree.XMLSyntaxError: logging.error("XML parse error, skipping file %s, %s" % (zipFilename, filename)) continue # parse xml articleData = pubStore.createEmptyArticleDict(publisher="elsevier") articleData = parseElsevier(xmlTree, articleData) if articleData==None: logging.warn("Parser got no data for %s" % filename) continue articleData["origFile"]=zipFilename+":"+filename #if articleData["doi"] in doi2pmid: #articleData["pmid"] = doi2pmid[articleData["doi"]] articleData["externalId"]=pii articleData["fulltextUrl"]="http://www.sciencedirect.com/science/svapps/pii/"+pii #articleData["pmid"] = pmidFinder.lookupPmid(articleData) # convert to ascii asciiString, mimeType = treeToAscii_Elsevier(xmlTree) if asciiString==None: logging.warn("No ASCII for %s / %s" % (zipFilename, filename)) continue store.writeArticle(articleId, articleData) # write IDs to separate file idRow = [str(articleData["articleId"]), articleData["doi"], articleData["externalId"], str(articleData["pmid"])] idFh.write("\t".join(idRow)) idFh.write("\n") # write to output fileData = createFileData(articleData, mimeType, asciiString) store.writeFile(articleId, (1000*(articleId))+1, fileData, externalId=articleData["externalId"]) convCount += 1 logging.info("Converted %d files, skipped %d" % (convCount, skipCount)) store.close() idFh.close()
def convertOneChunk(zipDir, inIndexFile, inIdFile, outFile): """ get files from inIndexFile, parse Xml, write everything to outfile in ascii format """ store = pubStore.PubWriterFile(outFile) # read all already done IDs donePiis = pubGeneric.parseDoneIds(inIdFile) # open output id files idFname = join(dirname(outFile), basename(outFile).split(".")[0] + ".ids.tab") logging.debug("Writing ids to %s" % idFname) idFh = open(idFname, "w") idFh.write("#articleId\tdoi\texternalId\tpmid\n") i = 0 inRows = list(maxCommon.iterTsvRows(inIndexFile)) #doi2pmid = None convCount = 0 skipCount = 0 pmidFinder = pubCompare.PmidFinder() logging.info("Converting %d files" % len(inRows)) for row in inRows: # read line i += 1 articleId = row.articleId zipFilename, filename = row.zipFilename, row.filename articleId = int(articleId) pii = splitext(basename(filename))[0] if pii in donePiis: logging.debug("PII %s has already been converted, skipping" % pii) skipCount += 1 continue donePiis.add(pii) # open file from zipfile fullZipPath = join(zipDir, zipFilename) zipFile = zipfile.ZipFile(fullZipPath) logging.debug("Parsing %s, file %s, %d files left" % (fullZipPath, filename, len(inRows) - i)) #if doi2pmid==None: #doi2pmid = parseDoi2Pmid(baseDir) xmlString = zipFile.open(filename).read() try: xmlTree = pubXml.etreeFromXml(xmlString) except lxml.etree.XMLSyntaxError: logging.error("XML parse error, skipping file %s, %s" % (zipFilename, filename)) continue # parse xml articleData = pubStore.createEmptyArticleDict(publisher="elsevier") articleData = parseElsevier(xmlTree, articleData) if articleData == None: logging.warn("Parser got no data for %s" % filename) continue articleData["origFile"] = zipFilename + ":" + filename #if articleData["doi"] in doi2pmid: #articleData["pmid"] = doi2pmid[articleData["doi"]] articleData["externalId"] = pii articleData[ "fulltextUrl"] = "http://www.sciencedirect.com/science/svapps/pii/" + pii #articleData["pmid"] = pmidFinder.lookupPmid(articleData) # convert to ascii asciiString, mimeType = treeToAscii_Elsevier(xmlTree) if asciiString == None: logging.warn("No ASCII for %s / %s" % (zipFilename, filename)) continue store.writeArticle(articleId, articleData) # write IDs to separate file idRow = [ str(articleData["articleId"]), articleData["doi"], articleData["externalId"], str(articleData["pmid"]) ] idFh.write("\t".join(idRow)) idFh.write("\n") # write to output fileData = createFileData(articleData, mimeType, asciiString) store.writeFile(articleId, (1000 * (articleId)) + 1, fileData, externalId=articleData["externalId"]) convCount += 1 logging.info("Converted %d files, skipped %d" % (convCount, skipCount)) store.close() idFh.close()
def parseMedline(xmlParser): """ fill article data dict with pubmed xml data >>> xml = PubmedTestDoc() >>> data = parseMedline(maxXml.XmlParser(string=xml)) >>> del data["time"] >>> repr(data) "OrderedDict([('articleId', ''), ('externalId', 'PMID20430833'), ('source', ''), ('origFile', ''), ('journal', 'Brain : a journal of neurology'), ('printIssn', '0006-8950'), ('eIssn', '0006-8950'), ('journalUniqueId', '0372537'), ('year', '2010'), ('articleType', 'research-article'), ('articleSection', ''), ('authors', u'Willemsen, Mich\\\\xe9l A; Verbeek, Marcel M'), ('authorEmails', ''), ('authorAffiliations', 'Radboud University Nijmegen Medical Centre, Donders Institute for Brain, Cognition and Behaviour, Department of Paediatric Neurology (820 IKNC), PO Box 9101, 6500 HB Nijmegen, The Netherlands. [email protected]'), ('keywords', 'Age of Onset/Useless Research'), ('title', 'Tyrosine hydroxylase deficiency: a treatable disorder of brain catecholamine biosynthesis.'), ('abstract', 'An infantile onset, progressive, hypokinetic-rigid syndrome with dystonia (type A), and a complex encephalopathy with neonatal onset (type B). Decreased cerebrospinal fluid concentrations of homovanillic acid and c.698G>A and c.707T>C mutations. Carriership of at least one promotor mutation, however, apparently predicts type A tyrosine hydroxylase deficiency. Most patients with tyrosine hydroxylase deficiency can be successfully treated with l-dopa.'), ('vol', '133'), ('issue', 'Pt 6'), ('page', '1810-22'), ('pmid', '20430833'), ('pmcId', ''), ('doi', ''), ('fulltextUrl', 'http://www.ncbi.nlm.nih.gov/pubmed/20430833')])" """ data = pubStore.createEmptyArticleDict() # medlineData = xmlParser.getXmlFirst("MedlineCitation") medlineData = xmlParser data["pmid"] = medlineData.getTextFirst("PMID") data["externalId"] = "PMID" + data["pmid"] data["fulltextUrl"] = "http://www.ncbi.nlm.nih.gov/pubmed/%s" % data["pmid"] logging.log(5, "PMID %s" % data["pmid"]) # data["year-pubmed"] = medlineData.getTextFirst("DateCreated/Year") # data["month-pubmed"] = medlineData.getTextFirst("DateCreated/Month") # data["day-pubmed"] = medlineData.getTextFirst("DateCreated/Day") otherIds = medlineData.getTextAll("OtherID", reqAttrDict={"Source": "NLM"}) pmcIds = [i for i in otherIds if i.startswith("PMC")] if len(pmcIds) > 0: data["pmcId"] = pmcIds[0].split()[0].replace("PMC", "") artTree = medlineData.getXmlFirst("Article") data["title"] = artTree.getTextFirst("ArticleTitle", default="") # handle structured abstracts abstractParts = [] abstractTrees = artTree.getXmlAll("Abstract/AbstractText") for aEl in abstractTrees: label = aEl.getAttr("NlmCategory") abstract = "" if label != None: abstract = "<p>%s</p> " % label abstract += aEl.getText() abstractParts.append(abstract) data["abstract"] = "".join(abstractParts) if data["abstract"] == "": data["abstract"] = artTree.getTextFirst("OtherAbstract/AbstractText", default="") data["authorAffiliations"] = artTree.getTextFirst("Affiliation", default="") data["doi"] = artTree.getTextFirst("ELocationID", default="", reqAttrDict={"EIdType": "doi"}) data["journalUniqueId"] = medlineData.getTextFirst("MedlineJournalInfo/NlmUniqueID") linkingIssn = medlineData.getTextFirst("MedlineJournalInfo/ISSNLinking") journalTree = artTree.getXmlFirst("Journal") data["eIssn"] = journalTree.getTextFirst("ISSN", reqAttrDict={"IssnType": "Electronic"}, default="") data["printIssn"] = journalTree.getTextFirst("ISSN", reqAttrDict={"IssnType": "Print"}, default="") if linkingIssn != None: data["eIssn"] = linkingIssn data["printIssn"] = linkingIssn data["vol"] = journalTree.getTextFirst("JournalIssue/Volume", default="") data["issue"] = journalTree.getTextFirst("JournalIssue/Issue", default="") data["year"] = journalTree.getTextFirst("JournalIssue/PubDate/Year", default="") if data["year"] == "": year = journalTree.getTextFirst("JournalIssue/PubDate/MedlineDate", default="").split()[0] if not year.isdigit(): year = "" data["year"] = year data["journal"] = journalTree.getTextFirst("Title", default="") data["page"] = artTree.getTextFirst("Pagination/MedlinePgn", default="") authorList = artTree.getXmlFirst("AuthorList") lastNames = [] initialList = [] if authorList != None: authorTrees = authorList.getXmlAll("Author") for authorTree in authorTrees: lastName = authorTree.getTextFirst("LastName", default="") if lastName == "": lastName = authorTree.getTextFirst("CollectiveName", default="") lastNames.append(lastName) initials = authorTree.getTextFirst("ForeName", default="") if initials == "": initials = authorTree.getTextFirst("Initials", default="") initialList.append(initials) authors = [lastNames[i] + ", " + initialList[i] for i in range(0, min(len(lastNames), len(initialList)))] data["authors"] = "; ".join(authors) articleTypeList = artTree.getTextAll("PublicationTypeList/PublicationType") articleTypesString = ",".join(articleTypeList) articleType = "research-article" if "Review" in articleTypeList: articleType = "review" if "letter" in articleTypeList: articleType = "research-article" noResearchArticleTags = [ "Bibliography", "Biography", "Case Reports", "Webcasts", "Dictionary", "Directory", "Editorial", "Festschrift", "Patient Education Handout", "Periodical Index", "Portraits", "Published Erratum", "Scientific Integrity Review" "Congresses", ] for noResearchArticleTag in noResearchArticleTags: if noResearchArticleTag in articleTypeList: articleType = "other" data["articleType"] = articleType # data["pubmedArticleTypes"] = articleTypesString logging.log(5, "pubmedArticleTypes %s, articleType %s" % (articleTypesString, articleType)) meshDescriptors = [] meshHeadingList = medlineData.getXmlFirst("MeshHeadingList", default="") if meshHeadingList: # for meshHeadingDescriptor in meshHeadingList.getTextAll("MeshHeading/DescriptorName", reqAttrDict={"MajorTopicYN":"Y"}): for meshHeadingDescriptor in meshHeadingList.getTextAll("MeshHeading/DescriptorName"): meshDescriptors.append(meshHeadingDescriptor.strip()) data["keywords"] = "/".join(meshDescriptors) # remove these annoying linebreaks! filtData = {} for key, val in data.iteritems(): filtData[key] = val.replace(u"\u2028", " ") return filtData
def convertOneChunk(zipDir, inIndexFile, inIdFile, outFile): """ get files from inIndexFile, parse Xml, write everything to outfile in ascii format """ diskDir = abspath(join(zipDir, "..", "disk")) store = pubStore.PubWriterFile(outFile) # read all already done IDs doneIds = parseDoneIds(inIdFile) # open output id files idFname = join(dirname(outFile), basename(outFile).split(".")[0] + ".ids.tab") logging.debug("Writing ids to %s" % idFname) idFh = open(idFname, "w") idFh.write("#articleId\tdoi\tpmid\n") pmidFinder = pubCompare.PmidFinder() unzipTmp = pubGeneric.makeTempDir(prefix="pubConvSpringerUnzip", tmpDir=pubConf.getFastTempDir()) maxCommon.delOnExit(unzipTmp) i = 0 inRows = list(maxCommon.iterTsvRows(inIndexFile)) logging.info("Converting %d files" % len(inRows)) convCount = 0 pdfNotFound = 0 for row in inRows: # read line i += 1 articleId = row.articleId zipFilename, filename = row.zipFilename, row.filename if u"\xbf" in filename: logging.info("Found weird character, skipping file") continue articleData = pubStore.createEmptyArticleDict(publisher="springer") if zipFilename == "": xmlString, pdfString = getDiskData(diskDir, filename) articleData["origFile"] = filename else: xmlString, pdfString = getUpdateData(unzipTmp, zipDir, zipFilename, filename) articleData["origFile"] = zipFilename + ":" + filename if pdfString == None: pdfNotFound += 1 logging.error("Could not open pdf or xml file") continue articleId = int(articleId) # parse xml logging.debug("Parsing XML") try: xmlTree = pubXml.etreeFromXml(xmlString) except lxml.etree.XMLSyntaxError: logging.error("XML parse error, skipping file %s, %s" % (zipFilename, filename)) continue articleData = parseXml(xmlTree, articleData) if articleData == None: logging.warn("Parser got no data for %s" % filename) continue if articleData["doi"] in doneIds: logging.error("article %s has already been converted, skipping" % articleData["doi"]) continue articleData["pmid"] = pmidFinder.lookupPmid(articleData) articleData["origFile"] = zipFilename + "/" + filename articleData["externalId"] = articleData["doi"] # convert pdf to ascii fileData = createFileData(articleData, "application/pdf", pdfString) logging.debug("converting pdf to ascii") pubGeneric.toAscii(fileData, "application/pdf") # write to output store.writeArticle(articleId, articleData) store.writeFile(articleId, (1000 * (articleId)) + 1, fileData, externalId=articleData["externalId"]) # write IDs to separate file idRow = [str(articleData["articleId"]), articleData["doi"], str(articleData["pmid"])] idFh.write("\t".join(idRow)) idFh.write("\n") doneIds.add(articleData["doi"]) convCount += 1 logging.info("Converted %d files, pdfNotFound=%d" % (convCount, pdfNotFound)) store.close() idFh.close()
def convertOneChunk(zipDir, inIndexFile, inIdFile, outFile): """ get files from inIndexFile, parse Xml, write everything to outfile in ascii format """ diskDir = abspath(join(zipDir, "..", "disk")) store = pubStore.PubWriterFile(outFile) # read all already done IDs doneIds = parseDoneIds(inIdFile) # open output id files idFname = join(dirname(outFile), basename(outFile).split(".")[0]+".ids.tab") logging.debug("Writing ids to %s" % idFname) idFh = open(idFname, "w") idFh.write("#articleId\tdoi\tpmid\n") pmidFinder = pubCompare.PmidFinder() unzipTmp = pubGeneric.makeTempDir(prefix="pubConvSpringerUnzip", tmpDir=pubConf.getFastTempDir()) maxCommon.delOnExit(unzipTmp) i = 0 inRows = list(maxCommon.iterTsvRows(inIndexFile)) logging.info("Converting %d files" % len(inRows)) convCount = 0 pdfNotFound = 0 for row in inRows: # read line i+=1 articleId = row.articleId zipFilename, filename = row.zipFilename, row.filename if u'\xbf' in filename: logging.info("Found weird character, skipping file") continue articleData = pubStore.createEmptyArticleDict(publisher="springer") if zipFilename=="": xmlString, pdfString = getDiskData(diskDir, filename) articleData["origFile"] = filename else: xmlString, pdfString = getUpdateData(unzipTmp, zipDir, zipFilename, filename) articleData["origFile"] = zipFilename+":"+filename if pdfString==None: pdfNotFound+=1 logging.error("Could not open pdf or xml file") continue articleId=int(articleId) # parse xml logging.debug("Parsing XML") try: xmlTree = pubXml.etreeFromXml(xmlString) except lxml.etree.XMLSyntaxError: logging.error("XML parse error, skipping file %s, %s" % (zipFilename, filename)) continue articleData = parseXml(xmlTree, articleData) if articleData==None: logging.warn("Parser got no data for %s" % filename) continue if articleData["doi"] in doneIds: logging.error("article %s has already been converted, skipping" % articleData["doi"]) continue articleData["pmid"] = pmidFinder.lookupPmid(articleData) articleData["origFile"]=zipFilename+"/"+filename articleData["externalId"]=articleData["doi"] # convert pdf to ascii fileData = createFileData(articleData, "application/pdf", pdfString) logging.debug("converting pdf to ascii") pubGeneric.toAscii(fileData, "application/pdf") # write to output store.writeArticle(articleId, articleData) store.writeFile(articleId, (1000*(articleId))+1, fileData, externalId=articleData["externalId"]) # write IDs to separate file idRow = [str(articleData["articleId"]), articleData["doi"], str(articleData["pmid"])] idFh.write("\t".join(idRow)) idFh.write("\n") doneIds.add(articleData["doi"]) convCount += 1 logging.info("Converted %d files, pdfNotFound=%d" % (convCount, pdfNotFound)) store.close() idFh.close()