def convertOneChunk(gzDir, idFname, inIndexFile, outFile): # for each row in index: store = pubStore.PubWriterFile(outFile) donePiis = pubGeneric.parseDoneIds(idFname) # log to file outBase = join(dirname(outFile), basename(outFile).split(".")[0]) logFname = outBase + ".log" pubGeneric.setupLogging(__file__, None, logFileName=logFname) idFname = outBase + "_ids.tab" logging.debug("Writing ids to %s" % idFname) idFh = open(idFname, "w") idFh.write("#articleId\texternalId\n") lastTsvFname = None tsvFile = None pmidFinder = pubCompare.PmidFinder() for row in maxCommon.iterTsvRows(inIndexFile, encoding=None): # open file and seek, if necessry if tsvFile == None or lastTsvFname != row.tsvFile: logging.debug("Seeking to %s in tsvfile %s" % (row.offset, row.tsvFile)) tsvFile = gzip.open(join(gzDir, row.tsvFile)) tsvFile.seek(int(row.offset)) lastTsvFname = row.tsvFile line = tsvFile.readline() if row.url.startswith("!"): logging.info("Ignoring %s, marked as duplicated" % row.url) continue #fields are: ["articleId", "tsvFile", "url", "offset"] fields = line.split("\t") url = fields[0] logging.debug("Replacing weird bing chars") content = fields[-1] assert (url == row.url) assert (len(content) != 0) url = url.decode("utf8") logging.debug("Converting to text") content = convertMicrosoft(content) artDict, fileDict = convertHtmlToDicts(url, content) if artDict == None: artDict, fileDict = minimalHtmlToDicts(url, content) if artDict == None: continue artDict["pmid"] = pmidFinder.lookupPmid(artDict) # write file articleId = int(row.articleId) fileId = articleId * 1000 store.writeFile(articleId, fileId, fileDict) store.writeArticle(articleId, artDict) store.close()
def convertOneChunk(gzDir, idFname, inIndexFile, outFile): # for each row in index: store = pubStore.PubWriterFile(outFile) donePiis = pubGeneric.parseDoneIds(idFname) # log to file outBase = join(dirname(outFile), basename(outFile).split(".")[0]) logFname = outBase+".log" pubGeneric.setupLogging(__file__, None, logFileName=logFname) idFname = outBase+"_ids.tab" logging.debug("Writing ids to %s" % idFname) idFh = open(idFname, "w") idFh.write("#articleId\texternalId\n") lastTsvFname = None tsvFile = None pmidFinder = pubCompare.PmidFinder() for row in maxCommon.iterTsvRows(inIndexFile, encoding=None): # open file and seek, if necessry if tsvFile==None or lastTsvFname!=row.tsvFile: logging.debug("Seeking to %s in tsvfile %s" % (row.offset, row.tsvFile)) tsvFile = gzip.open(join(gzDir, row.tsvFile)) tsvFile.seek(int(row.offset)) lastTsvFname = row.tsvFile line = tsvFile.readline() if row.url.startswith("!"): logging.info("Ignoring %s, marked as duplicated" % row.url) continue #fields are: ["articleId", "tsvFile", "url", "offset"] fields = line.split("\t") url = fields[0] logging.debug("Replacing weird bing chars") content = fields[-1] assert(url==row.url) assert(len(content)!=0) url = url.decode("utf8") logging.debug("Converting to text") content = convertMicrosoft(content) artDict, fileDict = convertHtmlToDicts(url, content) if artDict==None: artDict, fileDict = minimalHtmlToDicts(url, content) if artDict==None: continue artDict["pmid"] = pmidFinder.lookupPmid(artDict) # write file articleId = int(row.articleId) fileId = articleId*1000 store.writeFile(articleId, fileId, fileDict) store.writeArticle(articleId, artDict) store.close()
syntax: pubAlg.py <algName> map|reduce <inFile> <outFile> <paramPickleFile> """) parser.add_option("-d", "--debug", dest="debug", action="store_true", help="show debug messages") parser.add_option("-v", "--verbose", dest="verbose", action="store_true", help="show more debug messages") parser.add_option("-t", "--test", dest="test", action="store_true", help="run tests") (options, args) = parser.parse_args() if options.test: import doctest doctest.testmod() sys.exit(0) if options.debug: pubConf.debugMode = True pubGeneric.setupLogging(__file__, options) if len(args)==0: doctest.testmod() sys.exit(0) algName, algMethod, inName, outName, paramFile = args binData = gzip.open(paramFile, "rb").read() paramDict = marshal.loads(binData) for key, val in paramDict.iteritems(): logging.log(5, "parameter %s = %s" % (key, str(val))) alg = getAlg(algName, defClass=string.capitalize(algMethod)) if algMethod in ["combine", "processRow"]:
indexSplitDir = indexFilename+".tmp.split" chunkIds = pubStore.splitTabFileOnChunkId(indexFilename, indexSplitDir) idFname = pubGeneric.concatIdentifiers(finalOutDir, indexSplitDir, "doneArticles.tab") submitJobs(runner, inDir, chunkIds, indexSplitDir, idFname, buildDir) pubGeneric.concatDelIdFiles(buildDir, finalOutDir, "%d_ids.tab" % updateId) pubGeneric.concatDelLogs(buildDir, finalOutDir, "%d.log" % updateId) if isdir(indexSplitDir): # necessary? how could it not be there? logging.info("Deleting directory %s" % indexSplitDir) shutil.rmtree(indexSplitDir) # got sometimes exception here... pubStore.moveFiles(buildDir, finalOutDir) shutil.rmtree(buildDir) pubStore.appendToUpdatesTxt(finalOutDir, updateId, maxArticleId, processFiles) # this is a job script, so it is calling itself via parasol/bsub/qsub if __name__=="__main__": parser = optparse.OptionParser("""usage: %prog [options] <inIndexFile> <outFile> - job script to convert a Elsevier fulltext file (given using an index file) from consyn format to ascii""") parser.add_option("-d", "--debug", dest="debug", action="store_true", help="show debug messages") (options, args) = parser.parse_args() if args==[]: parser.print_help() exit(1) zipDir, inIdFile, inIndexFile, outFile = args logFname = join(dirname(outFile), basename(outFile).split(".")[0]+".log") pubGeneric.setupLogging(__file__, options, logFileName=logFname) convertOneChunk(zipDir, inIndexFile, inIdFile, outFile)
if isdir(indexSplitDir): # necessary? how could it not be there? logging.info("Deleting directory %s" % indexSplitDir) shutil.rmtree(indexSplitDir) # got sometimes exception here... pubStore.moveFiles(buildDir, finalOutDir) shutil.rmtree(buildDir) pubStore.appendToUpdatesTxt(finalOutDir, updateId, maxArticleId, processFiles) # this is a job script, so it is calling itself via parasol/bsub/qsub if __name__ == "__main__": parser = optparse.OptionParser( """usage: %prog [options] <inIndexFile> <outFile> - job script to convert a Elsevier fulltext file (given using an index file) from consyn format to ascii""" ) parser.add_option("-d", "--debug", dest="debug", action="store_true", help="show debug messages") (options, args) = parser.parse_args() if args == []: parser.print_help() exit(1) zipDir, inIdFile, inIndexFile, outFile = args logFname = join(dirname(outFile), basename(outFile).split(".")[0] + ".log") pubGeneric.setupLogging(__file__, options, logFileName=logFname) convertOneChunk(zipDir, inIndexFile, inIdFile, outFile)
# === COMMAND LINE INTERFACE, OPTIONS AND HELP === parser = optparse.OptionParser("""usage: %prog [options] publisher dir - add publisher field to article tables """) parser.add_option("-d", "--debug", dest="debug", action="store_true", help="show debug messages") parser.add_option("-v", "--verbose", dest="verbose", action="store_true", help="show more debug messages") (options, args) = parser.parse_args() # ==== FUNCTIONs ===== # ----------- MAIN -------------- if args==[]: parser.print_help() exit(1) pubGeneric.setupLogging(progFile, options) publisher, inDir= args[:3] if isfile(inDir): inFnames = [inDir] else: inFnames = glob.glob(join(inDir, "*.articles.gz")) for inFname in inFnames: logging.info("Reading %s" % inFname) headerLine = gzip.open(inFname).readline() if "publisher" in headerLine: logging.info("%s is OK" % inFname) continue
action="store_true", help="show debug messages") parser.add_option("-v", "--verbose", dest="verbose", action="store_true", help="show more debug messages") (options, args) = parser.parse_args() # ==== FUNCTIONs ===== # ----------- MAIN -------------- if args == []: parser.print_help() exit(1) pubGeneric.setupLogging(progFile, options) publisher, inDir = args[:3] if isfile(inDir): inFnames = [inDir] else: inFnames = glob.glob(join(inDir, "*.articles.gz")) for inFname in inFnames: logging.info("Reading %s" % inFname) headerLine = gzip.open(inFname).readline() if "publisher" in headerLine: logging.info("%s is OK" % inFname) continue