Esempio n. 1
0
def convertOneChunk(gzDir, idFname, inIndexFile, outFile):
    # for each row in index:
    store = pubStore.PubWriterFile(outFile)
    donePiis = pubGeneric.parseDoneIds(idFname)

    # log to file
    outBase = join(dirname(outFile), basename(outFile).split(".")[0])
    logFname = outBase + ".log"
    pubGeneric.setupLogging(__file__, None, logFileName=logFname)

    idFname = outBase + "_ids.tab"
    logging.debug("Writing ids to %s" % idFname)
    idFh = open(idFname, "w")
    idFh.write("#articleId\texternalId\n")

    lastTsvFname = None
    tsvFile = None
    pmidFinder = pubCompare.PmidFinder()
    for row in maxCommon.iterTsvRows(inIndexFile, encoding=None):
        # open file and seek, if necessry
        if tsvFile == None or lastTsvFname != row.tsvFile:
            logging.debug("Seeking to %s in tsvfile %s" %
                          (row.offset, row.tsvFile))
            tsvFile = gzip.open(join(gzDir, row.tsvFile))
            tsvFile.seek(int(row.offset))
        lastTsvFname = row.tsvFile

        line = tsvFile.readline()

        if row.url.startswith("!"):
            logging.info("Ignoring %s, marked as duplicated" % row.url)
            continue
        #fields are: ["articleId", "tsvFile", "url", "offset"]
        fields = line.split("\t")
        url = fields[0]
        logging.debug("Replacing weird bing chars")
        content = fields[-1]
        assert (url == row.url)
        assert (len(content) != 0)
        url = url.decode("utf8")

        logging.debug("Converting to text")
        content = convertMicrosoft(content)
        artDict, fileDict = convertHtmlToDicts(url, content)
        if artDict == None:
            artDict, fileDict = minimalHtmlToDicts(url, content)
        if artDict == None:
            continue
        artDict["pmid"] = pmidFinder.lookupPmid(artDict)
        # write file
        articleId = int(row.articleId)
        fileId = articleId * 1000
        store.writeFile(articleId, fileId, fileDict)
        store.writeArticle(articleId, artDict)
    store.close()
Esempio n. 2
0
def convertOneChunk(gzDir, idFname, inIndexFile, outFile):
    # for each row in index:
    store = pubStore.PubWriterFile(outFile)
    donePiis = pubGeneric.parseDoneIds(idFname)

    # log to file
    outBase = join(dirname(outFile), basename(outFile).split(".")[0])
    logFname = outBase+".log"
    pubGeneric.setupLogging(__file__, None, logFileName=logFname)

    idFname = outBase+"_ids.tab"
    logging.debug("Writing ids to %s" % idFname)
    idFh = open(idFname, "w")
    idFh.write("#articleId\texternalId\n")

    lastTsvFname = None
    tsvFile = None
    pmidFinder = pubCompare.PmidFinder()
    for row in maxCommon.iterTsvRows(inIndexFile, encoding=None):
        # open file and seek, if necessry
        if tsvFile==None or lastTsvFname!=row.tsvFile:
            logging.debug("Seeking to %s in tsvfile %s" % (row.offset, row.tsvFile))
            tsvFile = gzip.open(join(gzDir, row.tsvFile))
            tsvFile.seek(int(row.offset))
        lastTsvFname = row.tsvFile

        line = tsvFile.readline()

        if row.url.startswith("!"):
            logging.info("Ignoring %s, marked as duplicated" % row.url)
            continue
        #fields are: ["articleId", "tsvFile", "url", "offset"]
        fields = line.split("\t")
        url = fields[0]
        logging.debug("Replacing weird bing chars")
        content = fields[-1]
        assert(url==row.url)
        assert(len(content)!=0)
        url = url.decode("utf8")

        logging.debug("Converting to text")
        content = convertMicrosoft(content)
        artDict, fileDict = convertHtmlToDicts(url, content)
        if artDict==None:
            artDict, fileDict = minimalHtmlToDicts(url, content)
        if artDict==None:
            continue
        artDict["pmid"]  = pmidFinder.lookupPmid(artDict)
        # write file
        articleId = int(row.articleId)
        fileId = articleId*1000
        store.writeFile(articleId, fileId, fileDict)
        store.writeArticle(articleId, artDict)
    store.close()
Esempio n. 3
0
    syntax: pubAlg.py <algName> map|reduce <inFile> <outFile> <paramPickleFile>
    """)
    parser.add_option("-d", "--debug", dest="debug", action="store_true", help="show debug messages")
    parser.add_option("-v", "--verbose", dest="verbose", action="store_true", help="show more debug messages")
    parser.add_option("-t", "--test", dest="test", action="store_true", help="run tests")

    (options, args) = parser.parse_args()
    if options.test:
        import doctest
        doctest.testmod()
        sys.exit(0)

    if options.debug:
        pubConf.debugMode = True

    pubGeneric.setupLogging(__file__, options)

    if len(args)==0:
        doctest.testmod()
        sys.exit(0)

    algName, algMethod, inName, outName, paramFile = args

    binData = gzip.open(paramFile, "rb").read()
    paramDict = marshal.loads(binData)
    for key, val in paramDict.iteritems():
        logging.log(5, "parameter %s = %s" % (key, str(val)))

    alg = getAlg(algName, defClass=string.capitalize(algMethod))

    if algMethod in ["combine", "processRow"]:
Esempio n. 4
0
    indexSplitDir = indexFilename+".tmp.split"
    chunkIds = pubStore.splitTabFileOnChunkId(indexFilename, indexSplitDir)
    idFname = pubGeneric.concatIdentifiers(finalOutDir, indexSplitDir, "doneArticles.tab")

    submitJobs(runner, inDir, chunkIds, indexSplitDir, idFname, buildDir)

    pubGeneric.concatDelIdFiles(buildDir, finalOutDir, "%d_ids.tab" % updateId)
    pubGeneric.concatDelLogs(buildDir, finalOutDir, "%d.log" % updateId)

    if isdir(indexSplitDir): # necessary? how could it not be there? 
        logging.info("Deleting directory %s" % indexSplitDir)
        shutil.rmtree(indexSplitDir) # got sometimes exception here...
    pubStore.moveFiles(buildDir, finalOutDir)
    shutil.rmtree(buildDir)

    pubStore.appendToUpdatesTxt(finalOutDir, updateId, maxArticleId, processFiles)

# this is a job script, so it is calling itself via parasol/bsub/qsub
if __name__=="__main__":
    parser = optparse.OptionParser("""usage: %prog [options] <inIndexFile> <outFile> - job script to convert a Elsevier fulltext file (given using an index file) from consyn format to ascii""")
    parser.add_option("-d", "--debug", dest="debug", action="store_true", help="show debug messages")
    (options, args) = parser.parse_args()
    if args==[]:
        parser.print_help()
        exit(1)

    zipDir, inIdFile, inIndexFile, outFile = args
    logFname = join(dirname(outFile), basename(outFile).split(".")[0]+".log")
    pubGeneric.setupLogging(__file__, options, logFileName=logFname)
    convertOneChunk(zipDir, inIndexFile, inIdFile, outFile)
Esempio n. 5
0
    if isdir(indexSplitDir):  # necessary? how could it not be there?
        logging.info("Deleting directory %s" % indexSplitDir)
        shutil.rmtree(indexSplitDir)  # got sometimes exception here...
    pubStore.moveFiles(buildDir, finalOutDir)
    shutil.rmtree(buildDir)

    pubStore.appendToUpdatesTxt(finalOutDir, updateId, maxArticleId,
                                processFiles)


# this is a job script, so it is calling itself via parasol/bsub/qsub
if __name__ == "__main__":
    parser = optparse.OptionParser(
        """usage: %prog [options] <inIndexFile> <outFile> - job script to convert a Elsevier fulltext file (given using an index file) from consyn format to ascii"""
    )
    parser.add_option("-d",
                      "--debug",
                      dest="debug",
                      action="store_true",
                      help="show debug messages")
    (options, args) = parser.parse_args()
    if args == []:
        parser.print_help()
        exit(1)

    zipDir, inIdFile, inIndexFile, outFile = args
    logFname = join(dirname(outFile), basename(outFile).split(".")[0] + ".log")
    pubGeneric.setupLogging(__file__, options, logFileName=logFname)
    convertOneChunk(zipDir, inIndexFile, inIdFile, outFile)
# === COMMAND LINE INTERFACE, OPTIONS AND HELP ===
parser = optparse.OptionParser("""usage: %prog [options] publisher dir - add publisher field to article tables

""")

parser.add_option("-d", "--debug", dest="debug", action="store_true", help="show debug messages") 
parser.add_option("-v", "--verbose", dest="verbose", action="store_true", help="show more debug messages") 
(options, args) = parser.parse_args()

# ==== FUNCTIONs =====
# ----------- MAIN --------------
if args==[]: 
    parser.print_help()
    exit(1)

pubGeneric.setupLogging(progFile, options)

publisher, inDir= args[:3]

if isfile(inDir):
    inFnames = [inDir]
else:
    inFnames = glob.glob(join(inDir, "*.articles.gz"))

for inFname in inFnames:
    logging.info("Reading %s" % inFname)
    headerLine = gzip.open(inFname).readline()
    if "publisher" in headerLine:
        logging.info("%s is OK" % inFname)
        continue
Esempio n. 7
0
                  action="store_true",
                  help="show debug messages")
parser.add_option("-v",
                  "--verbose",
                  dest="verbose",
                  action="store_true",
                  help="show more debug messages")
(options, args) = parser.parse_args()

# ==== FUNCTIONs =====
# ----------- MAIN --------------
if args == []:
    parser.print_help()
    exit(1)

pubGeneric.setupLogging(progFile, options)

publisher, inDir = args[:3]

if isfile(inDir):
    inFnames = [inDir]
else:
    inFnames = glob.glob(join(inDir, "*.articles.gz"))

for inFname in inFnames:
    logging.info("Reading %s" % inFname)
    headerLine = gzip.open(inFname).readline()
    if "publisher" in headerLine:
        logging.info("%s is OK" % inFname)
        continue