Beispiel #1
0
def loadTsvSqlite(dbFname, tableName, tsvFnames, headers=None, intFields=[], \
    primKey=None, idxFields=[], dropTable=True):
    " load tabsep file into sqlLite db table "
    # if first parameter is string, make it to a list
    if len(tsvFnames) == 0:
        logging.debug("No filenames to load")
        return
    if isinstance(tsvFnames, basestring):
        tsvFnames = [tsvFnames]

    if os.path.isfile(dbFname):
        lockDb = False
        finalDbFname = None
    else:
        lockDb = True
        finalDbFname = dbFname
    con, cur = openSqlite(dbFname, lockDb=lockDb)

    # drop old table
    if dropTable:
        logging.debug("dropping old sqlite table")
        cur.execute('DROP TABLE IF EXISTS %s;' % tableName)
        con.commit()

    # create table
    createSql, idxSqls = makeTableCreateStatement(tableName, headers, \
        intFields=intFields, idxFields=idxFields, primKey=primKey)
    logging.log(5, "creating table with %s" % createSql)
    cur.execute(createSql)
    con.commit()

    logging.info("Loading data into table")
    tp = maxCommon.ProgressMeter(len(tsvFnames))
    sql = "INSERT INTO %s (%s) VALUES (%s)" % (tableName, ", ".join(headers),
                                               ", ".join(["?"] * len(headers)))
    for tsvName in tsvFnames:
        logging.debug("Importing %s" % tsvName)
        if os.path.getsize(tsvName) == 0:
            logging.debug("Skipping %s, zero size" % tsvName)
            continue
        rows = list(maxCommon.iterTsvRows(tsvName))
        logging.log(5, "Running Sql %s against %d rows" % (sql, len(rows)))
        cur.executemany(sql, rows)
        con.commit()
        tp.taskCompleted()

    logging.info("Adding indexes to table")
    for idxSql in idxSqls:
        cur.execute(idxSql)
        con.commit()

    con.close()

    if finalDbFname != None:
        logging.info("moving over ramdisk db to %s" % dbFname)
        shutil.move(dbFname, finalDbFname)
Beispiel #2
0
def lftpGet(remoteUrl, locDir, fileNames, connCount):
    " use lftp to download files in parallel "
    locDir = os.path.abspath(locDir)
    scriptPath = join(locDir, "lftp.cmd")
    logging.debug("Writing filenames to %s" % scriptPath)
    lFile = open(scriptPath, "w")
    lFile.write("set net:socket-buffer 32000000\n")
    lFile.write("set cmd:parallel %d\n" % int(connCount))
    lFile.write("set xfer:use-temp-file yes\n")  # atomic download
    lFile.write("set xfer:clobber yes\n")
    lFile.write("open %s\n" % remoteUrl)
    lFile.write("set xfer:log true\n")
    lFile.write("lcd %s\n" % locDir)
    pm = maxCommon.ProgressMeter(len(fileNames))
    existDirs = set()
    locNames = []
    for f in fileNames:
        locName = join(locDir, f)
        # make sure that target dir exists
        locFileDir = dirname(locName)
        if locFileDir not in existDirs and not isdir(locFileDir):
            logging.info("Creating directory %s" % locFileDir)
            os.makedirs(locFileDir)
        existDirs.add(locFileDir)

        logging.debug("filename %s" % locName)
        if isfile(locName):
            logging.debug("Already exists: %s, skipping" % locName)
        else:
            lFile.write("get %s -o %s\n" % (f, locName))
            locNames.append(locName)
        pm.taskCompleted()
    lFile.close()

    if find_executable("lftp") is None:
        raise Exception(
            "the command lftp is not in your PATH. Install it wit apt-get install lftp or yum install lftp"
        )

    cmd = ["lftp", "-f", scriptPath]
    logging.debug("Launching lftp for download, cmd %s" % " ".join(cmd))
    ret = subprocess.call(cmd)

    if ret != 0:
        logging.error("error during transfer")
        sys.exit(1)

    logging.debug("Updating downloads.log file in %s" % locDir)
    for f in fileNames:
        appendLog(locDir, "add", f)
    logging.info("Downloaded %d files: %s" %
                 (len(locNames), str(",".join(locNames))))
Beispiel #3
0
def addDictsToDbms(mapList, dbmList, articleIds):
    " write fingerprints to tab sep file as fingerprint -> pmid "
    assert (len(mapList) == len(dbmList))
    fprintType = 0
    for fprints, dbm in zip(mapList, dbmList):
        logging.info(
            "Writing fingerprint %d (0=doi, 1=issn/vol/page, 2=author/year/title)"
            % fprintType)
        pm = maxCommon.ProgressMeter(len(fprints))
        for fprint, artId in fprints.iteritems():
            artData = articleIds[int(artId)]
            pmid = str(artData[-1])
            dbm[str(fprint)] = pmid
            pm.taskCompleted()
        fprintType += 1
Beispiel #4
0
def concatFiles(inDir, outFname):
    " concat all files in outDir and write to outFname. "
    logging.info("Looking for tab.gz files in %s" % inDir)
    inFnames = pubGeneric.findFiles(inDir, ".tab.gz")
    ofh = open(outFname, "w")
    pm = maxCommon.ProgressMeter(len(inFnames))
    logging.info("Concatting...")
    fno = 0
    for relDir, fn in inFnames:
        lno = 0
        for line in gzip.open(fn):
            if lno==0 and fno==0:
                ofh.write(line)
            if lno!=0:
                ofh.write(line)
            lno += 1
        pm.taskCompleted()
        fno += 1
    ofh.close()
Beispiel #5
0
def writeDicts(mapList, outFname, articleIds):
    """ mapList is a list of dictionaries fingerprint -> id and articleIds is a
    dict id -> pmid. Write/Append a table fingerprint -> pmid to outFname and return offset
    where we started to write. """
    # can't use gzip as its "a" mode doesn't support tell()
    if isfile(outFname):
        logging.info("Appending to %s" % outFname)
        #ofh = gzip.open(outFname, "a")
        ofh = open(outFname, "a")
        offset = ofh.tell()
    else:
        logging.info("Creating new file %s" % outFname)
        ofh = open(outFname, "w")
        ofh.write("#fingerprint\tpmid\n")
        offset = 0

    fprintType = 0
    typeDesc = {
        0: "doi",
        1: "issn|vol|issue|page",
        2: "author|year|titlewords"
    }
    for fprints in mapList:

        logging.info("Writing %d fingerprints, type %s" %
                     (len(fprints), typeDesc[fprintType]))
        pm = maxCommon.ProgressMeter(len(fprints))
        for fprint, artId in fprints.iteritems():
            artData = articleIds[int(artId)]
            pmid = str(artData[-1])
            ofh.write("%s\t%s\n" % (fprint, pmid))
            pm.taskCompleted()
        fprintType += 1
    logging.info("Wrote %s" % outFname)
    ofh.close()
    return offset
Beispiel #6
0
def addPmids(datasetString):
    " for a given dataset, add the pmids from the pubFingerprint output file to the article files "
    #datasetString = args[0]

    textDir = pubConf.resolveTextDir(datasetString)
    logging.info("Changing article files in %s" % textDir)
    aToPfn = join(textDir, pubConf.idFname)
    logging.info("Reading art -> pmid mapping from %s" % aToPfn)
    artToPmid = parseIdFname(aToPfn)
    fnames = glob.glob(join(textDir, "*.articles.gz"))
    logging.info("Running on %d article files" % len(fnames))
    pm = maxCommon.ProgressMeter(len(fnames), stepCount=100)
    updateSqliteIds(textDir, artToPmid.items())
    #sys.exit(0)

    logging.info("Updating tab sep files")
    for fname in fnames:
        # write headers
        newFname = join(pubConf.TEMPDIR, basename(fname))
        logging.debug("reading %s, writing %s" % (fname, newFname))
        newF = gzip.open(newFname, "w")
        newF.write(gzip.open(fname).readline())

        # write rows, replacing pmids on the way
        for row in maxCommon.iterTsvRows(fname):
            artId = int(row.articleId)
            if int(row.articleId) in artToPmid:
                row = row._replace(pmid=artToPmid[artId])
            newF.write((u'\t'.join(row)).encode("utf8"))
            newF.write("\n")
        newF.close()

        # rename old, move over the new one
        shutil.move(fname, fname + ".bak")
        shutil.move(newFname, fname)
        pm.taskCompleted()
Beispiel #7
0
def runReduce(algName, paramDict, path, outFilename, quiet=False, inFnames=None):
    """ parse pickled dicts from path, run through reduce function of alg and
    write output to one file """

    if outFilename!=None and isfile(outFilename):
        logging.info("deleting existing file %s" % outFilename)
        os.remove(outFilename)

    if isinstance(algName, basestring):
        alg = getAlg(algName, defClass="Map")
    else:
        alg = algName

    if "map" not in dir(alg):
        logging.error("There is not map() function in %s" % algName)
        sys.exit(1)

    if "startup" in dir(alg):
        alg.startup(paramDict, {})

    if inFnames!=None:
        infiles = inFnames
    elif isfile(path):
        logging.debug("Filename specified, running only on a single file (debugging)")
        infiles = [(dirname(path), path)]
    else:
        infiles = pubGeneric.findFiles(path, [MAPREDUCEEXT])

    if len(infiles)==0:
        logging.error("Could not find any %s files in %s" % (MAPREDUCEEXT, path))
        sys.exit(1)

    # read pickle files into data dict
    data = {}
    fileCount = 0
    logging.info("Reading map output")
    meter = maxCommon.ProgressMeter(len(infiles), quiet=quiet, stepCount=100)
    for relDir, fileName in infiles:
        binData = gzip.open(fileName, "rb").read()
        nodeData = marshal.loads(binData)
        del binData
        for key, values in nodeData.iteritems():
            if not hasattr(values, "__iter__"):
                values = [values]
            # major change: append instead of extend
            # will break existing mr-scripts
            data.setdefault(key, []).append(values)
        fileCount += 1
        logging.debug("Reading "+fileName)
        meter.taskCompleted()

    logging.info("Writing to %s" % outFilename)
    if outFilename==None:
        ofh = None
    elif outFilename=="stdout":
        ofh = sys.stdout
    else:
        ofh = open(outFilename, "w")

    if "headers" in dir(alg) and ofh!=None:
        ofh.write("\t".join(alg.headers))
        ofh.write("\n")

    if "reduceStartup" in dir(alg):
        logging.info("Running reduceStartup")
        alg.reduceStartup(data, paramDict, ofh)

    logging.info("Running data through reducer")
    meter = maxCommon.ProgressMeter(len(data))
    for key, valList in data.iteritems():
        tupleIterator = alg.reduce(key, valList)
        for tuple in tupleIterator:
            if tuple==None:
                logging.debug("Got None, not writing anything")
                continue
            if type(tuple)==bytes: # make sure that returned value is a list
                tuple = [tuple]
            if type(tuple)==int: # make sure that it's a string
                tuple = [str(tuple)]
            tuple = [unicode(x).encode("utf8") for x in tuple] # convert to utf8
            if ofh!=None:
                ofh.write("\t".join(tuple))
                ofh.write("\n")
        meter.taskCompleted()
    if ofh!=None:
        ofh.close()

    if "reduceEnd" in dir(alg):
        logging.info("Running reduceEnd")
        alg.reduceEnd(data)
Beispiel #8
0
def createIndexFile(inDir, inFnames, indexFilename, updateId, minId, chunkSize):
    """ 
    write xml.Meta-filenames in inFnames in inDir to indexFilename in format
    (numId, chunkId, zipName, fileName), starting id is minId 

    returns the last articleId that was assigned
    """
    logging.info("Writing to %s" % indexFilename)
    indexFile = open(indexFilename, "w")
    headers = ["articleId", "chunkId", "zipFilename", "filename"]
    indexFile.write("\t".join(headers)+"\n")

    #logging.debug("Processing these files in %s: %s" % (inDir, inFnames))
    if len(inFnames)==0:
        logging.info("Nothing to convert, all files are already marked done in updates.tab")
        sys.exit(0)

    numId = minId
    xmlCount = 0
    i = 0
    plainXmlCount = 0
    pm = maxCommon.ProgressMeter(len(inFnames))

    for fname in inFnames:
        inPath = join(inDir, fname)
        i+=1
        chunkId = ((numId-minId) / chunkSize)
        chunkString = "%d_%05d" % (updateId, chunkId)

        if inPath.lower().endswith(".zip"):
            logging.debug("Indexing %s" % (inPath))
            zipFilename = inPath
            # get all relevant names from zipfile
            try:
                zipNames = zipfile.ZipFile(zipFilename).namelist()
            except zipfile.BadZipfile:
                logging.error("Bad zipfile: %s" % zipFilename)
                continue
            zipRelName = basename(zipFilename)
            for fileName in zipNames:
                if not fileName.endswith(".xml.Meta"):
                    continue
                xmlCount += 1
                data = [str(numId), chunkString, zipRelName, fileName]
                data = [d.encode("utf8") for d in data]
                indexFile.write("\t".join(data)+"\n")
                numId+=1
        else:
            # just append the filename to the index file
            assert(fname.lower().endswith(".meta"))
            data = [str(numId), chunkString, "", fname]
            indexFile.write("\t".join(data)+"\n")
            numId+=1
            plainXmlCount += 1
        pm.taskCompleted()


    indexFile.close()
    logging.info("Processed %d zip files, with %d xml files in them, and %d plain xml files" % \
        (i, xmlCount, plainXmlCount))
    return numId