def loadTsvSqlite(dbFname, tableName, tsvFnames, headers=None, intFields=[], \ primKey=None, idxFields=[], dropTable=True): " load tabsep file into sqlLite db table " # if first parameter is string, make it to a list if len(tsvFnames) == 0: logging.debug("No filenames to load") return if isinstance(tsvFnames, basestring): tsvFnames = [tsvFnames] if os.path.isfile(dbFname): lockDb = False finalDbFname = None else: lockDb = True finalDbFname = dbFname con, cur = openSqlite(dbFname, lockDb=lockDb) # drop old table if dropTable: logging.debug("dropping old sqlite table") cur.execute('DROP TABLE IF EXISTS %s;' % tableName) con.commit() # create table createSql, idxSqls = makeTableCreateStatement(tableName, headers, \ intFields=intFields, idxFields=idxFields, primKey=primKey) logging.log(5, "creating table with %s" % createSql) cur.execute(createSql) con.commit() logging.info("Loading data into table") tp = maxCommon.ProgressMeter(len(tsvFnames)) sql = "INSERT INTO %s (%s) VALUES (%s)" % (tableName, ", ".join(headers), ", ".join(["?"] * len(headers))) for tsvName in tsvFnames: logging.debug("Importing %s" % tsvName) if os.path.getsize(tsvName) == 0: logging.debug("Skipping %s, zero size" % tsvName) continue rows = list(maxCommon.iterTsvRows(tsvName)) logging.log(5, "Running Sql %s against %d rows" % (sql, len(rows))) cur.executemany(sql, rows) con.commit() tp.taskCompleted() logging.info("Adding indexes to table") for idxSql in idxSqls: cur.execute(idxSql) con.commit() con.close() if finalDbFname != None: logging.info("moving over ramdisk db to %s" % dbFname) shutil.move(dbFname, finalDbFname)
def lftpGet(remoteUrl, locDir, fileNames, connCount): " use lftp to download files in parallel " locDir = os.path.abspath(locDir) scriptPath = join(locDir, "lftp.cmd") logging.debug("Writing filenames to %s" % scriptPath) lFile = open(scriptPath, "w") lFile.write("set net:socket-buffer 32000000\n") lFile.write("set cmd:parallel %d\n" % int(connCount)) lFile.write("set xfer:use-temp-file yes\n") # atomic download lFile.write("set xfer:clobber yes\n") lFile.write("open %s\n" % remoteUrl) lFile.write("set xfer:log true\n") lFile.write("lcd %s\n" % locDir) pm = maxCommon.ProgressMeter(len(fileNames)) existDirs = set() locNames = [] for f in fileNames: locName = join(locDir, f) # make sure that target dir exists locFileDir = dirname(locName) if locFileDir not in existDirs and not isdir(locFileDir): logging.info("Creating directory %s" % locFileDir) os.makedirs(locFileDir) existDirs.add(locFileDir) logging.debug("filename %s" % locName) if isfile(locName): logging.debug("Already exists: %s, skipping" % locName) else: lFile.write("get %s -o %s\n" % (f, locName)) locNames.append(locName) pm.taskCompleted() lFile.close() if find_executable("lftp") is None: raise Exception( "the command lftp is not in your PATH. Install it wit apt-get install lftp or yum install lftp" ) cmd = ["lftp", "-f", scriptPath] logging.debug("Launching lftp for download, cmd %s" % " ".join(cmd)) ret = subprocess.call(cmd) if ret != 0: logging.error("error during transfer") sys.exit(1) logging.debug("Updating downloads.log file in %s" % locDir) for f in fileNames: appendLog(locDir, "add", f) logging.info("Downloaded %d files: %s" % (len(locNames), str(",".join(locNames))))
def addDictsToDbms(mapList, dbmList, articleIds): " write fingerprints to tab sep file as fingerprint -> pmid " assert (len(mapList) == len(dbmList)) fprintType = 0 for fprints, dbm in zip(mapList, dbmList): logging.info( "Writing fingerprint %d (0=doi, 1=issn/vol/page, 2=author/year/title)" % fprintType) pm = maxCommon.ProgressMeter(len(fprints)) for fprint, artId in fprints.iteritems(): artData = articleIds[int(artId)] pmid = str(artData[-1]) dbm[str(fprint)] = pmid pm.taskCompleted() fprintType += 1
def concatFiles(inDir, outFname): " concat all files in outDir and write to outFname. " logging.info("Looking for tab.gz files in %s" % inDir) inFnames = pubGeneric.findFiles(inDir, ".tab.gz") ofh = open(outFname, "w") pm = maxCommon.ProgressMeter(len(inFnames)) logging.info("Concatting...") fno = 0 for relDir, fn in inFnames: lno = 0 for line in gzip.open(fn): if lno==0 and fno==0: ofh.write(line) if lno!=0: ofh.write(line) lno += 1 pm.taskCompleted() fno += 1 ofh.close()
def writeDicts(mapList, outFname, articleIds): """ mapList is a list of dictionaries fingerprint -> id and articleIds is a dict id -> pmid. Write/Append a table fingerprint -> pmid to outFname and return offset where we started to write. """ # can't use gzip as its "a" mode doesn't support tell() if isfile(outFname): logging.info("Appending to %s" % outFname) #ofh = gzip.open(outFname, "a") ofh = open(outFname, "a") offset = ofh.tell() else: logging.info("Creating new file %s" % outFname) ofh = open(outFname, "w") ofh.write("#fingerprint\tpmid\n") offset = 0 fprintType = 0 typeDesc = { 0: "doi", 1: "issn|vol|issue|page", 2: "author|year|titlewords" } for fprints in mapList: logging.info("Writing %d fingerprints, type %s" % (len(fprints), typeDesc[fprintType])) pm = maxCommon.ProgressMeter(len(fprints)) for fprint, artId in fprints.iteritems(): artData = articleIds[int(artId)] pmid = str(artData[-1]) ofh.write("%s\t%s\n" % (fprint, pmid)) pm.taskCompleted() fprintType += 1 logging.info("Wrote %s" % outFname) ofh.close() return offset
def addPmids(datasetString): " for a given dataset, add the pmids from the pubFingerprint output file to the article files " #datasetString = args[0] textDir = pubConf.resolveTextDir(datasetString) logging.info("Changing article files in %s" % textDir) aToPfn = join(textDir, pubConf.idFname) logging.info("Reading art -> pmid mapping from %s" % aToPfn) artToPmid = parseIdFname(aToPfn) fnames = glob.glob(join(textDir, "*.articles.gz")) logging.info("Running on %d article files" % len(fnames)) pm = maxCommon.ProgressMeter(len(fnames), stepCount=100) updateSqliteIds(textDir, artToPmid.items()) #sys.exit(0) logging.info("Updating tab sep files") for fname in fnames: # write headers newFname = join(pubConf.TEMPDIR, basename(fname)) logging.debug("reading %s, writing %s" % (fname, newFname)) newF = gzip.open(newFname, "w") newF.write(gzip.open(fname).readline()) # write rows, replacing pmids on the way for row in maxCommon.iterTsvRows(fname): artId = int(row.articleId) if int(row.articleId) in artToPmid: row = row._replace(pmid=artToPmid[artId]) newF.write((u'\t'.join(row)).encode("utf8")) newF.write("\n") newF.close() # rename old, move over the new one shutil.move(fname, fname + ".bak") shutil.move(newFname, fname) pm.taskCompleted()
def runReduce(algName, paramDict, path, outFilename, quiet=False, inFnames=None): """ parse pickled dicts from path, run through reduce function of alg and write output to one file """ if outFilename!=None and isfile(outFilename): logging.info("deleting existing file %s" % outFilename) os.remove(outFilename) if isinstance(algName, basestring): alg = getAlg(algName, defClass="Map") else: alg = algName if "map" not in dir(alg): logging.error("There is not map() function in %s" % algName) sys.exit(1) if "startup" in dir(alg): alg.startup(paramDict, {}) if inFnames!=None: infiles = inFnames elif isfile(path): logging.debug("Filename specified, running only on a single file (debugging)") infiles = [(dirname(path), path)] else: infiles = pubGeneric.findFiles(path, [MAPREDUCEEXT]) if len(infiles)==0: logging.error("Could not find any %s files in %s" % (MAPREDUCEEXT, path)) sys.exit(1) # read pickle files into data dict data = {} fileCount = 0 logging.info("Reading map output") meter = maxCommon.ProgressMeter(len(infiles), quiet=quiet, stepCount=100) for relDir, fileName in infiles: binData = gzip.open(fileName, "rb").read() nodeData = marshal.loads(binData) del binData for key, values in nodeData.iteritems(): if not hasattr(values, "__iter__"): values = [values] # major change: append instead of extend # will break existing mr-scripts data.setdefault(key, []).append(values) fileCount += 1 logging.debug("Reading "+fileName) meter.taskCompleted() logging.info("Writing to %s" % outFilename) if outFilename==None: ofh = None elif outFilename=="stdout": ofh = sys.stdout else: ofh = open(outFilename, "w") if "headers" in dir(alg) and ofh!=None: ofh.write("\t".join(alg.headers)) ofh.write("\n") if "reduceStartup" in dir(alg): logging.info("Running reduceStartup") alg.reduceStartup(data, paramDict, ofh) logging.info("Running data through reducer") meter = maxCommon.ProgressMeter(len(data)) for key, valList in data.iteritems(): tupleIterator = alg.reduce(key, valList) for tuple in tupleIterator: if tuple==None: logging.debug("Got None, not writing anything") continue if type(tuple)==bytes: # make sure that returned value is a list tuple = [tuple] if type(tuple)==int: # make sure that it's a string tuple = [str(tuple)] tuple = [unicode(x).encode("utf8") for x in tuple] # convert to utf8 if ofh!=None: ofh.write("\t".join(tuple)) ofh.write("\n") meter.taskCompleted() if ofh!=None: ofh.close() if "reduceEnd" in dir(alg): logging.info("Running reduceEnd") alg.reduceEnd(data)
def createIndexFile(inDir, inFnames, indexFilename, updateId, minId, chunkSize): """ write xml.Meta-filenames in inFnames in inDir to indexFilename in format (numId, chunkId, zipName, fileName), starting id is minId returns the last articleId that was assigned """ logging.info("Writing to %s" % indexFilename) indexFile = open(indexFilename, "w") headers = ["articleId", "chunkId", "zipFilename", "filename"] indexFile.write("\t".join(headers)+"\n") #logging.debug("Processing these files in %s: %s" % (inDir, inFnames)) if len(inFnames)==0: logging.info("Nothing to convert, all files are already marked done in updates.tab") sys.exit(0) numId = minId xmlCount = 0 i = 0 plainXmlCount = 0 pm = maxCommon.ProgressMeter(len(inFnames)) for fname in inFnames: inPath = join(inDir, fname) i+=1 chunkId = ((numId-minId) / chunkSize) chunkString = "%d_%05d" % (updateId, chunkId) if inPath.lower().endswith(".zip"): logging.debug("Indexing %s" % (inPath)) zipFilename = inPath # get all relevant names from zipfile try: zipNames = zipfile.ZipFile(zipFilename).namelist() except zipfile.BadZipfile: logging.error("Bad zipfile: %s" % zipFilename) continue zipRelName = basename(zipFilename) for fileName in zipNames: if not fileName.endswith(".xml.Meta"): continue xmlCount += 1 data = [str(numId), chunkString, zipRelName, fileName] data = [d.encode("utf8") for d in data] indexFile.write("\t".join(data)+"\n") numId+=1 else: # just append the filename to the index file assert(fname.lower().endswith(".meta")) data = [str(numId), chunkString, "", fname] indexFile.write("\t".join(data)+"\n") numId+=1 plainXmlCount += 1 pm.taskCompleted() indexFile.close() logging.info("Processed %d zip files, with %d xml files in them, and %d plain xml files" % \ (i, xmlCount, plainXmlCount)) return numId