def updateSqliteIds(datasetString, artToPmid): " update the sqlite db given a list of (articleId, pmid) tuples " logging.info("Updating the sqlite DB %s" % datasetString) con, cur = pubStore.openArticleDb(datasetString) pmidArtIds = [(y,x) for x,y in artToPmid] cur.executemany("UPDATE articles SET pmid=? WHERE articleId=?", pmidArtIds) con.commit()
def updateSqliteIds(datasetString, artToPmid): " update the sqlite db given a list of (articleId, pmid) tuples " logging.info("Updating the sqlite DB %s" % datasetString) con, cur = pubStore.openArticleDb(datasetString) pmidArtIds = [(y, x) for x, y in artToPmid] cur.executemany("UPDATE articles SET pmid=? WHERE articleId=?", pmidArtIds) con.commit()
def updatePmids(medlineDir, crawlDir, updateIds, minYear=None): """ go over subdirs of crawlDir, for each: read the ISSNs, and add new PMIDs we have in medlineDir to subdir/pmids.txt We never remove a PMID from pmids.txt. """ logging.info("Now updating crawler directories with the new PMIDs") eIssnToPIssn = getEIssnToPIssn(pubConf.publisherIssnTable) subDirs = getSubdirs(crawlDir) con, cur = pubStore.openArticleDb("medline", mustOpen=True, useRamdisk=True) for subdir in subDirs: if subdir.endswith(".tmp"): continue subPath = join(crawlDir, subdir) logging.info("Processing subdirectory %s" % subPath) if isfile(pubCrawlLib.getLockFname(subPath)): logging.warn("Found lockfile, looks like a crawl is going on in %s, skipping" % subPath) continue pmidFname = join(crawlDir, subdir, "pmids.txt") issnFname = join(crawlDir, subdir, "issns.tab") if not isfile(issnFname) or not isfile(pmidFname): logging.info("Skipping %s, ISSN or docId file not found" % subPath) continue logging.debug("reading subdir %s: %s and %s" % (subdir, pmidFname, issnFname)) issns = [row.issn.strip() for row in maxCommon.iterTsvRows(issnFname)] logging.debug("ISSNs: %s" % ",".join(issns)) # read old pmids oldPmids = set([int(line.rstrip()) for line in open(pmidFname)]) #newPmids = set() # add new pmids, for each issn newPmids = getPmidsForIssns(con, cur, issns, minYear) logging.debug("%d PMIDs" % (len(newPmids))) oldCount = len(oldPmids) updateCount = len(newPmids) oldPmids.update(newPmids) # faster to add new to old set than old to new set pmids = oldPmids newCount = len(pmids) addCount = newCount - oldCount logging.info("crawl dir %s: old PMID count %d, update has %d, new total %d, added %d" % \ (subdir, oldCount, updateCount, newCount, addCount)) # write new pmids pmids = [str(x) for x in pmids] # randomize order, to distribute errors random.shuffle(pmids) # write all pmids to a tmp file pmidTmpFname = pmidFname+".new" pmidFh = open(pmidTmpFname, "w") pmidFh.write("\n".join(pmids)) pmidFh.close() # keep a copy of the original pmid file shutil.copy(pmidFname, pmidFname+".bak") # atomic rename the tmp file to the original file # to make sure that an intact pmid file always exists os.rename(pmidTmpFname, pmidFname)
def updatePmids(medlineDir, crawlDir, updateIds, minYear=None): """ go over subdirs of crawlDir, for each: read the ISSNs, and add new PMIDs we have in medlineDir to subdir/pmids.txt We never remove a PMID from pmids.txt. """ logging.info("Now updating crawler directories with the new PMIDs") eIssnToPIssn = getEIssnToPIssn(pubConf.publisherIssnTable) subDirs = getSubdirs(crawlDir) con, cur = pubStore.openArticleDb("medline", mustOpen=True, useRamdisk=True) for subdir in subDirs: if subdir.endswith(".tmp"): continue subPath = join(crawlDir, subdir) logging.info("Processing subdirectory %s" % subPath) if isfile(pubCrawlLib.getLockFname(subPath)): logging.warn( "Found lockfile, looks like a crawl is going on in %s, skipping" % subPath) continue pmidFname = join(crawlDir, subdir, "pmids.txt") issnFname = join(crawlDir, subdir, "issns.tab") if not isfile(issnFname) or not isfile(pmidFname): logging.info("Skipping %s, ISSN or docId file not found" % subPath) continue logging.debug("reading subdir %s: %s and %s" % (subdir, pmidFname, issnFname)) issns = [row.issn.strip() for row in maxCommon.iterTsvRows(issnFname)] logging.debug("ISSNs: %s" % ",".join(issns)) # read old pmids oldPmids = set([int(line.rstrip()) for line in open(pmidFname)]) #newPmids = set() # add new pmids, for each issn newPmids = getPmidsForIssns(con, cur, issns, minYear) logging.debug("%d PMIDs" % (len(newPmids))) oldCount = len(oldPmids) updateCount = len(newPmids) oldPmids.update( newPmids) # faster to add new to old set than old to new set pmids = oldPmids newCount = len(pmids) addCount = newCount - oldCount logging.info("crawl dir %s: old PMID count %d, update has %d, new total %d, added %d" % \ (subdir, oldCount, updateCount, newCount, addCount)) # write new pmids pmids = [str(x) for x in pmids] # randomize order, to distribute errors random.shuffle(pmids) # write all pmids to a tmp file pmidTmpFname = pmidFname + ".new" pmidFh = open(pmidTmpFname, "w") pmidFh.write("\n".join(pmids)) pmidFh.close() # keep a copy of the original pmid file shutil.copy(pmidFname, pmidFname + ".bak") # atomic rename the tmp file to the original file # to make sure that an intact pmid file always exists os.rename(pmidTmpFname, pmidFname)