Beispiel #1
0
def updateSqliteIds(datasetString, artToPmid):
    " update the sqlite db given a list of (articleId, pmid) tuples "
    logging.info("Updating the sqlite DB %s" % datasetString)
    con, cur = pubStore.openArticleDb(datasetString)
    pmidArtIds = [(y,x) for x,y in artToPmid]
    cur.executemany("UPDATE articles SET pmid=? WHERE articleId=?", pmidArtIds)
    con.commit()
Beispiel #2
0
def updateSqliteIds(datasetString, artToPmid):
    " update the sqlite db given a list of (articleId, pmid) tuples "
    logging.info("Updating the sqlite DB %s" % datasetString)
    con, cur = pubStore.openArticleDb(datasetString)
    pmidArtIds = [(y, x) for x, y in artToPmid]
    cur.executemany("UPDATE articles SET pmid=? WHERE articleId=?", pmidArtIds)
    con.commit()
Beispiel #3
0
def updatePmids(medlineDir, crawlDir, updateIds, minYear=None):
    """ go over subdirs of crawlDir, for each: read the ISSNs, and add new
    PMIDs we have in medlineDir to subdir/pmids.txt
    We never remove a PMID from pmids.txt.
    """ 
    logging.info("Now updating crawler directories with the new PMIDs")
    eIssnToPIssn = getEIssnToPIssn(pubConf.publisherIssnTable)
    subDirs = getSubdirs(crawlDir)
    con, cur = pubStore.openArticleDb("medline", mustOpen=True, useRamdisk=True)
    for subdir in subDirs:
        if subdir.endswith(".tmp"):
            continue
        subPath = join(crawlDir, subdir)
        logging.info("Processing subdirectory %s" % subPath)
        if isfile(pubCrawlLib.getLockFname(subPath)):
            logging.warn("Found lockfile, looks like a crawl is going on in %s, skipping" % subPath)
            continue

        pmidFname = join(crawlDir, subdir, "pmids.txt")
        issnFname = join(crawlDir, subdir, "issns.tab")
        if not isfile(issnFname) or not isfile(pmidFname):
            logging.info("Skipping %s, ISSN or docId file not found" % subPath)
            continue
        logging.debug("reading subdir %s: %s and %s" % (subdir, pmidFname, issnFname))
        issns = [row.issn.strip() for row in maxCommon.iterTsvRows(issnFname)]
        logging.debug("ISSNs: %s" % ",".join(issns))
        # read old pmids
        oldPmids = set([int(line.rstrip()) for line in open(pmidFname)])
        #newPmids = set()
        # add new pmids, for each issn
        newPmids = getPmidsForIssns(con, cur, issns, minYear)

        logging.debug("%d PMIDs" % (len(newPmids)))
        oldCount = len(oldPmids)
        updateCount = len(newPmids)
        oldPmids.update(newPmids) # faster to add new to old set than old to new set

        pmids = oldPmids
        newCount = len(pmids)
        addCount = newCount - oldCount
        logging.info("crawl dir %s: old PMID count %d, update has %d, new total %d, added %d" % \
            (subdir, oldCount, updateCount, newCount, addCount))

        # write new pmids
        pmids = [str(x) for x in pmids]
        # randomize order, to distribute errors
        random.shuffle(pmids)

        # write all pmids to a tmp file
        pmidTmpFname = pmidFname+".new"
        pmidFh = open(pmidTmpFname, "w")
        pmidFh.write("\n".join(pmids))
        pmidFh.close()

        # keep a copy of the original pmid file
        shutil.copy(pmidFname, pmidFname+".bak")
        # atomic rename  the tmp file to the original file
        # to make sure that an intact pmid file always exists
        os.rename(pmidTmpFname, pmidFname)
Beispiel #4
0
def updatePmids(medlineDir, crawlDir, updateIds, minYear=None):
    """ go over subdirs of crawlDir, for each: read the ISSNs, and add new
    PMIDs we have in medlineDir to subdir/pmids.txt
    We never remove a PMID from pmids.txt.
    """
    logging.info("Now updating crawler directories with the new PMIDs")
    eIssnToPIssn = getEIssnToPIssn(pubConf.publisherIssnTable)
    subDirs = getSubdirs(crawlDir)
    con, cur = pubStore.openArticleDb("medline",
                                      mustOpen=True,
                                      useRamdisk=True)
    for subdir in subDirs:
        if subdir.endswith(".tmp"):
            continue
        subPath = join(crawlDir, subdir)
        logging.info("Processing subdirectory %s" % subPath)
        if isfile(pubCrawlLib.getLockFname(subPath)):
            logging.warn(
                "Found lockfile, looks like a crawl is going on in %s, skipping"
                % subPath)
            continue

        pmidFname = join(crawlDir, subdir, "pmids.txt")
        issnFname = join(crawlDir, subdir, "issns.tab")
        if not isfile(issnFname) or not isfile(pmidFname):
            logging.info("Skipping %s, ISSN or docId file not found" % subPath)
            continue
        logging.debug("reading subdir %s: %s and %s" %
                      (subdir, pmidFname, issnFname))
        issns = [row.issn.strip() for row in maxCommon.iterTsvRows(issnFname)]
        logging.debug("ISSNs: %s" % ",".join(issns))
        # read old pmids
        oldPmids = set([int(line.rstrip()) for line in open(pmidFname)])
        #newPmids = set()
        # add new pmids, for each issn
        newPmids = getPmidsForIssns(con, cur, issns, minYear)

        logging.debug("%d PMIDs" % (len(newPmids)))
        oldCount = len(oldPmids)
        updateCount = len(newPmids)
        oldPmids.update(
            newPmids)  # faster to add new to old set than old to new set

        pmids = oldPmids
        newCount = len(pmids)
        addCount = newCount - oldCount
        logging.info("crawl dir %s: old PMID count %d, update has %d, new total %d, added %d" % \
            (subdir, oldCount, updateCount, newCount, addCount))

        # write new pmids
        pmids = [str(x) for x in pmids]
        # randomize order, to distribute errors
        random.shuffle(pmids)

        # write all pmids to a tmp file
        pmidTmpFname = pmidFname + ".new"
        pmidFh = open(pmidTmpFname, "w")
        pmidFh.write("\n".join(pmids))
        pmidFh.close()

        # keep a copy of the original pmid file
        shutil.copy(pmidFname, pmidFname + ".bak")
        # atomic rename  the tmp file to the original file
        # to make sure that an intact pmid file always exists
        os.rename(pmidTmpFname, pmidFname)