Exemple #1
0
def getArtDbPath(datasetName):
    """ return the sqlite database name with meta info of a dataset """
    dataDir = pubConf.resolveTextDir(datasetName, mustFind=False)
    if dataDir==None:
        return None
    dbPath = join(dataDir, "articles.db")
    return dbPath
Exemple #2
0
def resolveDatasetDesc(descs):
    " resolve a comma-sep list of dataset identifiers like pmc or elsevier to a list of directories "
    dirs = []
    for desc in descs.split(","):
        descDir = pubConf.resolveTextDir(desc)
        if descDir == None:
            raise Exception("Unknown dataset: %s" % desc)
        dirs.append(descDir)
    return dirs
Exemple #3
0
def resolveDatasetDesc(descs):
    " resolve a comma-sep list of dataset identifiers like pmc or elsevier to a list of directories "
    dirs = []
    for desc in descs.split(','):
        descDir = pubConf.resolveTextDir(desc)
        if descDir == None:
            raise Exception("Unknown dataset: %s" % desc)
        dirs.append(descDir)
    return dirs
Exemple #4
0
    def __init__(self, dataset):
        self.dataset = dataset
        self.textDir = pubConf.resolveTextDir(dataset)
        if self.textDir == None:
            raise Exception("dataset %s can not be resolved to a directory" % dataset)

        self.pubMapBaseDir = pubConf.pubMapBaseDir
        maxCommon.mustExistDir(pubConf.pubMapBaseDir, makeDir=True)

        self._defineBatchDirectories()
Exemple #5
0
def findFiles(dataset):
    """ return all basenames for .gz files in datasets.
    inDir can be a list of datasetnames, a file or a directory with datasetnames """

    #assert(type(datasets)==types.ListType)
    fnames = []
    dataDir = pubConf.resolveTextDir(dataset)
    if dataDir==None:
        raise Exception("error in input data spec")
    fnames.extend(glob.glob(join(dataDir, "*.articles.gz")))
    if len(fnames)==0:
        raise Exception("Could not find any *.articles.gz files in %s"% dataDir)

    return fnames
Exemple #6
0
    def __init__(self, dataset):
        self.markerCountsBase   = MARKERCOUNTSBASE
        self.markerDirBase      = MARKERDIRBASE
        self.pubMapBaseDir = pubConf.pubMapBaseDir
        maxCommon.mustExistDir(pubConf.pubMapBaseDir, makeDir=True)

        self.dataset = dataset
        if "," in dataset:
            logging.debug("comma in dataset description, deferring config")
            return

        self.textDir = pubConf.resolveTextDir(dataset)
        if self.textDir==None:
            raise Exception("dataset %s can not be resolved to a directory" % dataset)

        self._defineBatchDirectories()
Exemple #7
0
def getAllUpdateIds(datasets):
    " collect all available text dataset updateIds for all datasets "
    textUpdateIds = {}
    for dataset in datasets:
        textDir = pubConf.resolveTextDir(dataset)
        updateFname = join(textDir, "updates.tab")
        logging.debug("Reading %s" % updateFname)
        updateIds = []
        for row in maxCommon.iterTsvRows(updateFname):
            updateIds.append(row.updateId)
        textUpdateIds[dataset] = updateIds
    return textUpdateIds

    # also save to file, so we don't have to do this again
    outFname = join(batchDir, "updateIds.json")
    json.dumps(textUpdateIds, open(outFname, "w"), sort_keys=True, indent=4, )
    return textUpdateIds
Exemple #8
0
def getAllUpdateIds(datasets):
    " collect all available text dataset updateIds for all datasets "
    textUpdateIds = {}
    for dataset in datasets:
        textDir = pubConf.resolveTextDir(dataset)
        updateFname = join(textDir, "updates.tab")
        logging.debug("Reading %s" % updateFname)
        updateIds = []
        for row in maxCommon.iterTsvRows(updateFname):
            updateIds.append(row.updateId)
        textUpdateIds[dataset] = updateIds
    return textUpdateIds

    # also save to file, so we don't have to do this again
    outFname = join(batchDir, "updateIds.json")
    json.dumps(
        textUpdateIds,
        open(outFname, "w"),
        sort_keys=True,
        indent=4,
    )
    return textUpdateIds
Exemple #9
0
    def __init__(self, dataset, outDir):
        self.markerCountsBase   = MARKERCOUNTSBASE
        self.markerDirBase      = MARKERDIRBASE
        assert(outDir!=None and outDir!="")
        self.pubMapBaseDir = outDir
        maxCommon.mustExistDir(self.pubMapBaseDir, makeDir=True)
        logging.debug("Main pipeline outdir is %s" % outDir)

        self.dataset = dataset
        if "," in dataset:
            logging.debug("comma in dataset description, deferring config")
            return

        self.textDir = pubConf.resolveTextDir(dataset)
        if self.textDir==None:
            raise Exception("dataset %s can not be resolved to a directory" % dataset)

        # base dir for dataset
        self.baseDir = join(self.pubMapBaseDir, self.dataset)

        self.batchId = self._findCurrentBatchDir()
        self.batchDir = join(self.baseDirBatches, str(self.batchId))
        self._defineBatchDirectories()
Exemple #10
0
def addPmids(datasetString):
    " for a given dataset, add the pmids from the pubFingerprint output file to the article files "
    #datasetString = args[0]

    textDir = pubConf.resolveTextDir(datasetString)
    logging.info("Changing article files in %s" % textDir)
    aToPfn = join(textDir, pubConf.idFname)
    logging.info("Reading art -> pmid mapping from %s" % aToPfn)
    artToPmid = parseIdFname(aToPfn)
    fnames = glob.glob(join(textDir, "*.articles.gz"))
    logging.info("Running on %d article files" % len(fnames))
    pm = maxCommon.ProgressMeter(len(fnames), stepCount=100)
    updateSqliteIds(textDir, artToPmid.items())
    #sys.exit(0)

    logging.info("Updating tab sep files")
    for fname in fnames:
        # write headers
        newFname = join(pubConf.TEMPDIR, basename(fname))
        logging.debug("reading %s, writing %s" % (fname, newFname))
        newF = gzip.open(newFname, "w")
        newF.write(gzip.open(fname).readline())

        # write rows, replacing pmids on the way
        for row in maxCommon.iterTsvRows(fname):
            artId = int(row.articleId)
            if int(row.articleId) in artToPmid:
                row = row._replace(pmid=artToPmid[artId])
            newF.write((u'\t'.join(row)).encode("utf8"))
            newF.write("\n")
        newF.close()

        # rename old, move over the new one
        shutil.move(fname, fname+".bak")
        shutil.move(newFname, fname)
        pm.taskCompleted()
Exemple #11
0
def addPmids(datasetString):
    " for a given dataset, add the pmids from the pubFingerprint output file to the article files "
    #datasetString = args[0]

    textDir = pubConf.resolveTextDir(datasetString)
    logging.info("Changing article files in %s" % textDir)
    aToPfn = join(textDir, pubConf.idFname)
    logging.info("Reading art -> pmid mapping from %s" % aToPfn)
    artToPmid = parseIdFname(aToPfn)
    fnames = glob.glob(join(textDir, "*.articles.gz"))
    logging.info("Running on %d article files" % len(fnames))
    pm = maxCommon.ProgressMeter(len(fnames), stepCount=100)
    updateSqliteIds(textDir, artToPmid.items())
    #sys.exit(0)

    logging.info("Updating tab sep files")
    for fname in fnames:
        # write headers
        newFname = join(pubConf.TEMPDIR, basename(fname))
        logging.debug("reading %s, writing %s" % (fname, newFname))
        newF = gzip.open(newFname, "w")
        newF.write(gzip.open(fname).readline())

        # write rows, replacing pmids on the way
        for row in maxCommon.iterTsvRows(fname):
            artId = int(row.articleId)
            if int(row.articleId) in artToPmid:
                row = row._replace(pmid=artToPmid[artId])
            newF.write((u'\t'.join(row)).encode("utf8"))
            newF.write("\n")
        newF.close()

        # rename old, move over the new one
        shutil.move(fname, fname + ".bak")
        shutil.move(newFname, fname)
        pm.taskCompleted()
Exemple #12
0
def getArtDbPath(datasetName):
    " return the sqlite database name with meta info of a dataset "
    dataDir = pubConf.resolveTextDir(datasetName)
    dbPath = join(dataDir, "articles.db")
    return dbPath
Exemple #13
0
 def __init__(self):
     textDir = pubConf.resolveTextDir("medline")
     fname = join(textDir, FINGERPRINTFNAME)
     self.db = pubGeneric.openKeyValDb(fname)
     self.noPrints = []
     self.noMatches = []
Exemple #14
0
def filterCmd(inSpec, searchSpec, outSpec, options):
    outDir = pubConf.resolveTextDir(outSpec)
    assert(outDir!=None)
    maxCommon.mustBeEmptyDir(outDir)
    return submitJobs(inSpec, searchSpec, outDir)
Exemple #15
0
def filterCmd(inSpec, searchSpec, outSpec, options):
    outDir = pubConf.resolveTextDir(outSpec)
    assert (outDir != None)
    maxCommon.mustBeEmptyDir(outDir)
    return submitJobs(inSpec, searchSpec, outDir)
Exemple #16
0
 def __init__(self):
     textDir = pubConf.resolveTextDir("medline")
     fname = join(textDir, FINGERPRINTFNAME)
     self.db = pubGeneric.openKeyValDb(fname)
     self.noPrints = []
     self.noMatches = []