Beispiel #1
0
def createFingerprints(inDir, updateIds=None):
    map0 = {}  # doi -> articleId
    map1 = {}  # issn/vol/page -> articleId
    map2 = {}  # author, title, year -> articleId
    artIds = {}  # articleId -> (extId, doi, pmid)

    global noIssuePage
    noIssuePage = 0
    global noIssn
    noIssn = 0

    logging.info("Fingerprinting %s" % inDir)
    count = 0
    for row in pubStore.iterArticleDataDir(inDir,
                                           type="articles",
                                           updateIds=updateIds):
        articleId = int(row.articleId)

        fprint0 = getFingerprint0(row)
        addFprint(map0, fprint0, articleId)

        fprint1 = getFingerprint1(row)
        addFprint(map1, fprint1, articleId)

        fprint2 = getFingerprint2(row)
        addFprint(map2, fprint2, articleId)

        artIds[articleId] = (row.externalId, row.doi, int(row.pmid))
        count += 1
    return artIds, map0, map1, map2, noIssn, noIssuePage
Beispiel #2
0
def getIssnPmidDict(medlineDir, updateIds, minYear):
    """ go over medline articles and collect a printIssn -> pmidList dictionary 
    return to dicts: issn -> set of pmids, issn -> journal name
    
    """
    issnToPmid = defaultdict(set)
    issnToJournal = {}
    pmidCount = 0
    noIssnPmidCount = 0
    noMinYearCount = 0
    issnToJournal = {}
    logging.info("Reading ISSN/PMID assignment from directory %s" % medlineDir)
    for artData in pubStore.iterArticleDataDir(medlineDir, updateIds=updateIds):
        issn = artData.printIssn
        if issn=="":
            issn = artData.eIssn
        if issn=="":
            #oIssnCount.add(artData.pmid)
            logging.debug("PMID %s has not Issn" % artData.pmid)
            noIssnPmidCount += 1
            continue
        if minYear!=None and artData.year.isdigit() and not int(artData.year) >= minYear:
            logging.debug("PMID %s is too early" % artData.pmid)
            noMinYearCount += 1
            continue
                
        issnToPmid[issn].add(int(artData.pmid))
        issnToJournal[issn] = unidecode.unidecode(artData.journal)
        #pmids.add(artData.pmid)
        pmidCount += 1
    logging.info("Got %d PMIDs for %d ISSNs" % (pmidCount, len(issnToPmid)))
    logging.info("No info for %d PMIDs, %d PMIDs did not fulfill the minYear" % \
        (noIssnPmidCount, noMinYearCount))
    return issnToPmid, issnToJournal
Beispiel #3
0
def createFingerprints(inDir, updateIds=None):
    map0 = {}   # doi -> articleId
    map1 = {}   # issn/vol/page -> articleId
    map2 = {}   # author, title, year -> articleId
    artIds = {} # articleId -> (extId, doi, pmid)

    global noIssuePage
    noIssuePage = 0
    global noIssn
    noIssn = 0

    logging.info("Fingerprinting %s" % inDir)
    count = 0
    for row in pubStore.iterArticleDataDir(inDir, type="articles", updateIds = updateIds):
        articleId = int(row.articleId)

        fprint0 = getFingerprint0(row)
        addFprint(map0, fprint0, articleId)

        fprint1 = getFingerprint1(row)
        addFprint(map1, fprint1, articleId)

        fprint2 = getFingerprint2(row)
        addFprint(map2, fprint2, articleId)

        artIds[articleId] = (row.externalId, row.doi, int(row.pmid))
        count += 1
    return artIds, map0, map1, map2, noIssn, noIssuePage
Beispiel #4
0
def getIssnPmidDict(medlineDir, updateIds, minYear):
    """ go over medline articles and collect a printIssn -> pmidList dictionary 
    return to dicts: issn -> set of pmids, issn -> journal name
    
    """
    issnToPmid = defaultdict(set)
    issnToJournal = {}
    pmidCount = 0
    noIssnPmidCount = 0
    noMinYearCount = 0
    issnToJournal = {}
    logging.info("Reading ISSN/PMID assignment from directory %s" % medlineDir)
    for artData in pubStore.iterArticleDataDir(medlineDir,
                                               updateIds=updateIds):
        issn = artData.printIssn
        if issn == "":
            issn = artData.eIssn
        if issn == "":
            #oIssnCount.add(artData.pmid)
            logging.debug("PMID %s has not Issn" % artData.pmid)
            noIssnPmidCount += 1
            continue
        if minYear != None and artData.year.isdigit() and not int(
                artData.year) >= minYear:
            logging.debug("PMID %s is too early" % artData.pmid)
            noMinYearCount += 1
            continue

        issnToPmid[issn].add(int(artData.pmid))
        issnToJournal[issn] = unidecode.unidecode(artData.journal)
        #pmids.add(artData.pmid)
        pmidCount += 1
    logging.info("Got %d PMIDs for %d ISSNs" % (pmidCount, len(issnToPmid)))
    logging.info("No info for %d PMIDs, %d PMIDs did not fulfill the minYear" % \
        (noIssnPmidCount, noMinYearCount))
    return issnToPmid, issnToJournal