def loadDicts(prefix = 'cmj'):
    global docids, rdocids, tokenids
    docids = [item.strip().split('\t') for item in open(common.dataFile(prefix + '_docids.txt')).readlines()]
    docids = dict([(int(id), val) for id, val in docids]) # map int -> int_id
    rdocids = dict([(v, k) for k, v in docids.iteritems()]) # map int_id -> int
    tokenids = [item.strip().split('\t') for item in codecs.open(common.dataFile(prefix + '_tokenids.txt'), 'r', 'utf8').readlines()]
    tokenids = dict([(int(id), val) for id, val in tokenids])
def create_refsdb(baseDir, downloadMSC = False):
    """read references from baseDir files and merge them together into an ArticleDB database.
    also check for consistency and print some stats
    store the resulting database on disk as 'refs_baseDir.pdl'.
    """
    import cPickle
    reffiles = utils_iddb.getRefFiles(common.inputPath(baseDir))
    cPickle.dump(reffiles, open(common.dataFile('reffiles.pkl'), 'w'), protocol = -1) # store filesystem paths to references.xml files
    refdb = {}
    for reffile, refid in reffiles.iteritems():
        refs = utils_iddb.parseReferences(reffile, downloadMSC = downloadMSC)
        refdb[refid] = refs
        logging.info('id=%s: retrieved %i references' % (refid, len(refs)))
        if downloadMSC:
            cPickle.dump(refdb, open(common.dataFile('refs_partial.pkl'), 'w'), protocol = -1) # dump database immediately after each iteration
    f = open(common.dataFile('refs.pkl'), 'w')
    cPickle.dump(refdb, f, protocol = -1) # dump database immediately after each iteration        
    f.close()
    #print some statistics
    logging.info("%i MSC download attemps (%i ok, %i failed)" % (utils_iddb.attempts_all, utils_iddb.attempts_success, utils_iddb.attempts_failed))
    logging.info('reference database size: %i references in %i articles' % (sum([len(refs) for refs in refdb.itervalues()]), len(refdb)))

    db = ArticleDB.ArticleDB(common.dbFile('refs', baseDir), mode = 'override')
    insert_errors = 0
    for id, reflist in refdb.iteritems():
        for num, ref in enumerate(reflist):
            ref.id_int = id + ':' + str(num + 1) # references.xml reference counting starts with '1'
            if not db.insertArticle(ref):
                insert_errors += 1
        #print '.',
    db.commit()
    logging.info('resulting database has %i records (originally %i)' % (len(db), sum([len(refs) for refs in refdb.itervalues()])))
    logging.info('detected %i inconsistency collisions' % insert_errors)
Beispiel #3
0
def makeDefs():
    deffile = common.dataFile('mscdefs.txt')
    result = {}
    for mscdef in open(deffile):
        result[mscdef[:2]] = mscdef
    import cPickle
    f = open(common.dataFile('mscdefs.pkl'), 'w')
    cPickle.dump(result, f)
    f.close()
    return result
def loadDicts(prefix='cmj'):
    global docids, rdocids, tokenids
    docids = [
        item.strip().split('\t')
        for item in open(common.dataFile(prefix + '_docids.txt')).readlines()
    ]
    docids = dict([(int(id), val) for id, val in docids])  # map int -> int_id
    rdocids = dict([(v, k)
                    for k, v in docids.iteritems()])  # map int_id -> int
    tokenids = [
        item.strip().split('\t')
        for item in codecs.open(common.dataFile(prefix + '_tokenids.txt'), 'r',
                                'utf8').readlines()
    ]
    tokenids = dict([(int(id), val) for id, val in tokenids])
def getArts(fname, minFreq):
    db = ArticleDB.ArticleDB(common.dataFile(fname), mode = 'open')
    import ipyutils
    language = 'eng'
    ipyutils.loadDicts(prefix = 'gensim_' + language)
    arts = [Article.Article(rec) for rec in db.db if rec['language'] == language]
    for art in arts:
        art.msc = list(set(mscs.niceMSC(msc, prefix = 2)[0] for msc in art.msc))
    logging.info('loaded %i articles from %s' % (len(arts), fname))
    arts = [art for art in arts if len(art.msc)==1 and art.body and art.id_int in ipyutils.rdocids]
    logging.info('extracted %i articles with exactly one MSC and non-empty body' % (len(arts)))
    okmsc = mscs.getFreqMSCs(arts, minFreq, useMains = True)
    mscs.printStats(arts)
    for art in arts:
        art.body = art.body.decode('utf8')
#        art.body = art.body.decode('ascii', 'ignore')
        art.body = art.body.translate(BAD_MAP).encode('utf8')
        if art.title:
            art.title = art.title.decode('utf8')
    #        art.title = art.title.decode('ascii', 'ignore')
            art.title = art.title.translate(BAD_MAP).encode('utf8')
        art.msc = [mscs.niceMSC(art.msc[0], prefix = 2)[0]]
    arts = [art for art in arts if art.msc[0] in okmsc]
    allmsc, mainmsc = mscs.getMSCCnts(arts)
    for msc, mscarts in allmsc.iteritems():
        logging.info("class %s: %i articles" % (msc, len(mscarts)))
    logging.info("======================")
    logging.info("sum: %i articles" % sum(len(mscarts) for mscarts in allmsc.itervalues()))
        
    logging.debug('using %i articles from all %s msc classes that are covered by at least %i articles' % (len(arts), sorted(list(okmsc)), minFreq))
    return arts
def loadArts(dbFile = 'main_cmj.pdl'):
    import ArticleDB
    import Article
    import common
    global db, arts
    db = ArticleDB.ArticleDB(common.dataFile(dbFile), mode = 'open')
    arts = [Article.Article(rec) for rec in db.db if rec['id_int'] in rdocids]
def saveAsCorpus(arts, fname, fnc = lambda c: c):
    logging.info('saving corpus as %s' % fname)
    f = open(common.dataFile(fname), 'w')
    f.write('<?xml version="1.0" encoding="utf-8" ?>\n')
    f.write('<articles>\n')
    for art in arts:
        f.write('<article id="%s" lang="%s">\n' % (art.id_int, art.language))
        if art.msc:
            f.write('<category>\n')
#            assert len(art.msc) == 1
            f.write(art.msc[0])
            f.write('\n</category>\n')
        if art.title:
            f.write('<title>\n')
            f.write(fnc(art.title))
            f.write('\n</title>\n')            
        if art.body:
            f.write('<text>\n')
            f.write(fnc(art.body))
            f.write('\n</text>\n')
        if art.references:
            f.write('<references>\n')
            f.write(art.body)
            f.write('\n</references>\n')
        f.write('</article>\n')
    f.write('</articles>\n')
    f.close()
def loadArts(dbFile='main_cmj.pdl'):
    import ArticleDB
    import Article
    import common
    global db, arts
    db = ArticleDB.ArticleDB(common.dataFile(dbFile), mode='open')
    arts = [Article.Article(rec) for rec in db.db if rec['id_int'] in rdocids]
Beispiel #9
0
def saveAsCorpus(arts, fname, fnc=lambda c: c):
    logging.info('saving corpus as %s' % fname)
    f = open(common.dataFile(fname), 'w')
    f.write('<?xml version="1.0" encoding="utf-8" ?>\n')
    f.write('<articles>\n')
    for art in arts:
        f.write('<article id="%s" lang="%s">\n' % (art.id_int, art.language))
        if art.msc:
            f.write('<category>\n')
            #            assert len(art.msc) == 1
            f.write(art.msc[0])
            f.write('\n</category>\n')
        if art.title:
            f.write('<title>\n')
            f.write(fnc(art.title))
            f.write('\n</title>\n')
        if art.body:
            f.write('<text>\n')
            f.write(fnc(art.body))
            f.write('\n</text>\n')
        if art.references:
            f.write('<references>\n')
            f.write(art.body)
            f.write('\n</references>\n')
        f.write('</article>\n')
    f.write('</articles>\n')
    f.close()
Beispiel #10
0
def loadMsc2Id(lang):
    cats = [
        item.strip().split('\t')
        for item in open(common.dataFile('serial_mscids_%s.txt' %
                                         lang)).readlines()
    ]
    cats = dict([(int(id), val) for id, val in cats])  # map int -> id
    rcats = dict([(v, k) for k, v in cats.iteritems()])  # map id -> int
    return cats, rcats
Beispiel #11
0
def create_refsdb(baseDir, downloadMSC=False):
    """read references from baseDir files and merge them together into an ArticleDB database.
    also check for consistency and print some stats
    store the resulting database on disk as 'refs_baseDir.pdl'.
    """
    import cPickle
    reffiles = utils_iddb.getRefFiles(common.inputPath(baseDir))
    cPickle.dump(reffiles,
                 open(common.dataFile('reffiles.pkl'), 'w'),
                 protocol=-1)  # store filesystem paths to references.xml files
    refdb = {}
    for reffile, refid in reffiles.iteritems():
        refs = utils_iddb.parseReferences(reffile, downloadMSC=downloadMSC)
        refdb[refid] = refs
        logging.info('id=%s: retrieved %i references' % (refid, len(refs)))
        if downloadMSC:
            cPickle.dump(
                refdb,
                open(common.dataFile('refs_partial.pkl'), 'w'),
                protocol=-1)  # dump database immediately after each iteration
    f = open(common.dataFile('refs.pkl'), 'w')
    cPickle.dump(refdb, f,
                 protocol=-1)  # dump database immediately after each iteration
    f.close()
    #print some statistics
    logging.info("%i MSC download attemps (%i ok, %i failed)" %
                 (utils_iddb.attempts_all, utils_iddb.attempts_success,
                  utils_iddb.attempts_failed))
    logging.info('reference database size: %i references in %i articles' %
                 (sum([len(refs) for refs in refdb.itervalues()]), len(refdb)))

    db = ArticleDB.ArticleDB(common.dbFile('refs', baseDir), mode='override')
    insert_errors = 0
    for id, reflist in refdb.iteritems():
        for num, ref in enumerate(reflist):
            ref.id_int = id + ':' + str(
                num + 1)  # references.xml reference counting starts with '1'
            if not db.insertArticle(ref):
                insert_errors += 1
        #print '.',
    db.commit()
    logging.info('resulting database has %i records (originally %i)' %
                 (len(db), sum([len(refs) for refs in refdb.itervalues()])))
    logging.info('detected %i inconsistency collisions' % insert_errors)
Beispiel #12
0
def printStats(arts):
    """print some MSC stats for an article database (ArticleDB)"""
    mscs, mains = getMSCCnts(arts)
    uniq_mscs = set(mscs.keys())
    logging.info("#categories present in the db = %i" % len(uniq_mscs))
    #    print uniq_mscs
    mscdefs = set(makeDefs().keys())
    if not uniq_mscs.issubset(mscdefs):
        #        print 'db:', uniq_mscs
        #        print 'defs:', mscdefs
        logging.warning(
            "unrecognized MSC 2-digit code(s) present in the database: %s" %
            sorted(uniq_mscs - mscdefs))


#    uniq_mscs = uniq_mscs.union(mscdefs)
    logging.info('id\ttotal\tprimary')
    logging.info('==============================')
    for msc in sorted(list(uniq_mscs)):
        logging.info("%s\t%i\t%i" % (msc, len(mscs.setdefault(
            msc, [])), len(mains.setdefault(msc, []))))
    logging.info('==============================')
    len_mscs = [len(val) for val in mscs.itervalues()]
    len_mains = [len(val) for val in mains.itervalues()]
    logging.info('avg\t%i\t%i' %
                 (sum(len_mscs) / len(mscs), sum(len_mains) / len(mscs)))
    logging.info(
        'median\t%i\t%i' %
        (sorted(len_mscs)[len(mscs) / 2], sorted(len_mains)[len(mains) / 2]))
    lens = [len(art.msc) for art in arts if art.msc != None]
    logging.info('average MSC codes per article = %.2f' %
                 (1.0 * sum(lens) / len(arts)))
    import cPickle
    cPickle.dump(mscs, open(common.dataFile('mscs_all.pkl'), 'w'), protocol=-1)
    cPickle.dump(mains,
                 open(common.dataFile('mscs_primary.pkl'), 'w'),
                 protocol=-1)
Beispiel #13
0
def getArts(fname, minFreq):
    db = ArticleDB.ArticleDB(common.dataFile(fname), mode='open')
    import ipyutils
    language = 'eng'
    ipyutils.loadDicts(prefix='gensim_' + language)
    arts = [
        Article.Article(rec) for rec in db.db if rec['language'] == language
    ]
    for art in arts:
        art.msc = list(set(mscs.niceMSC(msc, prefix=2)[0] for msc in art.msc))
    logging.info('loaded %i articles from %s' % (len(arts), fname))
    arts = [
        art for art in arts
        if len(art.msc) == 1 and art.body and art.id_int in ipyutils.rdocids
    ]
    logging.info(
        'extracted %i articles with exactly one MSC and non-empty body' %
        (len(arts)))
    okmsc = mscs.getFreqMSCs(arts, minFreq, useMains=True)
    mscs.printStats(arts)
    for art in arts:
        art.body = art.body.decode('utf8')
        #        art.body = art.body.decode('ascii', 'ignore')
        art.body = art.body.translate(BAD_MAP).encode('utf8')
        if art.title:
            art.title = art.title.decode('utf8')
            #        art.title = art.title.decode('ascii', 'ignore')
            art.title = art.title.translate(BAD_MAP).encode('utf8')
        art.msc = [mscs.niceMSC(art.msc[0], prefix=2)[0]]
    arts = [art for art in arts if art.msc[0] in okmsc]
    allmsc, mainmsc = mscs.getMSCCnts(arts)
    for msc, mscarts in allmsc.iteritems():
        logging.info("class %s: %i articles" % (msc, len(mscarts)))
    logging.info("======================")
    logging.info("sum: %i articles" %
                 sum(len(mscarts) for mscarts in allmsc.itervalues()))

    logging.debug(
        'using %i articles from all %s msc classes that are covered by at least %i articles'
        % (len(arts), sorted(list(okmsc)), minFreq))
    return arts
Beispiel #14
0
def saveMsc2Id(cats, lang):
    DocumentCollection.saveLex(cats,
                               common.dataFile('serial_mscids_%s.txt' % lang))
Beispiel #15
0
def loadMSCdict(fname):
    import cPickle
    mscs = cPickle.load(open(common.dataFile(fname), 'r'))
    return mscs
Beispiel #16
0
                    old.append(art)
                    mscs[top] = old
                    if isMain:
                        old = mains.get(top, [])
                        old.append(art)
                        mains[top] = old
    return mscs, mains


def getFreqMSCs(arts, minFreq=50, useMains=False):
    mscs, mains = getMSCCnts(arts)
    if useMains:
        mscs = mains
    uniqMSCs = set(mscs.keys())
    result = set(
        [msc for msc in sorted(list(uniqMSCs)) if len(mscs[msc]) >= minFreq])
    #    print [(msc, len(mscs[msc])) for msc in result]
    return result


if __name__ == "__main__":
    logging.basicConfig(level=logging.DEBUG)
    import ArticleDB
    #    print sorted(makeDefs().keys())

    db = ArticleDB.ArticleDB(common.dataFile('tex_casopis.pdl'), mode='open')
    arts = [Article.Article(rec) for rec in db.db if rec['msc']]
    #    print len(db)
    logging.info('gathering MSC stats from %i articles' % len(arts))
    printStats(arts)
def loadMSCdict(fname):
    import cPickle
    mscs = cPickle.load(open(common.dataFile(fname), 'r'))
    return mscs
def saveMsc2Id(cats, lang):
    DocumentCollection.saveLex(cats, common.dataFile("serial_mscids_%s.txt" % lang))
def loadMsc2Id(lang):
    cats = [item.strip().split("\t") for item in open(common.dataFile("serial_mscids_%s.txt" % lang)).readlines()]
    cats = dict([(int(id), val) for id, val in cats])  # map int -> id
    rcats = dict([(v, k) for k, v in cats.iteritems()])  # map id -> int
    return cats, rcats