def loadDicts(prefix = 'cmj'): global docids, rdocids, tokenids docids = [item.strip().split('\t') for item in open(common.dataFile(prefix + '_docids.txt')).readlines()] docids = dict([(int(id), val) for id, val in docids]) # map int -> int_id rdocids = dict([(v, k) for k, v in docids.iteritems()]) # map int_id -> int tokenids = [item.strip().split('\t') for item in codecs.open(common.dataFile(prefix + '_tokenids.txt'), 'r', 'utf8').readlines()] tokenids = dict([(int(id), val) for id, val in tokenids])
def create_refsdb(baseDir, downloadMSC = False): """read references from baseDir files and merge them together into an ArticleDB database. also check for consistency and print some stats store the resulting database on disk as 'refs_baseDir.pdl'. """ import cPickle reffiles = utils_iddb.getRefFiles(common.inputPath(baseDir)) cPickle.dump(reffiles, open(common.dataFile('reffiles.pkl'), 'w'), protocol = -1) # store filesystem paths to references.xml files refdb = {} for reffile, refid in reffiles.iteritems(): refs = utils_iddb.parseReferences(reffile, downloadMSC = downloadMSC) refdb[refid] = refs logging.info('id=%s: retrieved %i references' % (refid, len(refs))) if downloadMSC: cPickle.dump(refdb, open(common.dataFile('refs_partial.pkl'), 'w'), protocol = -1) # dump database immediately after each iteration f = open(common.dataFile('refs.pkl'), 'w') cPickle.dump(refdb, f, protocol = -1) # dump database immediately after each iteration f.close() #print some statistics logging.info("%i MSC download attemps (%i ok, %i failed)" % (utils_iddb.attempts_all, utils_iddb.attempts_success, utils_iddb.attempts_failed)) logging.info('reference database size: %i references in %i articles' % (sum([len(refs) for refs in refdb.itervalues()]), len(refdb))) db = ArticleDB.ArticleDB(common.dbFile('refs', baseDir), mode = 'override') insert_errors = 0 for id, reflist in refdb.iteritems(): for num, ref in enumerate(reflist): ref.id_int = id + ':' + str(num + 1) # references.xml reference counting starts with '1' if not db.insertArticle(ref): insert_errors += 1 #print '.', db.commit() logging.info('resulting database has %i records (originally %i)' % (len(db), sum([len(refs) for refs in refdb.itervalues()]))) logging.info('detected %i inconsistency collisions' % insert_errors)
def makeDefs(): deffile = common.dataFile('mscdefs.txt') result = {} for mscdef in open(deffile): result[mscdef[:2]] = mscdef import cPickle f = open(common.dataFile('mscdefs.pkl'), 'w') cPickle.dump(result, f) f.close() return result
def loadDicts(prefix='cmj'): global docids, rdocids, tokenids docids = [ item.strip().split('\t') for item in open(common.dataFile(prefix + '_docids.txt')).readlines() ] docids = dict([(int(id), val) for id, val in docids]) # map int -> int_id rdocids = dict([(v, k) for k, v in docids.iteritems()]) # map int_id -> int tokenids = [ item.strip().split('\t') for item in codecs.open(common.dataFile(prefix + '_tokenids.txt'), 'r', 'utf8').readlines() ] tokenids = dict([(int(id), val) for id, val in tokenids])
def getArts(fname, minFreq): db = ArticleDB.ArticleDB(common.dataFile(fname), mode = 'open') import ipyutils language = 'eng' ipyutils.loadDicts(prefix = 'gensim_' + language) arts = [Article.Article(rec) for rec in db.db if rec['language'] == language] for art in arts: art.msc = list(set(mscs.niceMSC(msc, prefix = 2)[0] for msc in art.msc)) logging.info('loaded %i articles from %s' % (len(arts), fname)) arts = [art for art in arts if len(art.msc)==1 and art.body and art.id_int in ipyutils.rdocids] logging.info('extracted %i articles with exactly one MSC and non-empty body' % (len(arts))) okmsc = mscs.getFreqMSCs(arts, minFreq, useMains = True) mscs.printStats(arts) for art in arts: art.body = art.body.decode('utf8') # art.body = art.body.decode('ascii', 'ignore') art.body = art.body.translate(BAD_MAP).encode('utf8') if art.title: art.title = art.title.decode('utf8') # art.title = art.title.decode('ascii', 'ignore') art.title = art.title.translate(BAD_MAP).encode('utf8') art.msc = [mscs.niceMSC(art.msc[0], prefix = 2)[0]] arts = [art for art in arts if art.msc[0] in okmsc] allmsc, mainmsc = mscs.getMSCCnts(arts) for msc, mscarts in allmsc.iteritems(): logging.info("class %s: %i articles" % (msc, len(mscarts))) logging.info("======================") logging.info("sum: %i articles" % sum(len(mscarts) for mscarts in allmsc.itervalues())) logging.debug('using %i articles from all %s msc classes that are covered by at least %i articles' % (len(arts), sorted(list(okmsc)), minFreq)) return arts
def loadArts(dbFile = 'main_cmj.pdl'): import ArticleDB import Article import common global db, arts db = ArticleDB.ArticleDB(common.dataFile(dbFile), mode = 'open') arts = [Article.Article(rec) for rec in db.db if rec['id_int'] in rdocids]
def saveAsCorpus(arts, fname, fnc = lambda c: c): logging.info('saving corpus as %s' % fname) f = open(common.dataFile(fname), 'w') f.write('<?xml version="1.0" encoding="utf-8" ?>\n') f.write('<articles>\n') for art in arts: f.write('<article id="%s" lang="%s">\n' % (art.id_int, art.language)) if art.msc: f.write('<category>\n') # assert len(art.msc) == 1 f.write(art.msc[0]) f.write('\n</category>\n') if art.title: f.write('<title>\n') f.write(fnc(art.title)) f.write('\n</title>\n') if art.body: f.write('<text>\n') f.write(fnc(art.body)) f.write('\n</text>\n') if art.references: f.write('<references>\n') f.write(art.body) f.write('\n</references>\n') f.write('</article>\n') f.write('</articles>\n') f.close()
def loadArts(dbFile='main_cmj.pdl'): import ArticleDB import Article import common global db, arts db = ArticleDB.ArticleDB(common.dataFile(dbFile), mode='open') arts = [Article.Article(rec) for rec in db.db if rec['id_int'] in rdocids]
def saveAsCorpus(arts, fname, fnc=lambda c: c): logging.info('saving corpus as %s' % fname) f = open(common.dataFile(fname), 'w') f.write('<?xml version="1.0" encoding="utf-8" ?>\n') f.write('<articles>\n') for art in arts: f.write('<article id="%s" lang="%s">\n' % (art.id_int, art.language)) if art.msc: f.write('<category>\n') # assert len(art.msc) == 1 f.write(art.msc[0]) f.write('\n</category>\n') if art.title: f.write('<title>\n') f.write(fnc(art.title)) f.write('\n</title>\n') if art.body: f.write('<text>\n') f.write(fnc(art.body)) f.write('\n</text>\n') if art.references: f.write('<references>\n') f.write(art.body) f.write('\n</references>\n') f.write('</article>\n') f.write('</articles>\n') f.close()
def loadMsc2Id(lang): cats = [ item.strip().split('\t') for item in open(common.dataFile('serial_mscids_%s.txt' % lang)).readlines() ] cats = dict([(int(id), val) for id, val in cats]) # map int -> id rcats = dict([(v, k) for k, v in cats.iteritems()]) # map id -> int return cats, rcats
def create_refsdb(baseDir, downloadMSC=False): """read references from baseDir files and merge them together into an ArticleDB database. also check for consistency and print some stats store the resulting database on disk as 'refs_baseDir.pdl'. """ import cPickle reffiles = utils_iddb.getRefFiles(common.inputPath(baseDir)) cPickle.dump(reffiles, open(common.dataFile('reffiles.pkl'), 'w'), protocol=-1) # store filesystem paths to references.xml files refdb = {} for reffile, refid in reffiles.iteritems(): refs = utils_iddb.parseReferences(reffile, downloadMSC=downloadMSC) refdb[refid] = refs logging.info('id=%s: retrieved %i references' % (refid, len(refs))) if downloadMSC: cPickle.dump( refdb, open(common.dataFile('refs_partial.pkl'), 'w'), protocol=-1) # dump database immediately after each iteration f = open(common.dataFile('refs.pkl'), 'w') cPickle.dump(refdb, f, protocol=-1) # dump database immediately after each iteration f.close() #print some statistics logging.info("%i MSC download attemps (%i ok, %i failed)" % (utils_iddb.attempts_all, utils_iddb.attempts_success, utils_iddb.attempts_failed)) logging.info('reference database size: %i references in %i articles' % (sum([len(refs) for refs in refdb.itervalues()]), len(refdb))) db = ArticleDB.ArticleDB(common.dbFile('refs', baseDir), mode='override') insert_errors = 0 for id, reflist in refdb.iteritems(): for num, ref in enumerate(reflist): ref.id_int = id + ':' + str( num + 1) # references.xml reference counting starts with '1' if not db.insertArticle(ref): insert_errors += 1 #print '.', db.commit() logging.info('resulting database has %i records (originally %i)' % (len(db), sum([len(refs) for refs in refdb.itervalues()]))) logging.info('detected %i inconsistency collisions' % insert_errors)
def printStats(arts): """print some MSC stats for an article database (ArticleDB)""" mscs, mains = getMSCCnts(arts) uniq_mscs = set(mscs.keys()) logging.info("#categories present in the db = %i" % len(uniq_mscs)) # print uniq_mscs mscdefs = set(makeDefs().keys()) if not uniq_mscs.issubset(mscdefs): # print 'db:', uniq_mscs # print 'defs:', mscdefs logging.warning( "unrecognized MSC 2-digit code(s) present in the database: %s" % sorted(uniq_mscs - mscdefs)) # uniq_mscs = uniq_mscs.union(mscdefs) logging.info('id\ttotal\tprimary') logging.info('==============================') for msc in sorted(list(uniq_mscs)): logging.info("%s\t%i\t%i" % (msc, len(mscs.setdefault( msc, [])), len(mains.setdefault(msc, [])))) logging.info('==============================') len_mscs = [len(val) for val in mscs.itervalues()] len_mains = [len(val) for val in mains.itervalues()] logging.info('avg\t%i\t%i' % (sum(len_mscs) / len(mscs), sum(len_mains) / len(mscs))) logging.info( 'median\t%i\t%i' % (sorted(len_mscs)[len(mscs) / 2], sorted(len_mains)[len(mains) / 2])) lens = [len(art.msc) for art in arts if art.msc != None] logging.info('average MSC codes per article = %.2f' % (1.0 * sum(lens) / len(arts))) import cPickle cPickle.dump(mscs, open(common.dataFile('mscs_all.pkl'), 'w'), protocol=-1) cPickle.dump(mains, open(common.dataFile('mscs_primary.pkl'), 'w'), protocol=-1)
def getArts(fname, minFreq): db = ArticleDB.ArticleDB(common.dataFile(fname), mode='open') import ipyutils language = 'eng' ipyutils.loadDicts(prefix='gensim_' + language) arts = [ Article.Article(rec) for rec in db.db if rec['language'] == language ] for art in arts: art.msc = list(set(mscs.niceMSC(msc, prefix=2)[0] for msc in art.msc)) logging.info('loaded %i articles from %s' % (len(arts), fname)) arts = [ art for art in arts if len(art.msc) == 1 and art.body and art.id_int in ipyutils.rdocids ] logging.info( 'extracted %i articles with exactly one MSC and non-empty body' % (len(arts))) okmsc = mscs.getFreqMSCs(arts, minFreq, useMains=True) mscs.printStats(arts) for art in arts: art.body = art.body.decode('utf8') # art.body = art.body.decode('ascii', 'ignore') art.body = art.body.translate(BAD_MAP).encode('utf8') if art.title: art.title = art.title.decode('utf8') # art.title = art.title.decode('ascii', 'ignore') art.title = art.title.translate(BAD_MAP).encode('utf8') art.msc = [mscs.niceMSC(art.msc[0], prefix=2)[0]] arts = [art for art in arts if art.msc[0] in okmsc] allmsc, mainmsc = mscs.getMSCCnts(arts) for msc, mscarts in allmsc.iteritems(): logging.info("class %s: %i articles" % (msc, len(mscarts))) logging.info("======================") logging.info("sum: %i articles" % sum(len(mscarts) for mscarts in allmsc.itervalues())) logging.debug( 'using %i articles from all %s msc classes that are covered by at least %i articles' % (len(arts), sorted(list(okmsc)), minFreq)) return arts
def saveMsc2Id(cats, lang): DocumentCollection.saveLex(cats, common.dataFile('serial_mscids_%s.txt' % lang))
def loadMSCdict(fname): import cPickle mscs = cPickle.load(open(common.dataFile(fname), 'r')) return mscs
old.append(art) mscs[top] = old if isMain: old = mains.get(top, []) old.append(art) mains[top] = old return mscs, mains def getFreqMSCs(arts, minFreq=50, useMains=False): mscs, mains = getMSCCnts(arts) if useMains: mscs = mains uniqMSCs = set(mscs.keys()) result = set( [msc for msc in sorted(list(uniqMSCs)) if len(mscs[msc]) >= minFreq]) # print [(msc, len(mscs[msc])) for msc in result] return result if __name__ == "__main__": logging.basicConfig(level=logging.DEBUG) import ArticleDB # print sorted(makeDefs().keys()) db = ArticleDB.ArticleDB(common.dataFile('tex_casopis.pdl'), mode='open') arts = [Article.Article(rec) for rec in db.db if rec['msc']] # print len(db) logging.info('gathering MSC stats from %i articles' % len(arts)) printStats(arts)
def saveMsc2Id(cats, lang): DocumentCollection.saveLex(cats, common.dataFile("serial_mscids_%s.txt" % lang))
def loadMsc2Id(lang): cats = [item.strip().split("\t") for item in open(common.dataFile("serial_mscids_%s.txt" % lang)).readlines()] cats = dict([(int(id), val) for id, val in cats]) # map int -> id rcats = dict([(v, k) for k, v in cats.iteritems()]) # map id -> int return cats, rcats