def postprocess(): print "make paper table: paperid, year, journal, title, authors, authorids" # load existing author table AuthTblFile ='pdb/pm_author_count.csv' atbl = fif.loadCSV(AuthTblFile) FILE_PPRMETA_CSV = "pdb/pm_paper_meta.csv" fif.resetFile(FILE_PPRMETA_CSV) fif.addLineToFile(FILE_PPRMETA_CSV, "paperid,year,journal,title,authors") #TODO-put journal table generation from compiler to importer jnl_name2id = fif.loadCSV('pdb/pm_jnl.csv') # load and process each paper for pt in papermeta(): _id, _year, _jnl, authors, ttl, abstract = pt if not _id: continue if not ttl: ttl='-' _idstr=str(_id).strip('L') jnlid=jnl_name2id.get(_jnl,0) _linemeta = '%s,%s,%s,"%s",'%(_idstr, _year, jnlid, ttl) if not authors: authortext='"-;0"' else: authortext='"%s"'%'|'.join(['%s;%s'%(a, atbl.get(a,0)) for a in authors]) _linemeta += authortext fif.addLineToFile(FILE_PPRMETA_CSV, _linemeta)
def genTblPaperTerm(TermTbl): print "generate paper->term table" FILE_PT_CSV = 'pdb/SqlPaperTerm.csv' fif.resetFile(FILE_PT_CSV) sig_t_abs = fif.load_json('pdb/pm_df_t_abs_3.json') print "dump records to", FILE_PT_CSV for pt in readin.paperterms(): pid, t_ttl, t_abs = pt[0], pt[1], pt[2] # remove some terms of abstract t_abs_good = [ t for t in t_abs if sig_t_abs.get(t, 0) > 5 and sig_t_abs[t] < 2000 ] # rank the 1gram by df (lower is better) t_abs_ngram = [t for t in t_abs_good if len(t.split()) > 1] t_abs_1gram = [t for t in t_abs_good if not t in t_abs_ngram] t_abs_1gram = sorted(t_abs_1gram, key=lambda k: sig_t_abs[k])[:2] #TODO: better to check abbreviation, if offen in title or frequent terms, etc def __termCount(term): return '%s:%s' % (term, TermTbl.get(term, 0)) ttl_term_str = ';'.join([__termCount(t) for t in t_ttl]) abs_term_str = ';'.join( [__termCount(t) for t in t_abs_ngram + t_abs_1gram]) line = '%s,%s,%s' % (pid, ttl_term_str, abs_term_str) fif.addLineToFile(FILE_PT_CSV, line)
def genRelTermTbl_sem(TermTbl): print "generate related-terms table" cdf_ttl2ttl = fif.load_json('pdb/pm_cdf_ttl2ttl.json') cdf_ttl2abs = fif.load_json('pdb/pm_cdf_ttl2abs.json') FILE_REL_CSV = 'pdb/pm_relatedterms_semantic.csv' fif.resetFile(FILE_REL_CSV) for ref_term, df in cdf_ttl2ttl.items(): # related terms by CDF of TITLE relTtl = sorted(df.items(), key=lambda (k, v): v, reverse=True)[:8] relTtl = [t for (t, cnt) in relTtl] relTtl_str = '|'.join( ['%s;%s' % (t, TermTbl.get(t, 0)) for t in relTtl]) # related terms by CDF of ABSTRACT dfa = cdf_ttl2abs.get(t, None) if not dfa: continue relAbs = sorted(dfa.items(), key=lambda (k, v): v, reverse=True)[:15] relAbs = [t for (t, cnt) in relAbs] relAbs_str = '|'.join( ['%s;%s' % (t, TermTbl.get(t, 0)) for t in relAbs]) line = '%s,%d,%s,%s' % (ref_term, TermTbl.get( ref_term, 0), relTtl_str, relAbs_str) fif.addLineToFile(FILE_REL_CSV, line)
def __dumpId2IdList(filename, id2IdList): fif.resetFile(filename) print "dump each record line ..." for iid, idlist in id2IdList.items(): if not idlist: continue # could be empty for author->org for iiid in idlist: line = '%s,%s' % (iid, iiid) fif.addToFile(filename, line, isline=1)
def genRelTermTbl_lex(TermTbl): rel_lex = fif.load_json('pdb/pm_rel_t_ttl.json') FILE_REL_LEX_CSV = 'pdb/pm_relatedterms_lexical.csv' fif.resetFile(FILE_REL_LEX_CSV) for ref_term, terms in rel_lex.items(): if TermTbl.get(ref_term, 0) == 0: continue term_str = '|'.join(['%s;%s' % (t, TermTbl.get(t, 0)) for t in terms]) line = '%s,%d,%s' % (ref_term, TermTbl.get(ref_term, 0), term_str) fif.addLineToFile(FILE_REL_LEX_CSV, line)
def extractPapers(pubmedFile='pmed/pubmed_1996.txt'): pubmedFile = 'pubmed_result.txt' pubmedFile = 'pmed/pubmed_neurosci.txt' print "extract papers from", pubmedFile fif.resetFile(FileTblPaper) fif.resetFile(FileTblAuthor) fif.resetFile(FileTblDepart) fif.resetFile(FileTblPub) papertext = '' print "crunching", pubmedFile papers = [] with open(pubmedFile, 'r') as fh: for i, line in enumerate(fh): if line[0:5] == 'PMID-': p = parse(papertext) papers.append(p) papertext = '' papertext += line if Publisher_Name2Id.len() > 5e10: break print "crunching done" with open(FilePaperPkl, 'w') as fh: pickle.dump(papers[1:], fh) print "saved to", FilePaperPkl #dump2file() dump2pickle() print "extract papers - ALL DONE\n"
def genTerm2Paper(termtbl): print "generate term2paper table" # load exiting term->paper_id t2p = fif.load_json('pdb/pm_idx_t2p_ttl.json') FILE_T2P_CSV = 'pdb/pm_index_t2p.csv' fif.resetFile(FILE_T2P_CSV) for t, plist in t2p.items(): if not t: continue if not len(plist) > 1: continue if len(plist) > 2000: continue # TODO: think about this tid = termtbl[t] plist_str = ','.join(plist) line = '%s,%d,"%s"' % (t, tid, plist_str) fif.addLineToFile(FILE_T2P_CSV, line)
def overwrite(): hash2pid = readPaperHash2Id() badRecord = '!NO HASH IN HASH2PID!' fif.resetFile(FilePid2Term) for pt in psr.paperterms(): hashid, termTtl, termAbs = pt[0], pt[1], pt[2] if hashid in hash2pid: pid = hash2pid[hashid] tterm = ';'.join(termTtl) aterm = ';'.join(termAbs) text = '%s!%s!%s' % (pid, tterm, aterm) fif.addToFile(FilePid2Term, text, isline=1) else: badRecord += "%s\n" % hashid with open('BadHash2Pid.txt', 'w') as f: f.write(badRecord)
def makeTable(): dftt = fif.load_json(FILE_TERM_DFT) dfta = fif.load_json(FILE_TERM_DFA) terms = dftt.keys() + dfta.keys() tid, this_id = {}, 1 for t in terms: if not t in tid: tid[t] = this_id this_id += 1 print "termtbl size", len(tid) fif.resetFile(FILE_TERM_TBL_CSV) fif.addLineToFile(FILE_TERM_TBL_CSV, "termid, term") for t, _id in tid.items(): fif.addLineToFile(FILE_TERM_TBL_CSV, '%d,"%s"' % (_id, t)) fif.save_json(FILE_TERM_TBL_JSON, tid)
def makeSqlTblPaper(): print "\nmake === PAPER TABLE ===" # paperid, year, publisher, title, abstract, authors fif.resetFile(FileSqlPaper) print "dump each record line ..." for meta in readin.papermeta(): iid = meta[1] year = meta[2] pub = sqlization(meta[3]) pubid = sqlization(meta[4]) ttl = sqlization(meta[5]) abstr = sqlization(meta[6]) authors = sqlization(meta[7]) # save them to the csv line = ','.join([iid, year, pub, pubid, ttl, abstr, authors]) fif.addToFile(FileSqlPaper, line, isline=1)
def makeSqlTblOrg(): fif.resetFile(FileSqlOrg) # id, org-department, orgnization, city, country, georawtext for data in readin.getOrgId2Data(): try: iid, depart, org, city, country, geo = data #print "orgline", iid, depart #print " > org= ", org #print " > city=", city #print " > co= ", country #print " > geo= ", geo line = '%s,%s,%s,%s,%s,%s' % (iid, sqlization(depart), sqlization(org), city, country, sqlization(geo)) line = '%s,%s,%s,%s,%s' % (iid, sqlization(depart), sqlization(org), city, country) fif.addToFile(FileSqlOrg, line, isline=1) except: print("BADLINE when making sql for organization", line) print "- DONE"
def _dumpId2Name(id2name, filename): fif.resetFile(filename) print "dump each record line ..." for iid, name in id2name.items(): line = '%s,%s' % (iid, sqlization(name)) fif.addToFile(filename, line, isline=1)