def relatedTermByLexicon(TermJson, Word2TermJson, Save2Json): relatedTerms = {} # TERM -> [related TERM] dfTermOfTtl = fif.load_json(TermJson) idxW2TTtl = fif.load_json(Word2TermJson) print "find related terms" for t in dfTermOfTtl: words = t.split() wordSet = set(words) if not len(words) > 1: continue # find other terms that also contain the same $words related = {} for w in words: containingTerms = idxW2TTtl.get(w, None) if not containingTerms: continue # this term must have a least two common words for ct in containingTerms: __nCommonWords = 0 for w2 in wordSet: if w2 in ct: __nCommonWords += 1 if __nCommonWords >= 2: counter.inc(related, ct) if t in related: del related[t] # delete self relatedness top = sorted(related.items(), key=lambda (k, v): v, reverse=True)[:10] # sort again by DF top = [(k, dfTermOfTtl.get(k, 0)) for (k, v) in top] top_s = sorted(top, key=lambda (k, v): v, reverse=True)[:5] rel = [i[0] for i in top_s] if not rel: continue #print "%s related to %s"%(t, top_s) relatedTerms[t] = rel fif.save_json(Save2Json, relatedTerms)
def hashcode2PubmedId(): print "make dict: id --> year, pub, ttl" pidYearPubTtl = getPaperIdYearPubTtl() print "make dict: hash --> year, ttl" phashYearTtl = getPaperHashYearTTl() # old data with hash-id badRecords = [] hash2Pid = {} for pid, fields in pidYearPubTtl.items(): year, ttl = fields[0], fields[2] # compute the hash code and make mapping hashcode -> pid h = makeHash(ttl, year) # check that the hash-code hashed to the same paper if not h in phashYearTtl: badRecords.append('%s!%s!%s' % (pid, year, ttl)) else: oldffields = phashYearTtl[h] oldyear = str(oldffields[0]) oldttl = oldffields[1] test1 = oldyear == year test2 = oldttl[:5] == ttl[:5] if not test1: print "different year" print "old year:", oldyear, "type=", type(oldyear) print "year:", year, "type=", type(year) raw_input() hash2Pid[h] = pid # save them to a file fif.save_json(FileHash2Id, hash2Pid) fif.saveIterable('TblHash2IdBad.txt', badRecords)
def cdf(term_getter, jsonfile, threshold_ppr=0, threshold_occ=2, debug=0): print "find cdf: using term getter", term_getter, "ppr>", threshold_ppr, "coocr>", threshold_occ _conditionalDF = {} mapT2P = cmpl.loadIndexT2P() # TODO - load only the ">2" terms mapP2T = impt.loadP2T(term_getter) print "find cdf: loop all terms" # for each term check all papers whose title containing this term for t in mapT2P: if not t: continue if not len(t.split()) > 1: continue pidlist = mapT2P[t] if not len(pidlist) > threshold_ppr: continue # for each paper, count the terms based on $term_getter for pid in pidlist: targetTerms = [i for i in mapP2T[pid] if len(i.split()) > 1] counter.count_2d(t, targetTerms, _conditionalDF) #print t, " in:", pid, "co-terms:", targetTerms #delete terms that do not co-occur more than $threshold times util.dictfilter2d_inplace(_conditionalDF, threshold_occ) #if(len(_conditionalDF)>10): break util.dictfilter2d_nonempty_inplace(_conditionalDF) for k in _conditionalDF.keys()[:10]: print "Conditional DF - %s => %s" % (k, _conditionalDF[k]) print "Conditional DF: %s" % util.dictsize(_conditionalDF) fif.save_json(jsonfile, _conditionalDF)
def headwording(TermJson, Save2Json): print "\nfind index from headword to terms" dfTermOfTtl = fif.load_json(TermJson) indexHeading_pre = {} indexHeading = {} for t, count in dfTermOfTtl.items(): words = t.split() if not len(words) > 1: continue headword = words[0] if not headword in indexHeading_pre: indexHeading_pre[headword] = [(t, count)] for hw, t in indexHeading_pre.items(): indexHeading[hw] = sorted(t, key=lambda (term, count): count, reverse=True) fif.save_json(Save2Json, indexHeading)
def makeTable(): dftt = fif.load_json(FILE_TERM_DFT) dfta = fif.load_json(FILE_TERM_DFA) terms = dftt.keys() + dfta.keys() tid, this_id = {}, 1 for t in terms: if not t in tid: tid[t] = this_id this_id += 1 print "termtbl size", len(tid) fif.resetFile(FILE_TERM_TBL_CSV) fif.addLineToFile(FILE_TERM_TBL_CSV, "termid, term") for t, _id in tid.items(): fif.addLineToFile(FILE_TERM_TBL_CSV, '%d,"%s"' % (_id, t)) fif.save_json(FILE_TERM_TBL_JSON, tid)
def indexing(): IndexT2P_InTtl = {} IndexW2T_InTtl = {} print "indexing using data from", psr.PM_TERM_FILE with open(psr.PM_TERM_FILE, 'r') as f: for line in f: pid, termTtl, termAbs, wordTtl, wordAbs = psr.parseTermLine(line) if not pid: continue for t in termTtl: if not t in IndexT2P_InTtl: IndexT2P_InTtl[t] = [] IndexT2P_InTtl[t].append(pid) for word in t.split(): counter.count_2d(word, [t], IndexW2T_InTtl) # save the indexes print "Size of Index TermTtl2Paper", util.dictsize(IndexT2P_InTtl) print "Size of Index Word2TermOfTtl", util.dictsize(IndexW2T_InTtl) fif.save_json(__IDX_TERM2PAPER_FILE, IndexT2P_InTtl) fif.save_json(__IDX_WORD2TERM_FILE, IndexW2T_InTtl)
def df(): DFTermTtl, DFTermAbs, DFWordTtl, DFWordAbs = {}, {}, {}, {} print "df calculation using data from", psr.PM_TERM_FILE with open(psr.PM_TERM_FILE, 'r') as f: for line in f: pid, termTtl, termAbs, wordTtl, wordAbs = psr.parseTermLine(line) counter.count(termTtl, DFTermTtl) counter.count(termAbs, DFTermAbs) counter.count(wordTtl, DFWordTtl) counter.count(wordAbs, DFWordAbs) print "Size of DF TermTtl", util.dictsize(DFTermTtl) print "Size of DF TermAbs", util.dictsize(DFTermAbs) print "Size of DF WordTtl", util.dictsize(DFWordTtl) print "Size of DF WordAbs", util.dictsize(DFWordAbs) fif.save_json("pdb/pm_df_t_ttl.json", DFTermTtl) fif.save_json("pdb/pm_df_t_abs.json", DFTermAbs) fif.save_json("pdb/pm_df_w_ttl.json", DFWordTtl) fif.save_json("pdb/pm_df_w_abs.json", DFWordAbs) DFTermTtl2 = util.dictfilter(DFTermTtl, 2) DFTermAbs2 = util.dictfilter(DFTermAbs, 2) DFWordTtl2 = util.dictfilter(DFWordTtl, 2) DFWordAbs2 = util.dictfilter(DFWordAbs, 2) print "Size of DF TermTtl above 2", util.dictsize(DFTermTtl2) print "Size of DF TermAbs above 2", util.dictsize(DFTermAbs2) print "Size of DF WordTtl above 2", util.dictsize(DFWordTtl2) print "Size of DF WordAbs above 2", util.dictsize(DFWordAbs2) fif.save_json("pdb/pm_df_t_ttl_2.json", DFTermTtl2) fif.save_json("pdb/pm_df_t_abs_2.json", DFTermAbs2) fif.save_json("pdb/pm_df_w_ttl_2.json", DFWordTtl2) fif.save_json("pdb/pm_df_w_abs_2.json", DFWordAbs2) DFTermTtl3 = util.dictfilter(DFTermTtl2, 3) DFTermAbs3 = util.dictfilter(DFTermAbs2, 3) DFWordTtl3 = util.dictfilter(DFWordTtl2, 3) DFWordAbs3 = util.dictfilter(DFWordAbs2, 3) print "Size of DF TermTtl above 3", util.dictsize(DFTermTtl3) print "Size of DF TermAbs above 3", util.dictsize(DFTermAbs3) print "Size of DF WordTtl above 3", util.dictsize(DFWordTtl3) print "Size of DF WordAbs above 3", util.dictsize(DFWordAbs3) fif.save_json("pdb/pm_df_t_ttl_3.json", DFTermTtl3) fif.save_json("pdb/pm_df_t_abs_3.json", DFTermAbs3) fif.save_json("pdb/pm_df_w_ttl_3.json", DFWordTtl3) fif.save_json("pdb/pm_df_w_abs_3.json", DFWordAbs3)