Esempio n. 1
0
def relatedTermByLexicon(TermJson, Word2TermJson, Save2Json):

    relatedTerms = {}  # TERM -> [related TERM]

    dfTermOfTtl = fif.load_json(TermJson)
    idxW2TTtl = fif.load_json(Word2TermJson)
    print "find related terms"
    for t in dfTermOfTtl:
        words = t.split()
        wordSet = set(words)
        if not len(words) > 1: continue
        # find other terms that also contain the same $words
        related = {}
        for w in words:
            containingTerms = idxW2TTtl.get(w, None)
            if not containingTerms: continue
            # this term must have a least two common words
            for ct in containingTerms:
                __nCommonWords = 0
                for w2 in wordSet:
                    if w2 in ct: __nCommonWords += 1
                if __nCommonWords >= 2: counter.inc(related, ct)
        if t in related: del related[t]  # delete self relatedness
        top = sorted(related.items(), key=lambda (k, v): v, reverse=True)[:10]
        # sort again by DF
        top = [(k, dfTermOfTtl.get(k, 0)) for (k, v) in top]
        top_s = sorted(top, key=lambda (k, v): v, reverse=True)[:5]
        rel = [i[0] for i in top_s]
        if not rel: continue
        #print "%s related to %s"%(t, top_s)
        relatedTerms[t] = rel
    fif.save_json(Save2Json, relatedTerms)
Esempio n. 2
0
def genRelTermTbl_sem(TermTbl):
    print "generate related-terms table"
    cdf_ttl2ttl = fif.load_json('pdb/pm_cdf_ttl2ttl.json')
    cdf_ttl2abs = fif.load_json('pdb/pm_cdf_ttl2abs.json')
    FILE_REL_CSV = 'pdb/pm_relatedterms_semantic.csv'
    fif.resetFile(FILE_REL_CSV)
    for ref_term, df in cdf_ttl2ttl.items():

        # related terms by CDF of TITLE
        relTtl = sorted(df.items(), key=lambda (k, v): v, reverse=True)[:8]
        relTtl = [t for (t, cnt) in relTtl]
        relTtl_str = '|'.join(
            ['%s;%s' % (t, TermTbl.get(t, 0)) for t in relTtl])

        # related terms by CDF of ABSTRACT
        dfa = cdf_ttl2abs.get(t, None)
        if not dfa: continue
        relAbs = sorted(dfa.items(), key=lambda (k, v): v, reverse=True)[:15]
        relAbs = [t for (t, cnt) in relAbs]
        relAbs_str = '|'.join(
            ['%s;%s' % (t, TermTbl.get(t, 0)) for t in relAbs])

        line = '%s,%d,%s,%s' % (ref_term, TermTbl.get(
            ref_term, 0), relTtl_str, relAbs_str)
        fif.addLineToFile(FILE_REL_CSV, line)
Esempio n. 3
0
def genTblPaperTerm(TermTbl):
    print "generate paper->term table"

    FILE_PT_CSV = 'pdb/SqlPaperTerm.csv'
    fif.resetFile(FILE_PT_CSV)

    sig_t_abs = fif.load_json('pdb/pm_df_t_abs_3.json')

    print "dump records to", FILE_PT_CSV
    for pt in readin.paperterms():
        pid, t_ttl, t_abs = pt[0], pt[1], pt[2]

        # remove some terms of abstract
        t_abs_good = [
            t for t in t_abs if sig_t_abs.get(t, 0) > 5 and sig_t_abs[t] < 2000
        ]

        # rank the 1gram by df (lower is better)
        t_abs_ngram = [t for t in t_abs_good if len(t.split()) > 1]
        t_abs_1gram = [t for t in t_abs_good if not t in t_abs_ngram]
        t_abs_1gram = sorted(t_abs_1gram, key=lambda k: sig_t_abs[k])[:2]

        #TODO: better to check abbreviation, if offen in title or frequent terms, etc

        def __termCount(term):
            return '%s:%s' % (term, TermTbl.get(term, 0))

        ttl_term_str = ';'.join([__termCount(t) for t in t_ttl])
        abs_term_str = ';'.join(
            [__termCount(t) for t in t_abs_ngram + t_abs_1gram])

        line = '%s,%s,%s' % (pid, ttl_term_str, abs_term_str)
        fif.addLineToFile(FILE_PT_CSV, line)
Esempio n. 4
0
def makeTable():

    dftt = fif.load_json(FILE_TERM_DFT)
    dfta = fif.load_json(FILE_TERM_DFA)
    terms = dftt.keys() + dfta.keys()

    tid, this_id = {}, 1
    for t in terms:
        if not t in tid:
            tid[t] = this_id
            this_id += 1
    print "termtbl size", len(tid)

    fif.resetFile(FILE_TERM_TBL_CSV)
    fif.addLineToFile(FILE_TERM_TBL_CSV, "termid, term")
    for t, _id in tid.items():
        fif.addLineToFile(FILE_TERM_TBL_CSV, '%d,"%s"' % (_id, t))
    fif.save_json(FILE_TERM_TBL_JSON, tid)
Esempio n. 5
0
def genRelTermTbl_lex(TermTbl):
    rel_lex = fif.load_json('pdb/pm_rel_t_ttl.json')
    FILE_REL_LEX_CSV = 'pdb/pm_relatedterms_lexical.csv'
    fif.resetFile(FILE_REL_LEX_CSV)
    for ref_term, terms in rel_lex.items():
        if TermTbl.get(ref_term, 0) == 0: continue
        term_str = '|'.join(['%s;%s' % (t, TermTbl.get(t, 0)) for t in terms])
        line = '%s,%d,%s' % (ref_term, TermTbl.get(ref_term, 0), term_str)
        fif.addLineToFile(FILE_REL_LEX_CSV, line)
Esempio n. 6
0
def genTerm2Paper(termtbl):
    print "generate term2paper table"
    # load exiting term->paper_id
    t2p = fif.load_json('pdb/pm_idx_t2p_ttl.json')
    FILE_T2P_CSV = 'pdb/pm_index_t2p.csv'
    fif.resetFile(FILE_T2P_CSV)
    for t, plist in t2p.items():
        if not t: continue
        if not len(plist) > 1: continue
        if len(plist) > 2000: continue  # TODO: think about this
        tid = termtbl[t]
        plist_str = ','.join(plist)
        line = '%s,%d,"%s"' % (t, tid, plist_str)
        fif.addLineToFile(FILE_T2P_CSV, line)
Esempio n. 7
0
def headwording(TermJson, Save2Json):
    print "\nfind index from headword to terms"
    dfTermOfTtl = fif.load_json(TermJson)
    indexHeading_pre = {}
    indexHeading = {}
    for t, count in dfTermOfTtl.items():
        words = t.split()
        if not len(words) > 1: continue
        headword = words[0]
        if not headword in indexHeading_pre:
            indexHeading_pre[headword] = [(t, count)]
    for hw, t in indexHeading_pre.items():
        indexHeading[hw] = sorted(t,
                                  key=lambda (term, count): count,
                                  reverse=True)

    fif.save_json(Save2Json, indexHeading)
Esempio n. 8
0
def wordchain():
    print "find wordchain"
    terms = fif.load_json(cfg.FILE_TERM_DF_ABOVE2_JSON)
    chain_nex = {}  # this_word -> next_word -> count
    chain_pre = {}  # this_word -> previous_word -> count
    for t in terms:
        words = t.split()
        for i, w in enumerate(words[:-1]):
            counter.inc_2d(w, words[i + 1], chain_nex)
            counter.inc_2d(words[i + 1], w, chain_pre)
    print " - total term size", len(terms)
    print " - total nex_wordchain size", util.dictsize(chain_nex)
    print " - total pre_wordchain size", util.dictsize(chain_pre)

    for w in chain_nex.keys()[:5]:
        print w, "-->", chain_nex[w]
    for w in chain_pre.keys()[:5]:
        print w, "<--", chain_pre[w]
    while (1):
        w = raw_input('type type type')
        print ""
        print "   --> ", chain_nex[w]
        print "   <-- ", chain_pre[w]
Esempio n. 9
0
def readPaperHash2Id():
    return fif.load_json(FileHash2Id)
Esempio n. 10
0
def loadDFTermTtl():
    return fif.load_json("pdb/pm_df_t_ttl_2.json")
Esempio n. 11
0
def loadDFTermAbs():
    return fif.load_json("pdb/pm_df_t_abs_2.json")
Esempio n. 12
0
def loadIndexW2T():
    return fif.load_json(__IDX_WORD2TERM_FILE)
Esempio n. 13
0
def loadIndexT2P():
    return fif.load_json(__IDX_TERM2PAPER_FILE)
Esempio n. 14
0
def generateTables():
    ttbl = fif.load_json('pdb/pm_term_tbl.json')
    genTblPaperTerm(ttbl)