def Init(self): BoeLmC.Init(self) self.DocTextDir = "" self.ObjCenter = FbObjCacheCenterC() self.CtfCenter = TermCtfC() self.lInferenceWeight = [1,0,0] self.hDocText = {}
def Init(self): ObjObjEdgeFeatureExtractorC.Init(self) self.FeatureName += 'TextSim' # self.lObjField = ['name','desp','alias'] # self.lFieldSimMetric = ['coor','js','cosine'] self.lObjField = ['desp'] self.lFieldSimMetric = ['cosine'] self.CtfCenter = TermCtfC() self.TermCtfIn = ""
def Process(PairCorrCntDictInName, CtfInName, OutName, SimMetric='tfidf'): hPairCnt = pickle.load(open(PairCorrCntDictInName)) logging.info('pair cnt loaded') ObjCtfCenter = TermCtfC() ObjCtfCenter.Load(CtfInName) hPairCorr = {} logging.info('start to calc obj corpus ana similarity') cnt = 0 for key, tf in hPairCnt.items(): ObjA, ObjB = key.split() hPairCorr[ObjA + '\t' + ObjB] = CalcSimilarity(ObjA, ObjB, tf, ObjCtfCenter, SimMetric) hPairCorr[ObjB + '\t' + ObjA] = CalcSimilarity(ObjB, ObjA, tf, ObjCtfCenter, SimMetric) cnt += 1 if 0 == (cnt % 1000): logging.info('processed [%d] pair', cnt) pickle.dump(hPairCorr, open(OutName, 'w')) logging.info('corr score dumped to [%s]', OutName) return
def CalcDocObjDistribution(cls, doc, lDocObj): DocLm = LmBaseC(doc) lObjLm = [LmBaseC(obj.GetDesp()) for obj in lDocObj] lDocObjScore = [ LmBaseC.Similarity(ObjLm, DocLm, TermCtfC(), 'cosine') for ObjLm in lObjLm ] Z = float(sum(lDocObjScore)) # logging.debug('Doc obj dist Z= %f',Z) if 0 != Z: lDocObjScore = [item / Z for item in lDocObjScore] else: logging.warn('sum of doc obj scores is 0. raw scores:\n%s', json.dumps(lDocObjScore)) # lDocObjNoScore = zip([obj.GetId() for obj in lDocObj],lDocObjScore) # logging.debug('doc [%s] obj dist:\n%s',doc.DocNo,json.dumps(lDocObjNoScore)) return lDocObjScore
def CalcObjDistributionOnQuery(cls, lQObj, lDocObj): lQObjLm = [LmBaseC(obj.GetDesp()) for obj in lQObj] lDocObjLm = [LmBaseC(obj.GetDesp()) for obj in lDocObj] llDObjQObjSim = [[ LmBaseC.Similarity(QLm, DLm, TermCtfC(), 'cosine') for QLm in lQObjLm ] for DLm in lDocObjLm] lDObjSim = [sum(lDObjQObjSim) for lDObjQObjSim in llDObjQObjSim] # logging.debug('obj-o obj sim mtx:\n %s',json.dumps(llDObjQObjSim)) # logging.debug('obj-q sim:\n %s',json.dumps(lDObjSim)) Z = float(sum(lDObjSim)) # logging.debug('ObjDist on Q Z = %f',Z) if Z != 0: # logging.debug('Q obj desp: [%s]',lQObj[0].GetDesp()) # logging.debug('doc obj desp: [%s]',lDocObj[0].GetDesp()) lDObjSim = [item / Z for item in lDObjSim] else: logging.warn('doc obj has no similarity with q obj') return lDObjSim
def Init(self): self.CtfCenter = TermCtfC() self.Word2VecFile = "" self.CateDenseCenter = CateAttCntDensityCenterC()#now is null self.FbObjCacheCenter = FbObjCacheCenterC()
class ObjVecMakerC(cxBaseC): def Init(self): self.CtfCenter = TermCtfC() self.Word2VecFile = "" self.CateDenseCenter = CateAttCntDensityCenterC()#now is null self.FbObjCacheCenter = FbObjCacheCenterC() def SetConf(self,ConfIn): conf = cxConf(ConfIn) self.CtfCenter.Load(conf.GetConf('termctf')) self.Word2VecFile = conf.GetConf('word2vec') self.CateDenseCenter.load(conf.GetConf('cateattdense')) self.FbObjCacheCenter.SetConf(ConfIn) print "inited" @staticmethod def ShowConf(): print "termctf\nword2vec\ncateattdense" FbObjCacheCenterC.ShowConf() def MakeLmVec(self,lFbObj): lVector = [] print "start make lm vec" for FbObj in lFbObj: desp = FbObj.GetDesp() Lm = LmBaseC() Lm.SetFromRawText(desp) Vector = VectorC() for term in Lm.hTermTF: score = Lm.GetTFProb(term) * math.log(1.0/self.CtfCenter.GetCtfProb(term)) Vector.hDim[term] = score Vector.Key = FbObj.GetId() lVector.append(Vector) return lVector def MakeWord2Vec(self,lFbObjId): print "start make word2vec [%s]" %(self.Word2VecFile) lObjId = lFbObjId hObjP = dict(zip(lObjId,range(len(lObjId)))) lVector = [] for i in range(len(lObjId)): Vector = VectorC() Vector.Key = lObjId[i] lVector.append(Vector) reader = Word2VecReaderC() reader.open(self.Word2VecFile) print "start tarverse word2vec file [%s]" %(self.Word2VecFile) for word2vec in reader: if not word2vec.word in hObjP: continue p = hObjP[word2vec.word] lVector[p].hDim = word2vec.hDim print "get [%s]" %(lVector[p].Key) reader.close() return lVector def IsStopCate(self,cate): lStop = ['/common'] for item in lStop: if item == cate[:len(item)]: return True return False def MakeCateAttCntVec(self,lFbObj): #require the cate att cnt in APIBase #and the cate att distribution (empirical) center lVector = [] print "start make cate att cnt vec" for FbObj in lFbObj: Vector = VectorC() Vector.Key = FbObj.GetId() hCate = FbObj.FormCategoryAttCnt() print "cate for [%s]: \n%s" %(Vector.Key,json.dumps(hCate)) for cate in hCate: if self.IsStopCate(cate): continue cnt = hCate[cate] cdf = self.CateDenseCenter.GetProb(cate, cnt) print "cate [%s] prob[%f]" %(cate,cdf) Vector.hDim[cate] = cdf Vector.Normalize() lVector.append(Vector) return lVector def ProcessQObjFile(self,InName,OutName): #in: qid query objid #out: OutName_desp,OutName_cate,OutName_word2vec OutDesp = open(OutName + "_desp",'w') OutCate = open(OutName + "_cate",'w') OutWord2Vec = open(OutName + "_word2vec","w") lQidQuery = [] lFbObjId = [] lFbObjName = [] #read objid for line in open(InName): vCol = line.strip().split('\t') lQidQuery.append([vCol[0],vCol[1]]) FbObj = self.FbObjCacheCenter.FetchObj(vCol[2]) lFbObjId.append(FbObj.GetId()) lFbObjName.append(FbObj.GetName()) lDespVec = self.MakeLmVec([FbObj]) lCateVec = self.MakeCateAttCntVec([FbObj]) try: print >> OutDesp,vCol[0] + "\t" + vCol[1] + '\t' + FbObj.GetId() + '\t' + FbObj.GetName() + '\t' + lDespVec[0].dumps() print >> OutCate,vCol[0] + "\t" + vCol[1] + '\t'+ FbObj.GetId() + '\t' + FbObj.GetName() + '\t' + lCateVec[0].dumps() except UnicodeEncodeError: print "unicode encode error, discard" FbObj.clear() print "fetched, lm and cate vecs made, start make vecs from word2vec" #extract and dump lWord2Vec = self.MakeWord2Vec(lFbObjId) print "dumping" for i in range(len(lQidQuery)): try: print >> OutWord2Vec,lQidQuery[i][0] + "\t" + lQidQuery[i][1] + '\t'+ lFbObjId[i] + '\t' + lFbObjName[i] + '\t' + lWord2Vec[i].dumps() except UnicodeEncodeError: print "unicode encode error, discard" OutDesp.close() OutCate.close() OutWord2Vec.close() print 'done' return True
def SetConf(self, ConfIn): ObjObjEdgeFeatureExtractorC.SetConf(self, ConfIn) self.TermCtfIn = self.conf.GetConf('termctf') self.CtfCenter = TermCtfC(self.TermCtfIn)
CorrType = conf.GetConf('correlationmeasure') lLine = open(TargetObjIn).read().splitlines() hTargetObj = dict(zip(lLine,range(len(lLine)))) print "[%d] target obj load" %(len(hTargetObj)) hPair = {} if PairDictIn != "": hPair = pickle.load(open(PairDictIn)) print "[%d] pair cnt load" %(len(hPair)) else: if PairRawIn != "": hPair = FormPairDictFromRaw(PairRawIn) CtfCenter = TermCtfC() CtfCenter.Load(IdfDictIn) print "df load" print "forming neighbors..." hTargetObjNeighbor = FormTargetObjNeighbors(hTargetObj, hPair, CtfCenter,CorrType) print "dumpping results..." DumpTargetObjTopNeighbor(hTargetObjNeighbor, OutName, NumOfNeighbor) print "finished"
class BoeLmWeighterC(BoeLmC): def __init__(self,ConfIn = ""): self.Init() if "" != ConfIn: self.SetConf(ConfIn) def Init(self): BoeLmC.Init(self) self.DocTextDir = "" self.ObjCenter = FbObjCacheCenterC() self.CtfCenter = TermCtfC() self.lInferenceWeight = [1,0,0] self.hDocText = {} def SetConf(self,ConfIn): conf = cxConfC(ConfIn) self.DocTextDir = conf.GetConf('doctextdir') self.LoadDocText() self.ObjCenter.SetConf(ConfIn) CtfInName = conf.GetConf('objctf') self.CtfCenter.Load(CtfInName) @classmethod def ShowConf(): print 'doctextdir\nobjctf' FbObjCacheCenterC.ShowConf() def LoadDocText(self): for fname in WalkDir(self.DocTextDir): for line in open(fname): DocNo,text = line.strip().split('\t') self.hDocText[DocNo] = text logging.info('doc text loaded') def GetAllIdf(self,DocKg): lItem = DocKg.hNodeId.items() lItem.sort(key=lambda item:item[1]) lObjId = [item[0] for item in lItem] lRes = [] for ObjId in lObjId: idf = self.CtfCenter.GetLogIdf(ObjId) lRes.append(idf) return lRes def GetAllTf(self,DocKg): return list(DocKg.vNodeWeight) def GetAllTextCosine(self,DocKg): DocText = "" if DocKg.DocNo in self.hDocText: DocText = self.hDocText[DocKg.DocNo] lCos = [] if "" == DocText: return [0] * len(DocKg) DocLm = LmBaseC(DocText) lItem = DocKg.hNodeId.items() lItem.sort(key=lambda item:item[1]) lObjId = [item[0] for item in lItem] for ObjId in lObjId: desp = self.ObjCenter.FetchObjDesp(ObjId) lm = LmBaseC(desp) lCos.append(LmBaseC.Cosine(lm, DocLm)) return lCos def GetTextCosine(self,ObjId,DocKg): DocText = "" if DocKg.DocNo in self.hDocText: DocText = self.hDocText[DocKg.DocNo] DocLm = LmBaseC(DocText) desp = self.ObjCenter.FetchObjDesp(ObjId) lm = LmBaseC(desp) score = LmBaseC.Cosine(lm, DocLm) if 0 == score: return self.MinLogProb return math.log(score) def LinearWeightTfIdfTextSim(self,ObjId,DocKg,TfScore = 1,IdfScore = 0, TextSimScore = 0): if not ObjId in DocKg: return self.MinLogProb lTf = np.zeros(len(DocKg)) lIdf = np.zeros(len(DocKg)) if TfScore != 0: lTf = np.array(self.GetAllTf(DocKg)) if IdfScore != 0: lIdf = np.array(self.GetAllIdf(DocKg)) # lCos = np.array(self.GetAllTextCosine(DocKg)) TextSim = 0 if TextSimScore != 0: TextSim = self.GetTextCosine(ObjId,DocKg) W = np.array([TfScore,IdfScore,TextSimScore]) W = W / float(sum(W)) lScore = lTf * W[0] + lIdf * W[1] res = self.MinLogProb * (W[0] + W[1]) if ObjId in DocKg: p = DocKg.hNodeId[ObjId] res = lScore[p] res = res + TextSim * TextSimScore return res def inference(self, ObjId, DocKg): return self.LinearWeightTfIdfTextSim(ObjId, DocKg, self.lInferenceWeight[0], self.lInferenceWeight[1], self.lInferenceWeight[2])
what I do: I get the idf of mesh term from the doc->MeSH ana dict what's my input: DocMeSHDict what's my output: termctf.dump ''' import site site.addsitedir('/bos/usr0/cx/PyCode/cxPyLib') from IndriRelate.CtfLoader import TermCtfC import sys import pickle if 3 != len(sys.argv): print "DocMeSHDict + outname" sys.exit() hDocMeSH = pickle.load(open(sys.argv[1])) CtfCenter = TermCtfC() for DocNo,lAna in hDocMeSH.items(): for UI,term in lAna: CtfCenter.insert(UI) CtfCenter.dump(sys.argv[2]) print "done"