def write_doc_doc(self, doc_top_mat): """ Write the doc-doc relationships to the db @param doc_top_mat: a document x topic matrix where higher scores indicate greater similarity """ ndocs = doc_top_mat.shape[0] scores = np.zeros([ndocs, ndocs]) for i in xrange(ndocs): scores[i, i + 1:] = 1 / hellinger_distance( doc_top_mat[i, :]**0.5, doc_top_mat[i + 1:, :]**0.5) scores[np.where(np.isinf(scores))] = -1 scores = scores + scores.T # for accurate top K doc-docs score_inds = self._get_rev_srt_ind( (scores) )[:, : 30] # take the top thirty related docs (lower bound) TODO make an option? db_list = [] idxs = {} # so we don't have duplicates in the database for i in xrange(scores.shape[0]): for j in score_inds[i, :]: j = int(j) minv = min(i, j) maxv = max(i, j) if not idxs.has_key('%i %i' % (minv, maxv)): db_list.append( (minv, maxv, round(scores[minv, maxv], 3)) ) # TODO this could probably be replaced with a generator idxs['%i %i' % (minv, maxv)] = 1 self.dbase.executemany( "INSERT INTO doc_doc('id', 'doc_a', 'doc_b', 'score') VALUES(NULL, ?, ?, ?)", db_list)
def write_topic_topic(self, top_term_mat): """ Write the topic x topic matrix to the database @param top_term_mat: topics x terms matrix, should represent log-lieklihood for accurate calculations """ # TODO make distance metric a user option execution_str = 'INSERT INTO topic_topic (id, topic_a, topic_b, score) VALUES(NULL, ?, ?, ?)' for i in xrange(top_term_mat.shape[0]): scores = 1/hellinger_distance(top_term_mat[i,:]**0.5, top_term_mat[i+1:,:]**0.5) scores[np.where(np.isinf(scores))] = -1 res = generic_generator((i,)*len(scores), range(i+1, i+1+len(scores)), scores) self.dbase.executemany(execution_str, res)
def write_topic_topic(self, top_term_mat): """ Write the topic x topic matrix to the database @param top_term_mat: topics x terms matrix, should represent log-lieklihood for accurate calculations """ # TODO make distance metric a user option execution_str = 'INSERT INTO topic_topic (id, topic_a, topic_b, score) VALUES(NULL, ?, ?, ?)' for i in xrange(top_term_mat.shape[0]): scores = 1 / hellinger_distance(top_term_mat[i, :]**0.5, top_term_mat[i + 1:, :]**0.5) scores[np.where(np.isinf(scores))] = -1 res = generic_generator((i, ) * len(scores), range(i + 1, i + 1 + len(scores)), scores) self.dbase.executemany(execution_str, res)
def get_top_related_terms(self, term, top_n=10): """ Get the top_n terms related to the given term """ term_id = term.id top_term_mat = pickle.load(open(self.term_topic_obj_loc, 'rb')) max_score = 100000000 # compute the inverse Hellinger distance using the topic distributions for each term (lower is better) term = top_term_mat[term_id, :] scores = hellinger_distance(term, top_term_mat) scores[term_id] = max_score scores = 1 / scores top_term_ids = np.argsort(scores)[::-1][:top_n] top_terms = [] for ttid in top_term_ids: ttid = int(ttid) trm = self.get_term(ttid) top_terms.append(trm) return top_terms
def get_top_related_terms(self, term, top_n = 10): """ Get the top_n terms related to the given term """ term_id = term.id top_term_mat = pickle.load(open(self.term_topic_obj_loc,'rb')) max_score = 100000000 # compute the inverse Hellinger distance using the topic distributions for each term (lower is better) term = top_term_mat[term_id,:] scores = hellinger_distance(term, top_term_mat) scores[term_id] = max_score scores = 1/scores top_term_ids = np.argsort(scores)[::-1][:top_n] top_terms = [] for ttid in top_term_ids: ttid = int(ttid) trm = self.get_term(ttid) top_terms.append(trm) return top_terms
def write_doc_doc(self, doc_top_mat): """ Write the doc-doc relationships to the db @param doc_top_mat: a document x topic matrix where higher scores indicate greater similarity """ ndocs = doc_top_mat.shape[0] scores = np.zeros([ndocs, ndocs]) for i in xrange(ndocs): scores[i, i+1:] = 1/hellinger_distance(doc_top_mat[i,:]**0.5, doc_top_mat[i+1:,:]**0.5) scores[np.where(np.isinf(scores))] = -1 scores = scores + scores.T # for accurate top K doc-docs score_inds = self._get_rev_srt_ind((scores))[:,:30] # take the top thirty related docs (lower bound) TODO make an option? db_list = [] idxs = {} # so we don't have duplicates in the database for i in xrange(scores.shape[0]): for j in score_inds[i,:]: j = int(j) minv = min(i,j) maxv = max(i,j) if not idxs.has_key('%i %i' % (minv,maxv)): db_list.append((minv, maxv, round(scores[minv,maxv], 3))) # TODO this could probably be replaced with a generator idxs['%i %i' % (minv,maxv)] = 1 self.dbase.executemany("INSERT INTO doc_doc('id', 'doc_a', 'doc_b', 'score') VALUES(NULL, ?, ?, ?)", db_list)