Ejemplo n.º 1
0
 def write_doc_doc(self, doc_top_mat):
     """
     Write the doc-doc relationships to the db
     @param doc_top_mat: a document x topic matrix where higher scores indicate greater similarity
     """
     ndocs = doc_top_mat.shape[0]
     scores = np.zeros([ndocs, ndocs])
     for i in xrange(ndocs):
         scores[i, i + 1:] = 1 / hellinger_distance(
             doc_top_mat[i, :]**0.5, doc_top_mat[i + 1:, :]**0.5)
     scores[np.where(np.isinf(scores))] = -1
     scores = scores + scores.T  # for accurate top K doc-docs
     score_inds = self._get_rev_srt_ind(
         (scores)
     )[:, :
       30]  # take the top thirty related docs (lower bound) TODO make an option?
     db_list = []
     idxs = {}  # so we don't have duplicates in the database
     for i in xrange(scores.shape[0]):
         for j in score_inds[i, :]:
             j = int(j)
             minv = min(i, j)
             maxv = max(i, j)
             if not idxs.has_key('%i %i' % (minv, maxv)):
                 db_list.append(
                     (minv, maxv, round(scores[minv, maxv], 3))
                 )  # TODO this could probably be replaced with a generator
                 idxs['%i %i' % (minv, maxv)] = 1
     self.dbase.executemany(
         "INSERT INTO  doc_doc('id', 'doc_a', 'doc_b', 'score') VALUES(NULL, ?, ?, ?)",
         db_list)
Ejemplo n.º 2
0
 def write_topic_topic(self, top_term_mat):
     """
     Write the topic x topic matrix to the database
     @param top_term_mat: topics x terms matrix, should represent log-lieklihood for accurate calculations
     """
     # TODO make distance metric a user option
     execution_str = 'INSERT INTO topic_topic (id, topic_a, topic_b, score) VALUES(NULL, ?, ?, ?)'
     for i in xrange(top_term_mat.shape[0]):
         scores = 1/hellinger_distance(top_term_mat[i,:]**0.5, top_term_mat[i+1:,:]**0.5)
         scores[np.where(np.isinf(scores))] = -1
         res = generic_generator((i,)*len(scores), range(i+1, i+1+len(scores)), scores)
         self.dbase.executemany(execution_str, res)
Ejemplo n.º 3
0
 def write_topic_topic(self, top_term_mat):
     """
     Write the topic x topic matrix to the database
     @param top_term_mat: topics x terms matrix, should represent log-lieklihood for accurate calculations
     """
     # TODO make distance metric a user option
     execution_str = 'INSERT INTO topic_topic (id, topic_a, topic_b, score) VALUES(NULL, ?, ?, ?)'
     for i in xrange(top_term_mat.shape[0]):
         scores = 1 / hellinger_distance(top_term_mat[i, :]**0.5,
                                         top_term_mat[i + 1:, :]**0.5)
         scores[np.where(np.isinf(scores))] = -1
         res = generic_generator((i, ) * len(scores),
                                 range(i + 1, i + 1 + len(scores)), scores)
         self.dbase.executemany(execution_str, res)
Ejemplo n.º 4
0
    def get_top_related_terms(self, term, top_n=10):
        """
        Get the top_n terms related to the given term
        """
        term_id = term.id
        top_term_mat = pickle.load(open(self.term_topic_obj_loc, 'rb'))
        max_score = 100000000

        # compute the inverse Hellinger distance using the topic distributions for each term  (lower is better)
        term = top_term_mat[term_id, :]
        scores = hellinger_distance(term, top_term_mat)
        scores[term_id] = max_score
        scores = 1 / scores
        top_term_ids = np.argsort(scores)[::-1][:top_n]
        top_terms = []
        for ttid in top_term_ids:
            ttid = int(ttid)
            trm = self.get_term(ttid)
            top_terms.append(trm)
        return top_terms
Ejemplo n.º 5
0
    def get_top_related_terms(self, term, top_n = 10):
        """
        Get the top_n terms related to the given term
        """
        term_id = term.id
        top_term_mat = pickle.load(open(self.term_topic_obj_loc,'rb'))
        max_score = 100000000

        # compute the inverse Hellinger distance using the topic distributions for each term  (lower is better)
        term = top_term_mat[term_id,:]
        scores = hellinger_distance(term, top_term_mat)
        scores[term_id] = max_score
        scores = 1/scores
        top_term_ids = np.argsort(scores)[::-1][:top_n]
        top_terms = []
        for ttid in top_term_ids:
            ttid = int(ttid)
            trm = self.get_term(ttid)
            top_terms.append(trm)
        return top_terms
Ejemplo n.º 6
0
 def write_doc_doc(self, doc_top_mat):
     """
     Write the doc-doc relationships to the db
     @param doc_top_mat: a document x topic matrix where higher scores indicate greater similarity
     """
     ndocs = doc_top_mat.shape[0]
     scores = np.zeros([ndocs, ndocs])
     for i in xrange(ndocs):
         scores[i, i+1:] = 1/hellinger_distance(doc_top_mat[i,:]**0.5, doc_top_mat[i+1:,:]**0.5)
     scores[np.where(np.isinf(scores))] = -1
     scores = scores + scores.T # for accurate top K doc-docs
     score_inds = self._get_rev_srt_ind((scores))[:,:30] # take the top thirty related docs (lower bound) TODO make an option?
     db_list = []
     idxs = {} # so we don't have duplicates in the database
     for i in xrange(scores.shape[0]):
         for j in score_inds[i,:]:
             j = int(j)
             minv = min(i,j)
             maxv = max(i,j)
             if not idxs.has_key('%i %i' % (minv,maxv)):
                     db_list.append((minv, maxv, round(scores[minv,maxv], 3))) # TODO this could probably be replaced with a generator
                     idxs['%i %i' % (minv,maxv)] = 1
     self.dbase.executemany("INSERT INTO  doc_doc('id', 'doc_a', 'doc_b', 'score') VALUES(NULL, ?, ?, ?)", db_list)