Ejemplo n.º 1
0
    def get_analogy_blend(self):
        other_matrices = [matrix for name, matrix in
        self.other_matrices.items() if name.endswith('.smat')]
        other_matrices = self.other_matrices.values()
        
        # find concepts used at least twice
        docs = self.get_documents_matrix()
        concept_counts = docs.col_op(len)
        valid_concepts = set()
        for concept, count in concept_counts.to_sparse().named_items():
            if count >= 3: valid_concepts.add(concept)
        
        # extract relevant concepts from the doc matrix;
        # transpose it so it's concepts vs. documents
        orig_doc_matrix = self.get_documents_matrix()
        #sdoc_indices = [orig_doc_matrix.row_index(sdoc.name)
        #                for sdoc in self.study_documents]
        concept_indices = [orig_doc_matrix.col_index(c)
                           for c in valid_concepts]

        # NOTE: canonical documents can affect the stats this way.
        # Is there a clean way to fix this?

        doc_matrix = orig_doc_matrix[:,concept_indices].T.squish()
        if doc_matrix is None:
            theblend = blend(other_matrices)
            study_concepts = set(theblend.row_labels)
        else:
            theblend = blend([doc_matrix] + other_matrices)
            study_concepts = set(doc_matrix.row_labels)
        return theblend, study_concepts
Ejemplo n.º 2
0
    def get_analogy_blend(self):
        other_matrices = [
            matrix for name, matrix in self.other_matrices.items()
            if name.endswith('.smat')
        ]
        other_matrices = self.other_matrices.values()

        # find concepts used at least twice
        docs = self.get_documents_matrix()
        concept_counts = docs.col_op(len)
        valid_concepts = set()
        for concept, count in concept_counts.to_sparse().named_items():
            if count >= 3: valid_concepts.add(concept)

        # extract relevant concepts from the doc matrix;
        # transpose it so it's concepts vs. documents
        orig_doc_matrix = self.get_documents_matrix()
        #sdoc_indices = [orig_doc_matrix.row_index(sdoc.name)
        #                for sdoc in self.study_documents]
        concept_indices = [
            orig_doc_matrix.col_index(c) for c in valid_concepts
        ]

        # NOTE: canonical documents can affect the stats this way.
        # Is there a clean way to fix this?

        doc_matrix = orig_doc_matrix[:, concept_indices].T.squish()
        if doc_matrix is None:
            theblend = blend(other_matrices)
            study_concepts = set(theblend.row_labels)
        else:
            theblend = blend([doc_matrix] + other_matrices)
            study_concepts = set(doc_matrix.row_labels)
        return theblend, study_concepts
Ejemplo n.º 3
0
    def get_assoc_blend(self):
        other_matrices = []
        doc_matrix = self.get_documents_assoc()
        self._step('Blending...')
        for name, matrix in self.other_matrices.items():
            # use association matrices only
            # (unless we figure out how to do both kinds of blending)
            if name.endswith('.assoc.smat'):
                if matrix.shape[0] != matrix.shape[1]:
                    raise ValueError("The matrix %s is not square" % name)
                other_matrices.append(matrix)

        if doc_matrix is None:
            theblend = blend(other_matrices)
            study_concepts = set(theblend.row_labels)
        else:
            theblend = blend([doc_matrix] + other_matrices)
            study_concepts = set(doc_matrix.row_labels)
        return theblend, study_concepts
Ejemplo n.º 4
0
    def get_assoc_blend(self):
        other_matrices = []
        doc_matrix = self.get_documents_assoc()
        self._step('Blending...')
        for name, matrix in self.other_matrices.items():
            # use association matrices only
            # (unless we figure out how to do both kinds of blending)
            if name.endswith('.assoc.smat'):
                if matrix.shape[0] != matrix.shape[1]:
                    raise ValueError("The matrix %s is not square" % name)
                other_matrices.append(matrix)

        if doc_matrix is None:
            theblend = blend(other_matrices)
            study_concepts = set(theblend.row_labels)
        else:
            theblend = blend([doc_matrix] + other_matrices)
            study_concepts = set(doc_matrix.row_labels)
        return theblend, study_concepts
Ejemplo n.º 5
0
def make_blend(thefile):
    conceptnet = divisi2.network.conceptnet_matrix('en').normalize_all()
    thegame = divisi2.load(thefile).normalize_all()
    blended_matrix = blend([conceptnet, thegame], [0.9, 0.1])
    u,s,v = blended_matrix.svd()
    
    similarity = divisi2.reconstruct_similarity(u, s) # offset=1.5) 
    pd.mkdir(thefile.split('.')[0])
    pd[thefile.split('.')[0]]['blend'] = similarity
    return similarity
	obj_list.append(obj)
 		# weighted_triple = (rel_triple, weight)
	weighted_relations.append(rel_triple)

print len(weighted_relations)	
#print len(obj_list)
obj_list = set(obj_list)	
print len(obj_list)
matrix = divisi2.make_sparse(weighted_relations)
#print matrix

# ConceptNet Matrix
A = divisi2.network.conceptnet_matrix('en')
A_concept_axes, A_axis_weights, A_feature_axes = A.svd(k=100)

blended_matrix = blend([matrix, A])
concept_axes, axis_weights, feature_axes = blended_matrix.svd(k=100)

common_objects = list(set(obj_list).intersection(A.row_labels))
print len(A.row_labels)

# Save embeddings for ConceptNet
cnet_object_embeddings = np.array(
	[A_concept_axes.row_named(obj) for obj in common_objects])
np.save('cnet_object_embeddings.npy', cnet_object_embeddings)

# Save embeddings for Blended Matrix
blended_object_embeddings = np.array(
	[concept_axes.row_named(obj) for obj in common_objects])
np.save('blended_object_embeddings.npy', blended_object_embeddings)
v = []