def get_documents_matrix(self): """ Get a matrix of documents vs. concepts. This is temporarily cached (besides what StudyDir does) because it will be needed multiple times in analyzing a study. FIXME: try to make canonical documents not change the results """ self._step('Building document matrix...') if self.num_documents == 0: assert False return None if self._documents_matrix is not None: return self._documents_matrix entries = [] for doc in self.study_documents: self._step(doc.name) for concept, value in doc.extract_concepts_with_negation()[:1000]: if (concept not in PUNCTUATION) and (not en_nl.is_blacklisted(concept)): entries.append((value, doc.name, concept)) documents_matrix = divisi2.make_sparse(entries).normalize_tfidf(cols_are_terms=True) canon_entries = [] for doc in self.canonical_documents: self._step(doc.name) for concept, value in doc.extract_concepts_with_negation()[:1000]: if (concept not in PUNCTUATION) and (not en_nl.is_blacklisted(concept)): canon_entries.append((value, doc.name, concept)) if canon_entries: canonical_matrix = divisi2.make_sparse(canon_entries).normalize_rows() self._documents_matrix = documents_matrix + canonical_matrix else: self._documents_matrix = documents_matrix return self._documents_matrix
def get_documents_matrix(self): """ Get a matrix of documents vs. concepts. This is temporarily cached (besides what StudyDir does) because it will be needed multiple times in analyzing a study. FIXME: try to make canonical documents not change the results """ self._step('Building document matrix...') if self.num_documents == 0: assert False return None if self._documents_matrix is not None: return self._documents_matrix entries = [] for doc in self.study_documents: self._step(doc.name) for concept, value in doc.extract_concepts_with_negation()[:1000]: if (concept not in PUNCTUATION) and ( not en_nl.is_blacklisted(concept)): entries.append((value, doc.name, concept)) documents_matrix = divisi2.make_sparse(entries).normalize_tfidf( cols_are_terms=True) canon_entries = [] for doc in self.canonical_documents: self._step(doc.name) for concept, value in doc.extract_concepts_with_negation()[:1000]: if (concept not in PUNCTUATION) and ( not en_nl.is_blacklisted(concept)): canon_entries.append((value, doc.name, concept)) if canon_entries: canonical_matrix = divisi2.make_sparse( canon_entries).normalize_rows() self._documents_matrix = documents_matrix + canonical_matrix else: self._documents_matrix = documents_matrix return self._documents_matrix
def build_matrix(query, cutoff=DEFAULT_CUTOFF, identity_weight=DEFAULT_IDENTITY_WEIGHT, data_source=conceptnet_quads, transform=to_value_concept_feature): """ Builds a Divisi2 SparseMatrix from relational data. One required argument is the `query`, which can be a QuerySet or just a language identifier. Optional arguments: - `cutoff`: specifies how common a concept has to be to appear in the matrix. Defaults to DEFAULT_CUTOFF=5. - `identity_weight` - `data_source`: a function that produces (concept1, rel, concept2, value) quads given the `query` and `cutoff`. Defaults to :meth:`conceptnet_quads`. - `transform`: the function for transforming quads into (value, row_name, column_name) triples. Defaults to :meth:`to_value_concept_feature`, which yields (value, concept, feature) triples. """ logger.info("Performing ConceptNet query") quads = list(data_source(query, cutoff)) # todo: separate this out into a customizable function if identity_weight > 0: logger.info("Adding identities") morequads = [] concept_set = set(q[0] for q in quads) for concept in concept_set: morequads.append( (concept, 'InheritsFrom', concept, identity_weight)) for c1, rel, c2, val in quads: if rel == 'IsA': morequads.append((c1, 'InheritsFrom', c1, val)) quads.extend(morequads) logger.info("Creating triples") triples = transform(quads) logger.info("Building matrix") matrix = divisi2.make_sparse(triples) logger.info("Squishing underused rows") return matrix.squish(cutoff)
def get_documents_matrix(self): """ Get a matrix of documents vs. concepts. This is temporarily cached (besides what StudyDir does) because it will be needed multiple times in analyzing a study. """ self._step('Building document matrix...') if self.num_documents == 0: return None if self._documents_matrix is not None: return self._documents_matrix entries = [] for doc in self.documents: for concept, value in doc.extract_concepts_with_negation(): if (concept not in PUNCTUATION) and (not en_nl.is_blacklisted(concept)): entries.append((value, doc.name, concept)) self._documents_matrix = divisi2.make_sparse(entries) return self._documents_matrix
def make_divisi_matrix(filename): parsedlist = inform_parser(filename) game = filename.split('.')[0] thinglist = [(1 if x[3] else -1, english.normalize(x[0].replace('^', "'")), ('right', x[1], english.normalize(x[2].replace('^', "'")))) for x in parsedlist] # Write out the confusingly-named overlist. First, the nouns. overlist = open(game + '.over', 'w') for concept1, rel, concept2, val in parsedlist: if rel == 'HasProperty' and concept2 == 'mark_as_thing': print >> overlist, concept1 print concept1 # Now the verbs. verbs = verb_reader(filename) for verb in verbs: print >> overlist, verb overlist.close() game_matrix = divisi2.make_sparse(thinglist).normalize_all() divisi2.save(game_matrix, game + '.pickle') return game_matrix
def build_matrix(query, cutoff=DEFAULT_CUTOFF, identity_weight=DEFAULT_IDENTITY_WEIGHT, data_source=conceptnet_quads, transform=to_value_concept_feature): """ Builds a Divisi2 SparseMatrix from relational data. One required argument is the `query`, which can be a QuerySet or just a language identifier. Optional arguments: - `cutoff`: specifies how common a concept has to be to appear in the matrix. Defaults to DEFAULT_CUTOFF=5. - `identity_weight` - `data_source`: a function that produces (concept1, rel, concept2, value) quads given the `query` and `cutoff`. Defaults to :meth:`conceptnet_quads`. - `transform`: the function for transforming quads into (value, row_name, column_name) triples. Defaults to :meth:`to_value_concept_feature`, which yields (value, concept, feature) triples. """ logger.info("Performing ConceptNet query") quads = list(data_source(query, cutoff)) # todo: separate this out into a customizable function if identity_weight > 0: logger.info("Adding identities") morequads = [] concept_set = set(q[0] for q in quads) for concept in concept_set: morequads.append( (concept, 'InheritsFrom', concept, identity_weight) ) for c1, rel, c2, val in quads: if rel == 'IsA': morequads.append( (c1, 'InheritsFrom', c1, val) ) quads.extend(morequads) logger.info("Creating triples") triples = transform(quads) logger.info("Building matrix") matrix = divisi2.make_sparse(triples) logger.info("Squishing underused rows") return matrix.squish(cutoff)
from csc import divisi2 import cPickle as pickle mat_4x3 = divisi2.make_sparse([(2, "apple", "red"), (2, "orange", "orange"), (1, "apple", "green"), (1, "celery", "green"), (-1, "apple", "orange"), (-1, "banana", "orange")]) def pickle_bounce(obj): s = pickle.dumps(obj) objcopy = pickle.loads(s) return objcopy def test_sparse_pickle(): mat2 = divisi2.SparseMatrix.from_state(mat_4x3.to_state()) assert mat2 == mat_4x3 assert pickle_bounce(mat_4x3) == mat_4x3 assert pickle_bounce(mat_4x3[0]) == mat_4x3[0] assert pickle_bounce(mat_4x3[:, 0]) == mat_4x3[:, 0] def test_dense_pickle(): dmat = mat_4x3.to_dense() assert pickle_bounce(dmat).equals(dmat) assert pickle_bounce(dmat[0]).equals(dmat[0]) assert pickle_bounce(dmat[:, 0]).equals(dmat[:, 0])
def make_svd(self): matrix = divisi2.make_sparse(self.data).normalize_all() self.U,self.s,self.V = matrix.svd(k=14) self.predictions = divisi2.reconstruct_activation(self.V, self.s) del self.data print "init end"
from csc import divisi2 import cPickle as pickle mat_4x3 = divisi2.make_sparse([ (2, "apple", "red"), (2, "orange", "orange"), (1, "apple", "green"), (1, "celery", "green"), (-1, "apple", "orange"), (-1, "banana", "orange") ]) def pickle_bounce(obj): s = pickle.dumps(obj) objcopy = pickle.loads(s) return objcopy def test_sparse_pickle(): mat2 = divisi2.SparseMatrix.from_state(mat_4x3.to_state()) assert mat2 == mat_4x3 assert pickle_bounce(mat_4x3) == mat_4x3 assert pickle_bounce(mat_4x3[0]) == mat_4x3[0] assert pickle_bounce(mat_4x3[:,0]) == mat_4x3[:,0] def test_dense_pickle(): dmat = mat_4x3.to_dense() assert pickle_bounce(dmat).equals(dmat) assert pickle_bounce(dmat[0]).equals(dmat[0]) assert pickle_bounce(dmat[:,0]).equals(dmat[:,0])
cursor.execute(str % args) ret = cursor.fetchall() cursor.close() return ret f = open('../includes/config.yaml') config = yaml.load(f) f.close() conn = MySQLdb.connect(config['dbhost'],config['dbuser'],config['dbpass'] or '',config['dbname']) courses = db_query(conn, "SELECT * FROM comments WHERE parent='1'", ()) data = [] for c in courses: topics = db_query(conn, "SELECT * FROM comments WHERE parent='%d'", (c[0])) for t in topics: data.append( (1, c[0], t[1]) ) mat = divisi2.make_sparse(data) mat = mat.normalize_rows() mat_t = mat.T mult = divisi2.matrixmultiply(mat, mat_t) print mult similarities = mult.named_entries() for s in similarities: v,c1,c2 = s if c1 != c2: db_query(conn, "REPLACE INTO similarities (cid1,cid2,val) VALUE (%d,%d,%f)", (c1,c2,v)); conn.close()