def make_luminoso_space(docDir, studyDir): if not os.path.isdir(studyDir): model = luminoso2.make_common_sense(studyDir, 'en') else: model = luminoso2.load(studyDir) model.learn_from(docDir,studyDir) return model
from csc import divisi2 import numpy as np import json, luminoso2, time, urllib2 from divisi2 import DenseMatrix from csc_utils.ordered_set import OrderedSet from charm_exceptions import * #model = luminoso2.load('pldb_2011_may') model = luminoso2.load('hack_2011_oct/Model') sponsormat = divisi2.load("sponsors.dmat") doc_matrix = model.get_doc_matrix('pldb') tag_matrix = model.get_tag_matrix() tag_matrix = model.get_tag_matrix() tag_matrix = DenseMatrix.concatenate(tag_matrix, sponsormat) for i in xrange(doc_matrix.shape[0]): doc_matrix.row_labels[i] = doc_matrix.row_labels[i].replace('hack_2011_oct/Documents', 'PLDBDocs') # print doc_matrix.row_labels[i] def get_related_sponsors(email, n=10): if not ('sponsor', email) in tag_matrix.row_labels: return [] vec = tag_matrix.row_named(('sponsor', email)) got = divisi2.dot(tag_matrix, vec) results = [] for tag, weight in got.top_items(len(got)): key, value = tag if key == 'sponsor': results.append((value, weight)) if len(results) >= n: