def __init__(self, preloaded_actor=None, subsampling=False, fast=False, test=True): """We need to load and preprocess all of the vectors into the memory and persist them to cut down on IO costs""" if not preloaded_actor: # a= 'all' # w='wikipedia' trained = "data" #fnw = '%s/vectors.fullwiki.1000.s50.5k.words' % trained fnw = '%s/vectors.fullwiki.1000.s50.words' % trained fnw = '%s/freebase.words' % trained if False: wc2t = '%s/c2t' % './data' wt2c = '%s/t2c' % './data' # all word vecotor lib VL self.wc2t = cPickle.load(open(wc2t)) self.wt2c = cPickle.load(open(wt2c)) print "Loading...", ks, vs = [], [] for k, v in self.wc2t.iteritems(): k = veclib.canonize(k, {}, match=False) ks.append(k) vs.append(v) for k, v in zip(ks, vs): self.wc2t[k] = v print " done with veclib" # all words, word to index mappings w2i if os.path.exists(fnw + '.pickle'): self.aw2i, self.ai2w = cPickle.load(open(fnw + '.pickle')) else: self.aw2i, self.ai2w = veclib.get_words(fnw) cPickle.dump([self.aw2i, self.ai2w], open(fnw + '.pickle', 'w')) print " done with aw2i" else: # Wikipedia articles and their canonical transformations if False: self.wc2t = preloaded_actor.wc2t #Wiki dump article titles self.wt2c = preloaded_actor.wt2c # All vectors from word2vec self.aw2i = preloaded_actor.aw2i self.ai2w = preloaded_actor.ai2w
def __init__(self, preloaded_actor=None, subsampling=False, fast=False, test=True): """We need to load and preprocess all of the vectors into the memory and persist them to cut down on IO costs""" if not preloaded_actor: # a= 'all' # w='wikipedia' trained = "data" #fnw = '%s/vectors.fullwiki.1000.s50.5k.words' % trained fnw = '%s/vectors.fullwiki.1000.s50.words' % trained fnw = '%s/freebase.words' % trained if False: wc2t = '%s/c2t' % './data' wt2c = '%s/t2c' % './data' # all word vecotor lib VL self.wc2t = cPickle.load(open(wc2t)) self.wt2c = cPickle.load(open(wt2c)) print "Loading...", ks, vs = [], [] for k, v in self.wc2t.iteritems(): k = veclib.canonize(k, {}, match=False) ks.append(k) vs.append(v) for k, v in zip(ks, vs): self.wc2t[k] = v print " done with veclib" # all words, word to index mappings w2i if os.path.exists(fnw + '.pickle'): self.aw2i , self.ai2w = cPickle.load(open(fnw + '.pickle')) else: self.aw2i , self.ai2w = veclib.get_words(fnw) cPickle.dump([self.aw2i, self.ai2w], open(fnw + '.pickle','w')) print " done with aw2i" else: # Wikipedia articles and their canonical transformations if False: self.wc2t = preloaded_actor.wc2t #Wiki dump article titles self.wt2c = preloaded_actor.wt2c # All vectors from word2vec self.aw2i = preloaded_actor.aw2i self.ai2w = preloaded_actor.ai2w
from utils import * app = Flask(__name__, static_folder='static', static_url_path='', template_folder='templates') trained = "/home/ubuntu/data" fnv = '%s/vectors.fullwiki.1000.s50.num.npy' % trained fnw = '%s/vectors.fullwiki.1000.s50.words' % trained ffb = '%s/freebase_types_and_fullwiki.1000.s50.words' % trained avl = veclib.get_vector_lib(fnv) #avl = veclib.normalize(avl) avl = veclib.split(veclib.normalize, avl) if os.path.exists(fnw + '.pickle'): aw2i, ai2w = cPickle.load(open(fnw + '.pickle')) else: aw2i, ai2w = veclib.get_words(fnw) cPickle.dump([aw2i, ai2w], open(fnw + '.pickle','w')) frac = None if frac: end = int(avl.shape[0] * frac) avl = avl[:end] for i in range(end, avl.shape): del aw2i[ai2w[i].pop()] @app.route('/farthest/<raw_query>') #@json_exception def farthest(raw_query='{"args":["iphone", "ipad", "ipod", "walkman"]}'): """Given a list of arguments, calculate all the N^2 distance matrix and return the item farthest away. The total distance is just the distance from a node to all other nodes seperately.""" print 'QUERY'