Ejemplo n.º 1
0
def run(source, target, num_topics = 100, passes = 20, lang = 'en', distance_measure = euclidean, percentage = 0.05):
	"""
	Main entry point for this package. Contains and executes the whole data pipeline. 

	Arguments:
	source -- The path string to the source file containing all reviews
	target -- The path string to the target directory where the neighbors for all users will be saved

	Keyword arguments:
	num_topics -- The number of topics LDA is supposed to discover (default 100)
	passes -- The number of iterations for the statistical inference algorithm (default 20)
	lang -- The language the reviews shall be sorted by (default 'en')
	distance_measure -- A python function that measures the distance between two vectors in a num_topics-dimensional vector space. 
				Must take two numpy arrays and return a float. (default euclidean)
	percentage -- The cutoff for being a close neighbor, i.e. two users are close if their distance is 
			within the closest percentage percent of all distances (default 0.05) 
	"""
	with open(source) as f:
		all_reviews = []
		for line in f:
			all_reviews.append(json.loads(line))

	reviews = filter_by_language(all_reviews, lang)

	rt = ReviewTokenizer(reviews)
	rt.tokenize()

	db = DictionaryBuilder(rt.tokenized_docs)
	db.build()

	dtmb = DTMBuilder(db.dictionary, db.srcTexts)
	dtmb.build()

	ldaw = LDAWrapper(dtmb.dtm, db.dictionary)
	ldaw.run(num_topics = num_topics, passes = passes)

	modelwrapper = LDAModelWrapper(ldaw.ldamodel, db.dictionary, sortByUsers(rt.tokenized_docs))
	posteriors = modelwrapper.get_all_posteriors()

	means = {}
	for key, value in posteriors.iteritems():
		means[key] = mean(value).tolist()

	x = Recommender(means)
	y = x.calc_distances(distance_measure)

	threshhold = fivePercent(y, percentage)

	for user in means.iterkeys():
		z = x.calc_neighbors(user, distance_measure, threshhold = threshhold)
		if len(target) > 0:
			fileName = target + '/' + user + '.json'
		else:
			fileName = user + '.json'
		with open(fileName, 'w') as g:
			json.dump(z, g) 
def euclidean(x,y):   
    return np.sqrt(np.sum((x-y)**2))

means = {}
for key, value in posteriors.iteritems():
	means[key] = mean(value).tolist()

"""
with open('means_eachLine.json', 'w') as f:
	for key, value in means.iteritems():
		json.dump({key: value}, f)
		f.write('\n')
	#json.dump(means,f)

#neighbors = {}
x = Recommender(means)
for user in means.iterkeys():	
	y = x.calc_neighbors(user, euclidean)
	with open('neighbors/neighbors_' + user + '.json', 'w') as f:
		json.dump(y, f)
	#neighbors[user] = y


with open('neighbors_full.json', 'w') as f:
	json.dump(neighbors, f)
"""

x = Recommender(means)
y = x.calc_distances(euclidean)
with open('all_distances.json', 'w') as f:
	json.dump(y, f)