def run(source, target, num_topics = 100, passes = 20, lang = 'en', distance_measure = euclidean, percentage = 0.05): """ Main entry point for this package. Contains and executes the whole data pipeline. Arguments: source -- The path string to the source file containing all reviews target -- The path string to the target directory where the neighbors for all users will be saved Keyword arguments: num_topics -- The number of topics LDA is supposed to discover (default 100) passes -- The number of iterations for the statistical inference algorithm (default 20) lang -- The language the reviews shall be sorted by (default 'en') distance_measure -- A python function that measures the distance between two vectors in a num_topics-dimensional vector space. Must take two numpy arrays and return a float. (default euclidean) percentage -- The cutoff for being a close neighbor, i.e. two users are close if their distance is within the closest percentage percent of all distances (default 0.05) """ with open(source) as f: all_reviews = [] for line in f: all_reviews.append(json.loads(line)) reviews = filter_by_language(all_reviews, lang) rt = ReviewTokenizer(reviews) rt.tokenize() db = DictionaryBuilder(rt.tokenized_docs) db.build() dtmb = DTMBuilder(db.dictionary, db.srcTexts) dtmb.build() ldaw = LDAWrapper(dtmb.dtm, db.dictionary) ldaw.run(num_topics = num_topics, passes = passes) modelwrapper = LDAModelWrapper(ldaw.ldamodel, db.dictionary, sortByUsers(rt.tokenized_docs)) posteriors = modelwrapper.get_all_posteriors() means = {} for key, value in posteriors.iteritems(): means[key] = mean(value).tolist() x = Recommender(means) y = x.calc_distances(distance_measure) threshhold = fivePercent(y, percentage) for user in means.iterkeys(): z = x.calc_neighbors(user, distance_measure, threshhold = threshhold) if len(target) > 0: fileName = target + '/' + user + '.json' else: fileName = user + '.json' with open(fileName, 'w') as g: json.dump(z, g)
def euclidean(x,y): return np.sqrt(np.sum((x-y)**2)) means = {} for key, value in posteriors.iteritems(): means[key] = mean(value).tolist() """ with open('means_eachLine.json', 'w') as f: for key, value in means.iteritems(): json.dump({key: value}, f) f.write('\n') #json.dump(means,f) #neighbors = {} x = Recommender(means) for user in means.iterkeys(): y = x.calc_neighbors(user, euclidean) with open('neighbors/neighbors_' + user + '.json', 'w') as f: json.dump(y, f) #neighbors[user] = y with open('neighbors_full.json', 'w') as f: json.dump(neighbors, f) """ x = Recommender(means) y = x.calc_distances(euclidean) with open('all_distances.json', 'w') as f: json.dump(y, f)