def indexing(): business_coll = mongodb_helper.get_coll(settings.BUSINESS_COLL) business_coll.create_index("business_id") vector_coll = mongodb_helper.get_coll(settings.VECTOR_COLL) vector_coll.create_index("id") vector_coll.create_index("type")
def get_knn(type_, id_, k=10, approach='hin2vec'): distances = [] #TODO refector coll_mapping = { 'hin2vec': settings.VECTOR_COLL, 'deepwalk': settings.VECTOR_DEEPWALK_COLL, 'pte': settings.VECTOR_PTE_COLL, 'esim': settings.VECTOR_ESIM_COLL, } vector_coll = mongodb_helper.get_coll(coll_mapping[approach]) rest = vector_coll.find_one({'id': id_}) if rest is None: return [] v = rest['v'] for business in vector_coll.find({'type': settings.BUSINESS_COLL}): v2 = business['v'] if type_ == 'euclidean': distance = by_euclidean_distance(v, v2) if type_ == 'manhattan': distance = by_manhattan_distance(v, v2) if type_ == 'inner': distance = np.inner(v, v2) if type_ == 'sigmoid': distance = by_sigmoid_inner_product(v, v2) if type_ == 'cosine': distance = by_cosine(v, v2) distances.append((distance, business['id'])) if type_ in ['inner', 'sigmoid', 'cosine']: results = sorted(distances, reverse=True)[1:k + 1] print results return results return sorted(distances)[1:k + 1]
def extract_all_review(): review_coll = mongodb_helper.get_coll(settings.REVIEW_COLL) a = review_coll.find({}) text_lists = [] for record in a: text_lists.append(record['text']) return text_lists
def insert_db(bid, keywords): coll = mongodb_helper.get_coll(settings.BUSINESS_KEYWORD_COLL) keywords_dict_list = [] for word, score in keywords.items(): keywords_dict_list.append({'word': word, 'score': score}) data = {'id': bid, 'keywords': keywords_dict_list} coll.insert_one(data)
def main(k): '''\ %prog [options] <k> ''' k = int(k) bid2indexes, texts = extract_all_review() print 'Business count:', len(bid2indexes) print 'Review count:', len(texts) words, counts, tfidfs = compute_tfidf(texts) print 'Distinct word count:', len(words) seq2word = {} for seq, word in enumerate(words): seq2word[seq] = word coll = mongodb_helper.get_coll(settings.BUSINESS_KEYWORD_COLL) coll.drop() ith = 0 for bid, indexes in bid2indexes.items(): keywords = get_topk_keywords(k, indexes, tfidfs, seq2word) insert_db(bid, keywords) if ith % 100 == 0: print ith ith += 1 coll.create_index('id') return 0
def search(request): filtered = [] if 'q' in request.GET: solr = pysolr.Solr( 'http://%s:%d/solr/%s/' % (settings.SOLR_HOST, settings.SOLR_PORT, settings.SOLR_CORE), timeout=10) keywords = request.GET['q'] results = solr.search(keywords, rows=1000) vector_coll = mongodb_helper.get_coll(settings.VECTOR_COLL) review_coll = mongodb_helper.get_coll(settings.REVIEW_COLL) for r in results: b_id = r['business_id'][0] if vector_coll.find_one({'id': b_id}) is not None: review_count = review_coll.count({'business_id': b_id}) r['review_count'] = review_count filtered.append(r) return render(request, 'se.html', {'rests': filtered})
def detail(request, rest_id): business_coll = mongodb_helper.get_coll(settings.BUSINESS_COLL) rest_info = business_coll.find_one({'business_id': rest_id}) vector_coll = mongodb_helper.get_coll(settings.VECTOR_COLL) rest_vec = vector_coll.find_one({'id': rest_id}) knn_ids = [id_ for _, id_ in knn.by_euclidean_distance(rest_id)] knn_infos = [business_coll.find_one({'business_id': id_}) for id_ in knn_ids] categories = rest_info['categories'] knn_cat_dist = [] for cat, score in distribution.category_distribution(knn_ids): if cat in categories: knn_cat_dist.append((cat, score, True)) continue knn_cat_dist.append((cat, score, False)) return render(request, 'rest.html', {'rest_info': rest_info, 'rest_vec': rest_vec, 'knn_infos': knn_infos, 'knn_cat_dist': knn_cat_dist})
def by_sigmoid_inner_product(id_, k=10): distances = [] vector_coll = mongodb_helper.get_coll(settings.VECTOR_COLL) rest = vector_coll.find_one({'id': id_}) if rest is None: return [] v = rest['v'] for business in vector_coll.find({'type': settings.BUSINESS_COLL}): v2 = business['v'] distance = 1 / (1 + exp(-sum(a * b for a, b in zip(v, v2)))) distances.append((distance, business['id'])) return sorted(distances)[1:k + 1]
def by_manhattan_distance(id_, k=10): distances = [] vector_coll = mongodb_helper.get_coll(settings.VECTOR_COLL) rest = vector_coll.find_one({'id': id_}) if rest is None: return [] v = rest['v'] for business in vector_coll.find({'type': settings.BUSINESS_COLL}): v2 = business['v'] distance = sum(abs(a - b) for a, b in zip(v, v2)) distances.append((distance, business['id'])) return sorted(distances)[1:k + 1]
def import_yelp_data(): colls = [ (settings.BUSINESS_COLL, settings.BUSINESS_FILE), (settings.USER_COLL, settings.USER_FILE), (settings.REVIEW_COLL, settings.REVIEW_FILE), (settings.TIP_COLL, settings.TIP_FILE), (settings.CHECKIN_COLL, settings.CHECKIN_FILE), ] for coll_name, fpath in colls: coll = mongodb_helper.get_coll(coll_name) for sub_dataset in load_dataset(fpath): coll.insert_many(sub_dataset) print coll_name, coll.count()
def extract_review_text(ids): review_coll = mongodb_helper.get_coll(settings.REVIEW_COLL) result_dict = {} for id_ in ids: review_text_list = [] result = review_coll.find({'business_id': id_}) for record in result: original_text = record['text'] #processed_text = preprocess_review_text(original_text) #review_text_list.append(processed_text) review_text_list.append(original_text) result_dict[id_] = review_text_list return result_dict
def search(request): filtered = [] if 'q' in request.GET: solr = pysolr.Solr('http://localhost:8983/solr/gettingstarted/', timeout=10) keywords = request.GET['q'] results = solr.search(keywords) vector_coll = mongodb_helper.get_coll(settings.VECTOR_COLL) for r in results: if vector_coll.find_one({'id': r['business_id'][0]}) is not None: filtered.append(r) return render(request, 'se.html', {'rests': filtered})
def category_distribution(ids): business_coll = mongodb_helper.get_coll(settings.BUSINESS_COLL) cat_dist = {} for id_ in ids: cats = business_coll.find_one({'business_id': id_})['categories'] if cats is None: continue for cat in cats: if cat not in cat_dist: cat_dist[cat] = 1.0 / len(ids) continue cat_dist[cat] += 1.0 / len(ids) return sorted(cat_dist.items(), key=lambda x: x[1], reverse=True)
def extract_all_review(): review_coll = mongodb_helper.get_coll(settings.REVIEW_COLL) text_lists = [] bid2indexes = {} index = 0 for record in review_coll.find({}): text_lists.append(preprocess_review_text(record['text'])) bid = record['business_id'] if bid not in bid2indexes: bid2indexes[bid] = [index] continue bid2indexes[bid].append(index) index += 1 if index % 1000 == 0: print index return bid2indexes, text_lists
def get_knn(type_, id_, k=10, approach='hin2vec'): distances = [] #TODO refector coll_mapping = { 'hin2vec': settings.VECTOR_COLL, 'deepwalk': settings.VECTOR_DEEPWALK_COLL, 'pte': settings.VECTOR_PTE_COLL, 'esim': settings.VECTOR_ESIM_COLL, } vector_coll = mongodb_helper.get_coll(coll_mapping[approach]) rest = vector_coll.find_one({'id': id_}) if rest is None: return [] v = rest['v'] hin2vec_path_dim = [ 2, 15, 24, 31, 46, 52, 58, 68, 78, 85, 87, 92, 93, 98, 122, 125 ] # hin2vec_path_dim = [] if approach == 'hin2vec': for i in hin2vec_path_dim: v[i] = 0 for business in vector_coll.find({'type': settings.BUSINESS_COLL}): if business['id'] == id_: continue v2 = business['v'] if approach == 'hin2vec': for i in hin2vec_path_dim: v2[i] = 0 if type_ == 'euclidean': distance = by_euclidean_distance(v, v2) if type_ == 'manhattan': distance = by_manhattan_distance(v, v2) if type_ == 'inner': distance = np.inner(v, v2) if type_ == 'sigmoid': distance = by_sigmoid_inner_product(v, v2) if type_ == 'cosine': distance = by_cosine(v, v2) distances.append((distance, business['id'])) if type_ in ['inner', 'sigmoid', 'cosine']: results = sorted(distances, reverse=True)[:k] return results return sorted(distances)[:k]
def fetch_business_data(ids, key, data_type_flag): business_coll = mongodb_helper.get_coll(settings.BUSINESS_COLL) result_list = [] if data_type_flag == GEO_COORDS: for id_ in ids: longitude = business_coll.find_one({'business_id': id_})[key[0]] latitude = business_coll.find_one({'business_id': id_})[key[1]] result = (longitude, latitude, id_) if result is None: continue result_list.append(result) else: for id_ in ids: result = business_coll.find_one({'business_id': id_})[key] if result is None: continue if isinstance(result, list): for item in result: result_list.append(item) else: result_list.append(result) return result_list
def get_keywords(bid): coll = mongodb_helper.get_coll(settings.BUSINESS_KEYWORD_COLL) data = coll.find_one({'id': bid}) if data is None: return [] return [w['word'] for w in data['keywords']]
def detail(request, rest_id): business_coll = mongodb_helper.get_coll(settings.BUSINESS_COLL) rest_info = business_coll.find_one({'business_id': rest_id}) vector_coll = mongodb_helper.get_coll(settings.VECTOR_COLL) rest_vec = vector_coll.find_one({'id': rest_id}) # generate google map search string query = "https://www.google.com/maps/embed/v1/place?key=AIzaSyC0woDjDcggf1PhuX9POXxTO0F059_JpjU" query += "&q=" + "+".join(rest_info['address'].split(" ")) query += "," + "+".join(rest_info['city'].split(" ")) similarity_types = [['euclidean', 'Euclidean distance', False], ['manhattan', 'Manhattan distance', False], ['inner', 'Inner product', False], ['cosine', 'Cosine', False]] selected_sim_type = request.GET.get('similarity', 'euclidean') for s in similarity_types: if s[0] == selected_sim_type: s[2] = True break approaches = [['hin2vec', 'HIN2Vec', False], ['deepwalk', 'DeepWalk', False], ['pte', 'PTE', False], ['esim', 'Esim', False]] selected_approach = request.GET.get('approach', 'hin2vec') for s in approaches: if s[0] == selected_approach: s[2] = True break knn_result = knn.get_knn(selected_sim_type, rest_id, approach=selected_approach) knn_ids = [id_ for _, id_ in knn_result] knn_infos = [ business_coll.find_one({'business_id': id_}) for id_ in knn_ids ] for ith, b in enumerate(knn_infos): b['co_user_count'] = co_customers.get_number_com_customers( rest_id, b['business_id']) b['co_user_ratio'] = co_customers.get_ratio_com_customers( rest_id, b['business_id']) b['score'] = knn_result[ith][0] rest_info['keywords'] = views_helper.get_keywords(rest_id) # for kth_info in knn_infos: # kth_info['keywords'] = views_helper.get_keywords(kth_info['business_id']) knn_keyword_dist = [] for word, score in distribution.keyword_distribution(knn_ids): if word in rest_info['keywords']: knn_keyword_dist.append((word, score, True)) continue knn_keyword_dist.append((word, score, False)) knn_lon_lat = [] for row in knn_infos: knn_lon_lat.append([row['longitude'], row['latitude']]) categories = rest_info['categories'] knn_cat_dist = [] for cat, score in distribution.category_distribution(knn_ids): if cat in categories: knn_cat_dist.append((cat, score, True)) continue knn_cat_dist.append((cat, score, False)) barchart_data = [ go.Bar(x=[row[0] for row in knn_cat_dist], y=[row[1] for row in knn_cat_dist]) ] barchart_cat = plot(barchart_data, output_type="div").replace( "<div>", "<div style='height:500px'>") piechart_data_cat = [ go.Pie(labels=[row[0] for row in knn_cat_dist], values=[row[1] for row in knn_cat_dist]) ] piechart_cat = plot(piechart_data_cat, output_type="div").replace( "<div>", "<div style='height:500px'>") city = rest_info['city'] knn_city_dist = [] for c, score in distribution.city_distribution(knn_ids): if c == city: knn_city_dist.append((c, score, True)) continue knn_city_dist.append((c, score, False)) barchart_data = [ go.Bar(x=[row[0] for row in knn_city_dist], y=[row[1] for row in knn_city_dist]) ] barchart_city = plot(barchart_data, output_type="div").replace( "<div>", "<div style='height:500px'>") f = open("tmp.html", "w") f.write(barchart_city) f.close() piechart_data = [ go.Pie(labels=[row[0] for row in knn_city_dist], values=[row[1] for row in knn_city_dist]) ] piechart_city = plot(piechart_data, output_type="div").replace( "<div>", "<div style='height:500px'>") # edges = [(1,2), (3,2), (1,4), (3,4)] # nodes = {1: {"name": "McDonald's", "type": "business"}, # 2: {"name": "Jack", "type": "user"}, # 3: {"name": "Burger King","type": "business"}, # 4: {"name": "Anthony", "type": "user"}} # network generation #meta_paths = [] network_div = [] rest_id1 = rest_info['business_id'] for i in range(10): rest_id2 = knn_ids[i] meta_paths_tmp = graph_db.get_meta_path_count(rest_id1, rest_id2, 2) temp_ = [] for mp, count in sorted(meta_paths_tmp.items(), key=lambda x: len(x[0])): temp_.append(('B-%s-B' % ('-'.join(mp)), count)) meta_paths = temp_ nodes, edges = graph_db.get_paths(rest_id1, rest_id2, 2) if len(nodes) == 0: network_div.append([ rest_info["name"] + " v.s. " + knn_infos[i]["name"], '', meta_paths ]) else: G = create_network(nodes, edges) network_div.append([ rest_info["name"] + " v.s. " + knn_infos[i]["name"], draw_network(G), meta_paths ]) # added knn_lon_lat for google map display return render( request, 'rest.html', { 'rest_info': rest_info, 'rest_vec': rest_vec, 'query': query, 'knn_infos': knn_infos, 'knn_cat_dist': knn_cat_dist, 'knn_keyword_dist': knn_keyword_dist, 'knn_lon_lat': knn_lon_lat, 'barchart_cat': barchart_cat, 'piechart_data_cat': piechart_data_cat, 'piechart_cat': piechart_cat, 'knn_city_dist': knn_city_dist, 'barchart_city': barchart_city, 'piechart_city': piechart_city, 'network_div': network_div, 'similarity_types': similarity_types, 'approaches': approaches, })
def imoprt_vectors(): coll = mongodb_helper.get_coll(settings.VECTOR_COLL) for sub_vectors in load_vectors(): coll.insert_many(sub_vectors)