def projection(graph, attr='type', on_attr='user', using_attr='business'): proj_graph = copy_graph(graph) print('> Computing projection on %s using %s' % (on_attr, using_attr)) nbnodes = proj_graph.GetNodes() for node in proj_graph.Nodes(): node_id = node.GetId() node_type = proj_graph.GetStrAttrDatN(node_id, attr) if node_type == on_attr: nodeVec = snap.TIntV() snap.GetNodesAtHop(graph, node_id, 2, nodeVec, False) tempPrint('%d / %d' % (node_id, nbnodes)) for next_neighbor_id in nodeVec: if proj_graph.GetStrAttrDatN( next_neighbor_id, attr) == on_attr and not proj_graph.IsEdge( node_id, next_neighbor_id): proj_graph.AddEdge(node_id, next_neighbor_id) delete_node_type(proj_graph, attr=attr, value=using_attr) return proj_graph
root = data.getParent(__file__) alltoken = data.loadFile(root + '/computed/alltoken.pkl') reviews_feature = data.loadFile(root + '/computed/reviews_feature.pkl') n = len(reviews_feature) print "Total reviews:", n # TF-IDF print "> Computing TF" TF = dict() i = 0 for review in reviews_feature: i += 1 disp.tempPrint(str(i)) TF[review] = Counter() for token in reviews_feature[review]: TF[review][token] = float(reviews_feature[review][token]) / float(max(reviews_feature[review].values())) print "> Computing IDF" IDF = dict() i = 0 for token in alltoken: i += 1 disp.tempPrint(str(i)) IDF[token] = log(float(n) / float(len(alltoken[token]))) print "> Computing TFIDF" TFIDF = dict() i = 0
def sgd(TFIDF, target, alpha=0.04, epsilon=0.01, alapcoeff=.01, nIter=1000): weights = Counter() # parameters iIter = 0 bias = 0 alphainit = alpha # variables RMSE = 2 * epsilon first = True #delta_weights = Counter() #delta_bias = 0 while iIter < nIter: print "Iter", iIter # Update weights i = 0 for review in TFIDF: i += 1 disp.tempPrint(str(i)) coeff = target[review] coeff -= dot(TFIDF[review], weights) coeff -= bias #old_delta_weights = copy(weights) #old_delta_bias = coeff for token in TFIDF[review] : weights[token] += alpha * coeff * TFIDF[review][token] #delta_weights[token] = coeff * TFIDF[review][token] # Constant term bias += alpha * coeff #delta_bias = coeff #if alapcoeff: #alpha = alpha * (1 + alapcoeff * (delta_bias * old_delta_bias + dot(old_delta_weights, delta_weights))) # Compute the approximation error print "Computing error" error = 0. i = 0 for review in TFIDF: i += 1 disp.tempPrint(str(i)) errori = target[review] errori -= dot(TFIDF[review], weights) errori -= bias error += errori ** 2.0 iIter += 1 RMSE_old = RMSE RMSE = error / len(TFIDF) if first: first = False else: alpha = alphainit / math.sqrt(iIter) print '[',iIter,']'," - Error =", RMSE, "- Alpha =", alpha if abs(RMSE_old - RMSE) < epsilon: break return (RMSE, weights, bias)
def generate_slda_data(filename="yelp_academic_dataset_review_training.json", n_reviews=None, category=None): filepath = root + "/dataset/" + filename """ A couple of useful initializations """ # Chris Potts tokenizer. tok = tokenizer.Tokenizer(preserve_case=False) # min and max ngram sizes MIN_NGRAM = 1 MAX_NGRAM = 1 word_set = set() # set of unique words word2idx = dict() # mapping from word to int representation of a word ratings_list = [] reviews_list = [] reviews = [] data_list = [] words_distr = dict() words_counts = Counter() """ PHASE 1 : Load file and get set of all words """ stopwords = nltk.corpus.stopwords.words("english") print " PHASE 1 : Get all words " loaded_file = open(filepath) lines_file = loaded_file.readlines() if n_reviews == None: n_reviews = len(lines_file) loaded_file.close() i_review = 1 # we randomly select n_reviews from the dataset permutation = np.random.permutation(len(lines_file)) sample_reviews = permutation[0:n_reviews] for idx_review in sample_reviews: line_json = lines_file[idx_review] review_dict = json.loads(line_json) tokens_list = tok.ngrams(review_dict["text"], MIN_NGRAM, MAX_NGRAM, string=True) rating = review_dict["stars"] for token in tokens_list: if token not in stopwords: """ if token not in words_distr: words_distr[token] = Counter({5:0, 4:0, 3:0, 2:0, 1:0}); words_distr[token][rating] += 1; """ words_counts[token] += 1 reviews_list.append(Counter(tokens_list)) ratings_list.append(review_dict["stars"] - 1) word_set |= set(tokens_list) disp.tempPrint(str(i_review)) i_review += 1 """ PHASE 2 : Word to int conversion """ filter_threshold = 0.00001 * (max(words_counts.values()) * 1.0) print " PHASE 2 : Word to int conversion " i_word = 1 for word in word_set: if words_counts[word] >= filter_threshold: word2idx[word] = i_word disp.tempPrint(str(i_word)) i_word += 1 print " Filtered. Before : %d words. After : %d" % (len(word_set), len(word2idx)) """ PHASE 3 : Converting data to the right format """ print " PHASE 3 : Converting data to the right format " i_review = 1 for review in reviews_list: nwords = 0 data_line = "" for word in review: if word in word2idx: data_line += " " + str(word2idx[word]) + ":" + str(review[word]) nwords += 1 data_line += "\n" if nwords != 0: data_line = str(nwords) + " " + data_line data_list.append(data_line) disp.tempPrint(str(i_review)) i_review += 1 """ PHASE 4 : Save into right files """ print " PHASE 4 : Save into right files " n_reviews = len(data_list) idx_test = n_reviews * 8 / 10 if category: category = "_" + category else: cateogory = "" data_train = open("/tmp/slda_data_train" + category + ".txt", "w") label_train = open("/tmp/slda_label_train" + category + ".txt", "w") data_test = open("/tmp/slda_data_test" + category + ".txt", "w") label_test = open("/tmp/slda_label_test" + category + ".txt", "w") for i_review in range(idx_test): data_train.write(data_list[i_review]) label_train.write(str(ratings_list[i_review]) + "\n") for i_review in range(idx_test, n_reviews): data_test.write(data_list[i_review]) label_test.write(str(ratings_list[i_review]) + "\n") data_train.close() data_test.close() label_train.close() label_test.close() """ PHASE 5 : Save useful datastructures """ print " PHASE 5 : Save useful datastructures " data.save(reviews_list, "/tmp/slda_reviews" + category + ".pkl.gz") data.save(ratings_list, "/tmp/slda_ratings" + category + ".pkl.gz") data.save(word2idx, "/tmp/slda_word2idx" + category + ".pkl.gz")