def learn_distributions(file_lists_by_category): """ Estimate the parameters p_d, and q_d from the training set Input ----- file_lists_by_category: A two-element list. The first element is a list of spam files, and the second element is a list of ham files. Output ------ probabilities_by_category: A two-element tuple. The first element is a dict whose keys are words, and whose values are the smoothed estimates of p_d; the second element is a dict whose keys are words, and whose values are the smoothed estimates of q_d """ num_spam_emails = len(file_lists_by_category[0]) num_ham_emails = len(file_lists_by_category[1]) spam_word_to_count = util.get_counts(file_lists_by_category[0]) ham_word_to_count = util.get_counts(file_lists_by_category[1]) p_d = {k:((v+1.0)/(num_spam_emails+2)) for (k, v) in spam_word_to_count.items()} q_d = {k:((v+1.0)/(num_ham_emails+2)) for (k, v) in ham_word_to_count.items()} probabilities_by_category = (p_d, q_d) return probabilities_by_category
def learn_distributions(file_lists_by_category): """ Estimate the parameters p_d, and q_d from the training set Input ----- file_lists_by_category: A two-element list. The first element is a list of spam files, and the second element is a list of ham files. Output ------ probabilities_by_category: A two-element tuple. The first element is a dict whose keys are words, and whose values are the smoothed estimates of p_d; the second element is a dict whose keys are words, and whose values are the smoothed estimates of q_d """ ### TODO: Write your code here spam = file_lists[0] ham = file_lists[1] vocab = util.get_counts(ham + spam).keys() qdvalues = {} pdvalues = {} spamcounts = util.get_counts(spam) hamcounts = util.get_counts(ham) Ns = len(spam) Nh = len(ham) for word in vocab: pdvalues[word] = (spamcounts[word] + 1) / (Ns + 2) qdvalues[word] = (hamcounts[word] + 1) / (Nh + 2) probabilities_by_category = (pdvalues, qdvalues) return probabilities_by_category
def learn_distributions(file_lists_by_category): """ Estimate the parameters p_d, and q_d from the training set Input ----- file_lists_by_category: A two-element list. The first element is a list of spam files, and the second element is a list of ham files. Output ------ probabilities_by_category: A two-element tuple. The first element is a dict whose keys are words, and whose values are the smoothed estimates of p_d; the second element is a dict whose keys are words, and whose values are the smoothed estimates of q_d """ # TODO: Write your code here # File lists spam_files = file_lists_by_category[0] ham_files = file_lists_by_category[1] # Target distributions pd = util.Counter() qd = util.Counter() # The number of times each word occurs in specific bag counts_in_spam = util.get_counts(spam_files) counts_in_ham = util.get_counts(ham_files) # SPAM bag size and HAM bag size spam_bag_size = sum(list(counts_in_spam.values())) ham_bag_size = sum(list(counts_in_ham.values())) # Dictionary dictionary = set(list(counts_in_spam.keys()) + list(counts_in_ham.keys())) # Assign distributions for word in dictionary: # A word can be either picked or not picked, hence 2 pd[word] = (counts_in_spam[word] + 1) / (spam_bag_size + len(dictionary)) qd[word] = (counts_in_ham[word] + 1) / (ham_bag_size + len(dictionary)) """ # Sanity Check s = 0 for word in pd: s += pd[word] print("total pd: {}".format(s)) s = 0 for word in qd: s += qd[word] print("total qd: {}".format(s)) """ return pd, qd
def generate_lsh_graph(data_set, num_hashes=3, num_bits=5, verbose=False): hashers = MultiLSHasher(num_hashes, num_bits) if verbose: print 'Hashers initialized' data_counts = get_counts(data_set) num_docs = data_counts[0] num_features = data_counts[1] doc_features = {} word_counts = Counter() with open_data_file(data_set) as data: datareader = csv.reader(data, delimiter=' ') for row in datareader: doc = int(row[0]) word = int(row[1]) count = float(row[2]) word_counts[word] += 1 if doc not in doc_features: doc_features[doc] = [] doc_features[doc].append((word, count)) if verbose: print 'Loaded doc features' for doc, features in doc_features.items(): if type(features[0]) is float: break feature_tfidf = [] for w, c in features: tfidf = math.log(c + 1) * math.log( num_docs / float(word_counts[w])) feature_tfidf.append((w, tfidf)) doc_features[doc] = feature_tfidf hashers.compute_stream(doc_features) signatures = hashers.compute_signatures() if verbose: print 'Computed signatures' doc_features = {} words_doc_count = Counter() with open_data_file(data_set) as data: datareader = csv.reader(data, delimiter=' ') for row in datareader: doc = int(row[0]) count = float(row[2]) if '.' in row[2] else int(row[2]) for hl, s in signatures.items(): word = str(row[1]) + hl + s[doc] words_doc_count[word] += 1 if doc not in doc_features: doc_features[doc] = [] doc_features[doc].append((word, count)) if verbose: print 'Generated hashed doc features' filename = '%s-lsh-h%db%d' % (data_set, num_hashes, num_bits) with open_graph_file(filename) as graph: datawriter = csv.writer(graph, delimiter='\t') for doc, feature_counts in doc_features.items(): for feature, count in feature_counts: tfidf = math.log(count + 1) * math.log( num_docs / float(words_doc_count[feature])) datawriter.writerow([doc, feature, tfidf]) if verbose: print 'Wrote graph file %s' % filename
def generate_lsh_graph(data_set, num_hashes=3, num_bits=5, verbose=False): hashers = MultiLSHasher(num_hashes, num_bits) if verbose: print 'Hashers initialized' data_counts = get_counts(data_set) num_docs = data_counts[0] num_features = data_counts[1] doc_features = {} word_counts = Counter() with open_data_file(data_set) as data: datareader = csv.reader(data, delimiter=' ') for row in datareader: doc = int(row[0]) word = int(row[1]) count = float(row[2]) word_counts[word] += 1 if doc not in doc_features: doc_features[doc] = [] doc_features[doc].append((word, count)) if verbose: print 'Loaded doc features' for doc, features in doc_features.items(): if type(features[0]) is float: break feature_tfidf = [] for w, c in features: tfidf = math.log(c+1) * math.log(num_docs/float(word_counts[w])) feature_tfidf.append((w,tfidf)) doc_features[doc] = feature_tfidf hashers.compute_stream(doc_features) signatures = hashers.compute_signatures() if verbose: print 'Computed signatures' doc_features = {} words_doc_count = Counter() with open_data_file(data_set) as data: datareader = csv.reader(data, delimiter=' ') for row in datareader: doc = int(row[0]) count = float(row[2]) if '.' in row[2] else int(row[2]) for hl, s in signatures.items(): word = str(row[1]) + hl + s[doc] words_doc_count[word] += 1 if doc not in doc_features: doc_features[doc] = [] doc_features[doc].append((word, count)) if verbose: print 'Generated hashed doc features' filename = '%s-lsh-h%db%d' % (data_set, num_hashes, num_bits) with open_graph_file(filename) as graph: datawriter = csv.writer(graph, delimiter='\t') for doc, feature_counts in doc_features.items(): for feature, count in feature_counts: tfidf = math.log(count+1) * math.log(num_docs/float( words_doc_count[feature])) datawriter.writerow([doc, feature, tfidf]) if verbose: print 'Wrote graph file %s' % filename
def learn_distributions(file_lists_by_category): """ Estimate the parameters p_d, and q_d from the training set Input ----- file_lists_by_category: A two-element list. The first element is a list of spam files, and the second element is a list of ham files. Output ------ probabilities_by_category: A two-element tuple. The first element is a dict whose keys are words, and whose values are the smoothed estimates of p_d; the second element is a dict whose keys are words, and whose values are the smoothed estimates of q_d """ ### TODO: Write your code here spam_list = file_lists_by_category[0] ham_list = file_lists_by_category[1] spam_counts = util.get_counts(spam_list) num_spam_words = len(spam_counts) ham_counts = util.get_counts(ham_list) num_ham_words = len(ham_counts) D = len(spam_counts.keys() & ham_counts.keys()) p_d = dict() q_d = dict() for word in spam_counts: p_d[word] = (spam_counts[word] + 1) / (num_spam_words + D) p_d["default val"] = 1 / (num_spam_words + D) for word in ham_counts: q_d[word] = (ham_counts[word] + 1) / (num_ham_words + D) q_d["default val"] = 1 / (num_ham_words + D) probabilities_by_category = (p_d, q_d) return probabilities_by_category
def learn_distributions(file_lists_by_category): """ Estimate the parameters p_d, and q_d from the training set Input ----- file_lists_by_category: A two-element list. The first element is a list of spam files, and the second element is a list of ham files. Output ------ probabilities_by_category: A two-element tuple. The first element is a dict whose keys are words, and whose values are the smoothed estimates of p_d; the second element is a dict whose keys are words, and whose values are the smoothed estimates of q_d """ spamfiles = file_lists_by_category[0] hamfiles = file_lists_by_category[1] w = [] for spamfile in spamfiles: w.extend(util.get_words_in_file(spamfile)) for hamfile in hamfiles: w.extend(util.get_words_in_file(hamfile)) # n_spam = len(spam_words) # n_ham = len(ham_words) spam_count = util.get_counts(spamfiles) ham_count = util.get_counts(hamfiles) n = len(w) dict_spam = {wi : 0 for wi in w} dict_ham = {wi : 0 for wi in w} for key in dict_spam: dict_spam[key] = (spam_count[key]+1)/(n+2) dict_ham[key] = (ham_count[key]+1)/(n+2) probabilities_by_category = (dict_spam,dict_ham) return probabilities_by_category
def get_estimates(unqiue_words, files): ret = dict() num_files = len(files) counter = util.get_counts(files) # total_words = 0 # for word in counter: # total_words += counter[word] for word in unqiue_words: ret[word] = (counter[word] + 1) / (num_files + 2) return ret
def learn_distributions(file_lists_by_category): """ Estimate the parameters p_d, and q_d from the training set Input ----- file_lists_by_category: A two-element list. The first element is a list of spam files, and the second element is a list of ham files. Output ------ probabilities_by_category: A two-element tuple. The first element is a dict whose keys are words, and whose values are the smoothed estimates of p_d; the second element is a dict whose keys are words, and whose values are the smoothed estimates of q_d """ ### TODO: Write your code here spam_emails = file_lists_by_category[0] ham_emails = file_lists_by_category[1] spam_email_word_counts = util.get_counts(spam_emails) ham_email_word_counts = util.get_counts(ham_emails) file_list_train = list(itertools.chain.from_iterable(file_lists_by_category)) N = len(file_list_train) vocabulary = set(util.get_counts(file_list_train).keys()) D = len(vocabulary) words_p_d = {} words_q_d = {} for word in vocabulary: words_p_d[word] = (spam_email_word_counts[word] + 1) / (len(spam_emails) + 2) words_q_d[word] = (ham_email_word_counts[word] + 1) / (len(ham_emails) + 2) probabilities_by_category = (words_p_d, words_q_d) return probabilities_by_category
def generate_labeled_baseline_graph(output_file, percentile=95, verbose=False): data_set = output_file.split('-')[0] data_counts = get_counts(data_set) num_docs = data_counts[0] num_features = data_counts[1] test_data = [] words_doc_count = Counter() for doc, features in get_new_doc_features(data_set, output_file, percentile).items(): for word, count in features: words_doc_count[word] += 1 test_data.append([doc, word, count]) if verbose: print 'Loaded doc features' with open_graph_file(output_file) as graph: datawriter = csv.writer(graph, delimiter='\t') for d, features in get_new_doc_features(data_set, output_file, percentile).items(): for w, c in features: tfidf = math.log(c+1) * math.log(num_docs/float(words_doc_count[w])) datawriter.writerow([d, w, tfidf]) if verbose: print 'Wrote graph file %s' % output_file
def generate_baseline_graph(data_set, filename=None, verbose=False): data_counts = get_counts(data_set) num_docs = data_counts[0] num_features = data_counts[1] test_data = [] words_doc_count = Counter() for doc, features in get_doc_features(data_set).items(): for word, count in features: words_doc_count[word] += 1 test_data.append([doc, word, count]) if verbose: print 'Loaded doc features' if not filename: filename = data_set + '-baseline' with open_graph_file(filename) as graph: datawriter = csv.writer(graph, delimiter='\t') for d,w,c in test_data: if type(c) is float: datawriter.writerow([str(d), str(w) + 'w', c]) else: tfidf = math.log(c+1) * math.log(num_docs/float(words_doc_count[w])) datawriter.writerow([str(d), str(w) + 'w', tfidf]) if verbose: print 'Wrote graph file %s' % filename
def generate_baseline_graph(data_set, filename=None, verbose=False): data_counts = get_counts(data_set) num_docs = data_counts[0] num_features = data_counts[1] test_data = [] words_doc_count = Counter() for doc, features in get_doc_features(data_set).items(): for word, count in features: words_doc_count[word] += 1 test_data.append([doc, word, count]) if verbose: print 'Loaded doc features' if not filename: filename = data_set + '-baseline' with open_graph_file(filename) as graph: datawriter = csv.writer(graph, delimiter='\t') for d, w, c in test_data: if type(c) is float: datawriter.writerow([str(d), str(w) + 'w', c]) else: tfidf = math.log(c + 1) * math.log( num_docs / float(words_doc_count[w])) datawriter.writerow([str(d), str(w) + 'w', tfidf]) if verbose: print 'Wrote graph file %s' % filename
def generate_labeled_baseline_graph(output_file, percentile=95, verbose=False): data_set = output_file.split('-')[0] data_counts = get_counts(data_set) num_docs = data_counts[0] num_features = data_counts[1] test_data = [] words_doc_count = Counter() for doc, features in get_new_doc_features(data_set, output_file, percentile).items(): for word, count in features: words_doc_count[word] += 1 test_data.append([doc, word, count]) if verbose: print 'Loaded doc features' with open_graph_file(output_file) as graph: datawriter = csv.writer(graph, delimiter='\t') for d, features in get_new_doc_features(data_set, output_file, percentile).items(): for w, c in features: tfidf = math.log(c + 1) * math.log( num_docs / float(words_doc_count[w])) datawriter.writerow([d, w, tfidf]) if verbose: print 'Wrote graph file %s' % output_file
def generate_knn_graphs(data_set, ks=[5,10,20,30,50,100], verbose=False): ''' since we get a list of *all* the neighbors ordered by "nearness", it makes more sense to iterate through the different k's within the function rather than outside it ''' data_counts = get_counts(data_set) num_docs = data_counts[0] num_features = data_counts[1] max_k = max(ks) assert max_k < num_docs feature_matrix = np.matrix(np.zeros((num_docs, num_features))) words_doc_count = np.zeros(num_features) is_tfidf = False docs = set() with open_data_file(data_set) as data: datareader = csv.reader(data, delimiter=' ') for row in datareader: doc = int(row[0]) - 1 word = int(row[1]) - 1 if is_tfidf: count = float(row[2]) elif '.' in row[2]: count = float(row[2]) is_tfidf = True else: count = int(row[2]) words_doc_count[word] += 1 docs.add(doc) feature_matrix.itemset((doc, word), count) if verbose: print 'Loaded test data' if verbose: print 'Generating feature matrix' if not is_tfidf: for doc in xrange(num_docs): if doc in docs: for word in xrange(num_features): if words_doc_count[word] != 0: count = feature_matrix.item((doc,word)) tfidf = math.log(count+1) * math.log(num_docs/float(words_doc_count[word])) feature_matrix.itemset((doc,word), tfidf) if doc % 10 == 9: if verbose: print 'Processed %d out of %d documents' % (doc+1, num_docs) if verbose: print 'Generated feature matrix' normalizing_matrix = np.matrix(np.zeros((num_docs, num_docs))) for i in xrange(num_docs): f = feature_matrix[i] fft = math.sqrt(f * f.transpose()) if fft < 1e-9: normalizing_matrix.itemset((i,i), 0.0) else: normalizing_matrix.itemset((i,i), 1.0 / fft) if verbose: print 'Generated normalizing matrix' if verbose: print 'Generating folded graph' edges = [] N = normalizing_matrix F = feature_matrix doc_neighbors = {} for doc in xrange(num_docs): Nv = np.matrix(np.zeros((num_docs,1))) Nv.itemset(doc, N.item((doc, doc))) FtNv = F[doc].transpose() * N.item((doc,doc)) doc_weights = np.array(N * (F * FtNv)).transpose() neighbors = np.argsort(doc_weights)[0] doc_neighbors[doc] = [(neighbor, doc_weights.item(neighbor)) for neighbor in neighbors[-max_k:]] if doc % 10 == 9: if verbose: print 'Processed %d out of %d documents' % (doc+1, num_docs) if verbose: print 'Generated folded graph' for k in ks: filename = '%s-knn-k%d' % (data_set, k) with open_graph_file(filename) as graph: datawriter = csv.writer(graph, delimiter='\t') for doc in xrange(num_docs): for neighbor, weight in doc_neighbors[doc][-k:]: if weight >= 1e-9: datawriter.writerow([str(doc+1), str(neighbor+1), weight]) if verbose: print 'Wrote graph file %s' % filename
def generate_knn_graph(data_set, k, verbose=False): data_counts = get_counts(data_set) num_docs = data_counts[0] num_features = data_counts[1] assert k < num_docs feature_matrix = np.matrix(np.zeros((num_docs, num_features))) words_doc_count = np.zeros(num_features) is_tfidf = False docs = set() with open_data_file(data_set) as data: datareader = csv.reader(data, delimiter=' ') for row in datareader: doc = int(row[0]) - 1 word = int(row[1]) - 1 if is_tfidf: count = float(row[2]) elif '.' in row[2]: count = float(row[2]) is_tfidf = True else: count = int(row[2]) words_doc_count[word] += 1 docs.add(doc) feature_matrix.itemset((doc, word), count) if verbose: print 'Loaded test data' if verbose: print 'Generating feature matrix' if not is_tfidf: for doc in xrange(num_docs): if doc in docs: for word in xrange(num_features): if words_doc_count[word] != 0: count = feature_matrix.item((doc,word)) tfidf = math.log(count+1) * math.log(num_docs/float(words_doc_count[word])) feature_matrix.itemset((doc,word), tfidf) if doc % 10 == 9: if verbose: print 'Processed %d out of %d documents' % (doc+1, num_docs) if verbose: print 'Generated feature matrix' normalizing_matrix = np.matrix(np.zeros((num_docs, num_docs))) for i in xrange(num_docs): f = feature_matrix[i] fft = math.sqrt(f * f.transpose()) if fft < 1e-9: normalizing_matrix.itemset((i,i), 0.0) else: normalizing_matrix.itemset((i,i), 1.0 / fft) if verbose: print 'Generated normalizing matrix' if verbose: print 'Generating folded graph' edges = [] N = normalizing_matrix F = feature_matrix for doc in xrange(num_docs): Nv = np.matrix(np.zeros((num_docs,1))) Nv.itemset(doc, N.item((doc, doc))) FtNv = F[doc].transpose() * N.item((doc,doc)) doc_weights = np.array(N * (F * FtNv)).transpose() nearest_neighbors = np.argsort(doc_weights) for neighbor in nearest_neighbors[0][-k:]: if doc_weights.item(neighbor) < 1e-9: continue edges.append(((doc+1, int(neighbor)+1), doc_weights.item(neighbor))) if doc % 10 == 9: if verbose: print 'Processed %d out of %d documents' % (doc+1, num_docs) if verbose: print 'Generated folded graph' filename = '%s-knn-k%d' % (data_set, k) with open_graph_file(filename) as graph: datawriter = csv.writer(graph, delimiter='\t') for edge, weight in edges: datawriter.writerow([edge[0], edge[1], weight]) if verbose: print 'Wrote graph file %s' % filename
"_all.npy") # loading the halo mass and group identification Group_M_Mean200_fp = np.load(root + 'Group_M_Mean200_fp' + snap_dir + '.npy') * 1.e10 SubhaloGrNr_fp = np.load(root + 'SubhaloGrNr_fp' + snap_dir + '.npy') SubhaloPos_fp = np.load(root + 'SubhaloPos_fp' + snap_dir + '.npy') / 1.e3 GroupPos_fp = np.load(root + 'GroupPos_fp' + snap_dir + '.npy') / 1.e3 N_halos_fp = GroupPos_fp.shape[0] inds_halo_fp = np.arange(N_halos_fp, dtype=int) GroupEnv_fp = np.load(root + 'GroupEnv_fp' + snap_dir + '.npy') # get parent indices of the centrals and their subhalo indices in the original array unique_sub_grnr, firsts = np.unique(SubhaloGrNr_fp, return_index=True) count_halo_col_fp, count_halo_cents_col_fp, count_halo_sats_col_fp = get_counts( SubhaloGrNr_fp, firsts, N_halos_fp, sub_id_col) count_halo_sfg_fp, count_halo_cents_sfg_fp, count_halo_sats_sfg_fp = get_counts( SubhaloGrNr_fp, firsts, N_halos_fp, sub_id_sfg) count_halo_all_fp, count_halo_cents_all_fp, count_halo_sats_all_fp = get_counts( SubhaloGrNr_fp, firsts, N_halos_fp, sub_id_all) def get_env_pos(gal_inds, sub_grnr, sub_pos, group_env, group_inds, group_mass): # define mass bins log_min = 11. log_max = 15. N_bins = 41 bin_edges = np.linspace(log_min, log_max, N_bins) bin_cents = (bin_edges[1:] + bin_edges[:-1]) * .5
def generate_knn_graphs(data_set, ks=[5, 10, 20, 30, 50, 100], verbose=False): ''' since we get a list of *all* the neighbors ordered by "nearness", it makes more sense to iterate through the different k's within the function rather than outside it ''' data_counts = get_counts(data_set) num_docs = data_counts[0] num_features = data_counts[1] max_k = max(ks) assert max_k < num_docs feature_matrix = np.matrix(np.zeros((num_docs, num_features))) words_doc_count = np.zeros(num_features) is_tfidf = False docs = set() with open_data_file(data_set) as data: datareader = csv.reader(data, delimiter=' ') for row in datareader: doc = int(row[0]) - 1 word = int(row[1]) - 1 if is_tfidf: count = float(row[2]) elif '.' in row[2]: count = float(row[2]) is_tfidf = True else: count = int(row[2]) words_doc_count[word] += 1 docs.add(doc) feature_matrix.itemset((doc, word), count) if verbose: print 'Loaded test data' if verbose: print 'Generating feature matrix' if not is_tfidf: for doc in xrange(num_docs): if doc in docs: for word in xrange(num_features): if words_doc_count[word] != 0: count = feature_matrix.item((doc, word)) tfidf = math.log(count + 1) * math.log( num_docs / float(words_doc_count[word])) feature_matrix.itemset((doc, word), tfidf) if doc % 10 == 9: if verbose: print 'Processed %d out of %d documents' % (doc + 1, num_docs) if verbose: print 'Generated feature matrix' normalizing_matrix = np.matrix(np.zeros((num_docs, num_docs))) for i in xrange(num_docs): f = feature_matrix[i] fft = math.sqrt(f * f.transpose()) if fft < 1e-9: normalizing_matrix.itemset((i, i), 0.0) else: normalizing_matrix.itemset((i, i), 1.0 / fft) if verbose: print 'Generated normalizing matrix' if verbose: print 'Generating folded graph' edges = [] N = normalizing_matrix F = feature_matrix doc_neighbors = {} for doc in xrange(num_docs): Nv = np.matrix(np.zeros((num_docs, 1))) Nv.itemset(doc, N.item((doc, doc))) FtNv = F[doc].transpose() * N.item((doc, doc)) doc_weights = np.array(N * (F * FtNv)).transpose() neighbors = np.argsort(doc_weights)[0] doc_neighbors[doc] = [(neighbor, doc_weights.item(neighbor)) for neighbor in neighbors[-max_k:]] if doc % 10 == 9: if verbose: print 'Processed %d out of %d documents' % (doc + 1, num_docs) if verbose: print 'Generated folded graph' for k in ks: filename = '%s-knn-k%d' % (data_set, k) with open_graph_file(filename) as graph: datawriter = csv.writer(graph, delimiter='\t') for doc in xrange(num_docs): for neighbor, weight in doc_neighbors[doc][-k:]: if weight >= 1e-9: datawriter.writerow( [str(doc + 1), str(neighbor + 1), weight]) if verbose: print 'Wrote graph file %s' % filename
def generate_knn_graph(data_set, k, verbose=False): data_counts = get_counts(data_set) num_docs = data_counts[0] num_features = data_counts[1] assert k < num_docs feature_matrix = np.matrix(np.zeros((num_docs, num_features))) words_doc_count = np.zeros(num_features) is_tfidf = False docs = set() with open_data_file(data_set) as data: datareader = csv.reader(data, delimiter=' ') for row in datareader: doc = int(row[0]) - 1 word = int(row[1]) - 1 if is_tfidf: count = float(row[2]) elif '.' in row[2]: count = float(row[2]) is_tfidf = True else: count = int(row[2]) words_doc_count[word] += 1 docs.add(doc) feature_matrix.itemset((doc, word), count) if verbose: print 'Loaded test data' if verbose: print 'Generating feature matrix' if not is_tfidf: for doc in xrange(num_docs): if doc in docs: for word in xrange(num_features): if words_doc_count[word] != 0: count = feature_matrix.item((doc, word)) tfidf = math.log(count + 1) * math.log( num_docs / float(words_doc_count[word])) feature_matrix.itemset((doc, word), tfidf) if doc % 10 == 9: if verbose: print 'Processed %d out of %d documents' % (doc + 1, num_docs) if verbose: print 'Generated feature matrix' normalizing_matrix = np.matrix(np.zeros((num_docs, num_docs))) for i in xrange(num_docs): f = feature_matrix[i] fft = math.sqrt(f * f.transpose()) if fft < 1e-9: normalizing_matrix.itemset((i, i), 0.0) else: normalizing_matrix.itemset((i, i), 1.0 / fft) if verbose: print 'Generated normalizing matrix' if verbose: print 'Generating folded graph' edges = [] N = normalizing_matrix F = feature_matrix for doc in xrange(num_docs): Nv = np.matrix(np.zeros((num_docs, 1))) Nv.itemset(doc, N.item((doc, doc))) FtNv = F[doc].transpose() * N.item((doc, doc)) doc_weights = np.array(N * (F * FtNv)).transpose() nearest_neighbors = np.argsort(doc_weights) for neighbor in nearest_neighbors[0][-k:]: if doc_weights.item(neighbor) < 1e-9: continue edges.append( ((doc + 1, int(neighbor) + 1), doc_weights.item(neighbor))) if doc % 10 == 9: if verbose: print 'Processed %d out of %d documents' % (doc + 1, num_docs) if verbose: print 'Generated folded graph' filename = '%s-knn-k%d' % (data_set, k) with open_graph_file(filename) as graph: datawriter = csv.writer(graph, delimiter='\t') for edge, weight in edges: datawriter.writerow([edge[0], edge[1], weight]) if verbose: print 'Wrote graph file %s' % filename