def docs(dataset_name): p = util.Progbar(target=(util.lines_in_file(directories.RAW + dataset_name))) for i, d in enumerate(util.load_json_lines(directories.RAW + dataset_name)): p.update(i + 1) yield d
def __init__(self, load=False, vectors_file=directories.PRETRAINED_WORD_VECTORS, keep_all_words=False): if load: self.vocabulary = util.load_pickle(directories.RELEVANT_VECTORS + 'vocabulary.pkl') self.vectors = np.load(directories.RELEVANT_VECTORS + 'word_vectors.npy') self.d = self.vectors.shape[1] else: self.vocabulary = {} self.vectors = [] word_counts = util.load_pickle(directories.MISC + 'word_counts.pkl') with open(vectors_file) as f: for line in f: split = line.decode('utf8').split() w = normalize(split[0]) if w not in self.vocabulary and (w == UNKNOWN_TOKEN or w in word_counts or keep_all_words): vec = np.array(map(float, split[1:]), dtype='float32') if not self.vectors: self.d = vec.size self.vectors.append(np.zeros( self.d)) # reserve 0 for mask self.vocabulary[w] = len(self.vectors) self.vectors.append(vec) n_unkowns = len( [w for w in word_counts if w not in self.vocabulary]) unknown_mass = sum( c for w, c in word_counts.iteritems() if c < ADD_WORD_THRESHOLD and w not in self.vocabulary) total_mass = sum(word_counts.values()) print "Pretrained embedding size:", util.lines_in_file( vectors_file) print "Unknowns by mass: {:}/{:} = {:.2f}%%"\ .format(unknown_mass, total_mass, 100 * unknown_mass / float(total_mass)) print "Unknowns by count: {:}/{:} = {:.2f}%%"\ .format(n_unkowns, len(word_counts), 100 * n_unkowns / float(len(word_counts))) for c, w in sorted([(w, c) for c, w in word_counts.iteritems()], reverse=True): if w not in self.vocabulary and c > ADD_WORD_THRESHOLD: print "Adding", w, "count =", c self.add_vector(w) if UNKNOWN_TOKEN not in self.vocabulary: print "No presupplied unknown token" self.add_vector(UNKNOWN_TOKEN) self.add_vector(MISSING_TOKEN) self.unknown = self.vocabulary[UNKNOWN_TOKEN] self.missing = self.vocabulary[MISSING_TOKEN]
def print_dataset_stats(data_dir): G = snap.LoadEdgeList(snap.PUNGraph, data_dir + 'graph.txt', 0, 1) print "Num nodes:", G.GetNodes() print "Num edges:", G.GetEdges() n_users = len(util.load_json(data_dir + "user.json")) n_businesses = len(util.load_json(data_dir + "business.json")) n_edges = util.lines_in_file(data_dir + "new_edges.txt") print "({:} users) * ({:} businesses) = {:.3e} candidate edges".format( n_users, n_businesses, n_users * n_businesses) print "{:} edges, {:0.5f}% of candidate edges".format(n_edges, 100 * n_edges / float(n_users * n_businesses))
def print_dataset_stats(data_dir): G = snap.LoadEdgeList(snap.PUNGraph, data_dir + 'graph.txt', 0, 1) print "Num nodes:", G.GetNodes() print "Num edges:", G.GetEdges() n_users = len(util.load_json(data_dir + "user.json")) n_businesses = len(util.load_json(data_dir + "business.json")) n_edges = util.lines_in_file(data_dir + "new_edges.txt") print "({:} users) * ({:} businesses) = {:.3e} candidate edges".format( n_users, n_businesses, n_users * n_businesses) print "{:} edges, {:0.5f}% of candidate edges".format( n_edges, 100 * n_edges / float(n_users * n_businesses))
def do_post_update(self): #return; if util.lines_in_file(self.fn_dest) > 0: lines_seen = set() # holds lines already seen unique_lines = [] for line in open(self.fn_dest, "r"): if line not in lines_seen: # not a duplicate unique_lines.append(line) lines_seen.add(line) outfile = open(self.fn_dest, 'w') outfile.writelines(unique_lines) outfile.close()
def reviews_iterator(path='./data/provided/yelp_academic_dataset_review.json'): return util.logged_loop(util.load_json_lines(path), util.LoopLogger(100000, util.lines_in_file(path), True))
def build_dataset(vectors, name, tune_fraction=0.0, reduced=False, columns=None): doc_vectors = util.load_pickle(directories.MISC + name.replace("_reduced", "") + "_document_vectors.pkl") main_pairs = PairDataBuilder(columns) tune_pairs = PairDataBuilder(columns) main_mentions = MentionDataBuilder(columns) tune_mentions = MentionDataBuilder(columns) main_docs = DocumentDataBuilder(columns) tune_docs = DocumentDataBuilder(columns) print "Building dataset", name p = util.Progbar( target=(2 if reduced else util.lines_in_file(directories.RAW + name))) for i, d in enumerate(util.load_json_lines(directories.RAW + name)): if reduced and i > 2: break p.update(i + 1) if reduced and tune_fraction != 0: pairs, mentions, docs = (main_pairs, main_mentions, main_docs) \ if i == 0 else (tune_pairs, tune_mentions, tune_docs) else: pairs, mentions, docs = (main_pairs, main_mentions, main_docs) \ if random.random() > tune_fraction else (tune_pairs, tune_mentions, tune_docs) ms, ps = mentions.size(), pairs.size() mention_positions = {} for mention_num in sorted(d["mentions"].keys(), key=int): mention_positions[mention_num] = mentions.size() mentions.add_mention( d["mentions"][mention_num], vectors, doc_vectors[d["mentions"][mention_num]["doc_id"]]) for key in sorted(d["labels"].keys(), key=lambda k: (int(k.split()[1]), int(k.split()[0]))): k1, k2 = key.split() pairs.add_pair(d["labels"][key], mention_positions[k1], mention_positions[k2], int(d["mentions"][k1]["doc_id"]), int(d["mentions"][k1]["mention_id"]), int(d["mentions"][k2]["mention_id"]), d["pair_features"][key]) me, pe = mentions.size(), pairs.size() docs.add_doc(ms, me, ps, pe, d["document_features"]) suffix = ("_reduced" if reduced else "") if tune_mentions.size() > 0: tune_mentions.write(name + "_tune" + suffix) tune_pairs.write(name + "_tune" + suffix) tune_docs.write(name + "_tune" + suffix) main_mentions.write(name + "_train" + suffix) main_pairs.write(name + "_train" + suffix) main_docs.write(name + "_train" + suffix) else: main_mentions.write(name + suffix) main_pairs.write(name + suffix) main_docs.write(name + suffix)
def reviews_iterator(path='./data/provided/yelp_academic_dataset_review.json'): return util.logged_loop( util.load_json_lines(path), util.LoopLogger(100000, util.lines_in_file(path), True))