def acc_probe_lineplot_main(): opt = utils.parse_args() sep_patt = re.compile('[\s/]+') #data_str1 = '0.178 / 35.0 0.39 / 289.0 0.495 / 769.0 0.565 / 1465.0 0.618 / 2372.0' with_catalyzer = True #if with_catalyzer: # km_data_path = osp.join(utils.data_dir, 'km_plot_data_c') #else: km_data_path = osp.join(utils.data_dir, 'km_plot_data') train_data_path = osp.join(utils.data_dir, 'train_plot_data') #data format example: data_str_km = '0.788 / 50803.0 0.791 / 53157.0 0.795 / 55564.0 0.798 / 58021.0' data_str_km = utils.load_lines(km_data_path)[0] data_str_train = utils.load_lines(train_data_path)[0] label_ar = ['km', 'neural'] method_l = [] acc_l = [] probe_l = [] #determine this dynamically probe95_plot_l = [True, True] for i, data_str in enumerate([data_str_km, data_str_train]): data_ar = sep_patt.split(data_str.strip()) if probe95_plot_l[i]: assert (len(data_ar) % 3) == 0 step = 3 else: assert (len(data_ar) & 1) == 0 step = 2 data_ar = list(map(float, data_ar)) acc_l.extend(data_ar[0::step]) probe_l.extend(data_ar[1::step]) method_l.extend([label_ar[i]] * (len(data_ar) // step)) if False and probe95_plot: acc_l.extend(data_ar[0::step]) probe_l.extend(data_ar[2::step]) method_l.extend([label_ar[i] + '95'] * (len(data_ar) // step)) acc_ar = np.array(acc_l) probe_ar = np.array(probe_l) height = 1 n_clusters = 256 acc_probe_lineplot(probe_ar, acc_ar, method_l, height, n_clusters, opt)
def main(): prsd_ids = [] # if file with parsed IDS exists # that means we've processed some part of them before # also it might be all of them... # but with fails(connection, network) if not os.path.exists(processed_ids_fpath(FILE_PATH)): open(processed_ids_fpath(FILE_PATH), 'a').close() else: prsd_ids = load_lines(processed_ids_fpath(FILE_PATH)) src_ids = load_lines(FILE_PATH) if prsd_ids: s = set(src_ids) s.difference_update(set(prsd_ids)) ids = list(s) else: ids = src_ids if not ids: exit(1) ids = deque(ids) parser = JuridicalInfoParser(FILE_PATH, MAX_FILE_OUTPUT_SIZE, retries=RETRIES, backoff=BACKOFF, timeout=TIMEOUT) cnt = 0 print('-' * 20) while ids: try: idx = ids.popleft() _row = parser.process(idx) cnt += 1 except ParseError: print('Failed to process {}'.format(idx)) append_file(failed_ids_fpath(FILE_PATH), idx) print('-' * 20) except Exception as e: print('Failed to process {}'.format(idx)) print(e) else: append_file(processed_ids_fpath(FILE_PATH), idx) finally: print(f'{cnt} - {_row}') print('-' * 20)
def read_all_ranks_glove(opt): ##need weights!!!!!!!!!!!! graph_path = osp.join(utils.glove_dir, 'normalized', 'knn_100', 'graph_10', 'graph.txt') ranks = [] lines = utils.load_lines(graph_path)[1:] #tuples of 2 indices, and their weights idx2weights = {} for i, line in enumerate(lines, 1): cur_list = line.strip().split(' ') cur_ranks = [] for j in range(0, len(cur_list), 2): neigh = int(cur_list[j]) cur_ranks.append(neigh) neigh_weight = int(cur_list[j + 1]) tup = (i, neigh) if i < neigh else (neigh, i) idx2weights[tup] = neigh_weight #ensure proper k! for resulting graph ranks.append(cur_ranks) return ranks, idx2weights
def words_from_file(filename): file = filename lines = load_lines(file) words = [] for i in range(len(lines)): word = lines[i].strip('\n') words.append(word) return words
def __init__(self): #self.n_clusters = n_clusters #kahip partition top level result #64 for now!! #-loads partition data and prepares to make predictions self.kahip_path = osp.join(utils.data_dir, 'cache_partition64strong_0ht2') classes_l = utils.load_lines(self.kahip_path) ########## self.classes_l = [int(c) for c in classes_l]
def process_child(ranks, graph_path, datalen, branching_l, height, idx2classes, proc_i, ht2cutsz, opt): n_edges = create_graph.write_knn_graph(ranks, graph_path) parts_path = run_kahip(graph_path, datalen, branching_l, height, opt) lines = utils.load_lines(parts_path) idx2classes[proc_i] = [int(line) for line in lines] '''
def process_child(ranks, graph_path, datalen, branching_l, height, idx2classes, proc_i, ht2cutsz, opt): n_edges = create_graph.write_knn_graph(ranks, graph_path) parts_path = run_kahip(graph_path, datalen, branching_l, height, opt) lines = utils.load_lines(parts_path) idx2classes[proc_i] = [int(line) for line in lines] compute_cut_size_b = True and not opt.glove if compute_cut_size_b: cut_sz = compute_cut_size(classes, ranks) ht2cutsz[height].append((cut_sz, n_edges))
def get_bgl3_count_df(output_dir=None): """ combines the inp and sel variant lists into a single dataframe with counts """ inp_fn = "source_data/bgl3/unlabeled_Bgl3_mutations.txt" sel_fn = "source_data/bgl3/positive_Bgl3_mutations.txt" cache_fn = "bgl3_raw_counts.tsv" if output_dir is None or not isfile(join(output_dir, cache_fn)): print("Computing bgl3 count df from raw counts") inp_variant_list = utils.load_lines(inp_fn) sel_variant_list = utils.load_lines(sel_fn) df = pd.concat([ parse_bgl3_variant_list(inp_variant_list, "inp"), parse_bgl3_variant_list(sel_variant_list, "sel") ], axis=1, sort=True).fillna(0) if output_dir is not None: df.to_csv(join(output_dir, cache_fn), sep="\t") return df print("Loading cached count df from file: {}".format( join(output_dir, cache_fn))) return pd.read_csv(join(output_dir, cache_fn), sep="\t", index_col=0)
def process_document(path, vocab, title_start="========,", forbidden_start="***LIST***", test=False, ssplit=True): print("ssplit: " + str(ssplit)) lines = ([ l for l in utils.load_lines(path) if not l.startswith(forbidden_start) ]) if ssplit else (sentence_split(utils.load_txt_file(path))) stride = 1 if test else config.sent_stride lab_lines = [] lines_txt = [] for i in range(len(lines)): if lines[i].startswith(title_start): continue if (i - 1) >= 0 and lines[i - 1].startswith(title_start): lab_lines.append((lines[i], 1)) else: lab_lines.append((lines[i], 0)) lines_txt.append(lines[i]) raw_blocks = [] i = 0 while i < len(lab_lines): block = lab_lines[i:i + config.sent_window] if len(block) < config.sent_window: block.extend([(config.fake_sent, 0)] * (config.sent_window - len(block))) raw_blocks.append(block) i += stride if not test: random.shuffle(raw_blocks) raw_blocks = raw_blocks[:int(config.perc_blocks_train * len(raw_blocks))] doc_recs = [] for rb in raw_blocks: records = create_one_instance(rb, lines_txt, vocab) doc_recs.extend(records) return doc_recs, len(raw_blocks), raw_blocks if test else None
def read_all_ranks(opt, path=None): if opt.glove: graph_path = osp.join(utils.glove_dir, 'graph.txt') elif opt.sift: graph_path = osp.join(utils.data_dir, 'sift_graph_10', 'graph.txt') elif opt.prefix10m: graph_path = osp.join(utils.data_dir, 'prefix10m_graph_10.txt') else: if path is not None: graph_path = path else: raise Exception('Cannot read precomputed knn graph for unknown type data') ranks = [] lines = utils.load_lines(graph_path)[1:] #tuples of 2 indices, and their weights idx2weights = {} for i, line in enumerate(lines, 1): cur_list = line.strip().split(' ') cur_ranks = [] for j in range(0, len(cur_list), 2): neigh = int(cur_list[j]) cur_ranks.append(neigh) neigh_weight = int(cur_list[j+1]) tup = (i, neigh) if i < neigh else (neigh, i) idx2weights[tup] = neigh_weight #ensure proper k! for resulting graph ranks.append(cur_ranks) #ranks = torch.LongTensor(ranks) return ranks, idx2weights
def convert_file(fname: str, target_fname: str): triples = parse_lines(utils.load_lines(fname)) records = (record_from_triple(*t) for t in triples) utils.write_file_and_print_stats(records, target_fname)
def lines(self): return load_lines(self._ids_fpath)
def main(arguments): """Main run function for processing the Cornell Movie Dialog Data.""" # Parse the arguments args = utils.parse_arguments(arguments) # movie lines file movie_lines_file = os.path.join(args.infile_path, 'movie_lines.txt') # movie conversations file movie_conversations_file = os.path.join(args.infile_path, 'movie_conversations.txt') if not args.pairs: tf.logging.log( tf.logging.INFO, "Selecting and saving {} random lines...".format(args.num_lines)) lines = [] try: with open(movie_lines_file, encoding='iso-8859-1') as f: for line in f: values = line.split(" +++$+++ ") lines.append(values[-1].strip()) except FileNotFoundError as error: tf.logging.log(tf.logging.ERROR, error) tf.logging.log( tf.logging.ERROR, 'Input file not found, correct the specified location.') sys.exit(0) tf.logging.info("Found {} input lines.".format(len(lines))) with open(args.outfile, 'w', encoding='iso-8859-1') as f: if args.num_lines != 0: for item in np.random.choice(lines, args.num_lines, replace=False): f.write("%s\n" % item) else: for item in lines: f.write("%s\n" % item) tf.logging.log( tf.logging.INFO, 'Wrote {} lines to {}.'.format(args.num_lines, args.outfile)) else: tf.logging.log( tf.logging.INFO, "Selecting and saving {} random pairs...".format(args.num_lines)) tf.logging.log(tf.logging.INFO, 'CMDC movie_lines_path: {}'.format(movie_lines_file)) tf.logging.log( tf.logging.INFO, 'CMDC movie_converstions_path: {}'.format( movie_conversations_file)) movie_lines_fields = [ "lineID", "characterID", "movieID", "character", "text" ] movie_conversations_fields = [ "character1ID", "character2ID", "movieID", "utteranceIDs" ] # load the lines lines = utils.load_lines(movie_lines_file, movie_lines_fields) tf.logging.log( tf.logging.INFO, "Loaded {} lines: {}".format(len(lines), movie_lines_file)) # load the conversations conversations = utils.load_conversations(movie_conversations_file, lines, movie_conversations_fields) tf.logging.info("Loaded {} conversations: {}".format( len(conversations), movie_conversations_file)) with open(args.outfile, 'w', encoding='iso-8859-1') as outputfile: writer = csv.writer(outputfile, delimiter=args.delimiter) collected_pairs = utils.extract_pairs(conversations) tf.logging.log(tf.logging.INFO, 'Total of {} pairs'.format(len(collected_pairs))) if int(args.num_lines) != 0: random_idxs = np.random.choice(len(collected_pairs), args.num_lines, replace=False) for random_id in random_idxs: pair = collected_pairs[random_id] writer.writerow(pair) tf.logging.info("Wrote {} pairs to {}.".format( args.num_lines, args.outfile)) else: for item in collected_pairs: writer.writerow(item) tf.logging.info("Wrote {} pairs to {}.".format( len(collected_pairs), args.outfile))
def create_data_tree_root(dataset, all_ranks, ds_idx, train_node, idx2bin, height, branching_l, ht2cutsz, opt): #create graph from data. data = dataset[ds_idx] datalen = len(data) if datalen <= opt.k: return None graph_path = os.path.join(opt.data_dir, opt.graph_file) #'../data/knn.graph' #ranks are 1-based if opt.glove or opt.sift: #and len(branching_l) == 1: parts_path = run_kahip(graph_path, datalen, branching_l, height, opt) lines = utils.load_lines(parts_path) classes = [int(line) for line in lines] #read in all_ranks if opt.glove: all_ranks, idx2weights = read_all_ranks_glove(opt) elif opt.sift: all_ranks, idx2weights = read_all_ranks_sift(opt) ###implement!! #create root DataNode dataset, ds_idx, parent_train_node, idx2bin, height, opt dsnode = add_datanode_children(dataset, (all_ranks, idx2weights), ds_idx, train_node, idx2bin, height - 1, branching_l, classes, ht2cutsz, opt) return dsnode if len(branching_l) == 1: #this is always the case now #only use distance at top level of tree ranks = create_graph.create_knn_graph(data, k=opt.k, opt=opt) #should supply opt all_ranks = ranks else: assert all_ranks is not None #else compute part of previous graph ranks = create_graph.create_knn_sub_graph(all_ranks, ds_idx, data, opt) n_edges = create_graph.write_knn_graph(ranks, graph_path) #graph_dir = create_graph.data_dir #graph_file = os.path.join(graph_dir, graph_file) #create partition from graph #this overrides file each iteration #parts_path = opt.parts_path_root parts_path = run_kahip(graph_path, datalen, branching_l, height, opt) lines = utils.load_lines(parts_path) classes = [int(line) for line in lines] compute_cut_size_b = True and not opt.glove if compute_cut_size_b: cut_sz = compute_cut_size(classes, ranks) ht2cutsz[height].append((cut_sz, n_edges)) #create root DataNode dataset, ds_idx, parent_train_node, idx2bin, height, opt dsnode = add_datanode_children(dataset, (all_ranks, None), ds_idx, train_node, idx2bin, height - 1, branching_l, classes, ht2cutsz, opt) return dsnode
def create_data_tree_root(dataset, all_ranks, ds_idx, train_node, idx2bin, height, branching_l, ht2cutsz, opt): datalen = len(ds_idx) if datalen <= opt.k: return None graph_path = os.path.join(opt.data_dir, opt.graph_file) #'../data/knn.graph' #ranks are 1-based if opt.glove or opt.sift or opt.prefix10m: #and len(branching_l) == 1: if opt.glove: #custom paths #if opt.glove and opt.k_graph==50: #april, 50NN graph file #graph_path = os.path.join(opt.data_dir, 'glove50_'+opt.graph_file) #'../data/knn.graph' graph_path = os.path.join(opt.data_dir, opt.graph_file) #'../data/knn.graph' #graph_path = os.path.join(opt.data_dir, 'glove10_sub10knn.graph') print('graph file {}'.format(graph_path)) parts_path = run_kahip(graph_path, datalen, branching_l, height, opt) print('Done partitioning top level!') lines = utils.load_lines(parts_path) classes = [int(line) for line in lines] #read in all_ranks, for partitioning on further levels. all_ranks, idx2weights = read_all_ranks(opt) if opt.dataset_name != 'prefix10m': k1 = max(1, int(opt.nn_mult*opt.k)) ranks = utils.dist_rank(dataset, k=k1) else: #subtract 1 as graph was created with 1-indexing for kahip. ranks = torch.load('/large/prefix10m10knn.graph.pt') - 1 #create root DataNode dataset, ds_idx, parent_train_node, idx2bin, height, opt dsnode = add_datanode_children(dataset, (all_ranks, idx2weights), ds_idx, train_node, idx2bin, height-1, branching_l, classes, ht2cutsz, 0, opt, ranks, toplevel=True, root=True) return dsnode #create graph from data. data = dataset[ds_idx] if len(branching_l) == 1: #this is always the case now #use tree created at top level throughout the hierarchy ranks = create_graph.create_knn_graph(data, k=opt.k, opt=opt) #should supply opt all_ranks = ranks else: assert all_ranks is not None #else compute part of previous graph ranks = create_graph.create_knn_sub_graph(all_ranks, ds_idx, data, opt) n_edges = create_graph.write_knn_graph(ranks, graph_path) _, idx2weights = read_all_ranks(opt, path=graph_path) #create partition from graph #this overrides file each iteration parts_path = run_kahip(graph_path, datalen, branching_l, height, opt) lines = utils.load_lines(parts_path) classes = [int(line) for line in lines] compute_cut_size_b = False and not opt.glove if compute_cut_size_b: cut_sz = compute_cut_size(classes, ranks) ht2cutsz[height].append((cut_sz, n_edges)) #create root DataNode dataset, ds_idx, parent_train_node, idx2bin, height, opt dsnode = add_datanode_children(dataset, (all_ranks, idx2weights), ds_idx, train_node, idx2bin, height-1, branching_l, classes, ht2cutsz, 0, opt, all_ranks-1, toplevel=True, root=True) #Note the above all_ranks is not 5*opt.k number of nearest neighbors. return dsnode