def save_vaez_inference_time(self, model_dir): """ Loop over all ndjson data and save the encoded z's for each. Args: model_dir (str): directory of the trained model we are using """ torch.backends.cudnn.benchmark = True # Optimizes cudnn print('Saving to: ', os.path.join(model_dir, 'inference_vaez')) with torch.no_grad(): for split in ['train', 'valid', 'test']: loader = self.get_data_loader(split, self.hp.batch_size, self.hp.categories, self.hp.max_len, self.hp.max_per_category, False) for bidx, batch in enumerate(loader): # encode drawing and get z batch = self.preprocess_batch_from_data_loader(batch) strokes, stroke_lens, cats, cats_idx = batch max_len, cur_bsz, _ = strokes.size() z, _, _ = self.enc(strokes) # z: [bsz, 128] z_np = z.cpu().numpy() for i in range(cur_bsz): # get index of this drawing within the dataset (loader has shuffle=False so we can do this) global_idx = bidx * self.hp.batch_size + i drawing_id = loader.dataset.data[global_idx]['id'] # save numpy version of z out_dir = os.path.join(model_dir, 'inference_vaez', cats[i]) os.makedirs(out_dir, exist_ok=True) out_fp = os.path.join(out_dir, f'{drawing_id}.pkl') cur_z_np = z_np[i] utils.save_file(cur_z_np, out_fp)
def generate_public_key(path): logger.info("Generating public/private key pair...") key = RSA.generate(2048) private_key = key.export_key() public_key = key.publickey().export_key() save_file(os.path.join(path, "private.pem"), private_key) save_file(os.path.join(path, "public.pem"), public_key) logger.info("Keys successfully generated.")
def inference_on_dataset(args): model, tokenizer = load_model(args) model.cuda() if args.dataset == 'ets': data_loader = get_etsnonnative_dataloader( max_len=args.max_len, batch_size=1) elif args.dataset == 'schoolreviews': data_loader = get_schoolreviewsreal_dataloader( max_len=args.max_len, batch_size=1) print('Dataset loaded') results = {} for i, batch in enumerate(data_loader): if i % 100 == 0: print('{} / {} / ') token_ids_padded, atn_mask, label, text, id = batch query = text[0] # bsz 1 id = id[0] # bsz 1 tokens = tokenizer.encode(query) all_tokens = len(tokens) tokens = tokens[:tokenizer.max_len - 2] used_tokens = len(tokens) tokens = torch.tensor([tokenizer.bos_token_id] + tokens + [tokenizer.eos_token_id]).unsqueeze(0) mask = torch.ones_like(tokens) with torch.no_grad(): tokens = tokens.cuda() mask = mask.cuda() logits = model(tokens, attention_mask=mask)[0] probs = logits.softmax(dim=-1) fake, real = probs.detach().cpu().flatten().numpy().tolist() correct = False if ((real > fake) and (label == 1)) or \ (fake > real) and (label == 0): correct = True results[id] = {'fake': fake, 'real': real, 'correct': correct} save_file(results, os.path.join(args.output_dir, 'results.json'), verbose=True) if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('--model_name', default='roberta-base') parser.add_argument('--dataset', default='ets') parser.add_argument('--max_len', default=128) parser.add_argument('--output_dir', default=None, required=True) args = parser.parse_args() inference_on_dataset(args)
def main(): path = create_dir() url = "https://brasil.diplo.de/br-de/service/matriculaconsular/" \ "2222228?fbclid=IwAR0MojqudTlgMKQjZG7nNqp9gr3-QSKyiiRdY0jeeBY336zm_3yqr_Oc_nc" content = scraping_de(url) for c in content: save_file(pathname=path, filename=c["file_name"], content=requests.get(c["xls_url"]).content)
def segment_all_ndjson_data(self): """ Segment all samples in the NdjsonStrokeDataset """ for split in ['train', 'valid', 'test']: for category in final_categories(): # Skip if not in hparam's categories list if (self.hp.categories != 'all') and (category not in self.hp.categories): continue print(f'{split}: {category}') # ds = NdjsonStrokeDataset(category, split) ds = NdjsonStrokeDataset( category, split, max_per_category=self.hp.max_per_category) loader = DataLoader(ds, batch_size=1, shuffle=False) n_segd = 0 for i, sample in enumerate(loader): try: id, category = loader.dataset.data[i][ 'id'], loader.dataset.data[i]['category'] out_dir = self.save_dir / category out_fp = out_dir / f'{id}.json' if os.path.exists(out_fp): continue # note: we are NOT saving it into separate split categories in the case that # we want to train on 30 categories and then do test on 5 held out categories. # (i.e. keep it flexible to splitting within categories vs. across categories, which # can be specified in that Dataset) # TODO: should we do the same for ProgressionPair? # save segmentations segmented = self.segment_sample(sample, dataset='ndjson') # TODO: save sample / strokes as well so that we have all the data in one place? utils.save_file(segmented, out_fp) # save original image too for comparisons ndjson_strokes = loader.dataset.data[i][ 'ndjson_strokes'] img = create_progression_image_from_ndjson_seq( ndjson_strokes) out_fp = out_dir / f'{id}.jpg' img.save(out_fp) n_segd += 1 if n_segd == self.hp.max_per_category: break except Exception as e: print(e) continue
def segment_all_progressionpair_data(self): """ Segment all samples in the ProgressionPairDataset """ for split in ['train', 'valid', 'test']: print(split) if self.s2i_hp.drawing_type == 'stroke': self.ds = ProgressionPairDataset(split, use_full_drawings=True) loader = DataLoader( self.ds, batch_size=1, shuffle=False, collate_fn=ProgressionPairDataset.collate_fn) elif self.s2i_hp.drawing_type == 'image': self.ds = DrawingsAsImagesAnnotatedDataset( split, images=self.s2i_hp.images, data_aug_on_text=False) loader = DataLoader( self.ds, batch_size=1, shuffle=False, collate_fn=DrawingsAsImagesAnnotatedDataset.collate_fn) for i, sample in enumerate(loader): try: id, category = loader.dataset.data[i][ 'id'], loader.dataset.data[i]['category'] out_dir = self.save_dir / split if self.s2i_hp.drawing_type == 'image': sample = loader.dataset.data[ i] # contains the fp, n_segments data we need # save segmentations segmented = self.segment_sample(sample, dataset='progressionpair') # TODO: save sample / strokes as well so that we have all the data in one place? out_fp = out_dir / f'{category}_{id}.json' utils.save_file(segmented, out_fp) # save original image too for comparisons # TODO: image dataset doesn't have ndjson_strokes # ndjson_strokes = loader.dataset.data[i]['ndjson_strokes'] # img = create_progression_image_from_ndjson_seq(ndjson_strokes) out_fp = out_dir / f'{category}_{id}.jpg' open(out_fp, 'a').close() # img.save(out_fp) except Exception as e: print(e) continue
def save_run_data(path_to_dir, hp, ask_if_exists=True): """ 1) Save stdout to file 2) Save files to path_to_dir: - code_snapshot/: Snapshot of code (.py files) - hp.json: dict of HParams object - run_details.txt: command used and start time """ print('Saving run data to: {}'.format(path_to_dir)) parent_dir = os.path.dirname(path_to_dir) if not os.path.exists(parent_dir): os.makedirs(parent_dir) if os.path.isdir(path_to_dir): print( "Data already exists in this directory (presumably from a previous run)" ) if ask_if_exists: inp = raw_input( 'Enter "y" if you are sure you want to remove all the old contents: ' ) if inp in ["y", "yes"]: print("Removing old contents") shutil.rmtree(path_to_dir) else: print("Exiting") raise SystemExit print("Creating directory and saving data") if not os.path.exists(path_to_dir): os.makedirs(path_to_dir) # Save snapshot of code snapshot_dir = os.path.join(path_to_dir, "code_snapshot") print('Saving code snapshot to: {}'.format(snapshot_dir)) if os.path.exists( snapshot_dir): # shutil doesn't work if dest already exists shutil.rmtree(snapshot_dir) shutil.copytree("src", snapshot_dir) # Save hyperparms save_file(vars(hp), os.path.join(path_to_dir, "hp.json"), verbose=True) # Save some command used to run, start time with open(os.path.join(path_to_dir, "run_details.txt"), "w") as f: f.write("Command:\n") cmd = " ".join(sys.argv) start_time = datetime.now().strftime("%B%d_%H-%M-%S") f.write(cmd + "\n") f.write('Start time: {}'.format(start_time)) print("Command used to start program:\n", cmd) print('Start time: {}'.format(start_time))
def save_instruction_vocabulary_distribution(): df = pd.read_csv(ANNOTATED_PROGRESSION_PAIRS_CSV_PATH) tokens = Counter() for i in range(len(df)): annotation = df.iloc[i]['Answer.annotation'].replace('\r', '') for token in utils.normalize_sentence(annotation): tokens[token] += 1 norm = sum(tokens.values()) distribution = {tok: count / norm for tok, count in tokens.items()} utils.save_file(distribution, INSTRUCTIONS_VOCAB_DISTRIBUTION_PATH, verbose=True)
def prep_and_save_data(gen_method='gpt2-xl_p96', max_len=192): tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') for split in ['test', 'valid', 'train']: split_data = [] # load real fp = REAL_TEXT_PATH / f'webtext.{split}.jsonl' for line in open(fp): item = json.loads(line) id, ended, text = item['id'], item['ended'], item['text'] text_trunc, text_len, token_ids_padded = tokenize_and_prep( tokenizer, text, max_len) if text_len < max_len: continue prepped = { 'id': id, 'text': text_trunc, 'token_ids_padded': token_ids_padded, 'label': 1, 'label_text': 'human-written' } split_data.append(prepped) # load fake dir = GENERATED_TEXT_PATH / f'fake_{gen_method}' for fn in os.listdir(dir): if fn.startswith(split): fp = os.path.join(dir, fn) for line in open(fp): item = json.loads(line) id, text = item['id'], item['text'] text_trunc, text_len, token_ids_padded = tokenize_and_prep( tokenizer, text, max_len) if text_len < max_len: continue prepped = { 'id': id, 'text': text_trunc, 'token_ids_padded': token_ids_padded, 'label': 0, 'label_text': 'machine-generated' } split_data.append(prepped) out_fp = PREPPED_REALGEN_TEXT_PATH / f'{gen_method}_{split}.pkl' save_file(split_data, out_fp, verbose=True)
def end_of_epoch_hook(self, data_loader, epoch, outputs_path=None, writer=None): """ Args: data_loader: DataLoader epoch: int outputs_path: str writer: Tensorboard Writer """ for model in self.models: model.eval() with torch.no_grad(): # Generate texts on validation set inference = self.inference_loop(data_loader, writer=writer, epoch=epoch) out_fp = os.path.join(outputs_path, 'samples_e{}.json'.format(epoch)) utils.save_file(inference, out_fp, verbose=True)
def decrypt_data(filename_path, new_filename_path, private_key_path): with open(filename_path, "rb") as file_in: logger.info(f"Decrypting file {filename_path}...") private_key = RSA.import_key(read_file(private_key_path)) enc_session_key, nonce, tag, ciphertext = [ file_in.read(x) for x in (private_key.size_in_bytes(), 16, 16, -1) ] # Decrypt the session key with the private RSA key cipher_rsa = PKCS1_OAEP.new(private_key) session_key = cipher_rsa.decrypt(enc_session_key) # Decrypt the data with the AES session key cipher_aes = AES.new(session_key, AES.MODE_EAX, nonce) data = cipher_aes.decrypt_and_verify(ciphertext, tag) decrypt_filename = extract_or_create_filename(new_filename_path, filename_path) save_file(decrypt_filename, data) logger.info("File successfully decrypted.")
def save_inference_on_split(self, loader=None, dataset_split=None, dir=None, ext=None): """ Args: loader: DataLoader dataset_split: str dir: str (location to save inference/<dataset_split>.pkl> ext: str (e.g. 'json', 'pkl'; extension of file) """ if loader is None: loader = self.get_data_loader(dataset_split, self.hp.batch_size, shuffle=False) inference = self.inference_loop(loader) fp = os.path.join(dir, 'inference', '{}.{}'.format(dataset_split, ext)) utils.save_file(inference, fp, verbose=True)
def test_epoch_end(self, outputs): avg_test_loss = torch.stack([x['test_loss'] for x in outputs]).mean() avg_test_acc = torch.stack([x['test_acc'] for x in outputs]).mean() metrics = { 'test_loss': avg_test_loss, 'avg_test_acc': avg_test_acc, 'log': { 'test_epoch_loss': avg_test_loss, 'test_epoch_acc': avg_test_acc } } # Save (this is coded with the ETS dataset in mind right now, so we can # check accuracy across different cuts correctness = {} for x in outputs: for i, item_id in enumerate(x['id']): item_correct = x['correct'][i] correctness[item_id] = item_correct save_file(correctness, self.args.eval_out_fp) return metrics
def put(self, data): UPLOADS_FOLDER = current_app.config['UPLOADS_FOLDER'] user = current_user if 'picture' in req.files: file = req.files['picture'] filename = save_file(file, UPLOADS_FOLDER) data['avatar'] = filename if user.avatar: delete_file(UPLOADS_FOLDER, user.avatar) user.update(**data) return user
def generate_and_save(self, data_loader, epoch, n_gens, outputs_path=None): """ Generate and save drawings """ n = 0 gen_strokes = [] gt_strokes = [] gt_texts = [] for i, batch in enumerate(data_loader): batch = self.preprocess_batch_from_data_loader(batch) strokes, stroke_lens, texts, text_lens, text_indices, cats, cats_idx, urls = batch max_len, bsz, _ = strokes.size() if self.hp.cond_instructions == 'decinputs': # Encode instructions # text_indices: [len, bsz], text_lens: [bsz] instructions_emb = self.enc( text_indices, text_lens, self.text_embedding, category_embedding=self.category_embedding, categories=cats_idx) # [bsz, enc_dim] z = instructions_emb hidden_cell = (nn_utils.move_to_cuda( torch.zeros(1, bsz, self.hp.dec_dim)), nn_utils.move_to_cuda( torch.zeros(1, bsz, self.hp.dec_dim))) # initialize state with start of sequence stroke-5 stroke sos = torch.stack([torch.Tensor([0, 0, 1, 0, 0])] * bsz).unsqueeze( 0) # [1 (len), bsz, 5 (stroke-5)] sos = nn_utils.move_to_cuda(sos) # generate until end of sequence or maximum sequence length s = sos seq_x = [] # delta-x seq_y = [] # delta-y seq_pen = [] # pen-down for _ in range(max_len): if self.hp.cond_instructions == 'decinputs': # input is last state, z, and hidden_cell input = torch.cat( [s, z.unsqueeze(0)], dim=2 ) # [1 (len), 1 (bsz), input_dim (5) + z_dim (128)] elif self.hp.cond_instructions == 'match': # input is last state and hidden_cell input = s # [1, bsz (1), 5] if self.hp.use_categories_dec \ and hasattr(self, 'category_embedding'): # hack because VAE was trained with use_categories_dec=True but didn't actually have a category embedding cat_embs = self.category_embedding( cats_idx) # [bsz (1), cat_dim] input = torch.cat([input, cat_embs.unsqueeze(0)], dim=2) # [1, 1, dim] # dim = 5 + cat_dim if decodergmm, 5 + z_dim + cat_dim if vae outputs, pi, mu_x, mu_y, sigma_x, sigma_y, rho_xy, q, hidden, cell = \ self.dec(input, stroke_lens=stroke_lens, output_all=False, hidden_cell=hidden_cell) hidden_cell = (hidden, cell) # for next timee step # sample next state s, dx, dy, pen_up, eos = self.sample_next_state( pi, mu_x, mu_y, sigma_x, sigma_y, rho_xy, q) seq_x.append(dx) seq_y.append(dy) seq_pen.append(pen_up) if eos: # done drawing break # get in format to draw image # Cumulative sum because seq_x and seq_y are deltas, so get x (or y) at each stroke sample_x = np.cumsum(seq_x, 0) sample_y = np.cumsum(seq_y, 0) sample_pen = np.array(seq_pen) sequence = np.stack([sample_x, sample_y, sample_pen]).T # output_fp = os.path.join(outputs_path, f'e{epoch}-gen{n}.jpg') # save_strokes_as_img(sequence, output_fp) # Save original as well output_fp = os.path.join(outputs_path, f'e{epoch}-gt{n}.jpg') strokes_x = strokes[:, 0, 0] # first 0 for x because sample_next_state etc. only using 0-th batch item; 2nd 0 for dx strokes_y = strokes[:, 0, 1] # 1 for dy strokes_x = np.cumsum(strokes_x.cpu().numpy()) strokes_y = np.cumsum(strokes_y.cpu().numpy()) strokes_pen = strokes[:, 0, 3].cpu().numpy() strokes_out = np.stack([strokes_x, strokes_y, strokes_pen]).T # save_strokes_as_img(strokes_out, output_fp) gen_strokes.append(sequence) gt_strokes.append(strokes_out) gt_texts.append(texts[0]) # 0 because batch size is 1 n += 1 if n == n_gens: break # save grid drawings rowcol_size = 5 chunk_size = rowcol_size**2 for i in range(0, chunk_size, len(gen_strokes)): output_fp = os.path.join(outputs_path, f'e{epoch}_gen{i}-{i+chunk_size}.jpg') save_multiple_strokes_as_img(gen_strokes[i:i + chunk_size], output_fp) output_fp = os.path.join(outputs_path, f'e{epoch}_gt{i}-{i+chunk_size}.jpg') save_multiple_strokes_as_img(gt_strokes[i:i + chunk_size], output_fp) # save texts output_fp = os.path.join(outputs_path, f'e{epoch}_texts{i}-{i+chunk_size}.json') utils.save_file(gt_texts[i:i + chunk_size], output_fp)
def create_retrieval_set(N=200, instruction='toplevel_s2iprob'): """ Create a retrieval set by selecting N drawings per category. Uses generated instruction trees. Args: N (int): size of retrieval set per category instruction (str): method for extracting instruction """ # Walk over instruction trees seg_tree_path = BEST_SEG_NDJSON_PATH seg_tree_path = 'data/quickdraw/segmentations/greedy_parsing/progressionpair/Feb18_2020/strokes_to_instruction/S2IimgsFeb13/' for root, dirs, fns in os.walk(seg_tree_path): pqueue = [] category = os.path.basename(root) # n = 0 for fn in fns: if (fn != 'hp.json') and fn.endswith('json') and ('treant' not in fn): fp = os.path.join(root, fn) seg_tree = utils.load_file(fp) drawing_id = fn.replace('.json', '') # drawing_id = fn.replace('.json', '').split('_')[1] # for progressio pair? if instruction == 'toplevel_s2iprob': text = seg_tree[0]['text'] heapq.heappush( # cat_to_pqueue[category], pqueue, (seg_tree[0]['score'], drawing_id, text, seg_tree) ) # n += 1 # if n == 250: # break # We are in a directory with seg_trees if len(pqueue) > 0: print(category) # get best instructions best = heapq.nlargest(N, pqueue) # load drawings cat_drawings = ndjson_drawings(category) id_to_idx = {d['key_id']: idx for idx, d in enumerate(cat_drawings)} # save best best_out = [] for score, id, text, seg_tree in best: stroke3 = ndjson_to_stroke3(cat_drawings[id_to_idx[id]]['drawing']) out = { 'score': score, 'id': id, 'text': text, 'stroke3': stroke3 } best_out.append(out) # id = best_out[1]['id'] # save_img(category, id, cat_drawings, id_to_idx) # pp(best[1][3]) # import pdb; pdb.set_trace() out_fp = RETRIEVAL_SET_PATH / instruction / 'data' / f'{category}.pkl' utils.save_file(best_out, out_fp) # save a version with just the non-stroke data for easy viewing best_out_no_drawing = [] for d in best_out: best_out_no_drawing.append({'score': float(d['score']), 'id': d['id'], 'text': d['text']}) out_fp = RETRIEVAL_SET_PATH / instruction / 'data' / f'{category}_nodrawing.json' utils.save_file(best_out_no_drawing, out_fp) # Save drawings chunk_n = 25 for i in range(0, N, chunk_n): best_chunk = best_out[i:i+chunk_n] drawings = [] for b in best_chunk: # stroke3 format is in x y deltas, save_multiple_strokes...() expects the actual x y points b['stroke3'][:,0] = np.cumsum(b['stroke3'][:,0]) b['stroke3'][:,1] = np.cumsum(b['stroke3'][:,1]) drawings.append(b['stroke3']) out_dir = RETRIEVAL_SET_PATH / instruction / 'drawings' os.makedirs(out_dir, exist_ok=True) out_fp = out_dir / f'{category}_{i}-{i+chunk_n}.jpg' save_multiple_strokes_as_img(drawings, out_fp)
def save_annotated_progression_pairs_data(): """ Save <category>.pkl files that is a dictionary from id to data. Data is a dictionary that contains: url: S3 url of progression pair annotation: instruction written by MTurker ndjson_strokes: drawing in ndjson format (list of subsegments, each subsegment is list of x y points) ndjson_start: ndjson_strokes index of start of annotated segment - Offset by 1 relative to ndjson_strokes - When 0, this is the start of the drawing (before any strokes) ndjson_end: ndjson_strokes index of end of annotated segment - Offset by 1 relative to ndjson_strokes stroke3: drawing in stroke-3 format: numpy array of shape [len, 3] (x, y, pen_up) stroke3_start: stroke3 index of start of annotated segment stroke3_end: stroke3 index of end of annotated segment stroke3_segment: numpy array of shape [len, 3] (x, y, pen_up) segment that was annotated (drawing from _start to _end of progression pair) """ os.makedirs(LABELED_PROGRESSION_PAIRS_DATA_PATH, exist_ok=True) df = pd.read_csv(ANNOTATED_PROGRESSION_PAIRS_CSV_PATH) for cat in df['Input.category'].unique(): df_cat = df[df['Input.category'] == cat] print(cat, len(df_cat)) id_to_data = defaultdict(dict) # get ndjson stroke data drawings = ndjson_drawings(cat) id_to_strokes = defaultdict(dict) for data in drawings: id = data['key_id'] id_to_strokes[id]['ndjson_strokes'] = data['drawing'] stroke3 = ndjson_to_stroke3(data['drawing']) id_to_strokes[id]['stroke3'] = stroke3 # map annotations to strokes for i in range(len(df_cat)): id = df_cat.iloc[i]['Input.id'] id = str(id) annotation = df_cat.iloc[i]['Answer.annotation'].replace('\r', '') ndjson_start = df_cat.iloc[i]['Input.start'] ndjson_end = df_cat.iloc[i]['Input.end'] url = df_cat.iloc[i]['Input.url'] id_to_data[id]['ndjson_start'] = int(ndjson_start) id_to_data[id]['ndjson_end'] = int(ndjson_end) id_to_data[id]['url'] = url id_to_data[id]['annotation'] = annotation ndjson_strokes = id_to_strokes[id] stroke3 = id_to_strokes[id]['stroke3'] id_to_data[id]['ndjson_strokes'] = id_to_strokes[id][ 'ndjson_strokes'] id_to_data[id]['stroke3'] = stroke3 # save portion of stroke3 corresponding to start and end pen_up = np.where( id_to_strokes[id]['stroke3'][:, 2] == 1)[0].tolist() pen_up.insert( 0, 0 ) # insert to get indexing (when ndjson_start == 0) this is the beginning stroke3_start = 0 if (ndjson_start == 0) else (pen_up[ndjson_start] + 1) stroke3_end = pen_up[ndjson_end] id_to_data[id]['stroke3_start'] = stroke3_start id_to_data[id]['stroke3_end'] = stroke3_end id_to_data[id]['stroke3_segment'] = stroke3[ stroke3_start:stroke3_end + 1, :] # flatten result = [] for id, data in id_to_data.items(): data['id'] = id data['category'] = cat result.append(data) # save out_fn = f'{cat}.pkl' out_fp = LABELED_PROGRESSION_PAIRS_DATA_PATH / out_fn utils.save_file(result, out_fp)
def save_annotated_precurrentpost_data(): """ Combine precurrentpost images (generated from save_drawings_split_into_precurrentpost()) with annotaitons. Preprocesses for use with a data loader. Save <category>.pkl files that is a dictionary from id to data. Data is a dictionary that contains: url: S3 url of progression pair annotation: instruction written by MTurker ndjson_strokes: drawing in ndjson format (list of subsegments, each subsegment is list of x y points) ndjson_start: ndjson_strokes index of start of annotated segment - Offset by 1 relative to ndjson_strokes - When 0, this is the start of the drawing (before any strokes) ndjson_end: ndjson_strokes index of end of annotated segment - Offset by 1 relative to ndjson_strokes pre_seg_fp: filepath to image of strokes before annotated segment annotated_seg_fp: filepath to image of annotated segment post_seg_fp: filepath to image of strokes after annotated segment full_fp: filepath to image of full drawing """ os.makedirs(PRECURRENTPOST_DATAWITHANNOTATIONS_PATH, exist_ok=True) df = pd.read_csv(ANNOTATED_PROGRESSION_PAIRS_CSV_PATH) for cat in df['Input.category'].unique(): df_cat = df[df['Input.category'] == cat] print(cat, len(df_cat)) id_to_data = defaultdict(dict) # map annotations to strokes for i in range(len(df_cat)): id = df_cat.iloc[i]['Input.id'] id = str(id) annotation = df_cat.iloc[i]['Answer.annotation'].replace('\r', '') ndjson_start = df_cat.iloc[i]['Input.start'] ndjson_end = df_cat.iloc[i]['Input.end'] url = df_cat.iloc[i]['Input.url'] n_segments = df_cat.iloc[i]['Input.n_segments'] id_to_data[id]['ndjson_start'] = int(ndjson_start) id_to_data[id]['ndjson_end'] = int(ndjson_end) id_to_data[id]['url'] = url id_to_data[id]['annotation'] = annotation id_to_data[id]['pre_seg_fp'] = str(PRECURRENTPOST_DATA_PATH / cat / id / f'0-{ndjson_start}.jpg') id_to_data[id]['annotated_seg_fp'] = str( PRECURRENTPOST_DATA_PATH / cat / id / f'{ndjson_start}-{ndjson_end}.jpg') id_to_data[id]['start_to_annotated_fp'] = str( PRECURRENTPOST_DATA_PATH / cat / id / f'0-{ndjson_end}.jpg') id_to_data[id]['post_seg_fp'] = str( PRECURRENTPOST_DATA_PATH / cat / id / f'{ndjson_end}-{n_segments}.jpg') id_to_data[id]['full_fp'] = str(PRECURRENTPOST_DATA_PATH / cat / id / 'full.jpg') # flatten result = [] for id, data in id_to_data.items(): data['id'] = id data['category'] = cat result.append(data) # save out_fn = f'{cat}.pkl' out_fp = PRECURRENTPOST_DATAWITHANNOTATIONS_PATH / out_fn utils.save_file(result, out_fp)
import torch import os import sys import numpy as np from copy import deepcopy import argparse from src.Criterion import Criterion from src.utils import load_file, save_file if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument("-i", help="path to input(.bin) file", required=True) parser.add_argument("-t", help="path to target(.bin) file", required=True) parser.add_argument("-ig", help="path to gradInput(.bin) file", required=True) args = parser.parse_args() inp = load_file(args.i) target = load_file(args.t) ce_loss = Criterion() loss = ce_loss.forward(inp, target) print(loss) gradInput = ce_loss.backward(inp, target) save_file(gradInput, args.ig)
parser = add_generation_args(parser) args = parser.parse_args() set_seed_for_gen(args) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # gpt2-medium trained on one week of covid news model_fp = 'trained_models/school_reviews/gpt2/wandb/model_e0.pkl' tokenizer_fp = 'trained_models/school_reviews/gpt2/wandb/tokenizer.pkl' print('Loading') from transformers import GPT2LMHeadModel, GPT2TokenizerFast model, tokenizer = GPT2LMHeadModel, GPT2TokenizerFast model = model.from_pretrained('gpt2-xl') model = load_file(model_fp) tokenizer = load_file(tokenizer_fp) gpt2 = GPT2Wrapper(args, model=model, tokenizer=tokenizer) gpt2 = gpt2.to(device) print('Loaded') OUT_FP = 'data/school_reviews/train_detector/trainedonallreviews_gpt2-xl_e0.json' texts = [] for i in range(5000): text = model.generate_unconditional(self, n=1, bsz=1, stdout=True)[0] texts.append(text) if i % 10 == 0: save_file(texts, OUT_FP)
def save(self, file_path): model_data = [x.as_dict() for x in self.Layers] save_file(model_data, file_path)
def calc_stats_for_runs_in_dir(dir, best_n=10): """ Print runs with best stats in <dir> Assumes each run has file with the name: 'e<epoch>_loss<loss>.pt'. Args: dir (str) best_n (int) """ print(f'Looking in: {dir}\n') n = 0 runs_stats = [] hiplot_stats = [] for root, dirs, fns in os.walk(dir): for fn in fns: # Get loss from model fn (e<epoch>_loss<loss>.pt) if fn.endswith('pt') and ('loss' in fn): # Get best samples epoch = fn.split('_')[0].replace('e', '') loss = float(fn.split('loss')[1].strip('.pt')) run = root.replace(dir + '/', '') best_sample_fp = os.path.join(root, 'outputs', f'samples_e{epoch}.json') # Calculate stats m2scores, m2cat2scores = calc_bleu_and_rouge_on_samples( best_sample_fp, print=False) gt_toks, gen_toks = calc_rare_words_stats(best_sample_fp, print=False) run_results = { 'n_gen_toks': len(gen_toks), 'n_gt_toks': len(gt_toks), 'loss': loss, 'rouge1': np.mean(m2scores['rouge1']), 'rouge2': np.mean(m2scores['rouge2']), 'rougeL': np.mean(m2scores['rougeL']), 'bleu1': np.mean(m2scores['bleu1']), 'bleu2': np.mean(m2scores['bleu2']), } runs_stats.append([run, run_results]) # Save json data to be visualized by hiplot hp_dict = utils.load_file(os.path.join(root, 'hp.json')) run_hiplot = {} for k, val in hp_dict.items(): run_hiplot[k] = val run_hiplot.update(run_results) hiplot_stats.append(run_hiplot) n += 1 # # Write best runs sorted to file out_fp = os.path.join(dir, 'best_runs.txt') with open(out_fp, 'w') as f: print('-' * 100) for main_stat in runs_stats[0][1].keys( ): # n_gen_toks, loss, rougeL, bleu1, bleu2 print(f'RUNS WITH BEST: {main_stat}', file=f) if main_stat == 'loss': # lower is beter sorted_by_main_stat = sorted( runs_stats, key=lambda x: -x[1][main_stat])[-best_n:] else: # higher is better sorted_by_main_stat = sorted( runs_stats, key=lambda x: x[1][main_stat])[-best_n:] for run, stats in sorted_by_main_stat: main_stat_val = stats[main_stat] other_stats_str = ', '.join([ '{}: {:.4f}'.format(stat, val) for stat, val in stats.items() if (main_stat != stat) ]) out_str = '{}: {:.4f}'.format(main_stat, main_stat_val) print(out_str + ', ' + other_stats_str + ', run: ' + run, file=f) print(file=f) # Print to stdout for line in open(out_fp, 'r').readlines(): print(line.strip()) print('\nWrote best runs sorted to: ', out_fp) # # Save hiplot data in runs/strokes_to_instruction/Feb14_2020/imagesweep_textaug_rankimgs/ # out_fn = 'hiplot_data.json' out_fp = os.path.join(dir, out_fn) print() utils.save_file(hiplot_stats, out_fp, verbose=True)
parser.add_argument("-ig", help="path to gradInput(.bin) file", required=True) args = parser.parse_args() model = create_model(args.config) inp = load_file(args.i) num_input_nodes = np.prod(inp.shape[1:]) inp = inp.reshape(-1, (num_input_nodes)) out = model.forward(inp) model.clearGradParam() gradOutput = load_file(args.og) model.backward(inp, gradOutput) # save output save_file(out, args.o) # save gradW and gradB gradW, gradB = model.getGradParam() save_file(gradW, args.ow) save_file(gradB, args.ob) # save gradInput save_file(model.Layers[0].gradInput, args.ig) model.save("outputs/model2.bin")
os.makedirs(out + '/mismatch/hex') if not os.path.isdir(out + '/illegal'): os.makedirs(out + '/illegal') os.makedirs(out + '/illegal/sim_input') os.makedirs(out + '/illegal/elf') os.makedirs(out + '/illegal/asm') os.makedirs(out + '/illegal/hex') if not os.path.isdir(out + '/corpus'): os.makedirs(out + '/corpus') date = datetime.today().strftime('%Y%m%d') cov_log = out + '/cov_log_{}.txt'.format(date) if (multicore or record) and not os.path.isfile(cov_log): save_file(cov_log, 'w', '{:<10}\t{:<10}\t{:<10}\n'.format('time', 'iter', 'coverage')) start_time = time.time() if not multicore: if minimize: factory = TestFactory(Minimize) factory.add_option('toplevel', [toplevel]) factory.add_option('template', [template]) factory.add_option('out', [out]) factory.add_option('debug', [debug]) else: factory = TestFactory(Run) parser.register_option(factory) factory.add_option('cov_log', [cov_log])