def create_model(config_file): model = Model() with open(config_file, 'r') as f: num_layers = int(f.readline().strip()) for i in range(num_layers): layer_info = f.readline().strip().split(' ') layer_type = layer_info[0] if layer_type == LINEAR: num_inputs = int(layer_info[1]) num_outputs = int(layer_info[2]) model.addLayer(Linear(num_inputs, num_outputs)) elif layer_type == RELU: model.addLayer(ReLU()) weight_file = f.readline().strip() bias_file = f.readline().strip() weights = load_file(weight_file) biases = load_file(bias_file) linear_index = 0 for layer in model.Layers: if isinstance(layer, Linear): layer.W = weights[linear_index] layer.B = biases[linear_index] linear_index += 1 return model
def __iter__(self): with open(self.filelist_path, 'r') as f: items = [line.strip() for line in f.readlines()] if self.shuffle == True: random.shuffle(items) batch = [] batch_fname = [] batch_length = [] for i in range(20): for fname in items: fname = fname + "_" + str(i + 1) loadpath = os.path.join(self.path, fname) batch_file = load_file(loadpath, self.file_type) # if pos and neg are same file i.e. perm is same, skip it if batch_file[0] == batch_file[1]: continue for z in range( len(batch_file)): # z=0 -> pos_doc; z=1 -> neg_doc batch_file[z] = [ sentence.split() for sentence in batch_file[z] ] batch.append(batch_file) batch_length.append(len(batch_file[0])) batch_fname.append(fname) if len(batch) == self.batch_size: yield batch, batch_length, batch_fname batch = [] # make it batch empty for the next iteration batch_fname = [] batch_length = []
def calc_rare_words_stats(samples_fp, print=True): """ Stats for whether rare words are being generated Args: samples_fp: str to samples.json """ samples = utils.load_file(samples_fp) gt_toks = set() gen_toks = set() for sample in samples: gt, gen = sample['ground_truth'], sample['generated'] for tok in utils.normalize_sentence(gt): gt_toks.add(tok) for tok in utils.normalize_sentence(gen): gen_toks.add(tok) if print: print('\nRare words stats:') print('Number of unique tokens in reference instructions: ', len(gt_toks)) print('Number of unique tokens in generated instructions: ', len(gen_toks)) return gt_toks, gen_toks
def __iter__(self): with open(self.filelist_path, 'r') as f: items = [line.strip() for line in f.readlines()] with open(self.label, 'r') as f: labels =[int(line.strip()) for line in f.readlines()] # if self.shuffle == True: # random.shuffle(items) batch = [] batch_fname = [] batch_length = [] label = [] #for i in range(20): for i, fname in enumerate(items): if os.path.exists(os.path.join(self.path, fname)): #print(fname) loadpath = os.path.join(self.path, fname) batch_file = load_file(loadpath, self.file_type) # if pos and neg are same file i.e. perm is same, skip it # if batch_file[0] == batch_file[1]: # continue # for z in range(len(batch_file)): # z=0 -> pos_doc; z=1 -> neg_doc batch_file[0] = [(i+' <eos>').split() for i in ' '.join(batch_file[0]).split('<eos>')][:-1] batch.append(batch_file) batch_length.append(len(batch_file[0])) batch_fname.append(fname) label.append(labels[i]) if len(batch) == self.batch_size: yield batch, batch_length, batch_fname, label batch = [] # make it batch empty for the next iteration batch_fname = [] batch_length = [] label = []
def __init__(self, hp, save_dir): super().__init__(hp, save_dir) self.end_epoch_loader = None # TODO: not generating yet, need to refactor that # Model # Load text embeddings # TODO: move this into some config file vocab_size = len( utils.load_file(LABELED_PROGRESSION_PAIRS_IDX2TOKEN_PATH)) strokes_to_instruction_fp = 'best_models/strokes_to_instruction/catsdecoder-dim_512-model_type_cnn_lstm-use_prestrokes_False/model.pt' weights = torch.load(strokes_to_instruction_fp) enc_dim = weights['token_embedding.weight'].size(1) # enc_dim = hp.enc_dim # self.text_embedding = nn.Embedding(vocab_size, hp.enc_dim) self.text_embedding = nn.Embedding( vocab_size, enc_dim) # if we're loading, must be the same size self.text_embedding.weight = nn.Parameter( weights['token_embedding.weight']) self.enc = InstructionEncoderTransformer( enc_dim, hp.enc_num_layers, hp.dropout, use_categories=False) # TODO: should this be a hparam dec_input_dim = 5 if (hp.cond_instructions == 'initdec') else ( 5 + enc_dim) # dec_inputs self.dec = SketchRNNDecoderGMM( dec_input_dim, hp.dec_dim, hp.M) # Method 1 (see one_forward_pass, i.e. decinputs) self.models.extend([self.text_embedding, self.enc, self.dec]) if USE_CUDA: for model in self.models: model.cuda() self.optimizers.append(optim.Adam(self.parameters(), hp.lr))
def load(self, file_path): model_data = load_file(file_path) for x in model_data: if x["type"] == "Linear": self.addLayer( Linear(x["num_inputs"], x["num_outputs"], x["W"], x["B"])) elif x["type"] == "ReLU": self.addLayer(ReLU())
def __init__(self, args): ''' filename = the vocabulary file. should contain <unk>, <bos>, <eos>; but not <pad> ''' self.vocabs = utils.load_file(args.vocab_path, file_type='json') if '<pad>' in self.vocabs: self.vocabs.remove('<pad>') self._word_to_id = {tok: i for i, tok in enumerate(self.vocabs)} self._bos = self._word_to_id['<bos>'] self._eos = self._word_to_id['<eos>']
def analyze_school_reviews(): df = pd.read_csv(SCHOOL_REVIEWS_DATA) ###### OpenAI detector on ets / school reviews correctness = {} data = load_file('outputs/openai_detector/school_reviews/results.json') # correctness = {fn: d['correct'] for fn, d in data.items()} # The default 'correct' value is based on a 0.5 threshold. Set a new threshold here threshold = 0.5 print(threshold) for fn, d in data.items(): correct = d['real'] > threshold correctness[fn] = correct ###### # Index(['url', 'review_text', 'mn_grd_eb', 'mn_avg_eb', 'top_level', 'perwht', # 'perfrl', 'totenrl', 'gifted_tot', 'lep', 'disab_tot_idea', 'disab_tot', # 'perind', 'perasn', 'perhsp', 'perblk', 'perfl', 'perrl', # 'nonwhite_share2010', 'med_hhinc2016', 'mail_return_rate2010', # 'traveltime15_2010', 'poor_share2010', 'frac_coll_plus2010', # 'jobs_total_5mi_2015', 'jobs_highpay_5mi_2015', # 'ann_avg_job_growth_2004_2013', 'singleparent_share2010', # 'popdensity2010', 'urbanicity'], # dtype='object') #################################################### df['correct'] = np.NaN # df.correct = df.correct.astype('bool') df.set_index('url', inplace=True) for fn, correct in correctness.items(): correct = 1 if correct else 0 df.set_value(fn, 'correct', correct) df = df[df.correct.notnull()] # create some categorical (binary) buckets out of continous variables df['singleparent_share2010_aboveavg'] = df.singleparent_share2010 > df.singleparent_share2010.mean() df['perwht_aboveavg'] = df.perwht > df.perwht.mean() # breakpoint() print('-' * 100) print('Mean') print(df.groupby('urbanicity').correct.mean()) print('-' * 100) print('Mean') print(df.groupby('singleparent_share2010_aboveavg').correct.mean()) print('-' * 100) print('Mean') print(df.groupby('perwht_aboveavg').correct.mean()) breakpoint()
def __init__(self, split, gen_method): super().__init__() self.split = split fp = PREPPED_REALGEN_TEXT_PATH / f'{gen_method}_{split}.pkl' self.data = load_file(fp) if split == 'train': import random random.shuffle(self.data) self.data = self.data[:5000]
def load_hp(hp_obj, dir): """ Args: hp_obj: existing HParams object dir: directory with existing hp_obj, saved as 'hp.json' Returns: hp_object updated """ existing_hp = load_file(os.path.join(dir, "hp.json")) for k, v in existing_hp.items(): setattr(hp_obj, k, v) return hp_obj
def score_segtree_on_parent_child_splits(seg_dir, prob_threshold): def map_parents_to_children(seg_tree): """seg_tree is list of dicts""" id_to_node = {} parid_to_childids = defaultdict(list) for node in seg_tree: id, parid = node['id'], node['parent'] id_to_node[id] = node if parid != '': # root node parid_to_childids[parid].append(id) return id_to_node, parid_to_childids def calc_seg_score(id_to_node, parid_to_childids, scorers): metric2scores = defaultdict(list) for parid, childids in parid_to_childids.items(): par_text = id_to_node[parid]['text'] child_text_concat = ' '.join([id_to_node[childid]['text'] for childid in childids]) for scorer in scorers: for metric, value in scorer.score(par_text, child_text_concat).items(): metric2scores[metric].append(value) metric2scores = {metric: np.mean(scores) for metric, scores in metric2scores.items()} return metric2scores scorers = [InstructionScorer('bleu'), InstructionScorer('rouge')] metric2allscores = defaultdict(list) for root, dirs, fns in os.walk(seg_dir): for fn in fns: if (fn != 'hp.json') and fn.endswith('json') and ('treant' not in fn): fp = os.path.join(root, fn) seg_tree = utils.load_file(fp) seg_tree = prune_seg_tree(seg_tree, prob_threshold) # calculate score for this tree id_to_node, parid_to_childids = map_parents_to_children(seg_tree) metric2scores = calc_seg_score(id_to_node, parid_to_childids, scorers) for metric, score in metric2scores.items(): metric2allscores[metric].append(score) metric2allscores_mean = {metric: np.mean(scores) for metric, scores in metric2allscores.items()} metric2allscores_std = {metric: np.std(scores) for metric, scores in metric2allscores.items()} print('-' * 100) print(f'Scores for: {seg_dir}') print('Mean:') pprint(metric2allscores_mean) print() print('Std:') pprint(metric2allscores_std)
def __iter__(self): items = os.listdir(self.path) if self.shuffle == True: random.shuffle(items) batch = [] batch_fname = [] batch_length = [] for fname in items: loadpath = os.path.join(self.path, fname) batch_file = load_file(loadpath, self.file_type) batch.append(batch_file) batch_fname.append(fname) batch_length.append(len(batch_file[0])) if len(batch) == self.batch_size: yield batch, batch_length, batch_fname batch = [] # make it batch empty for the next iteration batch_fname = [] batch_length = []
def load_model(self, dir): """ Args: dir: str (location of trained model) """ # Load hyperparams used to train model # TODO: we may want to change certain hyperparams at inference time (e.g. decode_method) # Currently, the below just overwrites it model_hp = utils.load_file(os.path.join(dir, 'hp.json')) for key, value in model_hp.items(): setattr(self.hp, key, value) # Also want the updated values and save it next to inference/train.json.... # Load trained weights weights_fp = os.path.join(dir, 'model.pt') print('Loading model weights from: ', weights_fp) self.load_state_dict(torch.load(weights_fp))
def __init__(self, hp, save_dir): """ Args: hp: HParams object save_dir: str """ self.hp = hp self.save_dir = save_dir # Load hp used to train model self.s2i_hp = experiments.load_hp(copy.deepcopy(hp), hp.strokes_to_instruction_dir) default_s2i_hp = s2i_default_hparams() # For backwards compatibility: # hparams may have been added since model was trained; add them to s2i_hp for k, v in vars(default_s2i_hp).items(): if not hasattr(self.s2i_hp, k): setattr(self.s2i_hp, k, v) self.s2i_hp.drawing_type = 'stroke' # TODO: this should be image if we switch to the images model self.strokes_to_instruction = StrokesToInstructionModel( self.s2i_hp, save_dir=None) # save_dir=None means inference mode self.strokes_to_instruction.load_model(hp.strokes_to_instruction_dir) self.strokes_to_instruction.cuda() if (hp.split_scorer == 'instruction_to_strokes') or (hp.score_childinst_parstroke): self.i2s_hp = experiments.load_hp(copy.deepcopy(hp), hp.instruction_to_strokes_dir) # TODO: should do same backwards compatibility as above self.instruction_to_strokes = InstructionToStrokesModel( self.i2s_hp, save_dir=None) self.instruction_to_strokes.load_model( hp.instruction_to_strokes_dir ) # TODO: change param for load_model self.instruction_to_strokes.cuda() if hp.score_parent_child_text_sim: spacy.prefer_gpu() self.nlp = spacy.load('en_core_web_md') # TODO: this should be probably be contained in some model... self.token2idx = utils.load_file( LABELED_PROGRESSION_PAIRS_TOKEN2IDX_PATH)
def convert_all_segmentations_to_treants(seg_dir, prob_threshold): """ Recursively walk through directory and find instruction trees (i.e. segmentations) generated by src/model/segmentation.py. For each one, Args: seg_dir (str): """ for root, dirs, fns in os.walk(seg_dir): for fn in fns: if (fn != 'hp.json') and fn.endswith('json') and ('treant' not in fn): fp = os.path.join(root, fn) seg_tree = utils.load_file(fp) # TODO: save prob_threshold in filename? out_fp = fp.replace('.json', '_treant.js') pruned_seg_tree = prune_seg_tree(seg_tree, prob_threshold) n_og, n_pruned = len(seg_tree), len(pruned_seg_tree) # pprint(seg_tree) # pprint(pruned_seg_tree) print(f'N segments before vs. after pruning: {n_og}, {n_pruned}') save_segmentation_in_treant_format(pruned_seg_tree, out_fp)
def calc_bleu_and_rouge_on_samples(samples_fp, print=True): """ Args: samples_fp: str to samples.json """ samples = utils.load_file(samples_fp) scorers = [InstructionScorer('bleu'), InstructionScorer('rouge')] m2scores = defaultdict(list) m2cat2scores = defaultdict(lambda: defaultdict(list)) for sample in samples: # cat = sample['category'] # this wasn't saved in earlier runs. cat = sample['url'].split('fullinput/')[1].split('/progress')[0] gt, gen = sample['ground_truth'], sample['generated'] # gt, gen = gt.lower(), gen.lower() # gt = gt.replace('draw', 'add') # gen = gen.replace('draw', 'add') for scorer in scorers: for name, value in scorer.score(gt, gen).items(): m2scores[name].append(value) m2cat2scores[name][cat].append(value) if print: print('\nROUGE and BLEU:') print('\nAverage per category:') for rouge_name, cat2scores in m2cat2scores.items(): print('-' * 50) print(rouge_name) cat2avgs = {k: np.mean(v) for k, v in cat2scores.items()} pprint(sorted(cat2avgs.items(), key=lambda x: x[1])) print('Average:') pprint({ rouge_name: np.mean(vals) for rouge_name, vals in m2scores.items() }) return m2scores, m2cat2scores
def send_email(subject, text): email_data = load_file('.email_config.json') SENDING_ADDRESS = email_data['email'] SENDING_PASSWORD = email_data['password'] to_addr_list = email_data['email_to'] body = '\r\n'.join([ 'From: {}'.format(SENDING_ADDRESS), 'To: {}'.format(to_addr_list), 'Subject: {}'.format(subject), '', text ]) try: server = smtplib.SMTP('smtp.gmail.com', 587) # NOTE: This is the GMAIL SSL port. server.ehlo() server.starttls() server.login(SENDING_ADDRESS, SENDING_PASSWORD) server.sendmail(SENDING_ADDRESS, to_addr_list, body) server.quit() print('Email sent successfully!') except Exception as e: print('Email failed to send!') print(str(e))
def __init__(self, split, gen_method=None, max_len=192): super().__init__() self.split = split self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') self.max_len = max_len # Load data df = pd.read_csv(SCHOOL_REVIEWS_DATA) human_revs = df.review_text.tolist() gen_revs = load_file(SCHOOL_REVIEWS_TRAIN_DETECTOR_PATH) # Get data for this split if split == 'train': human_revs = human_revs[:5000] gen_revs = gen_revs[:5000] elif split == 'valid': human_revs = human_revs[5000:6000] gen_revs = gen_revs[5000:6000] elif split == 'test': human_revs = human_revs[6000:7000] gen_revs = gen_revs[6000:7000] revs = human_revs + gen_revs # Prep data self.data = [] for i, rev in enumerate(revs): text_trunc, text_len, token_ids_padded = tokenize_and_prep( self.tokenizer, rev, max_len) d = { 'review': rev, 'text_trunc': text_trunc, 'text_len': text_len, 'token_ids_padded': token_ids_padded, } self.data.append(d)
parser.add_argument('--inference_vaez', action='store_true') parser.add_argument('--load_model_path', help='path to directory containing model to load for inference') opt = parser.parse_args() nn_utils.setup_seeds() save_dir = os.path.join(RUNS_PATH, 'sketchrnn', datetime.today().strftime('%b%d_%Y'), opt.groupname, run_name) # If inference, load hparams if opt.inference or opt.inference_vaez: temp = hp.temperature # store this because we will vary at inference max_per_category = hp.max_per_category categories = hp.categories batch_size = hp.batch_size # overwrite hparmas with original hparmas orig_hp = utils.load_file(os.path.join(opt.load_model_path, 'hp.json')) # dict for k, v in orig_hp.items(): if k not in ['temperature', 'categories', 'max_per_category', 'batch_size']: setattr(hp, k, v) else: experiments.save_run_data(save_dir, hp) model = None skip_data = opt.inference or opt.inference_vaez if hp.model_type == 'vae': model = SketchRNNVAEModel(hp, save_dir, skip_data=skip_data) elif hp.model_type == 'decodergmm': model = SketchRNNDecoderGMMOnlyModel(hp, save_dir, skip_data=skip_data) elif hp.model_type == 'decoderlstm': model = SketchRNNDecoderLSTMOnlyModel(hp, save_dir, skip_data=skip_data)
def analyze_ets(): fp = ETS_NONNATIVE_PATH / 'index.csv' df = pd.read_csv(fp) ##################################################### # Loading inference results ###### First pass with trainer (model was trained on gpt2 gen text) # correctness = load_file('outputs/gpt2gens_detector/ets/evalETS_v0.0.json') # correctness = load_file('outputs/gpt2gens_detector/ets/evalETS_v0.1.json') ###### ###### OpenAI detector on ets / school reviews correctness = {} data = load_file('outputs/openai_detector/ets/results.json') # breakpoint() # correctness = {fn: d['correct'] for fn, d in data.items()} # The default 'correct' value is based on a 0.5 threshold. Set a new threshold here threshold = 0.94 print(threshold) for fn, d in data.items(): correct = d['real'] > threshold correctness[fn] = correct ###### #################################################### df['correct'] = np.NaN # df.correct = df.correct.astype('bool') df.set_index('Filename', inplace=True) for fn, correct in correctness.items(): correct = 1 if correct else 0 df.set_value(fn, 'correct', correct) df = df[df.correct.notnull()] print('-' * 100) print('Counts') print(df.groupby('Score Level').correct.count()) print('-' * 100) print('Mean') print(df.groupby('Score Level').correct.mean()) # print('-' * 100) # print('Std') # print(df.groupby('Score Level').std()) print('-' * 100) print('Fisher tabs') # fisher exact # https://medium.com/@robertmckee/statistical-analysis-hypothesis-testing-of-binary-data-b0dce43306 # https://en.wikipedia.org/wiki/Fisher%27s_exact_test # https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.fisher_exact.html print('high-medium') tab = pd.crosstab(df[df['Score Level'].isin(['high', 'medium'])]['Score Level'], df.correct) print(fisher_exact(tab)) print('medium-low') tab = pd.crosstab(df[df['Score Level'].isin(['medium', 'low'])]['Score Level'], df.correct) print(fisher_exact(tab)) print('high-low') tab = pd.crosstab(df[df['Score Level'].isin(['high', 'low'])]['Score Level'], df.correct) print(fisher_exact(tab)) breakpoint() # print(ttest_ind(df[df['Score Level'] == 'high'].correct, df[df['Score Level'] == 'medium'].correct)) # print(ttest_ind(df[df['Score Level'] == 'medium'].correct, df[df['Score Level'] == 'low'].correct)) # print(ttest_ind(df[df['Score Level'] == 'high'].correct, df[df['Score Level'] == 'low'].correct)) print('=' * 100) print(df.groupby('Language').count()) print('-' * 100) print(df.groupby('Language').mean()) breakpoint()
def create_retrieval_set(N=200, instruction='toplevel_s2iprob'): """ Create a retrieval set by selecting N drawings per category. Uses generated instruction trees. Args: N (int): size of retrieval set per category instruction (str): method for extracting instruction """ # Walk over instruction trees seg_tree_path = BEST_SEG_NDJSON_PATH seg_tree_path = 'data/quickdraw/segmentations/greedy_parsing/progressionpair/Feb18_2020/strokes_to_instruction/S2IimgsFeb13/' for root, dirs, fns in os.walk(seg_tree_path): pqueue = [] category = os.path.basename(root) # n = 0 for fn in fns: if (fn != 'hp.json') and fn.endswith('json') and ('treant' not in fn): fp = os.path.join(root, fn) seg_tree = utils.load_file(fp) drawing_id = fn.replace('.json', '') # drawing_id = fn.replace('.json', '').split('_')[1] # for progressio pair? if instruction == 'toplevel_s2iprob': text = seg_tree[0]['text'] heapq.heappush( # cat_to_pqueue[category], pqueue, (seg_tree[0]['score'], drawing_id, text, seg_tree) ) # n += 1 # if n == 250: # break # We are in a directory with seg_trees if len(pqueue) > 0: print(category) # get best instructions best = heapq.nlargest(N, pqueue) # load drawings cat_drawings = ndjson_drawings(category) id_to_idx = {d['key_id']: idx for idx, d in enumerate(cat_drawings)} # save best best_out = [] for score, id, text, seg_tree in best: stroke3 = ndjson_to_stroke3(cat_drawings[id_to_idx[id]]['drawing']) out = { 'score': score, 'id': id, 'text': text, 'stroke3': stroke3 } best_out.append(out) # id = best_out[1]['id'] # save_img(category, id, cat_drawings, id_to_idx) # pp(best[1][3]) # import pdb; pdb.set_trace() out_fp = RETRIEVAL_SET_PATH / instruction / 'data' / f'{category}.pkl' utils.save_file(best_out, out_fp) # save a version with just the non-stroke data for easy viewing best_out_no_drawing = [] for d in best_out: best_out_no_drawing.append({'score': float(d['score']), 'id': d['id'], 'text': d['text']}) out_fp = RETRIEVAL_SET_PATH / instruction / 'data' / f'{category}_nodrawing.json' utils.save_file(best_out_no_drawing, out_fp) # Save drawings chunk_n = 25 for i in range(0, N, chunk_n): best_chunk = best_out[i:i+chunk_n] drawings = [] for b in best_chunk: # stroke3 format is in x y deltas, save_multiple_strokes...() expects the actual x y points b['stroke3'][:,0] = np.cumsum(b['stroke3'][:,0]) b['stroke3'][:,1] = np.cumsum(b['stroke3'][:,1]) drawings.append(b['stroke3']) out_dir = RETRIEVAL_SET_PATH / instruction / 'drawings' os.makedirs(out_dir, exist_ok=True) out_fp = out_dir / f'{category}_{i}-{i+chunk_n}.jpg' save_multiple_strokes_as_img(drawings, out_fp)
import numpy as np from copy import deepcopy import argparse from src.Linear import Linear from src.ReLU import ReLU from src.Model import Model from src.Criterion import Criterion from src.Optimizer import SGDOptimizer from src.utils import load_file, save_file if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument("-modelName", help="model name", required=True) parser.add_argument("-data", help="location of the training data", required=True) parser.add_argument("-target", help="location of the target labels", required=True) args = parser.parse_args() os.makedirs(args.modelName, exist_ok=True) data = load_file(args.data) target = load_file(args.target) print(data.shape, target.shape)
import torch import os import sys import numpy as np from copy import deepcopy import argparse from src.Criterion import Criterion from src.utils import load_file, save_file if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument("-i", help="path to input(.bin) file", required=True) parser.add_argument("-t", help="path to target(.bin) file", required=True) parser.add_argument("-ig", help="path to gradInput(.bin) file", required=True) args = parser.parse_args() inp = load_file(args.i) target = load_file(args.t) ce_loss = Criterion() loss = ce_loss.forward(inp, target) print(loss) gradInput = ce_loss.backward(inp, target) save_file(gradInput, args.ig)
def calc_stats_for_runs_in_dir(dir, best_n=10): """ Print runs with best stats in <dir> Assumes each run has file with the name: 'e<epoch>_loss<loss>.pt'. Args: dir (str) best_n (int) """ print(f'Looking in: {dir}\n') n = 0 runs_stats = [] hiplot_stats = [] for root, dirs, fns in os.walk(dir): for fn in fns: # Get loss from model fn (e<epoch>_loss<loss>.pt) if fn.endswith('pt') and ('loss' in fn): # Get best samples epoch = fn.split('_')[0].replace('e', '') loss = float(fn.split('loss')[1].strip('.pt')) run = root.replace(dir + '/', '') best_sample_fp = os.path.join(root, 'outputs', f'samples_e{epoch}.json') # Calculate stats m2scores, m2cat2scores = calc_bleu_and_rouge_on_samples( best_sample_fp, print=False) gt_toks, gen_toks = calc_rare_words_stats(best_sample_fp, print=False) run_results = { 'n_gen_toks': len(gen_toks), 'n_gt_toks': len(gt_toks), 'loss': loss, 'rouge1': np.mean(m2scores['rouge1']), 'rouge2': np.mean(m2scores['rouge2']), 'rougeL': np.mean(m2scores['rougeL']), 'bleu1': np.mean(m2scores['bleu1']), 'bleu2': np.mean(m2scores['bleu2']), } runs_stats.append([run, run_results]) # Save json data to be visualized by hiplot hp_dict = utils.load_file(os.path.join(root, 'hp.json')) run_hiplot = {} for k, val in hp_dict.items(): run_hiplot[k] = val run_hiplot.update(run_results) hiplot_stats.append(run_hiplot) n += 1 # # Write best runs sorted to file out_fp = os.path.join(dir, 'best_runs.txt') with open(out_fp, 'w') as f: print('-' * 100) for main_stat in runs_stats[0][1].keys( ): # n_gen_toks, loss, rougeL, bleu1, bleu2 print(f'RUNS WITH BEST: {main_stat}', file=f) if main_stat == 'loss': # lower is beter sorted_by_main_stat = sorted( runs_stats, key=lambda x: -x[1][main_stat])[-best_n:] else: # higher is better sorted_by_main_stat = sorted( runs_stats, key=lambda x: x[1][main_stat])[-best_n:] for run, stats in sorted_by_main_stat: main_stat_val = stats[main_stat] other_stats_str = ', '.join([ '{}: {:.4f}'.format(stat, val) for stat, val in stats.items() if (main_stat != stat) ]) out_str = '{}: {:.4f}'.format(main_stat, main_stat_val) print(out_str + ', ' + other_stats_str + ', run: ' + run, file=f) print(file=f) # Print to stdout for line in open(out_fp, 'r').readlines(): print(line.strip()) print('\nWrote best runs sorted to: ', out_fp) # # Save hiplot data in runs/strokes_to_instruction/Feb14_2020/imagesweep_textaug_rankimgs/ # out_fn = 'hiplot_data.json' out_fp = os.path.join(dir, out_fn) print() utils.save_file(hiplot_stats, out_fp, verbose=True)
def score_segtree_match_with_annotations(seg_dir, prob_threshold): def load_annotations(): """ Returns: dict: drawing_id -> row from dataframe of Mturk annotations """ df = pd.read_csv(ANNOTATED_PROGRESSION_PAIRS_CSV_PATH) id_to_annotations = {} for i, row in df.iterrows(): drawing_id = row['Input.id'] id_to_annotations[drawing_id] = row return id_to_annotations def calc_seg_score(drawing_id, seg_tree, id_to_annotations, scorers): """ Calculate score for one tree. Args: drawing_id (int) seg_tree (list of dicts): [description] id_to_annotations (dict): drawing_id (int) -> row from dataframe of Mturk annotations scorers (list): Scorers (bleu, rouge) """ annotations = id_to_annotations[drawing_id] # TODO: check that this exists... category = annotations['Input.category'] gt_instruction = annotations['Answer.annotation'].replace('\r', '') ndjson_start = annotations['Input.start'] ndjson_end = annotations['Input.end'] n_segs = annotations['Input.n_segments'] url = annotations['Input.url'] metric2score = {} match = None # There may be one segment within the instruction tree that matches the annotated segment for node in seg_tree: if (node['left'] == ndjson_start) and (node['right'] == ndjson_end): # TODO: check offsets etc. gen_instruction = node['text'] for scorer in scorers: for metric, value in scorer.score(gt_instruction, gen_instruction).items(): metric2score[metric] = value match = { 'id': drawing_id, 'gen_instruction': gen_instruction, 'gt_instruction': gt_instruction, 'category': category, 'url': url, } return metric2score, match scorers = [InstructionScorer('bleu'), InstructionScorer('rouge')] metric2allscores = defaultdict(list) all_matches = [] id_to_annotations = load_annotations() n_segs = 0 # Find instruction trees for root, dirs, fns in os.walk(seg_dir): for fn in fns: if (fn != 'hp.json') and fn.endswith('json') and ('treant' not in fn): fp = os.path.join(root, fn) drawing_id = fn.split('_')[1].replace('.json', '') # fn: lion_6247028344487936.jpg drawing_id = int(drawing_id) seg_tree = utils.load_file(fp) seg_tree = prune_seg_tree(seg_tree, prob_threshold) # calculate score for this tree metric2score, match = calc_seg_score(drawing_id, seg_tree, id_to_annotations, scorers) if match: all_matches.append(match) for metric, score in metric2score.items(): metric2allscores[metric].append(score) n_segs += 1 metric2allscores_mean = {metric: np.mean(scores) for metric, scores in metric2allscores.items()} metric2allscores_std = {metric: np.std(scores) for metric, scores in metric2allscores.items()} print('-' * 100) print(f'Number of matches: {len(all_matches)} / {n_segs}') print(f'Scores for: {seg_dir}') print('Mean:') pprint(metric2allscores_mean)
def convert_generated_instruction_samples_to_html(samples_fp): """ Convert outputs from StrokeToInstructionRNN model to html """ html_path = samples_fp.replace('.json', '.html') with open(html_path, 'w') as out_f: out_f.write(""" <html lang="en"> <head> <title>Bootstrap Example</title> <meta charset="utf-8"> <meta name="viewport" content="width=device-width, initial-scale=1"> <link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/bootstrap/3.4.0/css/bootstrap.min.css"> <script src="https://ajax.googleapis.com/ajax/libs/jquery/3.4.1/jquery.min.js"></script> <script src="https://maxcdn.bootstrapcdn.com/bootstrap/3.4.0/js/bootstrap.min.js"></script> </head> <body> <div class="container"> <h2>MTurk Results</h2> """) ROW_TEMPLATE = """ <div class="row"> <div class="col-md-5"> <div class="thumbnail"> <div> <p><strong>Category: {}</strong></p> </div> <img src="{}" style="max-width:100%"> <div class="caption"> <p>Ground truth: {}</p> <p>Generated: {} </div> </div> </div> <div class="col-md-5"> <div class="thumbnail"> <div> <p><strong>Category: {}</strong></p> </div> <img src="{}" style="max-width:100%"> <div class="caption"> <p>Ground truth: {}</p> <p>Generated: {} </div> </div> </div> </div> """ samples = utils.load_file(samples_fp) for i in range(0, len(samples), 2): # cat = sample['category'] cat1 = samples[i]['url'].split('fullinput/')[1].split( '/progress')[0] url1 = samples[i]['url'] gt1 = ' '.join(utils.normalize_sentence( samples[i]['ground_truth'])) gen1 = samples[i]['generated'] cat2 = samples[i + 1]['url'].split('fullinput/')[1].split( '/progress')[0] url2 = samples[i + 1]['url'] gt2 = ' '.join( utils.normalize_sentence(samples[i + 1]['ground_truth'])) gen2 = samples[i + 1]['generated'] row = ROW_TEMPLATE.format(cat1, url1, gt1, gen1, cat2, url2, gt2, gen2) out_f.write(row) out_f.write(""" </div> </body> </html> """)
print("**word2vec Embeddings!") args = parser.parse_args() random.seed(0) torch.manual_seed(6) now = datetime.datetime.now() args.experiment_folder = args.experiment_path + \ f"{now.year}_{now.month}_{now.day}_{now.hour}_{now.minute}/" if not os.path.exists(args.experiment_folder) and args.save_model: os.makedirs(args.experiment_folder) utils.print_args(args) # vocabs contain all vocab + <pad>, <bos>, <eos>, <unk> args.vocabs = utils.load_file(args.vocab_path, file_type='json') args.n_vocabs = len(args.vocabs) args.word2idx = {tok: i for i, tok in enumerate(args.vocabs)} args.idx2word = {i: tok for i, tok in enumerate(args.vocabs)} args.padding_idx = args.word2idx[args.padding_symbol] batch_gen_train, batch_gen_test = data_load.create_batch_generators(args) batcher = lm_model.TokenBatcher(args) # Sentence encoder sentence_encoder = model.SentenceEmbeddingModel(args).to(args.device) # Convolution layer for extracting global coherence patterns global_feature_extractor = model.LightweightConvolution(args).to(args.device) # Bilinear layer for modeling inter-sentence relation bilinear_layer = model.BiAffine(args).to(args.device) # Linear layer coherence_scorer = model.LocalCoherenceScore(args).to(args.device)
parser = add_generation_args(parser) args = parser.parse_args() set_seed_for_gen(args) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # gpt2-medium trained on one week of covid news model_fp = 'trained_models/school_reviews/gpt2/wandb/model_e0.pkl' tokenizer_fp = 'trained_models/school_reviews/gpt2/wandb/tokenizer.pkl' print('Loading') from transformers import GPT2LMHeadModel, GPT2TokenizerFast model, tokenizer = GPT2LMHeadModel, GPT2TokenizerFast model = model.from_pretrained('gpt2-xl') model = load_file(model_fp) tokenizer = load_file(tokenizer_fp) gpt2 = GPT2Wrapper(args, model=model, tokenizer=tokenizer) gpt2 = gpt2.to(device) print('Loaded') OUT_FP = 'data/school_reviews/train_detector/trainedonallreviews_gpt2-xl_e0.json' texts = [] for i in range(5000): text = model.generate_unconditional(self, n=1, bsz=1, stdout=True)[0] texts.append(text) if i % 10 == 0: save_file(texts, OUT_FP)
required=True) parser.add_argument("-i", help="path to input(.bin) file", required=True) parser.add_argument("-og", help="path to gradOuput(.bin) file", required=True) parser.add_argument("-o", help="path to output(.bin) file", required=True) parser.add_argument("-ow", help="path to gradW(.bin) file", required=True) parser.add_argument("-ob", help="path to gradB(.bin) file", required=True) parser.add_argument("-ig", help="path to gradInput(.bin) file", required=True) args = parser.parse_args() model = create_model(args.config) inp = load_file(args.i) num_input_nodes = np.prod(inp.shape[1:]) inp = inp.reshape(-1, (num_input_nodes)) out = model.forward(inp) model.clearGradParam() gradOutput = load_file(args.og) model.backward(inp, gradOutput) # save output save_file(out, args.o) # save gradW and gradB gradW, gradB = model.getGradParam()