def create_model(config_file):

    model = Model()

    with open(config_file, 'r') as f:
        num_layers = int(f.readline().strip())
        for i in range(num_layers):
            layer_info = f.readline().strip().split(' ')
            layer_type = layer_info[0]

            if layer_type == LINEAR:
                num_inputs = int(layer_info[1])
                num_outputs = int(layer_info[2])
                model.addLayer(Linear(num_inputs, num_outputs))
            elif layer_type == RELU:
                model.addLayer(ReLU())

        weight_file = f.readline().strip()
        bias_file = f.readline().strip()

        weights = load_file(weight_file)
        biases = load_file(bias_file)

    linear_index = 0
    for layer in model.Layers:
        if isinstance(layer, Linear):
            layer.W = weights[linear_index]
            layer.B = biases[linear_index]
            linear_index += 1

    return model
 def __iter__(self):
     with open(self.filelist_path, 'r') as f:
         items = [line.strip() for line in f.readlines()]
     if self.shuffle == True:
         random.shuffle(items)
     batch = []
     batch_fname = []
     batch_length = []
     for i in range(20):
         for fname in items:
             fname = fname + "_" + str(i + 1)
             loadpath = os.path.join(self.path, fname)
             batch_file = load_file(loadpath, self.file_type)
             # if pos and neg are same file i.e. perm is same, skip it
             if batch_file[0] == batch_file[1]:
                 continue
             for z in range(
                     len(batch_file)):  # z=0 -> pos_doc; z=1 -> neg_doc
                 batch_file[z] = [
                     sentence.split() for sentence in batch_file[z]
                 ]
             batch.append(batch_file)
             batch_length.append(len(batch_file[0]))
             batch_fname.append(fname)
             if len(batch) == self.batch_size:
                 yield batch, batch_length, batch_fname
                 batch = []  # make it batch empty for the next iteration
                 batch_fname = []
                 batch_length = []
def calc_rare_words_stats(samples_fp, print=True):
    """
    Stats for whether rare words are being generated

    Args:
        samples_fp: str to samples.json
    """
    samples = utils.load_file(samples_fp)
    gt_toks = set()
    gen_toks = set()
    for sample in samples:
        gt, gen = sample['ground_truth'], sample['generated']
        for tok in utils.normalize_sentence(gt):
            gt_toks.add(tok)
        for tok in utils.normalize_sentence(gen):
            gen_toks.add(tok)

    if print:
        print('\nRare words stats:')
        print('Number of unique tokens in reference instructions: ',
              len(gt_toks))
        print('Number of unique tokens in generated instructions: ',
              len(gen_toks))

    return gt_toks, gen_toks
    def __iter__(self):
        with open(self.filelist_path, 'r') as f:
            items = [line.strip() for line in f.readlines()]

        with open(self.label, 'r') as f:
            labels =[int(line.strip()) for line in f.readlines()]
        # if self.shuffle == True:
        #     random.shuffle(items)
        batch = []
        batch_fname = []
        batch_length = []
        label = []
        #for i in range(20):
        for i, fname in enumerate(items):
            if os.path.exists(os.path.join(self.path, fname)):
                #print(fname)
                loadpath = os.path.join(self.path, fname)
                batch_file = load_file(loadpath, self.file_type)
                # if pos and neg are same file i.e. perm is same, skip it
                # if batch_file[0] == batch_file[1]:
                #     continue
                # for z in range(len(batch_file)):  # z=0 -> pos_doc; z=1 -> neg_doc
                batch_file[0] = [(i+' <eos>').split() for i in ' '.join(batch_file[0]).split('<eos>')][:-1]
                batch.append(batch_file)
                batch_length.append(len(batch_file[0]))
                batch_fname.append(fname)
                label.append(labels[i])
                if len(batch) == self.batch_size:
                    yield batch, batch_length, batch_fname, label
                    batch = []  # make it batch empty for the next iteration
                    batch_fname = []
                    batch_length = []
                    label = []
    def __init__(self, hp, save_dir):
        super().__init__(hp, save_dir)

        self.end_epoch_loader = None  # TODO: not generating yet, need to refactor that

        # Model
        # Load text embeddings
        # TODO: move this into some config file
        vocab_size = len(
            utils.load_file(LABELED_PROGRESSION_PAIRS_IDX2TOKEN_PATH))
        strokes_to_instruction_fp = 'best_models/strokes_to_instruction/catsdecoder-dim_512-model_type_cnn_lstm-use_prestrokes_False/model.pt'
        weights = torch.load(strokes_to_instruction_fp)
        enc_dim = weights['token_embedding.weight'].size(1)
        # enc_dim = hp.enc_dim
        # self.text_embedding = nn.Embedding(vocab_size, hp.enc_dim)
        self.text_embedding = nn.Embedding(
            vocab_size, enc_dim)  # if we're loading, must be the same size
        self.text_embedding.weight = nn.Parameter(
            weights['token_embedding.weight'])

        self.enc = InstructionEncoderTransformer(
            enc_dim, hp.enc_num_layers, hp.dropout,
            use_categories=False)  # TODO: should this be a hparam
        dec_input_dim = 5 if (hp.cond_instructions == 'initdec') else (
            5 + enc_dim)  # dec_inputs
        self.dec = SketchRNNDecoderGMM(
            dec_input_dim, hp.dec_dim,
            hp.M)  # Method 1 (see one_forward_pass, i.e. decinputs)

        self.models.extend([self.text_embedding, self.enc, self.dec])
        if USE_CUDA:
            for model in self.models:
                model.cuda()

        self.optimizers.append(optim.Adam(self.parameters(), hp.lr))
 def load(self, file_path):
     model_data = load_file(file_path)
     for x in model_data:
         if x["type"] == "Linear":
             self.addLayer(
                 Linear(x["num_inputs"], x["num_outputs"], x["W"], x["B"]))
         elif x["type"] == "ReLU":
             self.addLayer(ReLU())
 def __init__(self, args):
     '''
     filename = the vocabulary file. should contain <unk>, <bos>, <eos>; but not <pad>
     '''
     self.vocabs = utils.load_file(args.vocab_path, file_type='json')
     if '<pad>' in self.vocabs:
         self.vocabs.remove('<pad>')
     self._word_to_id = {tok: i for i, tok in enumerate(self.vocabs)}
     self._bos = self._word_to_id['<bos>']
     self._eos = self._word_to_id['<eos>']
def analyze_school_reviews():
    df = pd.read_csv(SCHOOL_REVIEWS_DATA)


    ###### OpenAI detector on ets / school reviews
    correctness = {}
    data = load_file('outputs/openai_detector/school_reviews/results.json')

    # correctness = {fn: d['correct'] for fn, d in data.items()}
    # The default 'correct' value is based on a 0.5 threshold. Set a new threshold here
    threshold = 0.5
    print(threshold)
    for fn, d in data.items():
        correct = d['real'] > threshold
        correctness[fn] = correct
    ######

    # Index(['url', 'review_text', 'mn_grd_eb', 'mn_avg_eb', 'top_level', 'perwht',
    #    'perfrl', 'totenrl', 'gifted_tot', 'lep', 'disab_tot_idea', 'disab_tot',
    #    'perind', 'perasn', 'perhsp', 'perblk', 'perfl', 'perrl',
    #    'nonwhite_share2010', 'med_hhinc2016', 'mail_return_rate2010',
    #    'traveltime15_2010', 'poor_share2010', 'frac_coll_plus2010',
    #    'jobs_total_5mi_2015', 'jobs_highpay_5mi_2015',
    #    'ann_avg_job_growth_2004_2013', 'singleparent_share2010',
    #    'popdensity2010', 'urbanicity'],
    #   dtype='object')

    ####################################################
    df['correct'] = np.NaN
    # df.correct = df.correct.astype('bool')
    df.set_index('url', inplace=True)

    for fn, correct in correctness.items():
        correct = 1 if correct else 0
        df.set_value(fn, 'correct', correct)
    df = df[df.correct.notnull()]

    # create some categorical (binary) buckets out of continous variables
    df['singleparent_share2010_aboveavg'] = df.singleparent_share2010 > df.singleparent_share2010.mean()
    df['perwht_aboveavg'] = df.perwht > df.perwht.mean()

    # breakpoint()

    print('-' * 100)
    print('Mean')
    print(df.groupby('urbanicity').correct.mean())
    print('-' * 100)
    print('Mean')
    print(df.groupby('singleparent_share2010_aboveavg').correct.mean())
    print('-' * 100)
    print('Mean')
    print(df.groupby('perwht_aboveavg').correct.mean())

    breakpoint()
Beispiel #9
0
    def __init__(self, split, gen_method):
        super().__init__()
        self.split = split

        fp = PREPPED_REALGEN_TEXT_PATH / f'{gen_method}_{split}.pkl'
        self.data = load_file(fp)

        if split == 'train':
            import random
            random.shuffle(self.data)
            self.data = self.data[:5000]
Beispiel #10
0
def load_hp(hp_obj, dir):
    """
    Args:
        hp_obj: existing HParams object
        dir: directory with existing hp_obj, saved as 'hp.json'
    Returns:
        hp_object updated
    """
    existing_hp = load_file(os.path.join(dir, "hp.json"))
    for k, v in existing_hp.items():
        setattr(hp_obj, k, v)
    return hp_obj
def score_segtree_on_parent_child_splits(seg_dir, prob_threshold):

    def map_parents_to_children(seg_tree):
        """seg_tree is list of dicts"""
        id_to_node = {}
        parid_to_childids = defaultdict(list)
        for node in seg_tree:
            id, parid = node['id'],  node['parent']
            id_to_node[id] = node
            if parid != '':  # root node
                parid_to_childids[parid].append(id)
        return id_to_node, parid_to_childids

    def calc_seg_score(id_to_node, parid_to_childids, scorers):
        metric2scores = defaultdict(list)
        for parid, childids in parid_to_childids.items():
            par_text = id_to_node[parid]['text']
            child_text_concat = ' '.join([id_to_node[childid]['text'] for childid in childids])

            for scorer in scorers:
                for metric, value in scorer.score(par_text, child_text_concat).items():
                    metric2scores[metric].append(value)

        metric2scores = {metric: np.mean(scores) for metric, scores in metric2scores.items()}
        return metric2scores


    scorers = [InstructionScorer('bleu'), InstructionScorer('rouge')]

    metric2allscores = defaultdict(list)
    for root, dirs, fns in os.walk(seg_dir):
        for fn in fns:
            if (fn != 'hp.json') and fn.endswith('json') and ('treant' not in fn):
                fp = os.path.join(root, fn)
                seg_tree = utils.load_file(fp)
                seg_tree = prune_seg_tree(seg_tree, prob_threshold)

                # calculate score for this tree
                id_to_node, parid_to_childids = map_parents_to_children(seg_tree)
                metric2scores = calc_seg_score(id_to_node, parid_to_childids, scorers)
                for metric, score in metric2scores.items():
                    metric2allscores[metric].append(score)

    metric2allscores_mean = {metric: np.mean(scores) for metric, scores in metric2allscores.items()}
    metric2allscores_std = {metric: np.std(scores) for metric, scores in metric2allscores.items()}

    print('-' * 100)
    print(f'Scores for: {seg_dir}')
    print('Mean:')
    pprint(metric2allscores_mean)
    print()
    print('Std:')
    pprint(metric2allscores_std)
 def __iter__(self):
     items = os.listdir(self.path)
     if self.shuffle == True:
         random.shuffle(items)
     batch = []
     batch_fname = []
     batch_length = []
     for fname in items:
         loadpath = os.path.join(self.path, fname)
         batch_file = load_file(loadpath, self.file_type)
         batch.append(batch_file)
         batch_fname.append(fname)
         batch_length.append(len(batch_file[0]))
         if len(batch) == self.batch_size:
             yield batch, batch_length, batch_fname
             batch = []  # make it batch empty for the next iteration
             batch_fname = []
             batch_length = []
Beispiel #13
0
    def load_model(self, dir):
        """
		Args:
			dir: str (location of trained model)
		"""

        # Load hyperparams used to train model
        # TODO: we may want to change certain hyperparams at inference time (e.g. decode_method)
        # Currently, the below just overwrites it
        model_hp = utils.load_file(os.path.join(dir, 'hp.json'))
        for key, value in model_hp.items():
            setattr(self.hp, key, value)
        # Also want the updated values and save it next to inference/train.json....

        # Load trained weights
        weights_fp = os.path.join(dir, 'model.pt')
        print('Loading model weights from: ', weights_fp)
        self.load_state_dict(torch.load(weights_fp))
Beispiel #14
0
    def __init__(self, hp, save_dir):
        """
        Args:
            hp: HParams object
            save_dir: str
        """
        self.hp = hp
        self.save_dir = save_dir

        # Load hp used to train model
        self.s2i_hp = experiments.load_hp(copy.deepcopy(hp),
                                          hp.strokes_to_instruction_dir)
        default_s2i_hp = s2i_default_hparams()
        # For backwards compatibility:
        # hparams may have been added since model was trained; add them to s2i_hp
        for k, v in vars(default_s2i_hp).items():
            if not hasattr(self.s2i_hp, k):
                setattr(self.s2i_hp, k, v)
        self.s2i_hp.drawing_type = 'stroke'  # TODO: this should be image if we switch to the images model

        self.strokes_to_instruction = StrokesToInstructionModel(
            self.s2i_hp, save_dir=None)  # save_dir=None means inference mode
        self.strokes_to_instruction.load_model(hp.strokes_to_instruction_dir)
        self.strokes_to_instruction.cuda()

        if (hp.split_scorer
                == 'instruction_to_strokes') or (hp.score_childinst_parstroke):
            self.i2s_hp = experiments.load_hp(copy.deepcopy(hp),
                                              hp.instruction_to_strokes_dir)
            # TODO: should do same backwards compatibility as above
            self.instruction_to_strokes = InstructionToStrokesModel(
                self.i2s_hp, save_dir=None)
            self.instruction_to_strokes.load_model(
                hp.instruction_to_strokes_dir
            )  # TODO: change param for load_model
            self.instruction_to_strokes.cuda()

        if hp.score_parent_child_text_sim:
            spacy.prefer_gpu()
            self.nlp = spacy.load('en_core_web_md')

        # TODO: this should be probably be contained in some model...
        self.token2idx = utils.load_file(
            LABELED_PROGRESSION_PAIRS_TOKEN2IDX_PATH)
def convert_all_segmentations_to_treants(seg_dir, prob_threshold):
    """
    Recursively walk through directory and find instruction trees (i.e. segmentations)
    generated by src/model/segmentation.py. For each one,

    Args:
        seg_dir (str):
    """
    for root, dirs, fns in os.walk(seg_dir):
        for fn in fns:
            if (fn != 'hp.json') and fn.endswith('json') and ('treant' not in fn):
                fp = os.path.join(root, fn)
                seg_tree = utils.load_file(fp)
                # TODO: save prob_threshold in filename?
                out_fp = fp.replace('.json', '_treant.js')
                pruned_seg_tree = prune_seg_tree(seg_tree, prob_threshold)
                n_og, n_pruned = len(seg_tree), len(pruned_seg_tree)
                # pprint(seg_tree)
                # pprint(pruned_seg_tree)
                print(f'N segments before vs. after pruning: {n_og}, {n_pruned}')
                save_segmentation_in_treant_format(pruned_seg_tree, out_fp)
def calc_bleu_and_rouge_on_samples(samples_fp, print=True):
    """
    Args:
        samples_fp: str to samples.json
    """
    samples = utils.load_file(samples_fp)

    scorers = [InstructionScorer('bleu'), InstructionScorer('rouge')]

    m2scores = defaultdict(list)
    m2cat2scores = defaultdict(lambda: defaultdict(list))
    for sample in samples:
        # cat = sample['category']  # this wasn't saved in earlier runs.
        cat = sample['url'].split('fullinput/')[1].split('/progress')[0]
        gt, gen = sample['ground_truth'], sample['generated']
        # gt, gen = gt.lower(), gen.lower()
        # gt = gt.replace('draw', 'add')
        # gen = gen.replace('draw', 'add')
        for scorer in scorers:
            for name, value in scorer.score(gt, gen).items():
                m2scores[name].append(value)
                m2cat2scores[name][cat].append(value)

    if print:
        print('\nROUGE and BLEU:')
        print('\nAverage per category:')
        for rouge_name, cat2scores in m2cat2scores.items():
            print('-' * 50)
            print(rouge_name)
            cat2avgs = {k: np.mean(v) for k, v in cat2scores.items()}
            pprint(sorted(cat2avgs.items(), key=lambda x: x[1]))

        print('Average:')
        pprint({
            rouge_name: np.mean(vals)
            for rouge_name, vals in m2scores.items()
        })

    return m2scores, m2cat2scores
Beispiel #17
0
def send_email(subject, text):
    email_data = load_file('.email_config.json')
    SENDING_ADDRESS = email_data['email']
    SENDING_PASSWORD = email_data['password']
    to_addr_list = email_data['email_to']

    body = '\r\n'.join([
        'From: {}'.format(SENDING_ADDRESS), 'To: {}'.format(to_addr_list),
        'Subject: {}'.format(subject), '', text
    ])
    try:
        server = smtplib.SMTP('smtp.gmail.com',
                              587)  # NOTE: This is the GMAIL SSL port.
        server.ehlo()
        server.starttls()
        server.login(SENDING_ADDRESS, SENDING_PASSWORD)
        server.sendmail(SENDING_ADDRESS, to_addr_list, body)
        server.quit()
        print('Email sent successfully!')
    except Exception as e:
        print('Email failed to send!')
        print(str(e))
    def __init__(self, split, gen_method=None, max_len=192):
        super().__init__()
        self.split = split
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        self.max_len = max_len
        
        # Load data
        df = pd.read_csv(SCHOOL_REVIEWS_DATA)
        human_revs = df.review_text.tolist()
        gen_revs = load_file(SCHOOL_REVIEWS_TRAIN_DETECTOR_PATH)

        # Get data for this split
        if split == 'train':
            human_revs = human_revs[:5000]
            gen_revs = gen_revs[:5000]
        elif split == 'valid':
            human_revs = human_revs[5000:6000]
            gen_revs = gen_revs[5000:6000]
        elif split == 'test':
            human_revs = human_revs[6000:7000]
            gen_revs = gen_revs[6000:7000]
        revs = human_revs + gen_revs

        # Prep data
        self.data = []
        for i, rev in enumerate(revs):
            text_trunc, text_len, token_ids_padded = tokenize_and_prep(
                self.tokenizer, rev, max_len)

            d = {
                'review': rev,
                'text_trunc': text_trunc,
                'text_len': text_len,
                'token_ids_padded': token_ids_padded,
            }
            
            self.data.append(d)
Beispiel #19
0
    parser.add_argument('--inference_vaez', action='store_true')
    parser.add_argument('--load_model_path', help='path to directory containing model to load for inference')
    opt = parser.parse_args()
    nn_utils.setup_seeds()

    save_dir = os.path.join(RUNS_PATH, 'sketchrnn', datetime.today().strftime('%b%d_%Y'), opt.groupname, run_name)

    # If inference, load hparams
    if opt.inference or opt.inference_vaez:
        temp = hp.temperature  # store this because we will vary at inference
        max_per_category = hp.max_per_category
        categories = hp.categories
        batch_size = hp.batch_size

        # overwrite hparmas with original hparmas
        orig_hp = utils.load_file(os.path.join(opt.load_model_path, 'hp.json'))  # dict
        for k, v in orig_hp.items():
            if k not in ['temperature', 'categories', 'max_per_category', 'batch_size']:
                setattr(hp, k, v)
    else:
        experiments.save_run_data(save_dir, hp)


    model = None
    skip_data = opt.inference or opt.inference_vaez
    if hp.model_type == 'vae':
        model = SketchRNNVAEModel(hp, save_dir, skip_data=skip_data)
    elif hp.model_type == 'decodergmm':
        model = SketchRNNDecoderGMMOnlyModel(hp, save_dir, skip_data=skip_data)
    elif hp.model_type == 'decoderlstm':
        model = SketchRNNDecoderLSTMOnlyModel(hp, save_dir, skip_data=skip_data)
def analyze_ets():
    fp = ETS_NONNATIVE_PATH / 'index.csv'
    df = pd.read_csv(fp)

    #####################################################
    # Loading inference results

    ###### First pass with trainer (model was trained on gpt2 gen text)
    # correctness = load_file('outputs/gpt2gens_detector/ets/evalETS_v0.0.json')
    # correctness = load_file('outputs/gpt2gens_detector/ets/evalETS_v0.1.json')
    ######

    ###### OpenAI detector on ets / school reviews
    correctness = {}
    data = load_file('outputs/openai_detector/ets/results.json')
    # breakpoint()

    # correctness = {fn: d['correct'] for fn, d in data.items()}
    # The default 'correct' value is based on a 0.5 threshold. Set a new threshold here
    threshold = 0.94
    print(threshold)
    for fn, d in data.items():
        correct = d['real'] > threshold
        correctness[fn] = correct
    ######

    ####################################################
    df['correct'] = np.NaN
    # df.correct = df.correct.astype('bool')
    df.set_index('Filename', inplace=True)

    for fn, correct in correctness.items():
        correct = 1 if correct else 0
        df.set_value(fn, 'correct', correct)

    df = df[df.correct.notnull()]

    print('-' * 100)
    print('Counts')
    print(df.groupby('Score Level').correct.count())
    print('-' * 100)
    print('Mean')
    print(df.groupby('Score Level').correct.mean())
    # print('-' * 100)
    # print('Std')
    # print(df.groupby('Score Level').std())

    print('-' * 100)
    print('Fisher tabs')
    # fisher exact
    # https://medium.com/@robertmckee/statistical-analysis-hypothesis-testing-of-binary-data-b0dce43306
    # https://en.wikipedia.org/wiki/Fisher%27s_exact_test
    # https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.fisher_exact.html
    print('high-medium')
    tab = pd.crosstab(df[df['Score Level'].isin(['high', 'medium'])]['Score Level'], df.correct)
    print(fisher_exact(tab))
    print('medium-low')
    tab = pd.crosstab(df[df['Score Level'].isin(['medium', 'low'])]['Score Level'], df.correct)
    print(fisher_exact(tab))
    print('high-low')
    tab = pd.crosstab(df[df['Score Level'].isin(['high', 'low'])]['Score Level'], df.correct)
    print(fisher_exact(tab))


    breakpoint()
    # print(ttest_ind(df[df['Score Level'] == 'high'].correct, df[df['Score Level'] == 'medium'].correct))
    # print(ttest_ind(df[df['Score Level'] == 'medium'].correct, df[df['Score Level'] == 'low'].correct))
    # print(ttest_ind(df[df['Score Level'] == 'high'].correct, df[df['Score Level'] == 'low'].correct))


    print('=' * 100)
    print(df.groupby('Language').count())
    print('-' * 100)
    print(df.groupby('Language').mean())


    breakpoint()
Beispiel #21
0
def create_retrieval_set(N=200, instruction='toplevel_s2iprob'):
    """
    Create a retrieval set by selecting N drawings per category.
    Uses generated instruction trees.

    Args:
        N (int): size of retrieval set per category
        instruction (str): method for extracting instruction
    """

    # Walk over instruction trees
    seg_tree_path = BEST_SEG_NDJSON_PATH
    seg_tree_path = 'data/quickdraw/segmentations/greedy_parsing/progressionpair/Feb18_2020/strokes_to_instruction/S2IimgsFeb13/'
    for root, dirs, fns in os.walk(seg_tree_path):
        pqueue = []
        category = os.path.basename(root)

        # n = 0
        for fn in fns:
            if (fn != 'hp.json') and fn.endswith('json') and ('treant' not in fn):
                fp = os.path.join(root, fn)
                seg_tree = utils.load_file(fp)
                drawing_id = fn.replace('.json', '')
                # drawing_id = fn.replace('.json', '').split('_')[1]  # for progressio pair?

                if instruction == 'toplevel_s2iprob':
                    text = seg_tree[0]['text']

                heapq.heappush(
                    # cat_to_pqueue[category],
                    pqueue,
                    (seg_tree[0]['score'], drawing_id, text, seg_tree)
                )
                # n += 1
                # if n == 250:
                #     break

        # We are in a directory with seg_trees
        if len(pqueue) > 0:
            print(category)
            # get best instructions
            best = heapq.nlargest(N, pqueue)

            # load drawings
            cat_drawings = ndjson_drawings(category)
            id_to_idx = {d['key_id']: idx for idx, d in enumerate(cat_drawings)}

            # save best
            best_out = []
            for score, id, text, seg_tree in best:
                stroke3 = ndjson_to_stroke3(cat_drawings[id_to_idx[id]]['drawing'])
                out = {
                    'score': score,
                    'id': id,
                    'text': text,
                    'stroke3': stroke3
                }
                best_out.append(out)

            # id = best_out[1]['id']
            # save_img(category, id, cat_drawings, id_to_idx)
            # pp(best[1][3])
            # import pdb; pdb.set_trace()

            out_fp = RETRIEVAL_SET_PATH / instruction / 'data' / f'{category}.pkl'
            utils.save_file(best_out, out_fp)

            # save a version with just the non-stroke data for easy viewing
            best_out_no_drawing = []
            for d in best_out:
                best_out_no_drawing.append({'score': float(d['score']), 'id': d['id'], 'text': d['text']})
            out_fp = RETRIEVAL_SET_PATH / instruction / 'data' / f'{category}_nodrawing.json'
            utils.save_file(best_out_no_drawing, out_fp)

            # Save drawings
            chunk_n = 25
            for i in range(0, N, chunk_n):
                best_chunk = best_out[i:i+chunk_n]
                drawings = []
                for b in best_chunk:
                    # stroke3 format is in x y deltas, save_multiple_strokes...() expects the actual x y points
                    b['stroke3'][:,0] = np.cumsum(b['stroke3'][:,0])
                    b['stroke3'][:,1] = np.cumsum(b['stroke3'][:,1])
                    drawings.append(b['stroke3'])
                out_dir = RETRIEVAL_SET_PATH / instruction / 'drawings'
                os.makedirs(out_dir, exist_ok=True)
                out_fp = out_dir / f'{category}_{i}-{i+chunk_n}.jpg'
                save_multiple_strokes_as_img(drawings, out_fp)
import numpy as np
from copy import deepcopy
import argparse

from src.Linear import Linear
from src.ReLU import ReLU
from src.Model import Model
from src.Criterion import Criterion
from src.Optimizer import SGDOptimizer
from src.utils import load_file, save_file

if __name__ == '__main__':

    parser = argparse.ArgumentParser()
    parser.add_argument("-modelName", help="model name", required=True)
    parser.add_argument("-data",
                        help="location of the training data",
                        required=True)
    parser.add_argument("-target",
                        help="location of the target labels",
                        required=True)

    args = parser.parse_args()

    os.makedirs(args.modelName, exist_ok=True)

    data = load_file(args.data)
    target = load_file(args.target)

    print(data.shape, target.shape)
import torch
import os
import sys
import numpy as np
from copy import deepcopy
import argparse

from src.Criterion import Criterion
from src.utils import load_file, save_file


if __name__ == '__main__':

    parser = argparse.ArgumentParser()
    parser.add_argument("-i", help="path to input(.bin) file", required=True)
    parser.add_argument("-t", help="path to target(.bin) file", required=True)
    parser.add_argument("-ig", help="path to gradInput(.bin) file", required=True)

    args = parser.parse_args()

    inp = load_file(args.i)
    target = load_file(args.t)

    ce_loss = Criterion()
    loss = ce_loss.forward(inp, target)
    print(loss)

    gradInput = ce_loss.backward(inp, target)
    save_file(gradInput, args.ig)
def calc_stats_for_runs_in_dir(dir, best_n=10):
    """
    Print runs with best stats in <dir>
    Assumes each run has file with the name: 'e<epoch>_loss<loss>.pt'.

    Args:
        dir (str)
        best_n (int)
    """
    print(f'Looking in: {dir}\n')

    n = 0
    runs_stats = []
    hiplot_stats = []
    for root, dirs, fns in os.walk(dir):
        for fn in fns:
            # Get loss from model fn (e<epoch>_loss<loss>.pt)
            if fn.endswith('pt') and ('loss' in fn):
                # Get best samples
                epoch = fn.split('_')[0].replace('e', '')
                loss = float(fn.split('loss')[1].strip('.pt'))
                run = root.replace(dir + '/', '')
                best_sample_fp = os.path.join(root, 'outputs',
                                              f'samples_e{epoch}.json')

                # Calculate stats
                m2scores, m2cat2scores = calc_bleu_and_rouge_on_samples(
                    best_sample_fp, print=False)
                gt_toks, gen_toks = calc_rare_words_stats(best_sample_fp,
                                                          print=False)
                run_results = {
                    'n_gen_toks': len(gen_toks),
                    'n_gt_toks': len(gt_toks),
                    'loss': loss,
                    'rouge1': np.mean(m2scores['rouge1']),
                    'rouge2': np.mean(m2scores['rouge2']),
                    'rougeL': np.mean(m2scores['rougeL']),
                    'bleu1': np.mean(m2scores['bleu1']),
                    'bleu2': np.mean(m2scores['bleu2']),
                }
                runs_stats.append([run, run_results])

                # Save json data to be visualized by hiplot
                hp_dict = utils.load_file(os.path.join(root, 'hp.json'))
                run_hiplot = {}
                for k, val in hp_dict.items():
                    run_hiplot[k] = val
                run_hiplot.update(run_results)
                hiplot_stats.append(run_hiplot)

                n += 1

    #
    # Write best runs sorted to file
    out_fp = os.path.join(dir, 'best_runs.txt')
    with open(out_fp, 'w') as f:
        print('-' * 100)
        for main_stat in runs_stats[0][1].keys(
        ):  # n_gen_toks, loss, rougeL, bleu1, bleu2
            print(f'RUNS WITH BEST: {main_stat}', file=f)
            if main_stat == 'loss':  # lower is beter
                sorted_by_main_stat = sorted(
                    runs_stats, key=lambda x: -x[1][main_stat])[-best_n:]
            else:  # higher is better
                sorted_by_main_stat = sorted(
                    runs_stats, key=lambda x: x[1][main_stat])[-best_n:]

            for run, stats in sorted_by_main_stat:
                main_stat_val = stats[main_stat]
                other_stats_str = ', '.join([
                    '{}: {:.4f}'.format(stat, val)
                    for stat, val in stats.items() if (main_stat != stat)
                ])
                out_str = '{}: {:.4f}'.format(main_stat, main_stat_val)
                print(out_str + ', ' + other_stats_str + ', run: ' + run,
                      file=f)
            print(file=f)

    # Print to stdout
    for line in open(out_fp, 'r').readlines():
        print(line.strip())
    print('\nWrote best runs sorted to: ', out_fp)

    #
    # Save hiplot data in runs/strokes_to_instruction/Feb14_2020/imagesweep_textaug_rankimgs/
    #
    out_fn = 'hiplot_data.json'
    out_fp = os.path.join(dir, out_fn)
    print()
    utils.save_file(hiplot_stats, out_fp, verbose=True)
def score_segtree_match_with_annotations(seg_dir, prob_threshold):
    def load_annotations():
        """
        Returns:
            dict: drawing_id -> row from dataframe of Mturk annotations
        """
        df = pd.read_csv(ANNOTATED_PROGRESSION_PAIRS_CSV_PATH)
        id_to_annotations = {}
        for i, row in df.iterrows():
            drawing_id = row['Input.id']
            id_to_annotations[drawing_id] = row
        return id_to_annotations

    def calc_seg_score(drawing_id, seg_tree, id_to_annotations, scorers):
        """
        Calculate score for one tree.

        Args:
            drawing_id (int)
            seg_tree (list of dicts): [description]
            id_to_annotations (dict): drawing_id (int) -> row from dataframe of Mturk annotations
            scorers (list): Scorers (bleu, rouge)
        """
        annotations = id_to_annotations[drawing_id]  # TODO: check that this exists...

        category = annotations['Input.category']
        gt_instruction = annotations['Answer.annotation'].replace('\r', '')
        ndjson_start = annotations['Input.start']
        ndjson_end = annotations['Input.end']
        n_segs = annotations['Input.n_segments']
        url = annotations['Input.url']

        metric2score = {}
        match = None
        # There may be one segment within the instruction tree that matches the annotated segment
        for node in seg_tree:
            if (node['left'] == ndjson_start) and (node['right'] == ndjson_end):  # TODO: check offsets etc.
                gen_instruction = node['text']
                for scorer in scorers:
                    for metric, value in scorer.score(gt_instruction, gen_instruction).items():
                        metric2score[metric] = value

                match = {
                    'id': drawing_id,
                    'gen_instruction': gen_instruction,
                    'gt_instruction': gt_instruction,
                    'category': category,
                    'url': url,
                }

        return metric2score, match


    scorers = [InstructionScorer('bleu'), InstructionScorer('rouge')]
    metric2allscores = defaultdict(list)
    all_matches = []
    id_to_annotations = load_annotations()
    n_segs = 0

    # Find instruction trees
    for root, dirs, fns in os.walk(seg_dir):
        for fn in fns:
            if (fn != 'hp.json') and fn.endswith('json') and ('treant' not in fn):
                fp = os.path.join(root, fn)
                drawing_id = fn.split('_')[1].replace('.json', '') # fn: lion_6247028344487936.jpg
                drawing_id = int(drawing_id)
                seg_tree = utils.load_file(fp)
                seg_tree = prune_seg_tree(seg_tree, prob_threshold)

                # calculate score for this tree
                metric2score, match = calc_seg_score(drawing_id, seg_tree, id_to_annotations, scorers)
                if match:
                    all_matches.append(match)
                    for metric, score in metric2score.items():
                        metric2allscores[metric].append(score)

                n_segs += 1

    metric2allscores_mean = {metric: np.mean(scores) for metric, scores in metric2allscores.items()}
    metric2allscores_std = {metric: np.std(scores) for metric, scores in metric2allscores.items()}

    print('-' * 100)
    print(f'Number of matches: {len(all_matches)} / {n_segs}')

    print(f'Scores for: {seg_dir}')
    print('Mean:')
    pprint(metric2allscores_mean)
Beispiel #26
0
def convert_generated_instruction_samples_to_html(samples_fp):
    """
    Convert outputs from StrokeToInstructionRNN model to html
    """
    html_path = samples_fp.replace('.json', '.html')
    with open(html_path, 'w') as out_f:
        out_f.write("""
        <html lang="en">
            <head>
              <title>Bootstrap Example</title>
              <meta charset="utf-8">
              <meta name="viewport" content="width=device-width, initial-scale=1">
              <link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/bootstrap/3.4.0/css/bootstrap.min.css">
              <script src="https://ajax.googleapis.com/ajax/libs/jquery/3.4.1/jquery.min.js"></script>
              <script src="https://maxcdn.bootstrapcdn.com/bootstrap/3.4.0/js/bootstrap.min.js"></script>
            </head>
            <body>

            <div class="container">
                <h2>MTurk Results</h2>
        """)

        ROW_TEMPLATE = """
        <div class="row">
            <div class="col-md-5">
              <div class="thumbnail">
                  <div>
                   <p><strong>Category: {}</strong></p>
                  </div>
                  <img src="{}" style="max-width:100%">
                  <div class="caption">
                    <p>Ground truth: {}</p>
                    <p>Generated: {}
                  </div>
              </div>
            </div>
            <div class="col-md-5">
              <div class="thumbnail">
                  <div>
                   <p><strong>Category: {}</strong></p>
                  </div>
                  <img src="{}" style="max-width:100%">
                  <div class="caption">
                    <p>Ground truth: {}</p>
                    <p>Generated: {}
                  </div>
              </div>
            </div>
          </div>
        """

        samples = utils.load_file(samples_fp)
        for i in range(0, len(samples), 2):
            # cat = sample['category']
            cat1 = samples[i]['url'].split('fullinput/')[1].split(
                '/progress')[0]
            url1 = samples[i]['url']
            gt1 = ' '.join(utils.normalize_sentence(
                samples[i]['ground_truth']))
            gen1 = samples[i]['generated']

            cat2 = samples[i + 1]['url'].split('fullinput/')[1].split(
                '/progress')[0]
            url2 = samples[i + 1]['url']
            gt2 = ' '.join(
                utils.normalize_sentence(samples[i + 1]['ground_truth']))
            gen2 = samples[i + 1]['generated']

            row = ROW_TEMPLATE.format(cat1, url1, gt1, gen1, cat2, url2, gt2,
                                      gen2)
            out_f.write(row)

        out_f.write("""
            </div>
            </body>
        </html>
        """)
    print("**word2vec Embeddings!")
args = parser.parse_args()

random.seed(0)
torch.manual_seed(6)

now = datetime.datetime.now()
args.experiment_folder = args.experiment_path + \
    f"{now.year}_{now.month}_{now.day}_{now.hour}_{now.minute}/"
if not os.path.exists(args.experiment_folder) and args.save_model:
    os.makedirs(args.experiment_folder)

utils.print_args(args)

# vocabs contain all vocab + <pad>, <bos>, <eos>, <unk>
args.vocabs = utils.load_file(args.vocab_path, file_type='json')
args.n_vocabs = len(args.vocabs)
args.word2idx = {tok: i for i, tok in enumerate(args.vocabs)}
args.idx2word = {i: tok for i, tok in enumerate(args.vocabs)}
args.padding_idx = args.word2idx[args.padding_symbol]

batch_gen_train, batch_gen_test = data_load.create_batch_generators(args)
batcher = lm_model.TokenBatcher(args)
# Sentence encoder
sentence_encoder = model.SentenceEmbeddingModel(args).to(args.device)
# Convolution layer for extracting global coherence patterns
global_feature_extractor = model.LightweightConvolution(args).to(args.device)
# Bilinear layer for modeling inter-sentence relation
bilinear_layer = model.BiAffine(args).to(args.device)
# Linear layer
coherence_scorer = model.LocalCoherenceScore(args).to(args.device)
Beispiel #28
0
    parser = add_generation_args(parser)
    args = parser.parse_args()
    set_seed_for_gen(args)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # gpt2-medium trained on one week of covid news
    model_fp = 'trained_models/school_reviews/gpt2/wandb/model_e0.pkl'
    tokenizer_fp = 'trained_models/school_reviews/gpt2/wandb/tokenizer.pkl'
    print('Loading')

    from transformers import GPT2LMHeadModel, GPT2TokenizerFast
    model, tokenizer = GPT2LMHeadModel, GPT2TokenizerFast
    model = model.from_pretrained('gpt2-xl')

    model = load_file(model_fp)
    tokenizer = load_file(tokenizer_fp)

    gpt2 = GPT2Wrapper(args, model=model, tokenizer=tokenizer)
    gpt2 = gpt2.to(device)
    print('Loaded')

    OUT_FP = 'data/school_reviews/train_detector/trainedonallreviews_gpt2-xl_e0.json'

    texts = []
    for i in range(5000):
        text = model.generate_unconditional(self, n=1, bsz=1, stdout=True)[0]
        texts.append(text)
        if i % 10 == 0:
            save_file(texts, OUT_FP)
                        required=True)
    parser.add_argument("-i", help="path to input(.bin) file", required=True)
    parser.add_argument("-og",
                        help="path to gradOuput(.bin) file",
                        required=True)
    parser.add_argument("-o", help="path to output(.bin) file", required=True)
    parser.add_argument("-ow", help="path to gradW(.bin) file", required=True)
    parser.add_argument("-ob", help="path to gradB(.bin) file", required=True)
    parser.add_argument("-ig",
                        help="path to gradInput(.bin) file",
                        required=True)

    args = parser.parse_args()

    model = create_model(args.config)
    inp = load_file(args.i)
    num_input_nodes = np.prod(inp.shape[1:])

    inp = inp.reshape(-1, (num_input_nodes))
    out = model.forward(inp)

    model.clearGradParam()

    gradOutput = load_file(args.og)
    model.backward(inp, gradOutput)

    # save output
    save_file(out, args.o)

    # save gradW and gradB
    gradW, gradB = model.getGradParam()