Ejemplo n.º 1
0
    def save_vaez_inference_time(self, model_dir):
        """
        Loop over all ndjson data and save the encoded z's for each.

        Args:
            model_dir (str): directory of the trained model we are using
        """
        torch.backends.cudnn.benchmark = True # Optimizes cudnn
        print('Saving to: ', os.path.join(model_dir, 'inference_vaez'))
        with torch.no_grad():
            for split in ['train', 'valid', 'test']:
                loader = self.get_data_loader(split, self.hp.batch_size, self.hp.categories, self.hp.max_len, self.hp.max_per_category, False)

                for bidx, batch in enumerate(loader):
                    # encode drawing and get z
                    batch = self.preprocess_batch_from_data_loader(batch)
                    strokes, stroke_lens, cats, cats_idx = batch
                    max_len, cur_bsz, _ = strokes.size()
                    z, _, _ = self.enc(strokes)  # z: [bsz, 128]
                    z_np = z.cpu().numpy()

                    for i in range(cur_bsz):
                        # get index of this drawing within the dataset (loader has shuffle=False so we can do this)
                        global_idx = bidx * self.hp.batch_size + i
                        drawing_id = loader.dataset.data[global_idx]['id']

                        # save numpy version of z
                        out_dir = os.path.join(model_dir, 'inference_vaez', cats[i])
                        os.makedirs(out_dir, exist_ok=True)
                        out_fp = os.path.join(out_dir, f'{drawing_id}.pkl')

                        cur_z_np = z_np[i]
                        utils.save_file(cur_z_np, out_fp)
Ejemplo n.º 2
0
def generate_public_key(path):
    logger.info("Generating public/private key pair...")
    key = RSA.generate(2048)
    private_key = key.export_key()
    public_key = key.publickey().export_key()
    save_file(os.path.join(path, "private.pem"), private_key)
    save_file(os.path.join(path, "public.pem"), public_key)
    logger.info("Keys successfully generated.")
Ejemplo n.º 3
0
def inference_on_dataset(args):
    model, tokenizer = load_model(args)
    model.cuda()
    if args.dataset == 'ets':
        data_loader = get_etsnonnative_dataloader(
            max_len=args.max_len,
            batch_size=1)
    elif args.dataset == 'schoolreviews':
        data_loader = get_schoolreviewsreal_dataloader(
            max_len=args.max_len,
            batch_size=1)
    print('Dataset loaded')
    

    results = {}
    for i, batch in enumerate(data_loader):
        if i % 100 == 0:
        print('{} / {} / ')

        token_ids_padded, atn_mask, label, text, id = batch
        query = text[0]  # bsz 1
        id = id[0]  # bsz 1

        tokens = tokenizer.encode(query)
        all_tokens = len(tokens)
        tokens = tokens[:tokenizer.max_len - 2]
        used_tokens = len(tokens)
        tokens = torch.tensor([tokenizer.bos_token_id] + tokens + [tokenizer.eos_token_id]).unsqueeze(0)
        mask = torch.ones_like(tokens)

        with torch.no_grad():
            tokens = tokens.cuda()
            mask = mask.cuda()
            logits = model(tokens, attention_mask=mask)[0]
            probs = logits.softmax(dim=-1)

            fake, real = probs.detach().cpu().flatten().numpy().tolist()

            correct = False
            if ((real > fake) and (label == 1)) or \
                (fake > real) and (label == 0):
                correct = True
            results[id] = {'fake': fake, 'real': real, 'correct': correct}

    save_file(results, os.path.join(args.output_dir, 'results.json'), verbose=True)

if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--model_name', default='roberta-base')
    parser.add_argument('--dataset', default='ets')
    parser.add_argument('--max_len', default=128)
    parser.add_argument('--output_dir', default=None, required=True)
    args = parser.parse_args()
    
    inference_on_dataset(args)
def main():
    path = create_dir()
    url = "https://brasil.diplo.de/br-de/service/matriculaconsular/" \
          "2222228?fbclid=IwAR0MojqudTlgMKQjZG7nNqp9gr3-QSKyiiRdY0jeeBY336zm_3yqr_Oc_nc"

    content = scraping_de(url)

    for c in content:
        save_file(pathname=path,
                  filename=c["file_name"],
                  content=requests.get(c["xls_url"]).content)
Ejemplo n.º 5
0
    def segment_all_ndjson_data(self):
        """
        Segment all samples in the NdjsonStrokeDataset
        """
        for split in ['train', 'valid', 'test']:
            for category in final_categories():
                # Skip if not in hparam's categories list
                if (self.hp.categories !=
                        'all') and (category not in self.hp.categories):
                    continue
                print(f'{split}: {category}')
                # ds = NdjsonStrokeDataset(category, split)
                ds = NdjsonStrokeDataset(
                    category, split, max_per_category=self.hp.max_per_category)
                loader = DataLoader(ds, batch_size=1, shuffle=False)
                n_segd = 0
                for i, sample in enumerate(loader):
                    try:
                        id, category = loader.dataset.data[i][
                            'id'], loader.dataset.data[i]['category']
                        out_dir = self.save_dir / category
                        out_fp = out_dir / f'{id}.json'
                        if os.path.exists(out_fp):
                            continue
                        # note: we are NOT saving it into separate split categories in the case that
                        # we want to train on 30 categories and then do test on 5 held out categories.
                        # (i.e. keep it flexible to splitting within categories vs. across categories, which
                        # can be specified in that Dataset)
                        # TODO: should we do the same for ProgressionPair?

                        # save segmentations
                        segmented = self.segment_sample(sample,
                                                        dataset='ndjson')
                        # TODO: save sample / strokes as well so that we have all the data in one place?
                        utils.save_file(segmented, out_fp)

                        # save original image too for comparisons
                        ndjson_strokes = loader.dataset.data[i][
                            'ndjson_strokes']
                        img = create_progression_image_from_ndjson_seq(
                            ndjson_strokes)
                        out_fp = out_dir / f'{id}.jpg'
                        img.save(out_fp)

                        n_segd += 1
                        if n_segd == self.hp.max_per_category:
                            break

                    except Exception as e:
                        print(e)
                        continue
Ejemplo n.º 6
0
    def segment_all_progressionpair_data(self):
        """
        Segment all samples in the ProgressionPairDataset
        """
        for split in ['train', 'valid', 'test']:
            print(split)
            if self.s2i_hp.drawing_type == 'stroke':
                self.ds = ProgressionPairDataset(split, use_full_drawings=True)
                loader = DataLoader(
                    self.ds,
                    batch_size=1,
                    shuffle=False,
                    collate_fn=ProgressionPairDataset.collate_fn)
            elif self.s2i_hp.drawing_type == 'image':
                self.ds = DrawingsAsImagesAnnotatedDataset(
                    split, images=self.s2i_hp.images, data_aug_on_text=False)
                loader = DataLoader(
                    self.ds,
                    batch_size=1,
                    shuffle=False,
                    collate_fn=DrawingsAsImagesAnnotatedDataset.collate_fn)

            for i, sample in enumerate(loader):
                try:
                    id, category = loader.dataset.data[i][
                        'id'], loader.dataset.data[i]['category']
                    out_dir = self.save_dir / split

                    if self.s2i_hp.drawing_type == 'image':
                        sample = loader.dataset.data[
                            i]  # contains the fp, n_segments data we need

                    # save segmentations
                    segmented = self.segment_sample(sample,
                                                    dataset='progressionpair')
                    # TODO: save sample / strokes as well so that we have all the data in one place?
                    out_fp = out_dir / f'{category}_{id}.json'
                    utils.save_file(segmented, out_fp)

                    # save original image too for comparisons
                    # TODO: image dataset doesn't have ndjson_strokes
                    # ndjson_strokes = loader.dataset.data[i]['ndjson_strokes']
                    # img = create_progression_image_from_ndjson_seq(ndjson_strokes)
                    out_fp = out_dir / f'{category}_{id}.jpg'
                    open(out_fp, 'a').close()
                    # img.save(out_fp)

                except Exception as e:
                    print(e)
                    continue
Ejemplo n.º 7
0
def save_run_data(path_to_dir, hp, ask_if_exists=True):
    """
    1) Save stdout to file
    2) Save files to path_to_dir:
        - code_snapshot/: Snapshot of code (.py files)
        - hp.json: dict of HParams object
        - run_details.txt: command used and start time
    """
    print('Saving run data to: {}'.format(path_to_dir))
    parent_dir = os.path.dirname(path_to_dir)
    if not os.path.exists(parent_dir):
        os.makedirs(parent_dir)
    if os.path.isdir(path_to_dir):
        print(
            "Data already exists in this directory (presumably from a previous run)"
        )
        if ask_if_exists:
            inp = raw_input(
                'Enter "y" if you are sure you want to remove all the old contents: '
            )
            if inp in ["y", "yes"]:
                print("Removing old contents")
                shutil.rmtree(path_to_dir)
            else:
                print("Exiting")
                raise SystemExit
    print("Creating directory and saving data")
    if not os.path.exists(path_to_dir):
        os.makedirs(path_to_dir)

    # Save snapshot of code
    snapshot_dir = os.path.join(path_to_dir, "code_snapshot")
    print('Saving code snapshot to: {}'.format(snapshot_dir))
    if os.path.exists(
            snapshot_dir):  # shutil doesn't work if dest already exists
        shutil.rmtree(snapshot_dir)
    shutil.copytree("src", snapshot_dir)

    # Save hyperparms
    save_file(vars(hp), os.path.join(path_to_dir, "hp.json"), verbose=True)

    # Save some command used to run, start time
    with open(os.path.join(path_to_dir, "run_details.txt"), "w") as f:
        f.write("Command:\n")
        cmd = " ".join(sys.argv)
        start_time = datetime.now().strftime("%B%d_%H-%M-%S")
        f.write(cmd + "\n")
        f.write('Start time: {}'.format(start_time))
        print("Command used to start program:\n", cmd)
        print('Start time: {}'.format(start_time))
Ejemplo n.º 8
0
def save_instruction_vocabulary_distribution():

    df = pd.read_csv(ANNOTATED_PROGRESSION_PAIRS_CSV_PATH)

    tokens = Counter()
    for i in range(len(df)):
        annotation = df.iloc[i]['Answer.annotation'].replace('\r', '')
        for token in utils.normalize_sentence(annotation):
            tokens[token] += 1

    norm = sum(tokens.values())
    distribution = {tok: count / norm for tok, count in tokens.items()}
    utils.save_file(distribution,
                    INSTRUCTIONS_VOCAB_DISTRIBUTION_PATH,
                    verbose=True)
Ejemplo n.º 9
0
def prep_and_save_data(gen_method='gpt2-xl_p96', max_len=192):
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    for split in ['test', 'valid', 'train']:
        split_data = []

        # load real
        fp = REAL_TEXT_PATH / f'webtext.{split}.jsonl'
        for line in open(fp):
            item = json.loads(line)
            id, ended, text = item['id'], item['ended'], item['text']

            text_trunc, text_len, token_ids_padded = tokenize_and_prep(
                tokenizer, text, max_len)
            if text_len < max_len:
                continue
            prepped = {
                'id': id,
                'text': text_trunc,
                'token_ids_padded': token_ids_padded,
                'label': 1,
                'label_text': 'human-written'
            }
            split_data.append(prepped)

        # load fake
        dir = GENERATED_TEXT_PATH / f'fake_{gen_method}'
        for fn in os.listdir(dir):
            if fn.startswith(split):
                fp = os.path.join(dir, fn)
                for line in open(fp):
                    item = json.loads(line)
                    id, text = item['id'], item['text']

                    text_trunc, text_len, token_ids_padded = tokenize_and_prep(
                        tokenizer, text, max_len)
                    if text_len < max_len:
                        continue
                    prepped = {
                        'id': id,
                        'text': text_trunc,
                        'token_ids_padded': token_ids_padded,
                        'label': 0,
                        'label_text': 'machine-generated'
                    }
                    split_data.append(prepped)

        out_fp = PREPPED_REALGEN_TEXT_PATH / f'{gen_method}_{split}.pkl'
        save_file(split_data, out_fp, verbose=True)
    def end_of_epoch_hook(self, data_loader, epoch, outputs_path=None, writer=None):
        """
        Args:
            data_loader: DataLoader
            epoch: int
            outputs_path: str
            writer: Tensorboard Writer
        """
        for model in self.models:
            model.eval()

        with torch.no_grad():
            # Generate texts on validation set
            inference = self.inference_loop(data_loader, writer=writer, epoch=epoch)
            out_fp = os.path.join(outputs_path, 'samples_e{}.json'.format(epoch))
            utils.save_file(inference, out_fp, verbose=True)
Ejemplo n.º 11
0
def decrypt_data(filename_path, new_filename_path, private_key_path):
    with open(filename_path, "rb") as file_in:
        logger.info(f"Decrypting file {filename_path}...")
        private_key = RSA.import_key(read_file(private_key_path))
        enc_session_key, nonce, tag, ciphertext = [
            file_in.read(x) for x in (private_key.size_in_bytes(), 16, 16, -1)
        ]

        # Decrypt the session key with the private RSA key
        cipher_rsa = PKCS1_OAEP.new(private_key)
        session_key = cipher_rsa.decrypt(enc_session_key)

        # Decrypt the data with the AES session key
        cipher_aes = AES.new(session_key, AES.MODE_EAX, nonce)
        data = cipher_aes.decrypt_and_verify(ciphertext, tag)
        decrypt_filename = extract_or_create_filename(new_filename_path,
                                                      filename_path)
        save_file(decrypt_filename, data)
        logger.info("File successfully decrypted.")
Ejemplo n.º 12
0
    def save_inference_on_split(self,
                                loader=None,
                                dataset_split=None,
                                dir=None,
                                ext=None):
        """
		Args:
			loader: DataLoader
			dataset_split: str
			dir: str (location to save inference/<dataset_split>.pkl>
			ext: str (e.g. 'json', 'pkl'; extension of file)
		"""
        if loader is None:
            loader = self.get_data_loader(dataset_split,
                                          self.hp.batch_size,
                                          shuffle=False)
        inference = self.inference_loop(loader)
        fp = os.path.join(dir, 'inference', '{}.{}'.format(dataset_split, ext))
        utils.save_file(inference, fp, verbose=True)
Ejemplo n.º 13
0
    def test_epoch_end(self, outputs):

        avg_test_loss = torch.stack([x['test_loss'] for x in outputs]).mean()
        avg_test_acc = torch.stack([x['test_acc'] for x in outputs]).mean()

        metrics = {
            'test_loss': avg_test_loss,
            'avg_test_acc': avg_test_acc,
            'log': {
                'test_epoch_loss': avg_test_loss,
                'test_epoch_acc': avg_test_acc
            }
        }

        # Save (this is coded with the ETS dataset in mind right now, so we can
        # check accuracy across different cuts
        correctness = {}
        for x in outputs:
            for i, item_id in enumerate(x['id']):
                item_correct = x['correct'][i]
                correctness[item_id] = item_correct
        save_file(correctness, self.args.eval_out_fp)

        return metrics
Ejemplo n.º 14
0
    def put(self, data):
        UPLOADS_FOLDER = current_app.config['UPLOADS_FOLDER']
        user = current_user

        if 'picture' in req.files:
            file = req.files['picture']
            filename = save_file(file, UPLOADS_FOLDER)
            data['avatar'] = filename

            if user.avatar:
                delete_file(UPLOADS_FOLDER, user.avatar)

        user.update(**data)

        return user
    def generate_and_save(self, data_loader, epoch, n_gens, outputs_path=None):
        """
        Generate and save drawings
        """
        n = 0
        gen_strokes = []
        gt_strokes = []
        gt_texts = []
        for i, batch in enumerate(data_loader):
            batch = self.preprocess_batch_from_data_loader(batch)
            strokes, stroke_lens, texts, text_lens, text_indices, cats, cats_idx, urls = batch

            max_len, bsz, _ = strokes.size()

            if self.hp.cond_instructions == 'decinputs':
                # Encode instructions
                # text_indices: [len, bsz], text_lens: [bsz]
                instructions_emb = self.enc(
                    text_indices,
                    text_lens,
                    self.text_embedding,
                    category_embedding=self.category_embedding,
                    categories=cats_idx)  # [bsz, enc_dim]
                z = instructions_emb

            hidden_cell = (nn_utils.move_to_cuda(
                torch.zeros(1, bsz, self.hp.dec_dim)),
                           nn_utils.move_to_cuda(
                               torch.zeros(1, bsz, self.hp.dec_dim)))

            # initialize state with start of sequence stroke-5 stroke
            sos = torch.stack([torch.Tensor([0, 0, 1, 0, 0])] * bsz).unsqueeze(
                0)  # [1 (len), bsz, 5 (stroke-5)]
            sos = nn_utils.move_to_cuda(sos)

            # generate until end of sequence or maximum sequence length
            s = sos
            seq_x = []  # delta-x
            seq_y = []  # delta-y
            seq_pen = []  # pen-down
            for _ in range(max_len):
                if self.hp.cond_instructions == 'decinputs':  # input is last state, z, and hidden_cell
                    input = torch.cat(
                        [s, z.unsqueeze(0)], dim=2
                    )  # [1 (len), 1 (bsz), input_dim (5) + z_dim (128)]

                elif self.hp.cond_instructions == 'match':  # input is last state and hidden_cell
                    input = s  # [1, bsz (1), 5]

                if self.hp.use_categories_dec \
                    and hasattr(self, 'category_embedding'):
                    # hack because VAE was trained with use_categories_dec=True but didn't actually have a category embedding
                    cat_embs = self.category_embedding(
                        cats_idx)  # [bsz (1), cat_dim]
                    input = torch.cat([input, cat_embs.unsqueeze(0)],
                                      dim=2)  # [1, 1, dim]
                    # dim = 5 + cat_dim if decodergmm, 5 + z_dim + cat_dim if vae

                outputs, pi, mu_x, mu_y, sigma_x, sigma_y, rho_xy, q, hidden, cell = \
                    self.dec(input, stroke_lens=stroke_lens, output_all=False, hidden_cell=hidden_cell)
                hidden_cell = (hidden, cell)  # for next timee step
                # sample next state
                s, dx, dy, pen_up, eos = self.sample_next_state(
                    pi, mu_x, mu_y, sigma_x, sigma_y, rho_xy, q)

                seq_x.append(dx)
                seq_y.append(dy)
                seq_pen.append(pen_up)

                if eos:  # done drawing
                    break

            # get in format to draw image
            # Cumulative sum because seq_x and seq_y are deltas, so get x (or y) at each stroke
            sample_x = np.cumsum(seq_x, 0)
            sample_y = np.cumsum(seq_y, 0)
            sample_pen = np.array(seq_pen)
            sequence = np.stack([sample_x, sample_y, sample_pen]).T
            # output_fp = os.path.join(outputs_path, f'e{epoch}-gen{n}.jpg')
            # save_strokes_as_img(sequence, output_fp)

            # Save original as well
            output_fp = os.path.join(outputs_path, f'e{epoch}-gt{n}.jpg')
            strokes_x = strokes[:, 0,
                                0]  # first 0 for x because sample_next_state etc. only using 0-th batch item; 2nd 0 for dx
            strokes_y = strokes[:, 0, 1]  # 1 for dy
            strokes_x = np.cumsum(strokes_x.cpu().numpy())
            strokes_y = np.cumsum(strokes_y.cpu().numpy())
            strokes_pen = strokes[:, 0, 3].cpu().numpy()
            strokes_out = np.stack([strokes_x, strokes_y, strokes_pen]).T
            # save_strokes_as_img(strokes_out, output_fp)

            gen_strokes.append(sequence)
            gt_strokes.append(strokes_out)
            gt_texts.append(texts[0])  # 0 because batch size is 1

            n += 1
            if n == n_gens:
                break

        # save grid drawings
        rowcol_size = 5
        chunk_size = rowcol_size**2
        for i in range(0, chunk_size, len(gen_strokes)):
            output_fp = os.path.join(outputs_path,
                                     f'e{epoch}_gen{i}-{i+chunk_size}.jpg')
            save_multiple_strokes_as_img(gen_strokes[i:i + chunk_size],
                                         output_fp)

            output_fp = os.path.join(outputs_path,
                                     f'e{epoch}_gt{i}-{i+chunk_size}.jpg')
            save_multiple_strokes_as_img(gt_strokes[i:i + chunk_size],
                                         output_fp)

            # save texts
            output_fp = os.path.join(outputs_path,
                                     f'e{epoch}_texts{i}-{i+chunk_size}.json')
            utils.save_file(gt_texts[i:i + chunk_size], output_fp)
Ejemplo n.º 16
0
def create_retrieval_set(N=200, instruction='toplevel_s2iprob'):
    """
    Create a retrieval set by selecting N drawings per category.
    Uses generated instruction trees.

    Args:
        N (int): size of retrieval set per category
        instruction (str): method for extracting instruction
    """

    # Walk over instruction trees
    seg_tree_path = BEST_SEG_NDJSON_PATH
    seg_tree_path = 'data/quickdraw/segmentations/greedy_parsing/progressionpair/Feb18_2020/strokes_to_instruction/S2IimgsFeb13/'
    for root, dirs, fns in os.walk(seg_tree_path):
        pqueue = []
        category = os.path.basename(root)

        # n = 0
        for fn in fns:
            if (fn != 'hp.json') and fn.endswith('json') and ('treant' not in fn):
                fp = os.path.join(root, fn)
                seg_tree = utils.load_file(fp)
                drawing_id = fn.replace('.json', '')
                # drawing_id = fn.replace('.json', '').split('_')[1]  # for progressio pair?

                if instruction == 'toplevel_s2iprob':
                    text = seg_tree[0]['text']

                heapq.heappush(
                    # cat_to_pqueue[category],
                    pqueue,
                    (seg_tree[0]['score'], drawing_id, text, seg_tree)
                )
                # n += 1
                # if n == 250:
                #     break

        # We are in a directory with seg_trees
        if len(pqueue) > 0:
            print(category)
            # get best instructions
            best = heapq.nlargest(N, pqueue)

            # load drawings
            cat_drawings = ndjson_drawings(category)
            id_to_idx = {d['key_id']: idx for idx, d in enumerate(cat_drawings)}

            # save best
            best_out = []
            for score, id, text, seg_tree in best:
                stroke3 = ndjson_to_stroke3(cat_drawings[id_to_idx[id]]['drawing'])
                out = {
                    'score': score,
                    'id': id,
                    'text': text,
                    'stroke3': stroke3
                }
                best_out.append(out)

            # id = best_out[1]['id']
            # save_img(category, id, cat_drawings, id_to_idx)
            # pp(best[1][3])
            # import pdb; pdb.set_trace()

            out_fp = RETRIEVAL_SET_PATH / instruction / 'data' / f'{category}.pkl'
            utils.save_file(best_out, out_fp)

            # save a version with just the non-stroke data for easy viewing
            best_out_no_drawing = []
            for d in best_out:
                best_out_no_drawing.append({'score': float(d['score']), 'id': d['id'], 'text': d['text']})
            out_fp = RETRIEVAL_SET_PATH / instruction / 'data' / f'{category}_nodrawing.json'
            utils.save_file(best_out_no_drawing, out_fp)

            # Save drawings
            chunk_n = 25
            for i in range(0, N, chunk_n):
                best_chunk = best_out[i:i+chunk_n]
                drawings = []
                for b in best_chunk:
                    # stroke3 format is in x y deltas, save_multiple_strokes...() expects the actual x y points
                    b['stroke3'][:,0] = np.cumsum(b['stroke3'][:,0])
                    b['stroke3'][:,1] = np.cumsum(b['stroke3'][:,1])
                    drawings.append(b['stroke3'])
                out_dir = RETRIEVAL_SET_PATH / instruction / 'drawings'
                os.makedirs(out_dir, exist_ok=True)
                out_fp = out_dir / f'{category}_{i}-{i+chunk_n}.jpg'
                save_multiple_strokes_as_img(drawings, out_fp)
Ejemplo n.º 17
0
def save_annotated_progression_pairs_data():
    """
    Save <category>.pkl files that is a dictionary from id to data.
    Data is a dictionary that contains:
        url: S3 url of progression pair
        annotation: instruction written by MTurker

        ndjson_strokes: drawing in ndjson format (list of subsegments, each subsegment is list of x y points)
        ndjson_start: ndjson_strokes index of start of annotated segment
            - Offset by 1 relative to ndjson_strokes
            - When 0, this is the start of the drawing (before any strokes)
        ndjson_end: ndjson_strokes index of end of annotated segment
            - Offset by 1 relative to ndjson_strokes

        stroke3: drawing in stroke-3 format: numpy array of shape [len, 3] (x, y, pen_up)
        stroke3_start: stroke3 index of start of annotated segment
        stroke3_end: stroke3 index of end of annotated segment
        stroke3_segment: numpy array of shape [len, 3] (x, y, pen_up)
            segment that was annotated (drawing from _start to _end of progression pair)
    """
    os.makedirs(LABELED_PROGRESSION_PAIRS_DATA_PATH, exist_ok=True)

    df = pd.read_csv(ANNOTATED_PROGRESSION_PAIRS_CSV_PATH)
    for cat in df['Input.category'].unique():
        df_cat = df[df['Input.category'] == cat]
        print(cat, len(df_cat))

        id_to_data = defaultdict(dict)

        # get ndjson stroke data
        drawings = ndjson_drawings(cat)
        id_to_strokes = defaultdict(dict)
        for data in drawings:
            id = data['key_id']
            id_to_strokes[id]['ndjson_strokes'] = data['drawing']
            stroke3 = ndjson_to_stroke3(data['drawing'])
            id_to_strokes[id]['stroke3'] = stroke3

        # map annotations to strokes
        for i in range(len(df_cat)):
            id = df_cat.iloc[i]['Input.id']
            id = str(id)
            annotation = df_cat.iloc[i]['Answer.annotation'].replace('\r', '')
            ndjson_start = df_cat.iloc[i]['Input.start']
            ndjson_end = df_cat.iloc[i]['Input.end']
            url = df_cat.iloc[i]['Input.url']

            id_to_data[id]['ndjson_start'] = int(ndjson_start)
            id_to_data[id]['ndjson_end'] = int(ndjson_end)
            id_to_data[id]['url'] = url
            id_to_data[id]['annotation'] = annotation

            ndjson_strokes = id_to_strokes[id]
            stroke3 = id_to_strokes[id]['stroke3']
            id_to_data[id]['ndjson_strokes'] = id_to_strokes[id][
                'ndjson_strokes']
            id_to_data[id]['stroke3'] = stroke3

            # save portion of stroke3 corresponding to start and end
            pen_up = np.where(
                id_to_strokes[id]['stroke3'][:, 2] == 1)[0].tolist()
            pen_up.insert(
                0, 0
            )  #  insert to get indexing (when ndjson_start == 0) this is the beginning
            stroke3_start = 0 if (ndjson_start
                                  == 0) else (pen_up[ndjson_start] + 1)
            stroke3_end = pen_up[ndjson_end]
            id_to_data[id]['stroke3_start'] = stroke3_start
            id_to_data[id]['stroke3_end'] = stroke3_end
            id_to_data[id]['stroke3_segment'] = stroke3[
                stroke3_start:stroke3_end + 1, :]

        # flatten
        result = []
        for id, data in id_to_data.items():
            data['id'] = id
            data['category'] = cat
            result.append(data)

        # save
        out_fn = f'{cat}.pkl'
        out_fp = LABELED_PROGRESSION_PAIRS_DATA_PATH / out_fn
        utils.save_file(result, out_fp)
Ejemplo n.º 18
0
def save_annotated_precurrentpost_data():
    """
    Combine precurrentpost images (generated from save_drawings_split_into_precurrentpost())
    with annotaitons. Preprocesses for use with a data loader.

    Save <category>.pkl files that is a dictionary from id to data.
    Data is a dictionary that contains:
        url: S3 url of progression pair
        annotation: instruction written by MTurker

        ndjson_strokes: drawing in ndjson format (list of subsegments, each subsegment is list of x y points)
        ndjson_start: ndjson_strokes index of start of annotated segment
            - Offset by 1 relative to ndjson_strokes
            - When 0, this is the start of the drawing (before any strokes)
        ndjson_end: ndjson_strokes index of end of annotated segment
            - Offset by 1 relative to ndjson_strokes

        pre_seg_fp: filepath to image of strokes before annotated segment
        annotated_seg_fp: filepath to image of annotated segment
        post_seg_fp: filepath to image of strokes after annotated segment
        full_fp: filepath to image of full drawing
    """
    os.makedirs(PRECURRENTPOST_DATAWITHANNOTATIONS_PATH, exist_ok=True)

    df = pd.read_csv(ANNOTATED_PROGRESSION_PAIRS_CSV_PATH)
    for cat in df['Input.category'].unique():
        df_cat = df[df['Input.category'] == cat]
        print(cat, len(df_cat))

        id_to_data = defaultdict(dict)

        # map annotations to strokes
        for i in range(len(df_cat)):
            id = df_cat.iloc[i]['Input.id']
            id = str(id)
            annotation = df_cat.iloc[i]['Answer.annotation'].replace('\r', '')
            ndjson_start = df_cat.iloc[i]['Input.start']
            ndjson_end = df_cat.iloc[i]['Input.end']
            url = df_cat.iloc[i]['Input.url']
            n_segments = df_cat.iloc[i]['Input.n_segments']

            id_to_data[id]['ndjson_start'] = int(ndjson_start)
            id_to_data[id]['ndjson_end'] = int(ndjson_end)
            id_to_data[id]['url'] = url
            id_to_data[id]['annotation'] = annotation

            id_to_data[id]['pre_seg_fp'] = str(PRECURRENTPOST_DATA_PATH / cat /
                                               id / f'0-{ndjson_start}.jpg')
            id_to_data[id]['annotated_seg_fp'] = str(
                PRECURRENTPOST_DATA_PATH / cat / id /
                f'{ndjson_start}-{ndjson_end}.jpg')
            id_to_data[id]['start_to_annotated_fp'] = str(
                PRECURRENTPOST_DATA_PATH / cat / id / f'0-{ndjson_end}.jpg')
            id_to_data[id]['post_seg_fp'] = str(
                PRECURRENTPOST_DATA_PATH / cat / id /
                f'{ndjson_end}-{n_segments}.jpg')
            id_to_data[id]['full_fp'] = str(PRECURRENTPOST_DATA_PATH / cat /
                                            id / 'full.jpg')

        # flatten
        result = []
        for id, data in id_to_data.items():
            data['id'] = id
            data['category'] = cat
            result.append(data)

        # save
        out_fn = f'{cat}.pkl'
        out_fp = PRECURRENTPOST_DATAWITHANNOTATIONS_PATH / out_fn
        utils.save_file(result, out_fp)
import torch
import os
import sys
import numpy as np
from copy import deepcopy
import argparse

from src.Criterion import Criterion
from src.utils import load_file, save_file


if __name__ == '__main__':

    parser = argparse.ArgumentParser()
    parser.add_argument("-i", help="path to input(.bin) file", required=True)
    parser.add_argument("-t", help="path to target(.bin) file", required=True)
    parser.add_argument("-ig", help="path to gradInput(.bin) file", required=True)

    args = parser.parse_args()

    inp = load_file(args.i)
    target = load_file(args.t)

    ce_loss = Criterion()
    loss = ce_loss.forward(inp, target)
    print(loss)

    gradInput = ce_loss.backward(inp, target)
    save_file(gradInput, args.ig)
Ejemplo n.º 20
0
    parser = add_generation_args(parser)
    args = parser.parse_args()
    set_seed_for_gen(args)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # gpt2-medium trained on one week of covid news
    model_fp = 'trained_models/school_reviews/gpt2/wandb/model_e0.pkl'
    tokenizer_fp = 'trained_models/school_reviews/gpt2/wandb/tokenizer.pkl'
    print('Loading')

    from transformers import GPT2LMHeadModel, GPT2TokenizerFast
    model, tokenizer = GPT2LMHeadModel, GPT2TokenizerFast
    model = model.from_pretrained('gpt2-xl')

    model = load_file(model_fp)
    tokenizer = load_file(tokenizer_fp)

    gpt2 = GPT2Wrapper(args, model=model, tokenizer=tokenizer)
    gpt2 = gpt2.to(device)
    print('Loaded')

    OUT_FP = 'data/school_reviews/train_detector/trainedonallreviews_gpt2-xl_e0.json'

    texts = []
    for i in range(5000):
        text = model.generate_unconditional(self, n=1, bsz=1, stdout=True)[0]
        texts.append(text)
        if i % 10 == 0:
            save_file(texts, OUT_FP)
Ejemplo n.º 21
0
 def save(self, file_path):
     model_data = [x.as_dict() for x in self.Layers]
     save_file(model_data, file_path)
def calc_stats_for_runs_in_dir(dir, best_n=10):
    """
    Print runs with best stats in <dir>
    Assumes each run has file with the name: 'e<epoch>_loss<loss>.pt'.

    Args:
        dir (str)
        best_n (int)
    """
    print(f'Looking in: {dir}\n')

    n = 0
    runs_stats = []
    hiplot_stats = []
    for root, dirs, fns in os.walk(dir):
        for fn in fns:
            # Get loss from model fn (e<epoch>_loss<loss>.pt)
            if fn.endswith('pt') and ('loss' in fn):
                # Get best samples
                epoch = fn.split('_')[0].replace('e', '')
                loss = float(fn.split('loss')[1].strip('.pt'))
                run = root.replace(dir + '/', '')
                best_sample_fp = os.path.join(root, 'outputs',
                                              f'samples_e{epoch}.json')

                # Calculate stats
                m2scores, m2cat2scores = calc_bleu_and_rouge_on_samples(
                    best_sample_fp, print=False)
                gt_toks, gen_toks = calc_rare_words_stats(best_sample_fp,
                                                          print=False)
                run_results = {
                    'n_gen_toks': len(gen_toks),
                    'n_gt_toks': len(gt_toks),
                    'loss': loss,
                    'rouge1': np.mean(m2scores['rouge1']),
                    'rouge2': np.mean(m2scores['rouge2']),
                    'rougeL': np.mean(m2scores['rougeL']),
                    'bleu1': np.mean(m2scores['bleu1']),
                    'bleu2': np.mean(m2scores['bleu2']),
                }
                runs_stats.append([run, run_results])

                # Save json data to be visualized by hiplot
                hp_dict = utils.load_file(os.path.join(root, 'hp.json'))
                run_hiplot = {}
                for k, val in hp_dict.items():
                    run_hiplot[k] = val
                run_hiplot.update(run_results)
                hiplot_stats.append(run_hiplot)

                n += 1

    #
    # Write best runs sorted to file
    out_fp = os.path.join(dir, 'best_runs.txt')
    with open(out_fp, 'w') as f:
        print('-' * 100)
        for main_stat in runs_stats[0][1].keys(
        ):  # n_gen_toks, loss, rougeL, bleu1, bleu2
            print(f'RUNS WITH BEST: {main_stat}', file=f)
            if main_stat == 'loss':  # lower is beter
                sorted_by_main_stat = sorted(
                    runs_stats, key=lambda x: -x[1][main_stat])[-best_n:]
            else:  # higher is better
                sorted_by_main_stat = sorted(
                    runs_stats, key=lambda x: x[1][main_stat])[-best_n:]

            for run, stats in sorted_by_main_stat:
                main_stat_val = stats[main_stat]
                other_stats_str = ', '.join([
                    '{}: {:.4f}'.format(stat, val)
                    for stat, val in stats.items() if (main_stat != stat)
                ])
                out_str = '{}: {:.4f}'.format(main_stat, main_stat_val)
                print(out_str + ', ' + other_stats_str + ', run: ' + run,
                      file=f)
            print(file=f)

    # Print to stdout
    for line in open(out_fp, 'r').readlines():
        print(line.strip())
    print('\nWrote best runs sorted to: ', out_fp)

    #
    # Save hiplot data in runs/strokes_to_instruction/Feb14_2020/imagesweep_textaug_rankimgs/
    #
    out_fn = 'hiplot_data.json'
    out_fp = os.path.join(dir, out_fn)
    print()
    utils.save_file(hiplot_stats, out_fp, verbose=True)
Ejemplo n.º 23
0
    parser.add_argument("-ig",
                        help="path to gradInput(.bin) file",
                        required=True)

    args = parser.parse_args()

    model = create_model(args.config)
    inp = load_file(args.i)
    num_input_nodes = np.prod(inp.shape[1:])

    inp = inp.reshape(-1, (num_input_nodes))
    out = model.forward(inp)

    model.clearGradParam()

    gradOutput = load_file(args.og)
    model.backward(inp, gradOutput)

    # save output
    save_file(out, args.o)

    # save gradW and gradB
    gradW, gradB = model.getGradParam()
    save_file(gradW, args.ow)
    save_file(gradB, args.ob)

    # save gradInput
    save_file(model.Layers[0].gradInput, args.ig)

    model.save("outputs/model2.bin")
Ejemplo n.º 24
0
    os.makedirs(out + '/mismatch/hex')

if not os.path.isdir(out + '/illegal'):
    os.makedirs(out + '/illegal')
    os.makedirs(out + '/illegal/sim_input')
    os.makedirs(out + '/illegal/elf')
    os.makedirs(out + '/illegal/asm')
    os.makedirs(out + '/illegal/hex')

if not os.path.isdir(out + '/corpus'):
    os.makedirs(out + '/corpus')

date = datetime.today().strftime('%Y%m%d')
cov_log = out + '/cov_log_{}.txt'.format(date)
if (multicore or record) and not os.path.isfile(cov_log):
    save_file(cov_log, 'w',
              '{:<10}\t{:<10}\t{:<10}\n'.format('time', 'iter', 'coverage'))

start_time = time.time()

if not multicore:
    if minimize:
        factory = TestFactory(Minimize)
        factory.add_option('toplevel', [toplevel])
        factory.add_option('template', [template])
        factory.add_option('out', [out])
        factory.add_option('debug', [debug])

    else:
        factory = TestFactory(Run)
        parser.register_option(factory)
        factory.add_option('cov_log', [cov_log])