Esempio n. 1
0
    def __init__(self, data_path, tokenizer, transforms, vocab, args):
        df = pd.read_csv(data_path, sep='\t')[[
            'gold_label', 'sentence1', 'sentence2', 'image'
        ]]
        print(f'{data_path}, number of rows: {len(df)}')
        if df['sentence2'].isnull().sum() > 0:
            print(
                f" drop number of lines because of missing sentence2: {df['sentence2'].isnull().sum()}"
            )
        df = df.loc[df['sentence2'].isnull() != True]
        df = df.rename({'gold_label': 'label', 'image': 'img'}, axis=1)
        df['img'] = args.img_path + '/' + df['img']
        self.data = df.to_dict('records')
        self.data_dir = str(os.path.dirname(data_path))
        self.tokenizer = tokenizer
        self.args = args
        self.vocab = vocab
        self.n_classes = len(args.labels)
        self.text_start_token = ["[CLS]"
                                 ] if args.model != "mmbt" else ["[SEP]"]

        with numpy_seed(0):
            for row in self.data:
                if np.random.random() < args.drop_img_percent:
                    row["img"] = None

        self.max_seq_len = args.max_seq_len
        if args.model == "mmbt":
            self.max_seq_len -= args.num_image_embeds

        self.transforms = transforms
Esempio n. 2
0
    def __init__(self, data_path, tokenizer, transforms, vocab, args):
        self.data = [json.loads(l) for l in open(data_path)]
        self.data_dir = os.path.dirname(data_path)
        self.tokenizer = tokenizer
        self.args = args
        self.vocab = vocab
        self.n_classes = len(args.labels)
        self.text_start_token = ["[CLS]"
                                 ] if args.model != "mmbt" else ["[SEP]"]

        with numpy_seed(0):
            for row in self.data:
                if np.random.random() < args.drop_img_percent:
                    row["img"] = None

        self.max_seq_len = args.max_seq_len
        if args.model == "mmbt":
            self.max_seq_len -= args.num_image_embeds

        self.transforms = transforms