def main(args, _=None): """Run the ``catalyst-data text2embeddings`` script.""" batch_size = args.batch_size num_workers = args.num_workers max_length = args.max_length pooling_groups = args.pooling.split(",") utils.set_global_seed(args.seed) utils.prepare_cudnn(args.deterministic, args.benchmark) if hasattr(args, "in_huggingface"): model_config = BertConfig.from_pretrained(args.in_huggingface) model_config.output_hidden_states = args.output_hidden_states model = BertModel.from_pretrained(args.in_huggingface, config=model_config) tokenizer = BertTokenizer.from_pretrained(args.in_huggingface) else: model_config = BertConfig.from_pretrained(args.in_config) model_config.output_hidden_states = args.output_hidden_states model = BertModel(config=model_config) tokenizer = BertTokenizer.from_pretrained(args.in_vocab) if hasattr(args, "in_model"): checkpoint = utils.load_checkpoint(args.in_model) checkpoint = {"model_state_dict": checkpoint} utils.unpack_checkpoint(checkpoint=checkpoint, model=model) model = model.eval() model, _, _, _, device = utils.process_components(model=model) df = pd.read_csv(args.in_csv) df = df.dropna(subset=[args.txt_col]) df.to_csv(f"{args.out_prefix}.df.csv", index=False) df = df.reset_index().drop("index", axis=1) df = list(df.to_dict("index").values()) num_samples = len(df) open_fn = LambdaReader( input_key=args.txt_col, output_key=None, lambda_fn=partial( tokenize_text, strip=args.strip, lowercase=args.lowercase, remove_punctuation=args.remove_punctuation, ), tokenizer=tokenizer, max_length=max_length, ) dataloader = utils.get_loader( df, open_fn, batch_size=batch_size, num_workers=num_workers, ) features = {} dataloader = tqdm(dataloader) if args.verbose else dataloader with torch.no_grad(): for idx, batch in enumerate(dataloader): batch = utils.any2device(batch, device) bert_output = model(**batch) mask = (batch["attention_mask"].unsqueeze(-1) if args.mask_for_max_length else None) if utils.check_ddp_wrapped(model): # using several gpu hidden_size = model.module.config.hidden_size hidden_states = model.module.config.output_hidden_states else: # using cpu or one gpu hidden_size = model.config.hidden_size hidden_states = model.config.output_hidden_states features_ = process_bert_output( bert_output=bert_output, hidden_size=hidden_size, output_hidden_states=hidden_states, pooling_groups=pooling_groups, mask=mask, ) # create storage based on network output if idx == 0: for key, value in features_.items(): name_ = key if isinstance(key, str) else f"{key:02d}" _, embedding_size = value.shape features[name_] = np.memmap( f"{args.out_prefix}.{name_}.npy", dtype=np.float32, mode="w+", shape=(num_samples, embedding_size), ) indices = np.arange(idx * batch_size, min((idx + 1) * batch_size, num_samples)) for key, value in features_.items(): name_ = key if isinstance(key, str) else f"{key:02d}" features[name_][indices] = _detach(value)
def get_datasets( self, stage: str, datapath: str = None, in_csv: str = None, in_csv_train: str = None, in_csv_valid: str = None, in_csv_infer: str = None, train_folds: str = None, valid_folds: str = None, tag2class: str = None, class_column: str = None, tag_column: str = None, folds_seed: int = 42, n_folds: int = 5, image_size: int = 256, ): datasets = collections.OrderedDict() tag2class = (json.load(open(tag2class)) if tag2class is not None else None) df, df_train, df_valid, df_infer = read_csv_data( in_csv=in_csv, in_csv_train=in_csv_train, in_csv_valid=in_csv_valid, in_csv_infer=in_csv_infer, train_folds=train_folds, valid_folds=valid_folds, tag2class=tag2class, class_column=class_column, tag_column=tag_column, seed=folds_seed, n_folds=n_folds, ) import cv2 import os def encode_fn_lambda(fname, datapath): return (cv2.cvtColor(cv2.imread(os.path.join(datapath, fname)), cv2.COLOR_BGR2GRAY) // 255)[:, :, None] open_fn = ReaderCompose(readers=[ ImageReader( input_key="images", output_key="image", datapath=datapath), LambdaReader(input_key="masks", output_key="mask", datapath=datapath, encode_fn=encode_fn_lambda), # MaskReader( # input_key="masks", output_key="mask", datapath=datapath # ), ScalarReader( input_key="name", output_key="name", default_value=-1, dtype=str, ), ]) for mode, source in zip(("train", "valid", "infer"), (df_train, df_valid, df_infer)): if len(source) > 0: datasets[mode] = ListDataset( list_data=source, open_fn=open_fn, dict_transform=self.get_transforms(stage=stage, mode=mode, image_size=image_size), ) # fff = datasets["train"][0] return datasets