def dcase_predict(self, experiment_path: str, feature_file: str, predict_scp: str, output: str = "predition.csv", **kwargs): """kwargs: {'max_length': int, 'method': str, 'beam_size': int}""" dump = torch.load(os.path.join(experiment_path, "saved.pth"), map_location="cpu") # Load previous training config config = dump["config"] vocabulary = torch.load(config["vocab_file"]) model = self._get_model(config, len(vocabulary)) model.load_state_dict(dump["model"]) # Some scaler (sklearn standardscaler) scaler = dump["scaler"] zh = config["zh"] model = model.to(self.device) dataset = SJTUDatasetEval(feature=feature_file, eval_scp=predict_scp, transform=scaler.transform) dataloader = torch.utils.data.DataLoader(dataset, shuffle=False, collate_fn=collate_fn((1, )), batch_size=32, num_workers=0) pbar = ProgressBar(persist=False, ascii=True) predictions = [] def _sample(engine, batch): # batch: [keys, feats, feat_lens] with torch.no_grad(): model.eval() keys = batch[0] output = self._forward(model, batch, mode="sample", **kwargs) seqs = output["seqs"].cpu().numpy() for idx, seq in enumerate(seqs): caption = self._convert_idx2sentence(seq, vocabulary, zh=zh) predictions.append({ "file_name": keys[idx] + ".wav", "caption_predicted": caption }) sample_engine = Engine(_sample) pbar.attach(sample_engine) sample_engine.run(dataloader) pred_df = pd.DataFrame(predictions) pred_df.to_csv(os.path.join(experiment_path, output), index=False)
def _ensemble(self, path1: str, path2: str, kaldi_stream: str, kaldi_scp: str, max_length: int = None): dump1 = torch.load(path1, map_location="cpu") dump2 = torch.load(path2, map_location="cpu") model1 = dump1["model"].to(device) model2 = dump2["model"].to(device) scaler = dump1["scaler"] config = dump1["config"] vocabulary = torch.load(config["vocab_file"]) dataset = SJTUDatasetEval(kaldi_stream=kaldi_stream, kaldi_scp=kaldi_scp, transform=scaler.transform) # dataset[i]: key, feature dataloader = torch.utils.data.DataLoader(dataset, shuffle=False, collate_fn=collate_fn((1, )), batch_size=32, num_workers=0) if max_length is None: max_length = model1.max_length pbar = ProgressBar(persist=False, ascii=True) key2pred = {} def _sample(engine, batch): # batch: [ids, feats, feat_lens] ids = batch[0] feats = batch[1] feat_lens = batch[-1] seqs = self._sample_batch(model1, model2, feats, feat_lens, max_length) seqs = seqs.cpu().numpy() for idx, seq in enumerate(seqs): caption = [] for word_id in seq: word = vocabulary.idx2word[word_id] if word == "<start>": continue elif word == "<end>": break else: caption.append(word) caption = " ".join(caption) key2pred[ids[idx]] = [ caption, ] sampler = Engine(_sample) pbar.attach(sampler) sampler.run(dataloader) return key2pred
def _get_dataloaders(config, vocabulary): scaler = getattr(pre, config["scaler"])(**config["scaler_args"]) inputdim = -1 caption_df = pd.read_json(config["caption_file"], dtype={"key": str}) for batch in tqdm(torch.utils.data.DataLoader( SJTUDataset( feature=config["feature_file"], caption_df=caption_df, vocabulary=vocabulary, ), collate_fn=collate_fn([0, 1]), **config["dataloader_args"]), ascii=True): feat = batch[0] feat_lens = batch[-2] packed_feat = torch.nn.utils.rnn.pack_padded_sequence( feat, feat_lens, batch_first=True, enforce_sorted=False).data scaler.partial_fit(packed_feat) inputdim = feat.shape[-1] assert inputdim > 0, "Reading input feature failed" augments = train_util.parse_augments(config["augments"]) train_keys = np.random.choice(caption_df["key"].unique(), int( len(caption_df["key"].unique()) * (config["train_percent"] / 100.)), replace=False) train_df = caption_df[caption_df["key"].apply( lambda x: x in train_keys)] val_df = caption_df[~caption_df.index.isin(train_df.index)] train_loader = torch.utils.data.DataLoader( SJTUDataset(feature=config["feature_file"], caption_df=train_df, vocabulary=vocabulary, transform=[scaler.transform, augments]), shuffle=True, collate_fn=collate_fn([0, 1]), **config["dataloader_args"]) if config["zh"]: train_key2refs = train_df.groupby("key")["tokens"].apply( list).to_dict() val_key2refs = val_df.groupby("key")["tokens"].apply( list).to_dict() else: train_key2refs = train_df.groupby("key")["caption"].apply( list).to_dict() val_key2refs = val_df.groupby("key")["caption"].apply( list).to_dict() val_loader = torch.utils.data.DataLoader(SJTUDataset( feature=config["feature_file"], caption_df=val_df, vocabulary=vocabulary, transform=scaler.transform), shuffle=False, collate_fn=collate_fn([0, 1]), **config["dataloader_args"]) return train_loader, val_loader, { "scaler": scaler, "inputdim": inputdim, "train_key2refs": train_key2refs, "val_key2refs": val_key2refs }
def evaluate(self, experiment_path: str, feature_file: str, feature_scp: str, caption_file: str, caption_output: str = "eval_output.json", score_output: str = "scores.txt", **kwargs): """kwargs: {'max_length': int, 'method': str, 'beam_size': int}""" dump = torch.load(os.path.join(experiment_path, "saved.pth"), map_location="cpu") # Load previous training config config = dump["config"] vocabulary = torch.load(config["vocab_file"]) model = self._get_model(config, vocabulary) model.load_state_dict(dump["model"]) # Some scaler (sklearn standardscaler) scaler = dump["scaler"] zh = config["zh"] model = model.to(self.device) dataset = SJTUDatasetEval(feature=feature_file, eval_scp=feature_scp, transform=scaler.transform) dataloader = torch.utils.data.DataLoader(dataset, shuffle=False, collate_fn=collate_fn((1, )), batch_size=32, num_workers=0) caption_df = pd.read_json(caption_file, dtype={"key": str}) if zh: key2refs = caption_df.groupby("key")["tokens"].apply( list).to_dict() else: key2refs = caption_df.groupby("key")["caption"].apply( list).to_dict() model.eval() key2pred = {} def _sample(engine, batch): with torch.no_grad(): model.eval() keys = batch[0] output = self._forward(model, batch, mode="sample", **kwargs) seqs = output["seqs"].cpu().numpy() for idx, seq in enumerate(seqs): caption = self._convert_idx2sentence(seq, vocabulary, zh) key2pred[keys[idx]] = [ caption, ] pbar = ProgressBar(persist=False, ascii=True) sampler = Engine(_sample) pbar.attach(sampler) sampler.run(dataloader) pred_df = [] for key, pred in key2pred.items(): pred_df.append({ "filename": key + ".wav", "caption": "".join(pred[0]) if zh else pred[0], "tokens": pred[0] if zh else pred[0].split() }) pred_df = pd.DataFrame(pred_df) pred_df.to_json(os.path.join(experiment_path, caption_output)) from pycocoevalcap.bleu.bleu import Bleu from pycocoevalcap.rouge.rouge import Rouge from pycocoevalcap.cider.cider import Cider from pycocoevalcap.meteor.meteor import Meteor from pycocoevalcap.spice.spice import Spice f = open(os.path.join(experiment_path, score_output), "w") scorer = Bleu(n=4, zh=zh) score, scores = scorer.compute_score(key2refs, key2pred) for n in range(4): f.write("Bleu-{}: {:6.3f}\n".format(n + 1, score[n])) scorer = Rouge(zh=zh) score, scores = scorer.compute_score(key2refs, key2pred) f.write("ROUGE: {:6.3f}\n".format(score)) scorer = Cider(zh=zh) score, scores = scorer.compute_score(key2refs, key2pred) f.write("CIDEr: {:6.3f}\n".format(score)) if not zh: scorer = Meteor() score, scores = scorer.compute_score(key2refs, key2pred) f.write("Meteor: {:6.3f}\n".format(score)) scorer = Spice() score, scores = scorer.compute_score(key2refs, key2pred) f.write("Spice: {:6.3f}\n".format(score)) f.close()
def sample(self, experiment_path: str, feature_file: str, feature_scp: str, output: str = "output_word.txt", **kwargs): """Generate captions given experiment model""" """kwargs: {'max_length': int, 'method': str, 'beam_size': int}""" import tableprint as tp dump = torch.load(os.path.join(experiment_path, "saved.pth"), map_location="cpu") # Load previous training config config = dump["config"] vocab_size = len(torch.load(config["vocab_file"])) model = self._get_model(config, vocab_size) model.load_state_dict(dump["model"]) # Some scaler (sklearn standardscaler) scaler = dump["scaler"] vocabulary = torch.load(config["vocab_file"]) zh = config["zh"] model = model.to(self.device) dataset = SJTUDatasetEval(feature=feature_file, eval_scp=feature_scp, transform=scaler.transform) dataloader = torch.utils.data.DataLoader(dataset, shuffle=False, collate_fn=collate_fn((1, )), batch_size=16, num_workers=0) width_length = 80 pbar = ProgressBar(persist=False, ascii=True) writer = open(os.path.join(experiment_path, output), "w") writer.write( tp.header(["InputUtterance", "Output Sentence"], width=[len("InputUtterance"), width_length])) writer.write('\n') sentences = [] def _sample(engine, batch): # batch: [keys, feats, feat_lens] with torch.no_grad(): model.eval() keys = batch[0] output = self._forward(model, batch, mode="sample", **kwargs) seqs = output["seqs"].cpu().numpy() for idx, seq in enumerate(seqs): caption = self._convert_idx2sentence(seq, vocabulary, zh=zh) if zh: sentence = " ".join(caption) else: sentence = caption writer.write( tp.row([keys[idx], sentence], width=[len("InputUtterance"), width_length]) + "\n") sentences.append(sentence) sample_engine = Engine(_sample) pbar.attach(sample_engine) sample_engine.run(dataloader) writer.write( tp.bottom(2, width=[len("InputUtterance"), width_length]) + "\n") writer.write("Unique sentence number: {}\n".format(len( set(sentences)))) writer.close()
def _get_dataloaders(config, vocabulary): scaler = getattr(pre, config["scaler"])(**config["scaler_args"]) inputdim = -1 caption_df = pd.read_json(config["caption_file"], dtype={"key": str}) for batch in tqdm(torch.utils.data.DataLoader( SJTUDataset( feature_file=config["feature_file"], caption_df=caption_df, vocabulary=vocabulary, ), collate_fn=collate_fn([0, 1]), **config["dataloader_args"]), ascii=True): feat = batch[0] feat = feat.reshape(-1, feat.shape[-1]) scaler.partial_fit(feat) inputdim = feat.shape[-1] assert inputdim > 0, "Reading inputstream failed" augments = train_util.parse_augments(config["augments"]) cv_keys = np.random.choice(caption_df["key"].unique(), int( len(caption_df["key"].unique()) * (1 - config["train_percent"] / 100.)), replace=False) cv_df = caption_df[caption_df["key"].apply(lambda x: x in cv_keys)] train_df = caption_df[~caption_df.index.isin(cv_df.index)] trainloader = torch.utils.data.DataLoader( SJTUDataset(feature_file=config["feature_file"], caption_df=train_df, vocabulary=vocabulary, transform=[scaler.transform, augments]), shuffle=True, collate_fn=collate_fn([0, 1]), **config["dataloader_args"]) if config["zh"]: train_key2refs = train_df.groupby("key")["tokens"].apply( list).to_dict() cv_key2refs = cv_df.groupby("key")["tokens"].apply(list).to_dict() else: train_key2refs = train_df.groupby("key")["caption"].apply( list).to_dict() cv_key2refs = cv_df.groupby("key")["caption"].apply(list).to_dict() cvloader = torch.utils.data.DataLoader(SJTUDataset( feature_file=config["feature_file"], caption_df=cv_df, vocabulary=vocabulary, transform=[scaler.transform]), shuffle=False, collate_fn=collate_fn([0, 1]), **config["dataloader_args"]) return trainloader, cvloader, { "scaler": scaler, "inputdim": inputdim, "train_key2refs": train_key2refs, "cv_key2refs": cv_key2refs }