Exemple #1
0
    def dcase_predict(self,
                      experiment_path: str,
                      feature_file: str,
                      predict_scp: str,
                      output: str = "predition.csv",
                      **kwargs):
        """kwargs: {'max_length': int, 'method': str, 'beam_size': int}"""

        dump = torch.load(os.path.join(experiment_path, "saved.pth"),
                          map_location="cpu")
        # Load previous training config
        config = dump["config"]

        vocabulary = torch.load(config["vocab_file"])
        model = self._get_model(config, len(vocabulary))
        model.load_state_dict(dump["model"])
        # Some scaler (sklearn standardscaler)
        scaler = dump["scaler"]
        zh = config["zh"]
        model = model.to(self.device)

        dataset = SJTUDatasetEval(feature=feature_file,
                                  eval_scp=predict_scp,
                                  transform=scaler.transform)
        dataloader = torch.utils.data.DataLoader(dataset,
                                                 shuffle=False,
                                                 collate_fn=collate_fn((1, )),
                                                 batch_size=32,
                                                 num_workers=0)

        pbar = ProgressBar(persist=False, ascii=True)
        predictions = []

        def _sample(engine, batch):
            # batch: [keys, feats, feat_lens]
            with torch.no_grad():
                model.eval()
                keys = batch[0]
                output = self._forward(model, batch, mode="sample", **kwargs)
                seqs = output["seqs"].cpu().numpy()
                for idx, seq in enumerate(seqs):
                    caption = self._convert_idx2sentence(seq,
                                                         vocabulary,
                                                         zh=zh)
                    predictions.append({
                        "file_name": keys[idx] + ".wav",
                        "caption_predicted": caption
                    })

        sample_engine = Engine(_sample)
        pbar.attach(sample_engine)
        sample_engine.run(dataloader)

        pred_df = pd.DataFrame(predictions)
        pred_df.to_csv(os.path.join(experiment_path, output), index=False)
Exemple #2
0
    def _ensemble(self,
                  path1: str,
                  path2: str,
                  kaldi_stream: str,
                  kaldi_scp: str,
                  max_length: int = None):
        dump1 = torch.load(path1, map_location="cpu")
        dump2 = torch.load(path2, map_location="cpu")

        model1 = dump1["model"].to(device)
        model2 = dump2["model"].to(device)

        scaler = dump1["scaler"]
        config = dump1["config"]

        vocabulary = torch.load(config["vocab_file"])

        dataset = SJTUDatasetEval(kaldi_stream=kaldi_stream,
                                  kaldi_scp=kaldi_scp,
                                  transform=scaler.transform)
        # dataset[i]: key, feature
        dataloader = torch.utils.data.DataLoader(dataset,
                                                 shuffle=False,
                                                 collate_fn=collate_fn((1, )),
                                                 batch_size=32,
                                                 num_workers=0)

        if max_length is None:
            max_length = model1.max_length

        pbar = ProgressBar(persist=False, ascii=True)

        key2pred = {}

        def _sample(engine, batch):
            # batch: [ids, feats, feat_lens]

            ids = batch[0]
            feats = batch[1]
            feat_lens = batch[-1]
            seqs = self._sample_batch(model1, model2, feats, feat_lens,
                                      max_length)
            seqs = seqs.cpu().numpy()

            for idx, seq in enumerate(seqs):
                caption = []
                for word_id in seq:
                    word = vocabulary.idx2word[word_id]
                    if word == "<start>":
                        continue
                    elif word == "<end>":
                        break
                    else:
                        caption.append(word)
                caption = " ".join(caption)
                key2pred[ids[idx]] = [
                    caption,
                ]

        sampler = Engine(_sample)
        pbar.attach(sampler)
        sampler.run(dataloader)

        return key2pred
Exemple #3
0
    def _get_dataloaders(config, vocabulary):
        scaler = getattr(pre, config["scaler"])(**config["scaler_args"])
        inputdim = -1
        caption_df = pd.read_json(config["caption_file"], dtype={"key": str})

        for batch in tqdm(torch.utils.data.DataLoader(
                SJTUDataset(
                    feature=config["feature_file"],
                    caption_df=caption_df,
                    vocabulary=vocabulary,
                ),
                collate_fn=collate_fn([0, 1]),
                **config["dataloader_args"]),
                          ascii=True):
            feat = batch[0]
            feat_lens = batch[-2]
            packed_feat = torch.nn.utils.rnn.pack_padded_sequence(
                feat, feat_lens, batch_first=True, enforce_sorted=False).data
            scaler.partial_fit(packed_feat)
            inputdim = feat.shape[-1]
        assert inputdim > 0, "Reading input feature failed"

        augments = train_util.parse_augments(config["augments"])
        train_keys = np.random.choice(caption_df["key"].unique(),
                                      int(
                                          len(caption_df["key"].unique()) *
                                          (config["train_percent"] / 100.)),
                                      replace=False)
        train_df = caption_df[caption_df["key"].apply(
            lambda x: x in train_keys)]
        val_df = caption_df[~caption_df.index.isin(train_df.index)]

        train_loader = torch.utils.data.DataLoader(
            SJTUDataset(feature=config["feature_file"],
                        caption_df=train_df,
                        vocabulary=vocabulary,
                        transform=[scaler.transform, augments]),
            shuffle=True,
            collate_fn=collate_fn([0, 1]),
            **config["dataloader_args"])

        if config["zh"]:
            train_key2refs = train_df.groupby("key")["tokens"].apply(
                list).to_dict()
            val_key2refs = val_df.groupby("key")["tokens"].apply(
                list).to_dict()
        else:
            train_key2refs = train_df.groupby("key")["caption"].apply(
                list).to_dict()
            val_key2refs = val_df.groupby("key")["caption"].apply(
                list).to_dict()
        val_loader = torch.utils.data.DataLoader(SJTUDataset(
            feature=config["feature_file"],
            caption_df=val_df,
            vocabulary=vocabulary,
            transform=scaler.transform),
                                                 shuffle=False,
                                                 collate_fn=collate_fn([0, 1]),
                                                 **config["dataloader_args"])

        return train_loader, val_loader, {
            "scaler": scaler,
            "inputdim": inputdim,
            "train_key2refs": train_key2refs,
            "val_key2refs": val_key2refs
        }
Exemple #4
0
    def evaluate(self,
                 experiment_path: str,
                 feature_file: str,
                 feature_scp: str,
                 caption_file: str,
                 caption_output: str = "eval_output.json",
                 score_output: str = "scores.txt",
                 **kwargs):
        """kwargs: {'max_length': int, 'method': str, 'beam_size': int}"""

        dump = torch.load(os.path.join(experiment_path, "saved.pth"),
                          map_location="cpu")
        # Load previous training config
        config = dump["config"]

        vocabulary = torch.load(config["vocab_file"])
        model = self._get_model(config, vocabulary)
        model.load_state_dict(dump["model"])
        # Some scaler (sklearn standardscaler)
        scaler = dump["scaler"]
        zh = config["zh"]
        model = model.to(self.device)

        dataset = SJTUDatasetEval(feature=feature_file,
                                  eval_scp=feature_scp,
                                  transform=scaler.transform)
        dataloader = torch.utils.data.DataLoader(dataset,
                                                 shuffle=False,
                                                 collate_fn=collate_fn((1, )),
                                                 batch_size=32,
                                                 num_workers=0)

        caption_df = pd.read_json(caption_file, dtype={"key": str})
        if zh:
            key2refs = caption_df.groupby("key")["tokens"].apply(
                list).to_dict()
        else:
            key2refs = caption_df.groupby("key")["caption"].apply(
                list).to_dict()

        model.eval()

        key2pred = {}

        def _sample(engine, batch):
            with torch.no_grad():
                model.eval()
                keys = batch[0]
                output = self._forward(model, batch, mode="sample", **kwargs)
                seqs = output["seqs"].cpu().numpy()

                for idx, seq in enumerate(seqs):
                    caption = self._convert_idx2sentence(seq, vocabulary, zh)
                    key2pred[keys[idx]] = [
                        caption,
                    ]

        pbar = ProgressBar(persist=False, ascii=True)
        sampler = Engine(_sample)
        pbar.attach(sampler)
        sampler.run(dataloader)

        pred_df = []
        for key, pred in key2pred.items():
            pred_df.append({
                "filename": key + ".wav",
                "caption": "".join(pred[0]) if zh else pred[0],
                "tokens": pred[0] if zh else pred[0].split()
            })
        pred_df = pd.DataFrame(pred_df)
        pred_df.to_json(os.path.join(experiment_path, caption_output))

        from pycocoevalcap.bleu.bleu import Bleu
        from pycocoevalcap.rouge.rouge import Rouge
        from pycocoevalcap.cider.cider import Cider
        from pycocoevalcap.meteor.meteor import Meteor
        from pycocoevalcap.spice.spice import Spice

        f = open(os.path.join(experiment_path, score_output), "w")

        scorer = Bleu(n=4, zh=zh)
        score, scores = scorer.compute_score(key2refs, key2pred)
        for n in range(4):
            f.write("Bleu-{}: {:6.3f}\n".format(n + 1, score[n]))

        scorer = Rouge(zh=zh)
        score, scores = scorer.compute_score(key2refs, key2pred)
        f.write("ROUGE: {:6.3f}\n".format(score))

        scorer = Cider(zh=zh)
        score, scores = scorer.compute_score(key2refs, key2pred)
        f.write("CIDEr: {:6.3f}\n".format(score))

        if not zh:
            scorer = Meteor()
            score, scores = scorer.compute_score(key2refs, key2pred)
            f.write("Meteor: {:6.3f}\n".format(score))

            scorer = Spice()
            score, scores = scorer.compute_score(key2refs, key2pred)
            f.write("Spice: {:6.3f}\n".format(score))

        f.close()
Exemple #5
0
    def sample(self,
               experiment_path: str,
               feature_file: str,
               feature_scp: str,
               output: str = "output_word.txt",
               **kwargs):
        """Generate captions given experiment model"""
        """kwargs: {'max_length': int, 'method': str, 'beam_size': int}"""
        import tableprint as tp

        dump = torch.load(os.path.join(experiment_path, "saved.pth"),
                          map_location="cpu")
        # Load previous training config
        config = dump["config"]

        vocab_size = len(torch.load(config["vocab_file"]))
        model = self._get_model(config, vocab_size)
        model.load_state_dict(dump["model"])
        # Some scaler (sklearn standardscaler)
        scaler = dump["scaler"]
        vocabulary = torch.load(config["vocab_file"])
        zh = config["zh"]
        model = model.to(self.device)
        dataset = SJTUDatasetEval(feature=feature_file,
                                  eval_scp=feature_scp,
                                  transform=scaler.transform)
        dataloader = torch.utils.data.DataLoader(dataset,
                                                 shuffle=False,
                                                 collate_fn=collate_fn((1, )),
                                                 batch_size=16,
                                                 num_workers=0)

        width_length = 80
        pbar = ProgressBar(persist=False, ascii=True)
        writer = open(os.path.join(experiment_path, output), "w")
        writer.write(
            tp.header(["InputUtterance", "Output Sentence"],
                      width=[len("InputUtterance"), width_length]))
        writer.write('\n')

        sentences = []

        def _sample(engine, batch):
            # batch: [keys, feats, feat_lens]
            with torch.no_grad():
                model.eval()
                keys = batch[0]
                output = self._forward(model, batch, mode="sample", **kwargs)
                seqs = output["seqs"].cpu().numpy()
                for idx, seq in enumerate(seqs):
                    caption = self._convert_idx2sentence(seq,
                                                         vocabulary,
                                                         zh=zh)
                    if zh:
                        sentence = " ".join(caption)
                    else:
                        sentence = caption
                    writer.write(
                        tp.row([keys[idx], sentence],
                               width=[len("InputUtterance"), width_length]) +
                        "\n")
                    sentences.append(sentence)

        sample_engine = Engine(_sample)
        pbar.attach(sample_engine)
        sample_engine.run(dataloader)
        writer.write(
            tp.bottom(2, width=[len("InputUtterance"), width_length]) + "\n")
        writer.write("Unique sentence number: {}\n".format(len(
            set(sentences))))
        writer.close()
    def _get_dataloaders(config, vocabulary):
        scaler = getattr(pre, config["scaler"])(**config["scaler_args"])
        inputdim = -1
        caption_df = pd.read_json(config["caption_file"], dtype={"key": str})

        for batch in tqdm(torch.utils.data.DataLoader(
                SJTUDataset(
                    feature_file=config["feature_file"],
                    caption_df=caption_df,
                    vocabulary=vocabulary,
                ),
                collate_fn=collate_fn([0, 1]),
                **config["dataloader_args"]),
                          ascii=True):
            feat = batch[0]
            feat = feat.reshape(-1, feat.shape[-1])
            scaler.partial_fit(feat)
            inputdim = feat.shape[-1]
        assert inputdim > 0, "Reading inputstream failed"

        augments = train_util.parse_augments(config["augments"])
        cv_keys = np.random.choice(caption_df["key"].unique(),
                                   int(
                                       len(caption_df["key"].unique()) *
                                       (1 - config["train_percent"] / 100.)),
                                   replace=False)
        cv_df = caption_df[caption_df["key"].apply(lambda x: x in cv_keys)]
        train_df = caption_df[~caption_df.index.isin(cv_df.index)]

        trainloader = torch.utils.data.DataLoader(
            SJTUDataset(feature_file=config["feature_file"],
                        caption_df=train_df,
                        vocabulary=vocabulary,
                        transform=[scaler.transform, augments]),
            shuffle=True,
            collate_fn=collate_fn([0, 1]),
            **config["dataloader_args"])

        if config["zh"]:
            train_key2refs = train_df.groupby("key")["tokens"].apply(
                list).to_dict()
            cv_key2refs = cv_df.groupby("key")["tokens"].apply(list).to_dict()
        else:
            train_key2refs = train_df.groupby("key")["caption"].apply(
                list).to_dict()
            cv_key2refs = cv_df.groupby("key")["caption"].apply(list).to_dict()
        cvloader = torch.utils.data.DataLoader(SJTUDataset(
            feature_file=config["feature_file"],
            caption_df=cv_df,
            vocabulary=vocabulary,
            transform=[scaler.transform]),
                                               shuffle=False,
                                               collate_fn=collate_fn([0, 1]),
                                               **config["dataloader_args"])

        return trainloader, cvloader, {
            "scaler": scaler,
            "inputdim": inputdim,
            "train_key2refs": train_key2refs,
            "cv_key2refs": cv_key2refs
        }