class BaseParser(Base): """ base class for parser, """ def __init__(self, config): super(BaseParser, self).__init__(config) self._init_vocab() @classmethod def load_default_config(cls) -> ConfigDict: config = ConfigDict(vocab_path=None) return config @Base.log def _init_vocab(self): self.vocab = Vocab() self.vocab.load(self.config.vocab_path) def parse_train(self): pass def parse_test(self): pass def parse_predump(self): pass def parse_predump_train(self): pass
def main(): model = torch.load('CNNmodel/model.pt') model.eval() config = torch.load('CNNmodel/config.pt') audio2mfcc = torchaudio.transforms.MFCC(sample_rate=config.sample_rate, n_mfcc=config.n_mfcc, log_mels=False, melkwargs={ 'n_fft': config.n_fft_size }).to(config.device) logger.info('Start cache for test data') if not os.path.isfile("./mfcc/test_input.pt"): os.makedirs("./mfcc", exist_ok=True) val_root = os.path.join(config.data_path, 'test') val_files = [ p for p in os.listdir(os.path.join(config.data_path, 'test')) if 'pcm' in p ] val_mfccs = {} for p in tqdm(val_files): sound_path = os.path.join(config.data_path, 'test', p) data, samplerate = sf.read(sound_path, channels=1, samplerate=16000, format='raw', subtype='PCM_16') mfcc = audio2mfcc(torch.Tensor(data).to(config.device)) audio_array = torch.zeros(config.n_mfcc, config.input_max_len) sel_ix = min(mfcc.shape[1], config.input_max_len) audio_array[:, :sel_ix] = mfcc[:, :sel_ix] val_mfccs[sound_path] = audio_array.transpose(0, 1) torch.save(val_mfccs, './mfcc/test_input.pt') logger.info('Done cache for test data') logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO) # sentence index vocab = Vocab(config) test_loader, test_label_path = data_loader(config, 'test', vocab) with open('prediction.txt', 'w') as file_writer: for tst_step, (file_name, mfcc, target_index) in enumerate( tqdm(test_loader, desc="Evaluating")): with torch.no_grad(): logit, feature = model(mfcc.to(config.device)) y_max = logit.max(dim=1)[1] pred = [vocab.index2text[i] for i in y_max.cpu().numpy()] for f, line in zip(file_name, pred): print(file_name) file_writer.write(f + " " + str(line) + '\n')
def __init__(self, image_fodler, image_fn, label_fn, min_freq=1, vocab=None): self.image_fns = [ os.path.join(image_fodler, _fn.strip()) for _fn in open(image_fn, 'r').readlines() ] self.labels = [_fn.strip() for _fn in open(label_fn, 'r').readlines()] assert len(self.image_fns) == len(self.labels) if vocab is None: dict_of_unique_words = TextUtils.get_dict_of_unique_words( self.labels, min_freq=min_freq) self.vocab = Vocab(dict_of_unique_words) else: self.vocab = vocab
def main(): args = argparse.ArgumentParser() args.add_argument("--input_max_len", default=400, type=int, help="Maximum sequence length for audio") args.add_argument("--num_epochs", default=300, type=int, help="num_epochs") args.add_argument("--data_path", default='data', type=str, help="root") args.add_argument("--sample_rate", default=16000, type=int, help="sampling rate for audio") args.add_argument("--n_fft_size", default=400, type=int, help="time widnow for fourier transform") args.add_argument("--n_mfcc", default=40, type=int, help="low frequency range (from 0 to n_mfcc)") args.add_argument("--max_len", default=30, type=int, help="target_max_length") args.add_argument("--batch_size", default=128, type=int, help="target_max_length") args.add_argument("--warmup_percent", default=0.1, type=float, help="Linear warmup over warmup_percent.") args.add_argument("--when", type=int, default=5, help="when to decay learning rate (default: 20)") args.add_argument("--clip", type=float, default=0.8, help="gradient clip value (default: 0.8)") args.add_argument("--lr", type=float, default=1e-4, help="initial learning rate (default: 1e-3)") args.add_argument("--seed", type=int, default=1234, help="random seed") args.add_argument("--logging_steps", type=int, default=50, help="frequency of result logging (default: 30)") config = args.parse_args() set_seed(config) logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO) config.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") config.device # get mfcc createMFCC(config) # sentence index vocab = Vocab(config) # data loaders train_loader, train_label_path = data_loader(config, 'train', vocab) validate_loader, validate_label_path = data_loader(config, 'validate', vocab) # build model(unknown sentence +1) model = CNN2D(len(vocab) + 1).to(config.device) # loss function loss_fct = torch.nn.CrossEntropyLoss() train(model, train_loader, validate_loader, loss_fct, config, vocab) logger.info('Done Training')
conversation_length = [ min(len(conv["lines"]), max_conv_len) for conv in conv_objects ] sentences, sentence_length = pad_sentences( conversations, max_sentence_length=max_sent_len, max_conversation_length=max_conv_len, ) print("Saving preprocessed data at", split_data_dir) to_pickle(conversation_length, split_data_dir.joinpath("conversation_length.pkl")) to_pickle(sentences, split_data_dir.joinpath("sentences.pkl")) to_pickle(sentence_length, split_data_dir.joinpath("sentence_length.pkl")) if split_type == "train": print("Save Vocabulary...") vocab = Vocab(tokenizer) vocab.add_dataframe(conversations) vocab.update(max_size=max_vocab_size, min_freq=min_freq) print("Vocabulary size: ", len(vocab)) vocab.pickle(convai2_dir.joinpath("word2id.pkl"), convai2_dir.joinpath("id2word.pkl")) print("Done!")
def _init_vocab(self): self.vocab = Vocab() self.vocab.load(self.config.vocab_path)