def create_model(args): latent_size = args['n_latents'] n_layers = args['n_decoder_layers'] mask_activ_fn = args['mask_activation'] spect_dim_low = args['low_dim'] spect_dim_high = args['high_dim'] time_step = args['time_step'] #activation = args['activation_fn'] joint_latent_size = args['n_joint_dim'] n_heads = args['n_heads'] print("making Transformer Decoder", file=sys.stderr) return Transformer(spect_dim_low, latent_size, n_layers,n_heads, mask_activ_fn) ,\ Transformer(spect_dim_high, latent_size, n_layers,n_heads, mask_activ_fn), \ FFLayer(joint_latent_size, latent_size * time_step, 'relu'), \ FFLayer(joint_latent_size, latent_size * time_step, 'relu')
def create_model(args): latent_size = args['n_latents'] n_hidden_units = args['n_hidden_units'] n_layers = args['n_decoder_layers'] if type(n_hidden_units) == int and args['decoder'] == 'fc': hidden_sizes = [n_hidden_units] * n_layers mask_activ_fn = args['mask_activation'] spect_dim = args['spect_dim'] pcm_dim = args['pcm_dim'] #activation = args['activation_fn'] n_heads = args['n_heads'] if not args['backprop_pcm']: pcm_dim = None if args['decoder'] == 'fc': print("making FC Decoder", file=sys.stderr) return Decoder(latent_size, hidden_sizes, spect_dim, pcm_dim, 'relu') elif args['decoder'] == 'transformer': print("making Transformer Decoder", file = sys.stderr) return (Transformer(spect_dim, pcm_dim, latent_size, n_layers,n_heads, None),\ Transformer(spect_dim, pcm_dim, latent_size, n_layers, n_heads, 'sigmoid')) raise NotImplementedError
num_encoder_layers = 2 #6 num_decoder_layers = 2 #6 dropout = 0.10 max_len = 150 forward_expansion = 2048 sp.Load(sami_model) src_pad_idx = sp.pad_id() logging.info("Loading trans model") model = Trans_model( embedding_size, src_vocab_size, trg_vocab_size, src_pad_idx, num_heads, num_encoder_layers, num_decoder_layers, forward_expansion, dropout, max_len, device, 'gelu' ) sp.Load(swedish_model) criterion = nn.CrossEntropyLoss(ignore_index=sp.pad_id()) optimizer = optim.Adam(model.parameters(), lr=learning_rate) scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.95) load_model = False if load_model == True:
sent4 = sami_short[11] trans4 = swedish_short[11] print(sent4, "\n", trans4) # [27] END # [28] START # Synth SWE # [28] END # [29] START model_path = "uni_joint_2layer_gelu_synth.pth.tar" # [29] END # [30] START model_synth_swe = Trans_model(embedding_size, src_vocab_size, trg_vocab_size, src_pad_idx, num_heads, num_encoder_layers, num_decoder_layers, forward_expansion, dropout, max_len, device, 'gelu') # [30] END # [31] START sp.Load(swedish_model) criterion = nn.CrossEntropyLoss(ignore_index=sp.pad_id()) optimizer = optim.Adam(model_synth_swe.parameters(), lr=learning_rate) scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.95) # [31] END # [32] START if load_model == True: checkpoint = torch.load(model_path, map_location='cpu') model_synth_swe.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer'])
def main(DATA, MAX_LINES, MAX_PADDING, MIN_LEN_SENTENCE, SIZE_VOCAB, SHOW_SENTENCES, LR, EPOCHS): if DATA == 'europarl': URLS=["http://www.statmt.org/europarl/v10/training-monolingual/europarl-v10.es.tsv.gz"] FILES = ["europarl-v10.es.tsv.gz"] CORPORA = ["europarl-v10.es.tsv"] elif DATA == 'newscarl': URLS=[ "http://data.statmt.org/news-crawl/es/news.2007.es.shuffled.deduped.gz", "http://data.statmt.org/news-crawl/es/news.2008.es.shuffled.deduped.gz", "http://data.statmt.org/news-crawl/es/news.2009.es.shuffled.deduped.gz", "http://data.statmt.org/news-crawl/es/news.2010.es.shuffled.deduped.gz", "http://data.statmt.org/news-crawl/es/news.2011.es.shuffled.deduped.gz", "http://data.statmt.org/news-crawl/es/news.2012.es.shuffled.deduped.gz", "http://data.statmt.org/news-crawl/es/news.2013.es.shuffled.deduped.gz", "http://data.statmt.org/news-crawl/es/news.2014.es.shuffled.deduped.gz", "http://data.statmt.org/news-crawl/es/news.2015.es.shuffled.deduped.gz", "http://data.statmt.org/news-crawl/es/news.2016.es.shuffled.deduped.gz", "http://data.statmt.org/news-crawl/es/news.2017.es.shuffled.deduped.gz", "http://data.statmt.org/news-crawl/es/news.2018.es.shuffled.deduped.gz", "http://data.statmt.org/news-crawl/es/news.2019.es.shuffled.deduped.gz" ] FILES=[ "news.2007.es.shuffled.deduped.gz", "news.2008.es.shuffled.deduped.gz", "news.2009.es.shuffled.deduped.gz", "news.2010.es.shuffled.deduped.gz", "news.2011.es.shuffled.deduped.gz", "news.2012.es.shuffled.deduped.gz", "news.2013.es.shuffled.deduped.gz", "news.2014.es.shuffled.deduped.gz", "news.2015.es.shuffled.deduped.gz", "news.2016.es.shuffled.deduped.gz", "news.2017.es.shuffled.deduped.gz", "news.2018.es.shuffled.deduped.gz", "news.2019.es.shuffled.deduped.gz" ] CORPORA=[ "news.2007.es.shuffled.deduped", "news.2008.es.shuffled.deduped", "news.2009.es.shuffled.deduped", "news.2010.es.shuffled.deduped", "news.2011.es.shuffled.deduped", "news.2012.es.shuffled.deduped", "news.2013.es.shuffled.deduped", "news.2014.es.shuffled.deduped", "news.2015.es.shuffled.deduped", "news.2016.es.shuffled.deduped", "news.2017.es.shuffled.deduped", "news.2018.es.shuffled.deduped", "news.2019.es.shuffled.deduped" ] print('File download') #--------------------------------------------- for u, f in zip(URLS, FILES): print(u) sys.stdout.flush() if path.exists(f): print('File already downloaded'.format(f)) else: wget.download(u, './'+f) print('Unzipping {}'.format(f)) #---------------------------------------------- for f, c in zip(FILES, CORPORA): print(f) sys.stdout.flush() if path.exists(f): with gzip.open(f, 'rb') as f_in: with open(c, 'wb') as f_out: shutil.copyfileobj(f_in, f_out) else: print('File already unzipped') print('Join all into one') #--------------------------------------------------- sys.stdout.flush() with open('corpus.es', 'wb') as outfile: for fname in CORPORA: print('Joining file:', fname) sys.stdout.flush() with open(fname, 'rb') as infile: for line in infile: outfile.write(line) print('Delete auxiliar files') #----------------------------------------------- sys.stdout.flush() for f in CORPORA: os.remove(f) print('Reduced File') #-------------------------------------------------------- sys.stdout.flush() with open('corpus.es', 'rb') as cfile: with open('corpus.reduced.es', 'wb') as rfile: count = 0 while count < MAX_LINES: line = cfile.readline() rfile.write(line) count += 1 print('Number of lines in the reduced file:', count) print('Read Data') #---------------------------------------------------------- sys.stdout.flush() FILE = 'corpus.reduced.es' data = [] with open(FILE, 'rb') as corpus_file: Lines = corpus_file.readlines() for line in Lines: data.append(line) print('Preprocessing the Data') #---------------------------------------------- sys.stdout.flush() print(data[50]) print(sentence_to_words(data[50])) sys.stdout.flush() preprocessed_data = [] perc = 0 for ind, s in enumerate(data): words = sentence_to_words(s) if len(words) >= MIN_LEN_SENTENCE and len(words) <= MAX_PADDING : preprocessed_data.append(words) if ind > perc: print('{}/{} sentences preprocessed'.format(perc, len(data))) sys.stdout.flush() perc += SHOW_SENTENCES print('Length data readed: ', len(data)) print('Length after preprocessing: ', len(preprocessed_data)) print(preprocessed_data[50]) sys.stdout.flush() print('Build Dictionary') #--------------------------------------------------- sys.stdout.flush() FED = 20 #nmber of elements we will show word_dict, complete_dict = build_dict(preprocessed_data, SIZE_VOCAB) list_dict = [key for key in word_dict.keys()] print('First {} elements of the dictionary: {}'.format(FED, list_dict[:FED])) sys.stdout.flush() print('Selected {}/{} words'.format(SIZE_VOCAB, len(complete_dict))) sys.stdout.flush() # We will add the XXX as 0 in the dictionary, so when a custom sentence is inputted, the XXX will mark the word the model has to guess word_dict['XXX'] = 0 word_dict['PAD'] = 1 word_dict['INFREQ'] = 2 print('Convert and Pad') #----------------------------------------------------- sys.stdout.flush() int_data, int_data_len = convert_and_pad_data(word_dict, preprocessed_data, MAX_PADDING) print(int_data[50], int_data_len[50]) sys.stdout.flush() print('Extract Word') #------------------------------------------------------- sys.stdout.flush() # Check there is no sentence with all 2's int_data_pre = [d for d, lend in zip(int_data, int_data_len) if len(set(d[:lend])) > 1] len_data_pre = [lend for d, lend in zip(int_data, int_data_len) if len(set(d[:lend])) > 1] print('{} of the {} sentences were only 2\'s'.format(len(int_data)-len(int_data_pre), len(int_data))) sys.stdout.flush() masked_data = [] word_masked = [] masked_data = int_data_pre.copy() for idx, (sentence, len_sentence) in enumerate(zip(int_data_pre, len_data_pre)): acceptable_value = False while acceptable_value == False: idx_word = random.randint(0, len_sentence-1) if int_data_pre[idx][idx_word] != 2: acceptable_value = True word_masked.append(int_data_pre[idx][idx_word]) #save the word extracted masked_data[idx][idx_word] = 0 #put this word to 0 print(masked_data[50]) print(word_masked[50]) sys.stdout.flush() print('Split Train, Valid and Test') #----------------------------------------- sys.stdout.flush() train_x, valid_x, train_y, valid_y, train_len, valid_len = train_test_split(masked_data, word_masked, len_data_pre, test_size=0.25, random_state=42) valid_x, test_x, valid_y, test_y, valid_len, test_len = train_test_split(valid_x, valid_y, valid_len, test_size=0.4, random_state=42) print('train: ', len(train_x), len(train_y), len(train_len)) print('valid: ', len(valid_x), len(valid_y), len(valid_len)) print('test: ', len(test_x), len(test_y), len(test_len)) sys.stdout.flush() print('Cleaning Variables') #-------------------------------------------------- sys.stdout.flush() preprocessed_data = None list_dict = None int_data = None int_data_len = None int_data_pre = None len_data_pre = None masked_data = None word_masked = None data = None Lines = None complete_dict = None # Training ------------------------------------------------------------------ BATCH_SIZE = 128 d_model = 256 heads = 8 N = 6 device = torch.device("cuda" if torch.cuda.is_available() else "cpu") print(device) sys.stdout.flush() print('Preparing Input Masks') sys.stdout.flush() msk_input_loader = create_mask(train_x, BATCH_SIZE) valid_msk_input_loader = create_mask(valid_x, BATCH_SIZE) print('Preparing Train Data Loaders') sys.stdout.flush() train_torch_x = torch.tensor(train_x).clone() train_torch_y = torch.tensor(train_y).clone() train_sample_ds = torch.utils.data.TensorDataset(train_torch_x, train_torch_y) train_loader = torch.utils.data.DataLoader(train_sample_ds, batch_size=BATCH_SIZE) print('Preparing Validation Data Loaders') sys.stdout.flush() valid_torch_x = torch.tensor(valid_x).clone() valid_torch_y = torch.tensor(valid_y).clone() valid_sample_ds = torch.utils.data.TensorDataset(valid_torch_x, valid_torch_y) valid_loader = torch.utils.data.DataLoader(valid_sample_ds, batch_size=BATCH_SIZE) print('Initialize Model') sys.stdout.flush() model = Transformer(SIZE_VOCAB, d_model, N, heads).to(device) for p in model.parameters(): if p.dim() > 1: nn.init.xavier_uniform_(p) print('Define Loss and Optimizer') sys.stdout.flush() loss_function = nn.CrossEntropyLoss() optimizer = optim.Adam(model.parameters(), lr=LR, betas=(0.9, 0.98), eps=1e-9) mask_sample_ds = None train_torch_x = None train_torch_y = None train_sample_ds = None print('Going to train') sys.stdout.flush() losses, valid_losses = train(model, train_loader, msk_input_loader, valid_loader, valid_msk_input_loader, EPOCHS, optimizer, loss_function, device) # Testing ------------------------------------------------------------------ print('Preparing Input Masks') msk_test_input_loader = create_mask(test_x, BATCH_SIZE) print('Preparing Data Loaders') test_torch_x = torch.tensor(test_x).clone() test_torch_y = torch.tensor(test_y).clone() test_sample_ds = torch.utils.data.TensorDataset(test_torch_x, test_torch_y) test_loader = torch.utils.data.DataLoader(test_sample_ds, batch_size=BATCH_SIZE) test_torch_x = None test_torch_y = None test_sample_ds = None print('Going to test') test_loss = evaluate(model, test_loader, msk_test_input_loader, loss_function, BATCH_SIZE, device) print(test_loss) # Custom Sentence ------------------------------------------------------------ test_sentences = ["Ha habido una XXX en Colombia durant la presentación del presidente", "Todas las tropas han sido XXX a America", "Estaba pensando que quizas XXX deberías hacerlo", "Todo lo que llevo esta dentro de mí XXX"] for custom_sentence in test_sentences: resulting_word = guess_word(custom_sentence) print('Initial Sentence: \t {}'.format(custom_sentence)) print('Word Guessed: \t\t {}'.format(resulting_word))