def tweet_transformer(lang, n_gram, voc=None): """ Get tweet transformer :param lang: :param n_gram: :return: """ if voc is None: token_to_ix = dict() else: token_to_ix = voc # end if if n_gram == 'c1': return transforms.Compose([ ltransforms.RemoveRegex( regex=r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'), ltransforms.ToLower(), ltransforms.Character(), ltransforms.ToIndex(start_ix=1, token_to_ix=token_to_ix), ltransforms.ToLength(length=settings.min_length), ltransforms.MaxIndex(max_id=settings.voc_sizes[n_gram][lang] - 1) ]) else: return transforms.Compose([ ltransforms.RemoveRegex( regex=r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'), ltransforms.ToLower(), ltransforms.Character2Gram(), ltransforms.ToIndex(start_ix=1, token_to_ix=token_to_ix), ltransforms.ToLength(length=settings.min_length), ltransforms.MaxIndex(max_id=settings.voc_sizes[n_gram][lang] - 1) ])
def text_transformer(n_gram, window_size): """ Get tweet transformer :param lang: :param n_gram: :return: """ if n_gram == 'c1': return transforms.Compose([ ltransforms.ToLower(), ltransforms.Character(), ltransforms.ToIndex(start_ix=0), ltransforms.ToNGram(n=window_size, overlapse=True), ltransforms.Reshape((-1, window_size)), ltransforms.MaxIndex(max_id=settings.voc_sizes[n_gram] - 1) ]) else: return transforms.Compose([ ltransforms.ToLower(), ltransforms.Character2Gram(), ltransforms.ToIndex(start_ix=0), ltransforms.ToNGram(n=window_size, overlapse=True), ltransforms.Reshape((-1, window_size)), ltransforms.MaxIndex(max_id=settings.voc_sizes[n_gram] - 1) ])
def text_transformer_cnn(window_size, n_gram, token_to_ix): """ Get text transformer for CNNSCD :param window_size: :param n_gram: :return: """ if n_gram == 'c1': return ltransforms.Compose([ ltransforms.ToLower(), ltransforms.Character(), ltransforms.ToIndex(start_ix=1, token_to_ix=token_to_ix), ltransforms.ToLength(length=window_size), ltransforms.Reshape((-1)), ltransforms.MaxIndex(max_id=settings.voc_sizes[n_gram]) ]) else: return ltransforms.Compose([ ltransforms.ToLower(), ltransforms.Character2Gram(), ltransforms.ToIndex(start_ix=1, token_to_ix=token_to_ix), ltransforms.ToLength(length=window_size), ltransforms.Reshape((-1)), ltransforms.MaxIndex(max_id=settings.voc_sizes[n_gram]) ])
# Load model and voc model.load_state_dict(torch.load(open(args.model, 'rb'))) if args.cuda: model.cuda() # end if voc = torch.load(open(args.voc, 'rb')) # Eval model.eval() if args.n_gram == 'c1': transforms = ltransforms.Compose([ ltransforms.ToLower(), ltransforms.Character(), ltransforms.ToIndex(start_ix=1, token_to_ix=voc), ltransforms.ToLength(length=window_size), ltransforms.MaxIndex(max_id=settings.voc_sizes[args.n_gram]) ]) else: transforms = ltransforms.Compose([ ltransforms.ToLower(), ltransforms.Character2Gram(), ltransforms.ToIndex(start_ix=1, token_to_ix=voc), ltransforms.ToLength(length=window_size), ltransforms.MaxIndex(max_id=settings.voc_sizes[args.n_gram]) ]) # end if # Validation losses validation_total = 0
def train_ccsaa(fold=0, ccsaa_epoch=100, text_length=20, n_gram='c1', dataset_size=100, dataset_start=0, cuda=True, save=False, save_dir='.'): """ Train CCSAA :param fold: :param ccsaa_epoch: :param text_length: :param n_gram: :param dataset_size: :param dataset_start: :param cuda: :return: """ # Save path save_path = os.path.join(save_dir, str(int(dataset_size)), str(int(dataset_start))) # Transforms if n_gram == 'c1': transform = transforms.Compose([ transforms.Character(), transforms.ToIndex(start_ix=0), transforms.ToNGram(n=text_length, overlapse=True), transforms.Reshape((-1, text_length)) ]) else: transform = transforms.Compose([ transforms.Character2Gram(), transforms.ToIndex(start_ix=0), transforms.ToNGram(n=text_length, overlapse=True), transforms.Reshape((-1, text_length)) ]) # end if # Load from directory reutersc50_dataset, reuters_loader_train, reuters_loader_test = dataset.load_dataset( dataset_size=dataset_size, dataset_start=dataset_start) reutersc50_dataset.transform = transform # Loss function loss_function = nn.CrossEntropyLoss() # Set fold reuters_loader_train.dataset.set_fold(fold) reuters_loader_test.dataset.set_fold(fold) # Model model = torchlanguage.models.CCSAA( text_length=text_length, vocab_size=settings.ccsaa_voc_size, embedding_dim=settings.ccsaa_embedding_dim, n_classes=settings.n_authors) if cuda: model.cuda() # end if # Load if save and os.path.exists( os.path.join(save_path, u"ccsaa." + str(fold) + u".pth")) and os.path.exists( os.path.join(save_path, u"ccsaa." + str(fold) + u".voc.pth")): model.load_state_dict( torch.load( open(os.path.join(save_path, u"ccsaa." + str(fold) + u".pth"), 'rb'))) voc = torch.load( open(os.path.join(save_path, u"ccsaa." + str(fold) + u".voc.pth"), 'rb')) return model, voc # end if # Optimizer optimizer = optim.SGD(model.parameters(), lr=settings.ccsaa_lr, momentum=settings.ccsaa_momentum) # Best model best_acc = 0.0 best_model = model.state_dict() # Fail count fail_count = 0 # Epoch for epoch in range(10000): # Total losses training_loss = 0.0 training_total = 0.0 test_loss = 0.0 test_total = 0.0 # Get test data for this fold for i, data in enumerate(reuters_loader_train): # Inputs and labels inputs, labels, time_labels = data # Reshape inputs = inputs.view(-1, text_length) # Outputs outputs = torch.LongTensor(inputs.size(0)).fill_(labels[0]) # To variable inputs, outputs = Variable(inputs), Variable(outputs) if cuda: inputs, outputs = inputs.cuda(), outputs.cuda() # end if # Zero grad model.zero_grad() # Compute output log_probs = model(inputs) # Loss loss = loss_function(log_probs, outputs) # Backward and step loss.backward() optimizer.step() # Add training_loss += loss.data[0] training_total += 1.0 # end for # Counters total = 0.0 success = 0.0 # For each test sample for i, data in enumerate(reuters_loader_test): # Inputs and labels inputs, labels, time_labels = data # Reshape inputs = inputs.view(-1, text_length) # Outputs outputs = torch.LongTensor(inputs.size(0)).fill_(labels[0]) # To variable inputs, outputs = Variable(inputs), Variable(outputs) if cuda: inputs, outputs = inputs.cuda(), outputs.cuda() # end if # Forward model_outputs = model(inputs) loss = loss_function(model_outputs, outputs) # Take the max as predicted _, predicted = torch.max(model_outputs.data, 1) # Add to correctly classified word success += (predicted == outputs.data).sum() total += predicted.size(0) # Add loss test_loss += loss.data[0] test_total += 1.0 # end for # Accuracy accuracy = success / total * 100.0 # print(u"Epoch {}, train loss {}, test loss {}, accuracy {}".format(epoch, training_loss / training_total, test_loss / test_total, accuracy)) # Save if best if accuracy > best_acc and epoch > 10: best_acc = accuracy best_model = model.state_dict() fail_count = 0 elif epoch > 10: fail_count += 1 # end if if fail_count > ccsaa_epoch: break # end if # end for # Load best model.load_state_dict(best_model) # Save if save: # Create dir if not exists if not os.path.exists(save_path): os.mkdir(save_path) # end if # Save torch.save( model.state_dict(), open(os.path.join(save_path, u"ccsaa." + str(fold) + u".pth"), 'wb')) # Save doc torch.save( transform.transforms[1].token_to_ix, open(os.path.join(save_path, u"ccsaa." + str(fold) + u".voc.pth"), 'wb')) # end if return model, transform.transforms[1].token_to_ix
import numpy as np from torchlanguage import transforms as ltransforms # Settings stride = 100 window_size = 3000 security_border = 200 # Parse arguments args = functions.argument_parser_training_model() if args.n_gram == 'c1': transforms = ltransforms.Compose([ ltransforms.ToLower(), ltransforms.Character(), ltransforms.ToIndex(start_ix=1), ltransforms.ToLength(length=window_size), ltransforms.Reshape((-1)), ltransforms.MaxIndex(max_id=settings.voc_sizes[args.n_gram]) ]) else: transforms = ltransforms.Compose([ ltransforms.ToLower(), ltransforms.Character2Gram(), ltransforms.ToIndex(start_ix=1), ltransforms.ToLength(length=window_size), ltransforms.Reshape((-1)), ltransforms.MaxIndex(max_id=settings.voc_sizes[args.n_gram]) ]) # end if
parser.add_argument("--text-length", type=int, help="Text length", default=20) parser.add_argument("--batch-size", type=int, help="Batch-size", default=64) parser.add_argument("--no-cuda", action='store_true', default=False, help="Enables CUDA training") args = parser.parse_args() # Use CUDA? args.cuda = not args.no_cuda and torch.cuda.is_available() # Transforms if args.n_gram == 'c1': transform = transforms.Compose([ transforms.Character(), transforms.ToIndex(start_ix=0), transforms.ToNGram(n=args.text_length, overlapse=True), transforms.Reshape((-1, args.text_length)) ]) else: transform = transforms.Compose([ transforms.Character2Gram(), transforms.ToIndex(start_ix=0), transforms.ToNGram(n=args.text_length, overlapse=True), transforms.Reshape((-1, args.text_length)) ]) # end if # Dataset dataset = datasets.ReutersC50Dataset(download=True, n_authors=15,
# Load feature selector feature_selector = torchlanguage.models.CCSAA(text_length=20, vocab_size=84, embedding_dim=50, n_classes=15) feature_selector.load_state_dict(torch.load(open(args.feature_selector, 'rb'))) feature_selector.linear = etnn.Identity() if args.cuda: feature_selector.cuda() # end if feature_selector_voc = torch.load(open(args.feature_selector_voc, 'rb')) # Transforms transform = transforms.Compose([ transforms.Character(), transforms.ToIndex(token_to_ix=feature_selector_voc), transforms.MaxIndex(max_id=83), transforms.ToNGram(n=20, overlapse=True), transforms.Reshape((-1, 20)), transforms.ToCUDA(), transforms.FeatureSelector(model=feature_selector, n_features=150, to_variable=True), transforms.ToCPU(), transforms.Normalize(mean=-5.08, std=0.3294) ]) # Results parameter_averages = np.zeros(n_test) parameter_max = np.zeros(n_test)