def _fuse_input_processors(ip_form: InputProcessor, ip_context: InputProcessor) -> InputProcessor: assert ip_form.word_embeddings_file == ip_context.word_embeddings_file assert ip_form.word_embeddings_format == ip_context.word_embeddings_format assert set(ip_form.train_files) == set(ip_context.train_files) assert ip_form.vocab_file == ip_context.vocab_file assert ip_form.vector_size == ip_context.vector_size assert ip_form.min_word_count == ip_context.min_word_count assert ip_form.sep_symbol == ip_context.sep_symbol return InputProcessor( word_embeddings_file=ip_form.word_embeddings_file, word_embeddings_format=ip_form.word_embeddings_format, train_files=ip_form.train_files, vocab_file=ip_form.vocab_file, vector_size=ip_form.vector_size, ngram_threshold=ip_form.ngram_threshold, nmin=ip_form.nmin, nmax=ip_form.nmax, ngram_dropout=ip_form.ngram_dropout, min_word_count=ip_form.min_word_count, max_copies=ip_context.max_copies, smin=ip_context.smin, smax=ip_context.smax, max_seq_length=ip_context.max_seq_length, model_cls=ip_context.model_cls, bert_model=ip_context.bert_model)
def __init__(self, model_path: str, device: str = 'cpu'): """ Initialize a new wrapper from a given model directory :param model_path: the directory that contains the trained BERTRAM model :param device: the device to use for inferring word vectors """ self.device = device # load the input processor corresponding to the model self.input_processor = InputProcessor.load(os.path.join(model_path, IP_NAME)) self.input_processor.ngram_dropout = 0 # load the model config and the actual model bertram_config = BertramConfig.load(os.path.join(model_path, CONFIG_NAME)) _, _, bertram_cls = MODELS[bertram_config.transformer_cls] self.model, loading_info = bertram_cls.from_pretrained(model_path, bertram_config=bertram_config, output_loading_info=True) # type: Bertram if loading_info['missing_keys']: logger.info('Reloading with do_setup=True because of missing keys: {}'.format(loading_info)) del self.model self.model, loading_info = bertram_cls.from_pretrained(model_path, bertram_config=bertram_config, output_loading_info=True, do_setup=True) # type: Bertram if loading_info['missing_keys']: raise ValueError('Something went wrong loading a pretrained model: {}'.format(loading_info)) self.model.setup() self.model.to(self.device) self.model.eval() for param in self.model.parameters(): param.requires_grad = False
def _load_model_components( path: str ) -> Tuple[InputProcessor, bertram.BertramConfig, bertram.Bertram]: input_processor = InputProcessor.load(os.path.join(path, bertram.IP_NAME)) bertram_config = bertram.BertramConfig.load( os.path.join(path, bertram.CONFIG_NAME)) _, _, bertram_cls = bertram.MODELS[bertram_config.transformer_cls] model = bertram_cls.from_pretrained(path, bertram_config=bertram_config) return input_processor, bertram_config, model
def _save_model_components(path: str, input_processor: InputProcessor, bertram_config: bertram.BertramConfig, model: bertram.Bertram, meta_info: Dict) -> None: output_ip_file = os.path.join(path, bertram.IP_NAME) input_processor.save(output_ip_file) output_bc_file = os.path.join(path, bertram.CONFIG_NAME) bertram_config.save(output_bc_file) model_to_save = model.module if hasattr(model, 'module') else model output_model_file = os.path.join(path, WEIGHTS_NAME) torch.save(model_to_save.state_dict(), output_model_file) output_config_file = os.path.join(path, CONFIG_NAME) with open(output_config_file, 'w') as f: f.write(model_to_save.config.to_json_string()) meta_file = os.path.join(path, META_NAME) with open(meta_file, 'w') as f: for k, v in meta_info.items(): f.write('{}: {}\n'.format(k, v))
def prepare_data(embedding_path, num_words=200000, max_len=50): inp_processor = InputProcessor("train.csv", "test.csv", num_words=num_words, max_len=max_len) inp_processor.pickle_data_sets() inp_processor.pickle_embeddings(embedding_path, "embedding_matrix.p")
class Program: def __init__(self): self.__processor = InputProcessor() self.__sparqlQuery = SparqlQuery() def run(self): print "Welkom bij de DBPedia Olympische Spelen vragensteller.\nToets 'q' en druk op enter om te stoppen.\n" self.__processor.process_input() while (self.__processor.get_input("raw") != 'q'): try: category = self.__processor.category_from_input() #print the category and it's link output = category + " ".join(["" for x in range(50 - len(category))]) + "http://dbpedia.org/resources/" + category dashes = len(output) * '-' print "\n\n" + dashes + "\n" + output + "\n" + dashes + "\n\n" self.__sparqlQuery.setQueryFromCategory(category) answer = self.__sparqlQuery.query() if answer: self.print_answer(answer) else: print "Geen antwoord gevonden, opnieuw zoeken met subcategorieƫn..." answer = self.__sparqlQuery.query(True) if answer: self.print_answer(answer) else: print "Helaas, er kon geen antwoord op de vraag gevonden worden. Probeer het nog eens." except Exception as error: print "\n-------------------\nEr is een fout opgetreden:" print error.args[0] print "-------------------\n" self.__processor.process_input() def print_answer(self, answer, recurse = False): cont = "y" if len(answer) // 50 == 0: print "".join(answer) else: if not recurse: print "Meer dan 50 antwoorden. Print eerste 50:\n" print "".join(answer[:50]) cont = raw_input("Druk op een toets om door te gaan. Toets 'n' en druk op enter om af te breken. Druk 'q' om te stoppen.\n") if cont == 'n': return if cont == 'q': sys.exit() self.print_answer(answer[51:], True)
from input_processor import InputProcessor from file_processor import FileProcessor from config import config if __name__ == '__main__': #List of testcases to be run testcases = ['testcase1/testcase1.txt','testcase2/testcase2.txt','testcase3/testcase3.txt'] #This loop will run all the testcases and save the output as a json object in the output.json file and print the result for testcase in testcases: print "############## Running" + testcase + " ##############" query = raw_input("Input Query for "+ testcase+ ":" ) testcase_dir = config.TESTCASES_DIR + testcase.split('/')[0] ip_obj = InputProcessor() #Check if the query already exists in the output file return_dict = ip_obj.query_already_exists(testcase_dir,query) #Processing the file and getting the output and storing it in output file if not return_dict: print "Processing File" return_dict = ip_obj.process_input(config.TESTCASES_DIR+testcase,query) fp_obj = FileProcessor() if fp_obj.create_outfile(testcase_dir,return_dict): pass else: fp_obj.update_outfile(testcase_dir, return_dict) print return_dict[query]
default=100, help= "The maximum number of contexts per word. Words with more contexts are discarded." ) parser.add_argument( '--split_contexts', type=int, default=None, help= "If given, the list of contexts per word is split into chunks of size 'split_contexts'. " "Each chunk is processed separately and the results are then averaged. This can be used " "for words with too many contexts to fit into GPU memory.") args = parser.parse_args() input_processor = InputProcessor.load( os.path.join(args.model, bertram.IP_NAME)) input_processor.ngram_dropout = 0 device = torch.device("cuda" if torch.cuda.is_available() else "cpu") bertram_config = bertram.BertramConfig.load( os.path.join(args.model, bertram.CONFIG_NAME)) _, _, bertram_cls = bertram.MODELS[bertram_config.transformer_cls] model, loading_info = bertram_cls.from_pretrained( args.model, bertram_config=bertram_config, output_loading_info=True) # type: bertram.Bertram if loading_info['missing_keys']: logger.info( 'Reloading with do_setup=True because of missing keys: {}'.format( loading_info))
] t = temp[temp.TYPO.map(lambda x: x.count('.')) == 2] t = t[t.REGI.map(lambda x: x.count('.')) == 2] t.reset_index(drop=True, inplace=True) reg_list = list() typo_list = list() for i in range(t.shape[0]): reg_list.append(t['REGI'][i].split('.')[0]) typo_list.append(t['TYPO'][i].split('.')[0]) ttt = t.drop_duplicates() ttt.reset_index(drop=True, inplace=True) top10 = ttt['REGI'].value_counts()[:10].index topTypo10 = ttt['TYPO'].value_counts()[:10].index token_size = 3 inputProcessor = InputProcessor () in_list , out_list = inputProcessor.processInput(typo_list, reg_list, token_size); in_vocab = set() out_vocab = set() for name in in_list: for char in name: in_vocab.add(char) for name in out_list: for char in name: out_vocab.add(char) vocab = in_vocab.union(out_vocab) num_encoder_tokens = len(in_vocab)
def __init__(self): self.__processor = InputProcessor() self.__sparqlQuery = SparqlQuery()
def main(args): parser = argparse.ArgumentParser("Train a new BERTRAM instance") # Required parameters parser.add_argument('--model_cls', default='bert', choices=['bert', 'roberta'], help="The transformer model class, either 'bert' or 'roberta'.") parser.add_argument("--bert_model", default=None, type=str, required=True, help="The pretrained model to use (e.g., 'bert-base-uncased'.") parser.add_argument("--output_dir", default=None, type=str, required=True, help="The output directory where the model checkpoints will be written.") parser.add_argument('--train_dir', type=str, required=True, help="The directory in which the buckets for training are stored. " "Each bucket should be a text file containing lines of the form " "<WORD><TAB><CONTEXT_1><TAB>...<CONTEXT_n>") parser.add_argument('--vocab', type=str, required=True, help="The file in which the vocabulary to be used for training is stored." "Each line should be of the form <WORD> <COUNT>") parser.add_argument('--emb_file', type=str, required=True, help="The file in which the target embeddings for mimicking are stored.") parser.add_argument('--emb_dim', type=int, required=True, help="The number of dimensions for the target embeddings.") parser.add_argument('--mode', choices=bertram.MODES, required=True, help="The BERTRAM mode (e.g., 'form', 'add', 'replace').") # Other parameters parser.add_argument("--cache_dir", default="", type=str, help="Where to store the pre-trained models downloaded from s3") parser.add_argument("--max_seq_length", default=128, type=int, help="The maximum total input sequence length after WordPiece tokenization.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument("--warmup_proportion", default=0.1, type=float, help="Proportion of training to perform linear learning rate warmup for.") parser.add_argument('--seed', type=int, default=42, help="Random seed for initialization") parser.add_argument('--gradient_accumulation_steps', type=int, default=1, help="Number of updates steps to accumulate before performing a backward/update pass.") parser.add_argument('--save_epochs', type=int, nargs='+', default=[1, 5, 10], help="The number of epochs after which a checkpoint is saved.") parser.add_argument('--min_word_count', '-mwc', type=int, default=100, help="The minimum number of occurrences for a word to be used as training target.") parser.add_argument('--num_buckets', type=int, default=25, help="The number of buckets in the training directory.") parser.add_argument('--emb_format', type=str, choices=['text', 'gensim'], default='text', help="The format in which target embeddings are stored.") parser.add_argument('--no_finetuning', action='store_true', help="Whether not to finetune the underlying transformer language model.") parser.add_argument('--optimize_only_combinator', action='store_true', help="Whether to freeze both the underyling transformer language model and the ngram" "embeddings during training.") # Context parameters parser.add_argument('--smin', type=int, default=20, help="The minimum number of contexts per word.") parser.add_argument('--smax', type=int, default=20, help="The maximum number of contexts per word.") # Form parameters parser.add_argument('--nmin', type=int, default=3, help="The minimum number of characters per ngram.") parser.add_argument('--nmax', type=int, default=5, help="The maximum number of characters per ngram.") parser.add_argument('--dropout', type=float, default=0, help="The ngram dropout probability.") parser.add_argument('--ngram_threshold', type=int, default=4, help="The minimum number of occurrences for an ngram to get its own embedding.") # Visdom parameters parser.add_argument('--visdom_port', type=int, default=8098) parser.add_argument('--visdom_server', type=str, default=None) args = parser.parse_args(args) vis = visdom.Visdom(port=args.visdom_port, server=args.visdom_server) if args.visdom_server else None device = torch.device("cuda" if torch.cuda.is_available() else "cpu") n_gpu = torch.cuda.device_count() if args.gradient_accumulation_steps < 1: raise ValueError("Invalid gradient_accumulation_steps: {} < 1".format(args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if os.path.exists(args.output_dir) and os.listdir(args.output_dir): raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir)) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) # generate output directories for all save_epochs for epoch in range(int(args.num_train_epochs)): if (epoch + 1) in args.save_epochs: out_dir = args.output_dir + '-e' + str(epoch + 1) if os.path.exists(out_dir) and os.listdir(out_dir): raise ValueError("Output directory ({}) already exists and is not empty.".format(out_dir)) if not os.path.exists(out_dir): os.makedirs(out_dir) train_files = [args.train_dir + 'train.bucket' + str(i) + '.txt' for i in range(args.num_buckets)] _, _, bertram_cls = bertram.MODELS[args.model_cls] if args.bert_model in ['bert-base-uncased', 'bert-large-uncased', 'roberta-base', 'roberta-large']: logger.info("Initializing new BERTRAM instance from {}.".format(args.bert_model)) input_processor = InputProcessor( word_embeddings_file=args.emb_file, word_embeddings_format=args.emb_format, train_files=train_files, vocab_file=args.vocab, vector_size=args.emb_dim, nmin=args.nmin, nmax=args.nmax, ngram_dropout=args.dropout, ngram_threshold=args.ngram_threshold, smin=args.smin, smax=args.smax, min_word_count=args.min_word_count, max_seq_length=args.max_seq_length, form_only=(args.mode == bertram.MODE_FORM), model_cls=args.model_cls, bert_model=args.bert_model ) cache_dir = args.cache_dir if args.cache_dir else os.path.join(PYTORCH_TRANSFORMERS_CACHE, 'distributed_-1') bertram_config = bertram.BertramConfig( transformer_cls=args.model_cls, output_size=args.emb_dim, mode=args.mode, ngram_vocab_size=input_processor.ngram_builder.get_number_of_ngrams() ) model = bertram_cls.from_pretrained(args.bert_model, cache_dir=cache_dir, bertram_config=bertram_config) else: logger.info("Initializing pretrained BERTRAM instance from {}.".format(args.bert_model)) input_processor = InputProcessor.load(os.path.join(args.bert_model, IP_NAME)) bertram_config = bertram.BertramConfig.load(os.path.join(args.bert_model, CONFIG_NAME)) model, loading_info = bertram_cls.from_pretrained(args.bert_model, bertram_config=bertram_config, output_loading_info=True) # type: Bertram if loading_info['missing_keys']: raise ValueError('Something went wrong loading a pretrained model: {}'.format(loading_info)) if args.mode != model.bertram_config.mode: logger.warning("Overwriting original mode {} with {}.".format(model.bertram_config.mode, args.mode)) model.bertram_config.mode = args.mode input_processor.mode = args.mode model.setup() model = model.to(device) model_orig = model if n_gpu > 1: model = torch.nn.DataParallel(model) # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] if args.no_finetuning: optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if ('bert.' not in n and 'roberta.' not in n)], 'weight_decay': 0.01 }] model_orig.transformer.eval() for name, param in model_orig.transformer.named_parameters(): param.requires_grad = False elif args.optimize_only_combinator: optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if ('bert.' not in n and 'roberta.' not in n) and 'ngram_processor.' not in n], 'weight_decay': 0.01 }] for name, param in model_orig.named_parameters(): if 'bert.' in name or 'roberta.' in name or 'ngram_processor.' in name: param.requires_grad = False else: optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] num_train_examples = input_processor.get_number_of_train_examples_per_epoch() avg_contexts_per_word = (args.smin + args.smax) / 2 avg_examples_per_batch = args.train_batch_size / avg_contexts_per_word num_train_optimization_steps = int( num_train_examples / avg_examples_per_batch / args.gradient_accumulation_steps) * args.num_train_epochs if args.mode == bertram.MODE_FORM: optimizer = torch.optim.Adam(model.parameters(), lr=args.learning_rate) else: optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_proportion * num_train_optimization_steps, t_total=num_train_optimization_steps) global_step = 0 if vis is not None: loss_window = vis.line(Y=np.array([0]), X=np.array([0]), opts=dict(xlabel='Step', ylabel='Loss', title='Training loss (' + args.output_dir + ')', legend=['Loss'])) else: loss_window = None model.train() for epoch in trange(int(args.num_train_epochs), desc="Epoch"): summed_loss = 0 step = 0 while True: try: nr_of_parallel_batches = max(1, n_gpu) batches = [] # fix a number of chunks for the current set of batches to make parallel processing feasible num_chunks = -1 for i in range(nr_of_parallel_batches): batch = input_processor.generate_batch_from_buffer(args.train_batch_size, num_chunks=num_chunks) batches.append(batch) num_chunks = len(batch.nrs_of_contexts) input_ids = torch.stack([batch.input_ids for batch in batches]).to(device) segment_ids = torch.stack([batch.segment_ids for batch in batches]).to(device) nrs_of_contexts = torch.stack([batch.nrs_of_contexts for batch in batches]).to(device) mask_positions = torch.stack([batch.mask_positions for batch in batches]).to(device) input_mask = torch.stack([batch.input_mask for batch in batches]).to(device) # pad ngram ids if len(batches) > 1: max_nr_of_ngrams = max([batch.ngram_features.ngram_ids.shape[1] for batch in batches]) for batch in batches: batch_nr_of_ngrams = batch.ngram_features.ngram_ids.shape[1] nr_of_words = batch.ngram_features.ngram_ids.shape[0] padding = torch.zeros((nr_of_words, max_nr_of_ngrams - batch_nr_of_ngrams), dtype=torch.long) batch.ngram_features.ngram_ids = torch.cat([batch.ngram_features.ngram_ids, padding], dim=1) ngram_ids = torch.stack([batch.ngram_features.ngram_ids for batch in batches]).to(device) ngram_lengths = torch.stack([batch.ngram_features.ngram_lengths for batch in batches]).to(device) target_vectors = torch.stack([batch.target_vectors for batch in batches]).to(device) loss = model(input_ids, segment_ids, nrs_of_contexts, mask_positions, input_mask, ngram_ids, ngram_lengths, target_vectors) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps loss.backward() summed_loss += loss.item() logger.debug('Done with backward step') if step > 0 and step % 100 == 0: if vis is not None: vis.line(Y=np.array([summed_loss / 100]), X=np.array([global_step]), win=loss_window, update='append') logger.info('Step: %d\tLoss: %.17f', step, (summed_loss / 100)) summed_loss = 0 if (step + 1) % args.gradient_accumulation_steps == 0: optimizer.step() scheduler.step() optimizer.zero_grad() global_step += 1 step += 1 except EndOfDatasetException: logger.info('Done with epoch %d', epoch) input_processor.reset() if (epoch + 1) in args.save_epochs: out_dir = args.output_dir + '-e' + str(epoch + 1) model_to_save = model.module if hasattr(model, 'module') else model # Only save the model it-self output_model_file = os.path.join(out_dir, WEIGHTS_NAME) torch.save(model_to_save.state_dict(), output_model_file) output_config_file = os.path.join(out_dir, TRANSFORMER_CONFIG_NAME) with open(output_config_file, 'w') as f: f.write(model_to_save.config.to_json_string()) output_ip_file = os.path.join(out_dir, IP_NAME) input_processor.save(output_ip_file) output_bc_file = os.path.join(out_dir, CONFIG_NAME) model_to_save.bertram_config.save(output_bc_file) break # Save a trained model and the associated configuration model_to_save = model.module if hasattr(model, 'module') else model # Only save the model it-self output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME) torch.save(model_to_save.state_dict(), output_model_file) output_config_file = os.path.join(args.output_dir, TRANSFORMER_CONFIG_NAME) with open(output_config_file, 'w') as f: f.write(model_to_save.config.to_json_string()) output_ip_file = os.path.join(args.output_dir, IP_NAME) input_processor.save(output_ip_file) output_bc_file = os.path.join(args.output_dir, CONFIG_NAME) model_to_save.bertram_config.save(output_bc_file)
error = desired_output - outputs[-1] for i in range(size, 0, -1): delta = error * sigmoid_derivative(outputs[i]) adjustments[i - 1] = dot(outputs[i - 1].T, delta) error = dot(delta, self.weights[i - 1].T) for i in range(size - 1, 0, -1): self.weights[i] += adjustments[i] * .1 if __name__ == "__main__": random.seed(1) neural_network = NeuralNetwork(7, [8, 9], 3) input_processor = InputProcessor('data.in') values = input_processor.values training_inputs = np.array(list(map(lambda l: l[:7], values))) training_outputs = np.array(list(map(lambda l: l[7:], values))) neural_network.train(training_inputs, training_outputs) errors = [] for value in values: out = neural_network.predict(value[:7])[-1] predicted = input_processor.revert(out) result = input_processor.revert(value[7:]) print(result, predicted) errors.append(np.sum(np.abs(predicted - result) / predicted)) print(np.average(errors))