def query_pair(origin, destination, n_days=366): """ Query all flights between 2 airports Args: origin: code for origin airport destination: code for destination airport n_days: max days of history """ # Start at day 1 since it will only query when day==1 start_day = date.today() dfs = [] for x in range(n_days): query_day = start_day + timedelta(x) # Only do first day of month if (query_day.day != 1) and (query_day != start_day): log.trace(f"Skiping day '{query_day}'") continue response = query_flights(origin, destination, query_day) data = response.json() if data["Quotes"]: dfs.append(parse_data(data)) if dfs: return pd.concat(dfs).reset_index(drop=True) else: log.warning(f"No flights from '{origin}' to '{destination}'")
def write_output_sequence(tar_real, predictions, step, write_output_seq): ref_sents = [] hyp_sents = [] rouge_all = Rouge() for tar, ref_hyp in zip(tar_real, predictions): detokenized_refs, detokenized_hyp_sents = detokenize( target_tokenizer, tf.squeeze(tar), tf.squeeze(ref_hyp)) ref_sents.append(detokenized_refs) hyp_sents.append(detokenized_hyp_sents) try: rouges = rouge_all.get_scores(ref_sents, hyp_sents) avg_rouge_f1 = np.mean([ np.mean([ rouge_scores['rouge-1']["f"], rouge_scores['rouge-2']["f"], rouge_scores['rouge-l']["f"] ]) for rouge_scores in rouges ]) _, _, bert_f1 = b_score(ref_sents, hyp_sents, model_type=config.bert_score_model) avg_bert_f1 = np.mean(bert_f1.numpy()) except: log.warning( 'Some problem while calculating ROUGE so setting ROUGE score to zero' ) avg_rouge_f1 = 0 avg_bert_f1 = 0 if write_output_seq: with tf.io.gfile.GFile( config.output_sequence_write_path + str(step.numpy()), 'w') as f: for ref, hyp in zip(ref_sents, hyp_sents): f.write(ref + '\t' + hyp + '\n') return (avg_rouge_f1, avg_bert_f1)
def monitor_run(ckpt_save_path, bert_score, rouge_score, train_loss, step, to_monitor=config.monitor_metric): ckpt_fold, ckpt_string = os.path.split(ckpt_save_path) if config.run_tensorboard: with valid_output_sequence_writer.as_default(): tf.summary.scalar('ROUGE_f1', rouge_score, step=step) tf.summary.scalar('BERT_f1', bert_score, step=step) monitor_metrics = dict() monitor_metrics['BERT_f1'] = bert_score monitor_metrics['ROUGE_f1'] = rouge_score monitor_metrics['combined_metric'] = (monitor_metrics['BERT_f1'], monitor_metrics['ROUGE_f1']) # multiply with the weights monitor_metrics['combined_metric'] = round( tf.reduce_sum([(i * j) for i, j in zip( monitor_metrics['combined_metric'], config.combined_metric_weights) ]).numpy(), 2) log.info(f"combined_metric {monitor_metrics['combined_metric']:4f}") if config.last_recorded_value < monitor_metrics[to_monitor]: # reset tolerance to zero if the monitor_metric decreases before the tolerance threshold config.tolerance = 0 config.last_recorded_value = monitor_metrics[to_monitor] ckpt_files_tocopy = [files for files in os.listdir(os.path.split(ckpt_save_path)[0]) \ if ckpt_string in files] log.info( f'{to_monitor} is {monitor_metrics[to_monitor]:4f} so checkpoint files {ckpt_string} \ will be copied to best checkpoint directory') # copy the best checkpoints shutil.copy2(os.path.join(ckpt_fold, 'checkpoint'), config.best_ckpt_path) for files in ckpt_files_tocopy: shutil.copy2(os.path.join(ckpt_fold, files), config.best_ckpt_path) else: config.tolerance += 1 # stop if minimum training loss is reached if train_loss < config.min_train_loss: log.info(f'Stop training since minimum training loss reached') return False else: return True # Warn and early stop if config.tolerance > config.tolerance_threshold: log.warning('Tolerance exceeded') if config.early_stop: log.info( f'Early stopping since the {to_monitor} reached the tolerance threshold' ) return False else: return True else: return True
def evaluate_bert_score(self): try: _, _, bert_f1 = b_score(self.ref_sents, self.hyp_sents, model_type=config.bert_score_model, device='cpu') avg_bert_f1 = np.mean(bert_f1.numpy()) except: log.warning('Some problem while calculating BERT score so setting it to zero') avg_bert_f1 = 0 return avg_bert_f1
def evaluate_rouge(self): try: all_rouge_scores = self.calculate_rouge.get_scores(self.ref_sents , self.hyp_sents) avg_rouge_f1 = np.mean([np.mean([rouge_scores['rouge-1']["f"], rouge_scores['rouge-2']["f"], rouge_scores['rouge-l']["f"]]) for rouge_scores in all_rouge_scores]) except: log.warning('Some problem while calculating ROUGE so setting it to zero') avg_rouge_f1 = 0 return avg_rouge_f1
def check_ckpt(checkpoint_path): ckpt = tf.train.Checkpoint(Model=Model, optimizer=optimizer) ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=10) if tf.train.latest_checkpoint(checkpoint_path): ckpt.restore(ckpt_manager.latest_checkpoint) log.info(ckpt_manager.latest_checkpoint + ' restored') else: log.warning('No checkpoint found so using the initialized_weights') return ckpt_manager
def query_flights( origin, destination, day, max_attempts=20, seconds_sleep=1, country="ES", currency="EUR", locale="en-US", ): """ Query flights iterating until there is a result Args: origin: code for origin airport destination: code for destination airport day: day for the flights [date] max_attempts: number of retries seconds_sleep: seconds to sleep before returning a result country: code for country (default: ES) currency: code for currency (default: EUR) locale: code for output info (default: en-US) """ url = f"{BASE_URL}{country}/{currency}/{locale}/{origin}/{destination}/{day:%Y-%m-%d}" for attemp_num in range(max_attempts): log.debug( f"Quering {origin}-{destination} for date '{day}' (attempt {attemp_num})" ) response = requests.get(url, headers=HEADERS) if response.status_code == 200: sleep(seconds_sleep) return response # If there are 'Too many requests' sleep a little elif response.status_code == 429: log.warning(f"API limit reached at attempt {attemp_num + 1}") sleep(2 * attemp_num + 1) # Raise unknown cases else: response.raise_for_status() log.error(f"Number max of attempts reached ({max_attempts})") raise TimeoutError("TimeOut")
def training_loop(dataset, check_model_capacity, detokenize_samples=None): min_loss = 10000000 if check_model_capacity: dataset = dataset.repeat(670) for (step, (input_ids, target_ids)) in tqdm(enumerate(dataset, 1), initial=1): start=time.time() grad_accum_flag = (True if ((step)%config.gradient_accumulation_steps) == 0 else False) if config.accumulate_gradients else None predictions = train_step( input_ids, target_ids, grad_accum_flag ) if grad_accum_flag is not None: if grad_accum_flag: if (step)%config.steps_to_print_training_info==0: predicted_ids = train_sanity_check(target_tokenizer, predictions, target_ids) train_loss = batch_run_check( step, start ) else: if (step)%config.steps_to_print_training_info==0: train_loss = batch_run_check( step, start ) if check_model_capacity: if min_loss > train_loss: min_loss = train_loss else: log.warning('Loss not decreasing watch out') monitor_early_stop = monitor_run( 'not saving', 0, 0, 0.0, 1, copy_best_ckpt=False ) if check_model_capacity: log.info(f'target_ids are {target_ids}') log.info(f'predicted ids are {predicted_ids}') if train_loss < config.min_train_loss: log.info('Minimum training loss reached') else: log.info("Loss didn't reach upto the min_train_loss specified, try to increase\ the parameters of the model or number of train steps")
def evaluate_bleu_score(self, case_sensitive=False): ref_filename = tempfile.NamedTemporaryFile(delete=False) hyp_filename = tempfile.NamedTemporaryFile(delete=False) with tf.io.gfile.GFile(ref_filename.name, 'w') as f_ref: with tf.io.gfile.GFile(hyp_filename.name, 'w') as f_hyp: for references, hypothesis_output in zip(self.ref_sents , self.hyp_sents): f_hyp.write(hypothesis_output+'\n') f_ref.write(references+'\n') try: bleu_score = compute_bleu.bleu_wrapper(ref_filename = ref_filename.name, hyp_filename = hyp_filename.name, case_sensitive = False) except: log.warning('Some problem while calculating BLEU score so setting it to zero') bleu_score = 0 return bleu_score
def create_vocab(tokenizer_path, tok_type): try: tokenizer = tfds.features.text.SubwordTextEncoder.load_from_file(config.tokenizer_path) except FileNotFoundError: log.warning(f'Vocab files not available in {config.tokenizer_path} so building it from the training set') if config.use_tfds: examples, metadata = tfds.load(config.tfds_name, with_info=True, as_supervised=True) train_examples = examples['train'] if tok_type=='source': tokenizer = tfds.features.text.SubwordTextEncoder.build_from_corpus( (ip_seq.numpy() for ip_seq, _ in train_examples), target_vocab_size=2**13) else: tokenizer = tfds.features.text.SubwordTextEncoder.build_from_corpus( (op_seq.numpy() for _, op_seq in train_examples), target_vocab_size=2**13) tokenizer.save_to_file(config.tokenizer_path) if tok_type=='source': assert(tokenizer.vocab_size+2 == config.input_vocab_size),f'{tok_type}vocab size in configuration script should be {tokenizer.vocab_size+2}' else: assert(tokenizer.vocab_size+2 == config.output_vocab_size),f'{tok_type}vocab size in configuration script should be {tokenizer.vocab_size+2}' log.info(f'{tok_type} vocab file created and saved to {config.tokenizer_path}') return tokenizer