def __save_references( project: str, issue_summary: Tuple[str, int, Set[str], Set[str], Set[str], Set[str], Set[str], Set[str], List[str], List[str]] ) -> None: """ Save references for an issue in JSON format. :param project: Project to write references for :param issue_summary: Data type describing necessary data :return: None """ summary_dir = os.path.join("Projects", project, "Summary") if not os.path.isdir(summary_dir): os.mkdir(summary_dir) issue_dict = { "issue_key": issue_summary[0], "issue_id": issue_summary[1], "urls": list(issue_summary[2]), "revisions": list(issue_summary[3]), "mailing_lists": list(issue_summary[4]), "pdf_documents": list(issue_summary[5]), "archives": list(issue_summary[6]), "other_issues": list(issue_summary[7]), "commits": issue_summary[8], "pull_requests": issue_summary[9] } path = os.path.join(summary_dir, issue_summary[0] + ".json") utils.save_as_json(issue_dict, path)
def preprocess_seq_galaxy_clades(fasta_file, samples_clades, LEN_AA): encoded_samples = list() aa_chars = utils.get_all_possible_words(amino_acid_codes) f_word_dictionaries, r_word_dictionaries = utils.get_words_indices(aa_chars) all_sample_names = list(samples_clades.keys()) for sequence_obj in SeqIO.parse(fasta_file, "fasta"): row = list() seq_id = sequence_obj.id sequence = str(sequence_obj.seq) sequence = sequence.replace("*", '') if "X" not in sequence and all_sample_names.count(seq_id) > 0 and len(sequence) == LEN_AA: row.append(seq_id) clade_name = samples_clades[seq_id] clade_name = utils.format_clade_name(clade_name) row.append(clade_name) seq_chars = list(sequence) indices_chars = [str(r_word_dictionaries[i]) for i in seq_chars] joined_indices_kmers = ','.join(indices_chars) row.append(joined_indices_kmers) encoded_samples.append(row) sample_clade_sequence_df = pd.DataFrame(encoded_samples, columns=["SampleName", "Clade", "Sequence"]) sample_clade_sequence_df.to_csv(PATH_SAMPLES_CLADES, index=None) utils.save_as_json(PATH_F_DICT, f_word_dictionaries) utils.save_as_json(PATH_R_DICT, r_word_dictionaries) return sample_clade_sequence_df, f_word_dictionaries, r_word_dictionaries
def parse_issues(self, issues_raw: List[dict] = None) -> List[dict]: """ For each raw issue, create a JSON file containing necessary information: 1. Issue key 2. Project information 2.1 Project key 2.2 Project name 3. Author 4. Date of creation 5. Date of update 6. Current status 7. Summary 8. Description 9. List of attachments 9.1 File name 9.2 URL to attachment 10. List of issue links 10.1 Type of link 10.2 Issue key 11. List of remote links 11.1 Title of link 11.2 URL 12. List of comments 12.1 Author 12.2 Date of creation 12.3 Date of update 12.4 Comment body The parsed data for each file is stored in "Projects/<project_name>/Issues/<issue_key>.json :param issues_raw: List of dictionaries representing raw issues. If none is specified, then they are loaded from the cache :return: List of dictionaries of parsed issues """ print("{}: parsing issues. This may take a while".format(self.project)) count = 0 issues_dir = self.issues_dir utils.create_dir_if_necessary(issues_dir) if not issues_raw: issues_raw = self.load_issues_raw() issues = [] for count, issue in enumerate(issues_raw, start=1): filename = issue["key"] + ".json" path = os.path.join(issues_dir, filename) json_object = self.__prepare_json_object(issue) utils.save_as_json(json_object, path) issues.append(json_object) if count % 100 == 0: print("{}: Parsed {} issues".format(self.project, count)) print("{}: Finished parsing issues! Totally parsed: {}".format( self.project, count)) return issues
def get_galaxy_samples_clades(path_seq_clades): ncov_global_df = pd.read_csv(path_seq_clades, sep="\t") samples_clades = dict() for idx in range(len(ncov_global_df)): sample_row = ncov_global_df.take([idx]) s_name = sample_row["seqName"].values[0] clade_name = sample_row["clade"].values[0] if sample_row["qc.overallStatus"].values[0] and sample_row["qc.overallStatus"].values[0] == "good": clade_name = utils.format_clade_name(clade_name) samples_clades[s_name] = clade_name utils.save_as_json(PATH_ALL_SAMPLES_CLADES, samples_clades) return samples_clades
def __save_issues_raw(self, issues: List[dict]) -> None: """ Persist raw issues in the corresponding folder. :param issues: List of dictionaries describing unparsed issues :return: None """ directory = self.issues_raw_dir utils.create_dir_if_necessary(directory) print("\t{}: Successfully saved!".format(self.project)) for issue in issues: key = issue["key"] filename = key + ".json" path = os.path.join(directory, filename) utils.save_as_json(issue, path)
def find_pred_mut(): mut_tr = utils.read_json(results_path + "tr_parent_child_pos_{}_{}.json".format(clade_parent, clade_child)) mut_te = utils.read_json(results_path + "te_parent_child_pos_{}_{}.json".format(clade_parent, clade_child)) mut_future_true = utils.read_json(results_path + "parent_child_pos_{}_{}.json".format(clade_child, clade_future)) mut_future_gen = utils.read_json(results_path + "parent_gen_pos_{}_{}.json".format(clade_child, clade_future)) novel_mut = list() novel_mut_orig = list() present_in_tr_mut = list() for key in mut_future_gen: if key not in mut_tr and key not in mut_te and key in mut_future_true: print(key, mut_future_gen[key], mut_future_true[key]) s_key = key.split(">") s_key = "".join(s_key) novel_mut_orig.append(s_key) novel_mut.append(key) print("novel mut share: {}, {}, {}".format(str(len(novel_mut) / float(len(mut_future_true))), str(len(novel_mut)), str(len(mut_future_true)))) utils.save_as_json(results_path + "predicted_novel_mutations_in_c_{}.json".format(clade_child), novel_mut) utils.save_as_json(results_path + "predicted_novel_mutations_in_c_{}_original.json".format(clade_child), novel_mut_orig) print("---") for key in mut_future_gen: if key in mut_future_true and key in mut_tr: print(key, mut_future_gen[key], mut_future_true[key], mut_tr[key]) present_in_tr_mut.append(key) print("--") tr_pos = get_POS(mut_tr) print(tr_pos) true_pos = get_POS(mut_future_true) print() print(true_pos) gen_pos = get_POS(mut_future_gen) print() print(gen_pos) novel_pos = list() present_in_tr_pos = list() for pos in gen_pos: if pos in true_pos and pos not in tr_pos: novel_pos.append(pos) if pos in tr_pos: present_in_tr_pos.append(pos) print() print("% gen mut present in tr: {}".format(str(float(len(present_in_tr_pos))/len(tr_pos)))) print() print("% novel mut pos: {}".format(str(float(len(novel_pos))/len(true_pos)))) print() print(novel_pos)
def __save_json(json_list: List[dict], directory: str, issue_key: str = None) -> None: """ Save target list of dictionaries to the desired directory in a file in JSON format. If issue_key is specified, then the file is name "<issue_key>.json", otherwise - "all.json". :param json_list: List of dictionaries to save as JSON file :param directory: Directory where to save the file :param issue_key: Target issue key :return: None """ utils.create_dir_if_necessary(directory) if issue_key: filename = issue_key + ".json" else: filename = "all.json" path = os.path.join(directory, filename) utils.save_as_json(json_list, path)
def parse_issue(self, issue_key: str) -> dict: """ Parse a raw issue and store it in "Projects/<project_name>/Issues/<issue_key>.json. If the issue is not cached, then it is fetched first. :param issue_key: Key of the issue to parse :return: Dictionary representing the issue """ filename = issue_key + ".json" utils.create_dir_if_necessary(self.issues_dir) path_raw = os.path.join(self.issues_raw_dir, filename) if not os.path.isfile(path_raw): issue_raw = self.fetch_issue_raw(issue_key, save=True) else: issue_raw = utils.load_json(path_raw) json_object = self.__prepare_json_object(issue_raw) path = os.path.join(self.issues_dir, filename) utils.save_as_json(json_object, path) return json_object
def scrape_deaths(): # Load content from Terviset's Covid dashboard and parse it log_status("Scraping data on deaths from " + TERVISEAMET_COVID_DASHBOARD) html = requests.get(TERVISEAMET_COVID_DASHBOARD).text soup = BeautifulSoup(html, "html.parser") # Extract number of deaths from page content and update JSON data on deaths deaths_container = soup.select(DEATHS_SELECTOR) if len(deaths_container) > 0: try: # Get number of deaths and the current date deaths_count = int(deaths_container[0].text.strip()) current_date = (datetime.now() - timedelta(days=1)).strftime("%Y-%m-%d") # Load existing deaths data json_deaths = read_json_from_file(DEATHS_PATH) # Add new entry to deaths data for current date deaths_output = {} if len(json_deaths): deaths_output = json_deaths deaths_output[current_date] = deaths_count # Save data on deaths save_as_json(DEATHS_PATH + ".tmp", deaths_output) # Log status log_status("Successfully scraped deaths. Total deaths: " + str(deaths_count)) except: # Log error error_message = "Error when scraping data on deaths" log_status(error_message + ":") log_status(traceback.format_exc()) raise Exception(error_message) else: # Log error error_message = "Error: could not find page element with data on deaths" log_status(error_message) raise Exception(error_message)
'pageid': article['pageid'], 'text': corpus.strip_mediawiki_markup(article['text']) } # limit = 0 article_sentences = {} all_articles = [] for title, article in semi_cleaned_articles.items(): all_articles.append(Article(title, article['pageid'], corpus.get_sentences_and_citations(article['text']))) # limit += 1 # if limit == 3: # break train, dev, test = corpus.get_corpus_splits(all_articles) # print(get_corpus_stats(all_articles)) # print(get_corpus_stats(train)) # print(get_corpus_stats(dev)) # print(get_corpus_stats(test)) save_as_json(all_articles, 'all_articles.json') save_as_json(train, 'train.json') save_as_json(dev, 'dev.json') save_as_json(test, 'test.json') save_stats_to_md(all_articles, train, dev, test) # with open('article_sentences.json', 'w') as json_file: # json.dump(article_sentences, json_file, sort_keys=True, indent=4)
def start_training(forward_dict, rev_dict, gen_encoder=None, gen_decoder=None): #pos_variations = dict() #pos_variations_count = dict() start_time = time.time() print("Loading datasets...") #pretr_clade_files = glob.glob('data/pretrain/*.csv') tr_clade_files = glob.glob('data/train/*.csv') te_clade_files = glob.glob('data/test/*.csv') pretr_combined_X = list() pretr_combined_y = list() '''print("Loading pre-training datasets...") for name in pretr_clade_files: pretr_clade_df = pd.read_csv(name, sep="\t") pretr_X = pretr_clade_df["X"].tolist() pretr_y = pretr_clade_df["Y"].tolist() pretr_combined_X.extend(pretr_X) pretr_combined_y.extend(pretr_y)''' combined_X = list() combined_y = list() # load train data print("Loading training datasets...") for name in tr_clade_files: tr_clade_df = pd.read_csv(name, sep="\t") X = tr_clade_df["X"].tolist() y = tr_clade_df["Y"].tolist() combined_X.extend(X) combined_y.extend(y) combined_te_X = list() combined_te_y = list() # load test data print("Loading test datasets...") for te_name in te_clade_files: te_clade_df = pd.read_csv(te_name, sep="\t") te_X = te_clade_df["X"].tolist() te_y = te_clade_df["Y"].tolist() combined_te_X.extend(te_X) combined_te_y.extend(te_y) print(len(te_X), len(te_y)) print() tr_unrelated_files = glob.glob("data/tr_unrelated/*.csv") print("Loading unrelated datasets...") unrelated_X = list() unrelated_y = list() for tr_unrelated in tr_unrelated_files: unrelated_clade_df = pd.read_csv(tr_unrelated, sep="\t") un_X = unrelated_clade_df["X"].tolist() un_y = unrelated_clade_df["Y"].tolist() unrelated_X.extend(un_X) unrelated_y.extend(un_y) print(len(un_X), len(un_y)) unrelated_X = np.array(unrelated_X) unrelated_y = np.array(unrelated_y) print("Unrelated data sizes") print(len(unrelated_X), len(unrelated_y)) print("train and test data sizes") print(len(combined_X), len(combined_y), len(combined_te_X), len(combined_te_y)) kmer_f_dict = utils.read_json(PATH_KMER_F_DICT) kmer_r_dict = utils.read_json(PATH_KMER_R_DICT) vocab_size = len(kmer_f_dict) + 1 print("Number of kmers: {}".format(str(len(kmer_f_dict)))) print("Vocab size: {}".format(str(len(kmer_f_dict) + 1))) combined_X = np.array(combined_X) combined_y = np.array(combined_y) X_train = combined_X y_train = combined_y test_dataset_in = np.array(combined_te_X) test_dataset_out = np.array(combined_te_y) if gen_encoder is None or gen_decoder is None: encoder, decoder = neural_network.make_generator_model( len_final_aa_padding, vocab_size, embedding_dim, enc_units, batch_size, size_stateful) else: encoder = gen_encoder decoder = gen_decoder #print(len(pretr_combined_X)) '''if len(pretr_combined_X) == 0: X_pretrain, X_train, y_pretrain, y_train = train_test_split(combined_X, combined_y, test_size=pretrain_train_size) X_pretrain = np.array(X_pretrain) y_pretrain = np.array(y_pretrain) pre_train_cluster_indices, pre_train_cluster_indices_dict = utils.find_cluster_indices(y_pretrain, batch_size) df_pretrain = pd.DataFrame(list(zip(X_pretrain, y_pretrain)), columns=["X", "Y"]) df_pretrain.to_csv(PRETRAIN_DATA, sep="\t", index=None) # save update train dataset df_train = pd.DataFrame(list(zip(X_train, y_train)), columns=["X", "Y"]) df_train.to_csv(tr_clade_files[0], sep="\t", index=None) else: ''' #X_pretrain = np.array(pretr_combined_X) #y_pretrain = np.array(pretr_combined_y) #print("Pretrain data sizes") #print(X_pretrain.shape, y_pretrain.shape) # divide into pretrain and train print("Train data sizes") print(X_train.shape, y_train.shape) X_train = np.array(X_train) y_train = np.array(y_train) # pretrain generator if to_pretrain is True: utils.create_dirs("data/generated_files/pre_train") pretrain_gen_train_loss = list() pretrain_gen_test_loss = list() pretrain_gen_test_seq_var = list() pretrain_gen_train_seq_var = list() pretrain_gen_batch_test_loss = list() pretrain_gen_batch_test_seq_var = list() print("Pretraining generator...") # balance tr data by mutations x_pretr_parent_child_mut_indices, x_pos_variations, x_pos_variations_count = utils.get_mutation_tr_indices( X_train, kmer_f_dict, kmer_r_dict, forward_dict, rev_dict, "x") #print(x_pos_variations) #print() #print(x_pos_variations_count) y_pretr_parent_child_mut_indices, y_pos_variations, y_pos_variations_count = utils.get_mutation_tr_indices( y_train, kmer_f_dict, kmer_r_dict, forward_dict, rev_dict, "y") #print(y_pos_variations) #print() print(y_pos_variations_count) #sys.exit() #print("Creating training data generator balanced by sample weights...") #pre_train_cluster_indices, pre_train_cluster_indices_dict, scatter_df = utils.find_cluster_indices(y_train, batch_size) print() print("Creating training data generator balanced by sample weights...") #training_generator = utils.calculate_sample_weights(y_train, batch_size, pos_variations_count) inputs_tokens_weights = utils.calculate_input_sample_weights( X_train, x_pos_variations_count) #pre_train_cluster_indices, pre_train_cluster_indices_dict = utils.find_cluster_indices(y_train, batch_size) pre_train_cluster_indices_dict = dict() mut_pattern, mut_pattern_dist, mut_pattern_dist_freq, mut_buckets = utils.create_mut_balanced_dataset( X_train, y_train, kmer_f_dict, len_final_aa_padding, batch_size) #sys.exit() #utils.save_as_json(PRETR_MUT_INDICES, pretr_parent_child_mut_indices) utils.save_as_json(PRETR_MUT_INDICES, y_pretr_parent_child_mut_indices) #utils.save_as_json(PRETR_MUT_INDICES, pretr_parent_child_mut_indices) # get pretraining dataset as sliced tensors n_pretrain_batches = int(X_train.shape[0] / float(batch_size)) print("Num of pretrain batches: {}".format(str(n_pretrain_batches))) #updated_lr = pretr_lr for i in range(retrain_pretrain_start_index, pretrain_epochs): #pretrain_generator_optimizer = tf.keras.optimizers.Adam(learning_rate=pretr_lr) print("Pre training epoch {}/{}...".format(str(i + 1), str(pretrain_epochs))) pretrain_gen_tr_loss, bat_te_gen_loss, bat_te_seq_var, bat_tr_seq_var, encoder, decoder = train_model.pretrain_generator( [ X_train, y_train, test_dataset_in, test_dataset_out, te_batch_size, n_te_batches ], i, encoder, decoder, enc_units, vocab_size, n_pretrain_batches, batch_size, pretrain_epochs, size_stateful, forward_dict, rev_dict, kmer_f_dict, kmer_r_dict, y_pos_variations_count) print( "Pre training loss at epoch {}/{}: Generator loss: {}, variation score: {}" .format(str(i + 1), str(pretrain_epochs), str(pretrain_gen_tr_loss), str(np.mean(bat_tr_seq_var)))) pretrain_gen_train_loss.append(pretrain_gen_tr_loss) pretrain_gen_batch_test_loss.append(bat_te_gen_loss) pretrain_gen_batch_test_seq_var.append(bat_te_seq_var) pretrain_gen_train_seq_var.append(bat_tr_seq_var) print() print("Pretrain: predicting on test datasets...") with tf.device('/device:cpu:0'): pretrain_gen_te_loss, pretrain_gen_te_seq_var = utils.predict_sequence( i, 0, test_dataset_in, test_dataset_out, te_batch_size, n_te_batches, len_final_aa_padding, vocab_size, enc_units, encoder, decoder, size_stateful, "pretrain") pretrain_gen_test_loss.append(pretrain_gen_te_loss) pretrain_gen_test_seq_var.append(pretrain_gen_te_seq_var) print("Pre-training epoch {} finished".format(str(i + 1))) print() epoch_type_name = "pretrain_epoch_{}".format(str(i + 1)) utils.save_predicted_test_data(test_dataset_in, test_dataset_out, te_batch_size, enc_units, vocab_size, len_final_aa_padding, size_stateful, epoch_type_name, PRETRAIN_GEN_ENC_MODEL, PRETRAIN_GEN_DEC_MODEL) # np.savetxt(PRETRAIN_GEN_LOSS, pretrain_gen_train_loss) np.savetxt(PRETRAIN_GEN_TEST_LOSS, pretrain_gen_test_loss) np.savetxt("data/generated_files/pretrain_gen_test_seq_var.txt", pretrain_gen_test_seq_var) np.savetxt("data/generated_files/pretrain_gen_batch_test_loss.txt", pretrain_gen_batch_test_loss) np.savetxt("data/generated_files/pretrain_gen_batch_test_seq_var.txt", pretrain_gen_batch_test_seq_var) np.savetxt("data/generated_files/pretrain_gen_batch_train_seq_var.txt", pretrain_gen_train_seq_var) print("Pre-training finished") print() end_time = time.time() print("Pretraining finished in {} seconds".format( str(np.round(end_time - start_time, 2)))) if gan_train is False: sys.exit() # GAN training # create discriminator model utils.create_dirs("data/generated_files/gan_train") train_cluster_indices, train_cluster_indices_dict = utils.find_cluster_indices( y_train, batch_size) disc_parent_encoder_model, disc_gen_encoder_model = neural_network.make_disc_par_gen_model( len_final_aa_padding, vocab_size, embedding_dim, enc_units, batch_size, size_stateful) discriminator = neural_network.make_discriminator_model(enc_units) # use the pretrained generator and train it along with discriminator print("Training Generator and Discriminator...") train_gen_total_loss = list() train_gen_true_loss = list() train_gen_fake_loss = list() train_disc_total_loss = list() train_disc_true_loss = list() train_disc_fake_loss = list() train_te_loss = list() train_gen_test_seq_var = list() train_gen_batch_test_loss = list() train_gen_batch_test_seq_var = list() n_train_batches = int(X_train.shape[0] / float(batch_size)) print("Num of train batches: {}".format(str(n_train_batches))) # balance tr data by mutations tr_parent_child_mut_indices, pos_variations, pos_variations_count = utils.get_mutation_tr_indices( X_train, y_train, kmer_f_dict, kmer_r_dict, forward_dict, rev_dict, pos_variations, pos_variations_count) print(pos_variations) print() print(pos_variations_count) utils.save_as_json(TR_MUT_INDICES, tr_parent_child_mut_indices) for n in range(epochs): print("Training epoch {}/{}...".format(str(n + 1), str(epochs))) epo_gen_true_loss, epo_gen_fake_loss, epo_total_gen_loss, epo_disc_true_loss, epo_disc_fake_loss, epo_total_disc_loss, epo_bat_te_loss, epo_bat_gen_seq_var, encoder, decoder = train_model.start_training_mut_balanced( [ X_train, y_train, unrelated_X, unrelated_y, test_dataset_in, test_dataset_out, te_batch_size, n_te_batches ], n, encoder, decoder, disc_parent_encoder_model, disc_gen_encoder_model, discriminator, enc_units, vocab_size, n_train_batches, batch_size, tr_parent_child_mut_indices, epochs, size_stateful, forward_dict, rev_dict, kmer_f_dict, kmer_r_dict, pos_variations, pos_variations_count, train_cluster_indices_dict) print( "Training loss at epoch {}/{}, G true loss: {}, G fake loss: {}, Total G loss: {}, D true loss: {}, D fake loss: {}, Total D loss: {}" .format(str(n + 1), str(epochs), str(epo_gen_true_loss), str(epo_gen_fake_loss), str(epo_total_gen_loss), str(epo_disc_true_loss), str(epo_disc_fake_loss), str(epo_total_disc_loss))) train_gen_total_loss.append(epo_total_gen_loss) train_gen_true_loss.append(epo_gen_true_loss) train_gen_fake_loss.append(epo_gen_fake_loss) train_disc_total_loss.append(epo_total_disc_loss) train_disc_true_loss.append(epo_disc_true_loss) train_disc_fake_loss.append(epo_disc_fake_loss) train_gen_batch_test_loss.append(epo_bat_te_loss) train_gen_batch_test_seq_var.append(epo_bat_gen_seq_var) # predict seq on test data print("Prediction on test data...") with tf.device('/device:cpu:0'): epo_tr_gen_te_loss, epo_tr_gen_seq_var = utils.predict_sequence( n, 0, test_dataset_in, test_dataset_out, te_batch_size, n_te_batches, len_final_aa_padding, vocab_size, enc_units, encoder, decoder, size_stateful, "gan_train") train_te_loss.append(epo_tr_gen_te_loss) train_gen_test_seq_var.append(epo_tr_gen_seq_var) print() epoch_type_name = "gan_train_epoch_{}".format(str(n + 1)) utils.save_predicted_test_data(test_dataset_in, test_dataset_out, te_batch_size, enc_units, vocab_size, len_final_aa_padding, size_stateful, epoch_type_name, TRAIN_GEN_ENC_MODEL, TRAIN_GEN_DEC_MODEL) print("Training finished") # save loss files np.savetxt(TRAIN_GEN_TOTAL_LOSS, train_gen_total_loss) np.savetxt(TRAIN_GEN_FAKE_LOSS, train_gen_fake_loss) np.savetxt(TRAIN_GEN_TRUE_LOSS, train_gen_true_loss) np.savetxt(TRAIN_DISC_FAKE_LOSS, train_disc_fake_loss) np.savetxt(TRAIN_DISC_TRUE_LOSS, train_disc_true_loss) np.savetxt(TRAIN_DISC_TOTAL_LOSS, train_disc_total_loss) np.savetxt(TEST_LOSS, train_te_loss) np.savetxt("data/generated_files/train_gen_batch_test_loss.txt", train_gen_batch_test_loss) np.savetxt("data/generated_files/train_gen_batch_test_seq_var.txt", train_gen_batch_test_seq_var) np.savetxt("data/generated_files/train_gen_test_seq_var.txt", train_gen_test_seq_var) end_time = time.time() print("Program finished in {} seconds".format( str(np.round(end_time - start_time, 2))))
import utils def main(suas): reading_levels = utils.get_reading_levels(suas) utils.enrich_collection(suas, reading_levels) if __name__ == '__main__': suas = utils.get_suas_1970() main(suas) utils.save_as_json(utils.reading_levels_fname, suas)
def start_training_mut_balanced(inputs, epo_step, encoder, decoder, disc_par_enc, disc_gen_enc, discriminator, enc_units, vocab_size, n_train_batches, batch_size, parent_child_mut_indices, epochs, size_stateful, forward_dict, rev_dict, kmer_f_dict, kmer_r_dict, pos_variations, pos_variations_count, train_cluster_indices_dict): """ Training sequences balanced by mutation type """ X_train, y_train, unrelated_X, unrelated_y, test_dataset_in, test_dataset_out, te_batch_size, n_te_batches = inputs epo_avg_total_gen_loss = list() epo_ave_gen_true_loss = list() epo_avg_gen_fake_loss = list() epo_avg_total_disc_loss = list() epo_avg_disc_fake_loss = list() epo_avg_disc_real_loss = list() disc_real_loss = tf.constant(0) disc_fake_loss = tf.constant(0) total_disc_loss = tf.constant(0) gen_fake_loss = tf.constant(0) gen_true_loss = tf.constant(0) total_gen_loss = tf.constant(0) batch_mut_distribution = dict() epo_te_gen_loss = list() epo_te_seq_var = list() pos_size = dict() #get_mut_size(parent_child_mut_indices) mut_keys = list(parent_child_mut_indices.keys()) epo_train_save_folder = "data/generated_files/gan_train/{}".format(str(epo_step+1)) enc_train_save_folder = "data/generated_files/gan_train/{}/enc".format(str(epo_step+1)) dec_train_save_folder = "data/generated_files/gan_train/{}/dec".format(str(epo_step+1)) utils.create_dirs(epo_train_save_folder) utils.create_dirs(enc_train_save_folder) utils.create_dirs(dec_train_save_folder) for step in range(n_train_batches): #unrolled_x, unrolled_y, batch_mut_distribution = sample_true_x_y(parent_child_mut_indices, batch_size, X_train, y_train, batch_mut_distribution) unrolled_x, unrolled_y = sample_true_x_y(batch_size, X_train, y_train, train_cluster_indices_dict) un_X, un_y = utils.sample_unrelated_x_y(unrelated_X, unrelated_y, batch_size) seq_len = unrolled_x.shape[1] disc_gen = step % n_disc_step if disc_gen in list(range(0, n_disc_step - n_gen_step)): # train discriminator _, _, disc_par_enc, disc_gen_enc, discriminator, disc_real_loss, disc_fake_loss, total_disc_loss = d_loop(seq_len, batch_size, vocab_size, enc_units, unrolled_x, unrolled_y, un_X, un_y, encoder, decoder, disc_par_enc, disc_gen_enc, discriminator, size_stateful, pos_size, pos_variations, pos_variations_count, step) # share weights with generator's encoder disc_par_enc.load_weights(GEN_ENC_WEIGHTS) disc_gen_enc.load_weights(GEN_ENC_WEIGHTS) #disc_gen_enc.layers[1].set_weights(disc_par_enc.layers[1].get_weights()) print("Training epoch {}/{}, batch {}/{}, D true loss: {}, D fake loss: {}, Total D loss: {}".format(str(epo_step+1), str(epochs), str(step+1), str(n_train_batches), str(disc_real_loss.numpy()), str(disc_fake_loss.numpy()), str(total_disc_loss.numpy()))) else: # train generator with unrolled discriminator # save disc weights to reset after unrolling discriminator.save_weights(DISC_WEIGHTS) disc_par_enc.save_weights(DISC_PAR_ENC_WEIGHTS) disc_gen_enc.save_weights(DISC_GEN_ENC_WEIGHTS) print("Applying unrolled steps...") # unrolling steps for i in range(unrolled_steps): print("Unrolled step: {}/{}".format(str(i+1), str(unrolled_steps))) # sample data for unrolling #unroll_x, unroll_y, _ = sample_true_x_y(parent_child_mut_indices, batch_size, X_train, y_train, batch_mut_distribution) unroll_x, unroll_y = sample_true_x_y(batch_size, X_train, y_train, train_cluster_indices_dict) un_unroll_X, un_unroll_y = utils.sample_unrelated_x_y(unrelated_X, unrelated_y, batch_size) # train discriminator _, _, disc_par_enc, disc_gen_enc, discriminator, d_r_l, d_f_l, d_t_l = d_loop(seq_len, batch_size, vocab_size, enc_units, unroll_x, unroll_y, un_unroll_X, un_unroll_y, encoder, decoder, disc_par_enc, disc_gen_enc, discriminator, size_stateful, pos_size, pos_variations, pos_variations_count, step) print("Unrolled disc losses: real {}, fake {}, total {}".format(str(d_r_l.numpy()), str(d_f_l.numpy()), str(d_t_l.numpy()))) # finish unrolling # train generator with unrolled discriminator encoder, decoder, _, _, _, gen_true_loss, gen_fake_loss, total_gen_loss = g_loop(seq_len, batch_size, vocab_size, enc_units, unrolled_x, unrolled_y, un_X, un_y, encoder, decoder, disc_par_enc, disc_gen_enc, discriminator, size_stateful, pos_size, pos_variations, pos_variations_count, step) print("Training epoch {}/{}, batch {}/{}, G true loss: {}, G fake loss: {}, Total G loss: {}".format(str(epo_step+1), str(epochs), str(step+1), str(n_train_batches), str(gen_true_loss.numpy()), str(gen_fake_loss.numpy()), str(total_gen_loss.numpy()))) encoder.save_weights(GEN_ENC_WEIGHTS) # reset weights of discriminator, disc_par_enc and disc_gen_enc after unrolling discriminator.load_weights(DISC_WEIGHTS) disc_par_enc.load_weights(DISC_PAR_ENC_WEIGHTS) disc_gen_enc.load_weights(DISC_GEN_ENC_WEIGHTS) # intermediate prediction on test data while training if (step + 1) % test_log_step == 0 and step > 0: print("Training: prediction on test data...") with tf.device('/device:cpu:0'): _, _ = utils.predict_sequence(epo_step, step, test_dataset_in, test_dataset_out, te_batch_size, n_te_batches, seq_len, vocab_size, enc_units, encoder, decoder, size_stateful, "gan_train", True) print("Training epoch {}/{}, batch {}/{}, G true loss: {}, G fake loss: {}, Total G loss: {}, D true loss: {}, D fake loss: {}, Total D loss: {}".format(str(epo_step+1), str(epochs), str(step+1), str(n_train_batches), str(gen_true_loss.numpy()), str(gen_fake_loss.numpy()), str(total_gen_loss.numpy()), str(disc_real_loss.numpy()), str(disc_fake_loss.numpy()), str(total_disc_loss.numpy()))) # write off results epo_ave_gen_true_loss.append(gen_true_loss.numpy()) epo_avg_gen_fake_loss.append(gen_fake_loss.numpy()) epo_avg_total_gen_loss.append(total_gen_loss.numpy()) epo_avg_disc_fake_loss.append(disc_fake_loss.numpy()) epo_avg_disc_real_loss.append(disc_real_loss.numpy()) epo_avg_total_disc_loss.append(total_disc_loss.numpy()) # save model print("Training epoch {} finished, Saving model...".format(str(epo_step+1))) print() tf.keras.models.save_model(encoder, TRAIN_GEN_ENC_MODEL) tf.keras.models.save_model(decoder, TRAIN_GEN_DEC_MODEL) # save trained models per epoch tf.keras.models.save_model(encoder, enc_train_save_folder) tf.keras.models.save_model(decoder, dec_train_save_folder) encoder.save_weights(GEN_ENC_WEIGHTS) decoder.save_weights(GEN_DEC_WEIGHTS) utils.save_as_json("data/generated_files/ave_batch_x_y_mut_epo_{}.json".format(str(epo_step)), batch_mut_distribution) return np.mean(epo_ave_gen_true_loss), np.mean(epo_avg_gen_fake_loss), np.mean(epo_avg_total_gen_loss), np.mean(epo_avg_disc_real_loss), np.mean(epo_avg_disc_fake_loss), np.mean(epo_avg_total_disc_loss), np.mean(epo_te_gen_loss), np.mean(epo_te_seq_var), encoder, decoder
def main(): # Log status log_status("Starting to generate chart data at " + str(TODAY_DMYHM)) # 1. Create date ranges for charts log_status("Creating date ranges for charts") case_dates = pd.date_range(start=DATE_SETTINGS["first_case_date"], end=YESTERDAY_YMD) vaccination_dates = pd.date_range(start=DATE_SETTINGS["vaccination_start_date"], end=YESTERDAY_YMD) # 2. Calculate data related to deaths try: deaths = read_json_from_file(DEATHS_PATH) manual_data = read_json_from_file(MANUAL_DATA_PATH) except: # Log error log_status('Error when loading local data:') log_status(traceback.format_exc()) exit() log_status("Calculating data related to deaths") manual_data["deceased"].update(deaths) deceased = list(manual_data["deceased"].values()) n_deaths = deceased[-1] n_deaths_change = int(deceased[-1]) - int(deceased[-2]) # 3. Calculate data related to test results # Define columns to import column_list = [ 'Gender', 'AgeGroup', 'County', 'ResultValue', 'StatisticsDate' ] test_results = get_json_from_csv_file(TEST_RESULTS_PATH, column_list) log_status("Calculating data related to test results") # Find count of confirmed cases n_confirmed_cases = np.sum([res["ResultValue"] == "P" for res in test_results]) # Find total number of tests n_tests_administered = len(test_results) log_status("Total number of tests: " + str(n_tests_administered)) infections_by_county = get_infection_count_by_county(test_results, county_mapping) county_by_day = get_county_by_day(test_results, case_dates, county_mapping, county_sizes) confirmed_cases_by_county = get_confirmed_cases_by_county(test_results, county_mapping) tests_per_day_chart_data = get_tests_per_day_chart_data(test_results, case_dates) cumulative_cases_chart_data = get_cumulative_cases_chart_data( test_results, case_dates, tests_per_day_chart_data ) cumulative_tests_chart_data = get_cumulative_tests_chart_data(test_results, case_dates) positive_test_by_age_chart_data = get_positive_tests_by_age_chart_data(test_results) positive_negative_chart_data = get_positive_negative_chart_data(test_results, county_mapping) county_daily_active = get_county_daily_active(test_results, case_dates, county_mapping, county_sizes) # Delete test result data from memory del test_results infections_by_county_10000 = get_infections_data_by_count_10000(infections_by_county, county_sizes) tests_pop_ratio = get_test_data_pop_ratio(infections_by_county_10000) new_cases_per_day_chart_data = get_new_cases_per_day_chart_data(cumulative_cases_chart_data) n_active_cases = cumulative_cases_chart_data["active"][-1] n_active_cases_change = (cumulative_cases_chart_data["active"][-1] - cumulative_cases_chart_data["active"][-2]) per_100k = cumulative_cases_chart_data["active100k"][-1] active_infections_by_county = [ {"MNIMI": k, "sequence": v, "drilldown": k} for k, v in county_daily_active["countyByDayActive"].items() ] active_infections_by_county_100k = [ [k, round(v[-1] / county_sizes[k] * 100000, 2)] for k, v in county_daily_active["countyByDayActive"].items() ] # 4. Calculate data related to test locations test_locations = read_json_from_file(TEST_LOCATIONS_PATH) municipalities_data = get_municipality_data(test_locations, county_mapping) # Delete test location data from memory del test_locations # 5. Calculate data related to hospitalisation hospitalization = read_json_from_file(HOSPITALIZATION_PATH) log_status("Calculating data related to hospitalisation") # Set hospitalised and ICU time-series hospital = get_hospital_data(hospitalization, DATE_SETTINGS["first_case_date"]) # TODO: Based on cross-checking with the hospitalisation data published by TEHIK, the data listed # in the manual_data.json file with the field name "intensive" appears to show the number # of patients on ventilation. We should fix the terminology and make sure that the intensive # and on ventilation statistics are being calculated correctly. intensive = list(get_in_intensive_data(hospitalization, manual_data["intensive"]).values()) on_ventilation = list(get_on_ventilation_data(hospitalization).values()) # Delete hospitalization data from memory del hospitalization hospitalised = hospital["activehospitalizations"] n_on_ventilation = on_ventilation[-1] n_on_ventilation_change = int(on_ventilation[-1]) - int(on_ventilation[-2]) # 6. Calculate data related to vaccination vaccination = read_json_from_file(VACCINATIONS_PATH) log_status("Calculating data related to vaccination") vaccinated_people_chart_data = get_vaccinated_people_chart_data(vaccination, vaccination_dates) last_day_vaccination_data = [x for x in vaccination if x["MeasurementType"] == "Vaccinated" and x["VaccinationSeries"] == 1][-1] last_day_completed_vaccination_data = [x for x in vaccination if x["MeasurementType"] == "FullyVaccinated" and x["VaccinationSeries"] == 1][-1] last_day_doses_administered_data = [x for x in vaccination if x["MeasurementType"] == "DosesAdministered" and x["VaccinationSeries"] == 1][-1] # Delete vaccination data from memory del vaccination n_fully_vaccinated = last_day_completed_vaccination_data["TotalCount"] n_fully_vaccinated_change = last_day_completed_vaccination_data["DailyCount"] n_fully_vaccinated_percentage = last_day_completed_vaccination_data["PopulationCoverage"] n_vaccinated_at_least_one_dose = last_day_vaccination_data["TotalCount"] n_vaccinated_at_least_one_dose_change = last_day_vaccination_data["DailyCount"] n_vaccinated_at_least_one_dose_percentage = last_day_vaccination_data["PopulationCoverage"] # vaccination_number_total = (n_vaccinated_at_least_one_dose - n_fully_vaccinated) # vaccination_number_last_day = (n_vaccinated_at_least_one_dose_change - n_fully_vaccinated_change) # 7. Create and save final JSON log_status("Compiling final JSON") final_json = { "updatedOn": TODAY_DMYHM, "confirmedCasesNumber": str(n_confirmed_cases), # TODO: For consistency, we should include the change in the number of confirmed cases as well. "hospitalisedNumber": str(hospital["activehospitalizations"][-1]), "hospitalChanged": str(hospital["activehospitalizations"][-1] - hospital["activehospitalizations"][-2]), "onVentilation": on_ventilation, "onVentilationNumber": n_on_ventilation, "onVentilationChanged": n_on_ventilation_change, "deceased": deceased, "deceasedNumber": str(n_deaths), "deceasedChanged": str(n_deaths_change), "testsAdministeredNumber": str(n_tests_administered), # TODO: For consistency, we should include the change in the number of tests as well. "activeCasesNumber": str(n_active_cases), "activeChanged": str(n_active_cases_change), "perHundred": str(per_100k), # TODO: This should be given a clearer name. "dates2": [str(x.date()) for x in case_dates], # TODO: Change key to "caseDates" "dates3": [str(x.date()) for x in vaccination_dates], # TODO: Change key to "vaccinationDates" "counties": counties, "age_groups": age_groups, "dataInfectionsByCounty": infections_by_county, "dataInfectionsByCounty10000": infections_by_county_10000, "dataActiveInfectionsByCounty100k": active_infections_by_county_100k, "dataActiveInfectionsByCounty": active_infections_by_county, "dataTestsPopRatio": tests_pop_ratio, "countyByDay": county_by_day, "dataCountyDailyActive": county_daily_active, "dataConfirmedCasesByCounty": confirmed_cases_by_county, "dataCumulativeCasesChart": cumulative_cases_chart_data, "dataNewCasesPerDayChart": new_cases_per_day_chart_data, "dataCumulativeTestsChart": cumulative_tests_chart_data, "dataTestsPerDayChart": tests_per_day_chart_data, "dataPositiveTestsByAgeChart": positive_test_by_age_chart_data, "dataPositiveNegativeChart": positive_negative_chart_data, "dataVaccinatedPeopleChart": vaccinated_people_chart_data, "dataMunicipalities": municipalities_data, "hospital": hospital, # TODO: Rename this to make it clearer what data it contains. # "vaccinationNumberTotal": vaccination_number_total, # "vaccinationNumberLastDay": vaccination_number_last_day, "fullyVaccinatedNumber": n_fully_vaccinated, "fullyVaccinatedNumberChange": n_fully_vaccinated_change, "fullyVaccinatedNumberPercentage": n_fully_vaccinated_percentage, "vaccinatedAtLeastOneDoseNumber": n_vaccinated_at_least_one_dose, "vaccinatedAtLeastOneDoseChange": n_vaccinated_at_least_one_dose_change, "vaccinatedAtLeastOneDosePercentage": n_vaccinated_at_least_one_dose_percentage, } # Dump JSON output log_status("Dumping JSON output") save_as_json(OUTPUT_FILE_LOCATION, final_json) # Log finish time finish = datetime.today().astimezone(ESTONIA_TIMEZONE).strftime("%d/%m/%Y, %H:%M") log_status("Finished update process at " + finish)
def main(): # Log status log_status("Starting data update process at " + str(today)) # Get current number of deaths from Terviseamet's Covid dashboard try: scrape_deaths() except: log_status("Aborting data update.") exit() # Load data from external services log_status("Downloading data from TEHIK: Test results") json_testing = get_json_data(TESTING_ENDPOINT) log_status("Downloading data from TEHIK: Location data") json_test_location = get_json_data(TEST_LOCATION_ENDPOINT) log_status("Downloading data from TEHIK: Hospitalisation data") json_hospitalisation = get_json_data(HOSPITALISATION_ENDPOINT) log_status("Downloading data from TEHIK: Vaccination data") json_vaccination = get_json_data(VACCINATION_ENDPOINT) # Validate data from remote endpoints # TODO: Add checks that the testing and vaccination data are up to date. We will need to adopt # a different approach than for the test location and hospitalisation data due to the fact # that the data structure of the JSON is different. Checking the "Last-Modified" header of the # response may be the way to go and would handle the possibility that there are no tests or # vaccinations on a particular day. ok = True if json_testing is None: log_status("Unable to retrieve testing data") ok = False if json_test_location is None: log_status("Unable to retrieve location data") ok = False elif not is_up_to_date(json_test_location, "LastStatisticsDate"): log_status("Location data is not up-to-date") ok = False if json_hospitalisation is None: log_status("Unable to retrieve hospitalisation data") ok = False elif not is_up_to_date(json_hospitalisation, "LastLoadStatisticsDate"): log_status("Hospitalisation data is not up-to-date") ok = False if json_vaccination is None: log_status("Unable to retrieve vaccination data") ok = False # TODO: Review whether this check is needed. I have commented it out for now. # if not is_header_last_modified_up_to_date(TEST_LOCATION_ENDPOINT): # log_status("Location data last modified is not up-to-date") # ok = False if not ok: log_status( "One or more of the TEHIK APIs has not been updated or could not be retrieved." ) log_status("Aborting data update.") exit() # Load locally-stored data log_status("Loading local data files") try: json_deaths = read_json_from_file(DEATHS_FILE_LOCATION) json_manual = read_json_from_file(MANUAL_DATA_FILE_LOCATION) except: # Log error log_status('Error when loading local data:') log_status(traceback.format_exc()) exit() # Log status log_status("Calculating main statistics") # Statsbar # Find count of confirmed cases n_confirmed_cases = np.sum( [res["ResultValue"] == "P" for res in json_testing]) # Find total number of tests n_tests_administered = len(json_testing) # Create date ranges for charts # dates1 = pd.date_range(start=DATE_SETTINGS["dates1_start"], end=yesterday) dates2 = pd.date_range(start=DATE_SETTINGS["dates2_start"], end=yesterday) dates3 = pd.date_range(start=DATE_SETTINGS["dates3_start"], end=yesterday) # Set recovered, deceased, hospitalised and ICU time-series hospital = get_hospital_data(json_hospitalisation, DATE_SETTINGS["dates2_start"]) recovered = hospital["discharged"] json_manual["deceased"].update(json_deaths) deceased = list(json_manual["deceased"].values()) hospitalised = hospital["activehospitalizations"] # TODO: Based on cross-checking with the hospitalisation data publishedby TEHIK, the data listed # in the manual_data.json file with the field name "intensive" appears to show the number # of patients on ventilation. We should fix the terminology and make sure that the intensive # and on ventilation statistics are being calculated correctly. intensive = list( get_in_intensive_data(json_hospitalisation, json_manual["intensive"]).values()) on_ventilation = list( get_on_ventilation_data(json_hospitalisation).values()) n_deaths = deceased[-1] n_deaths_change = int(deceased[-1]) - int(deceased[-2]) # Get data for each chart log_status("Calculating data for charts") infections_by_county = get_infection_count_by_county( json_testing, county_mapping) infections_by_county_10000 = get_infections_data_by_count_10000( infections_by_county, county_sizes) tests_pop_ratio = get_test_data_pop_ratio(infections_by_county_10000) county_by_day = get_county_by_day(json_testing, dates2, county_mapping, county_sizes) confirmed_cases_by_county = get_confirmed_cases_by_county( json_testing, county_mapping) cumulative_cases_chart_data = get_cumulative_cases_chart_data( json_testing, recovered, deceased, hospitalised, intensive, on_ventilation, dates2) new_cases_per_day_chart_data = get_new_cases_per_day_chart_data( cumulative_cases_chart_data) cumulative_tests_chart_data = get_cumulative_tests_chart_data( json_testing, dates2) tests_per_day_chart_data = get_tests_per_day_chart_data( json_testing, dates2) positive_test_by_age_chart_data = get_positive_tests_by_age_chart_data( json_testing) positive_negative_chart_data = get_positive_negative_chart_data( json_testing, county_mapping) vaccinated_people_chart_data = get_vaccinated_people_chart_data( json_vaccination, dates3) county_daily_active = get_county_daily_active(json_testing, dates2, county_mapping, county_sizes) n_active_cases = cumulative_cases_chart_data["active"][-1] n_active_cases_change = (cumulative_cases_chart_data["active"][-1] - cumulative_cases_chart_data["active"][-2]) active_infections_by_county = [{ "MNIMI": k, "sequence": v, "drilldown": k } for k, v in county_daily_active["countyByDayActive"].items()] active_infections_by_county_100k = [[ k, round(v[-1] / county_sizes[k] * 100000, 2) ] for k, v in county_daily_active["countyByDayActive"].items()] municipalities_data = get_municipality_data(json_test_location, county_mapping) per_100k = cumulative_cases_chart_data["active100k"][-1] # Calculate vaccination data log_status("Calculating vaccination data") last_day_vaccination_data = [ x for x in json_vaccination if x["MeasurementType"] == "Vaccinated" ][-1] last_day_completed_vaccination_data = [ x for x in json_vaccination if x["MeasurementType"] == "FullyVaccinated" ][-1] # TODO: Doses administered # last_day_doses_administered_data = [x for x in json_vaccination if x['MeasurementType'] == 'DosesAdministered'][-1] completed_vaccination_number_total = last_day_completed_vaccination_data[ "TotalCount"] completed_vaccination_number_last_day = last_day_completed_vaccination_data[ "DailyCount"] all_vaccination_number_total = last_day_vaccination_data["TotalCount"] all_vaccination_number_last_day = last_day_vaccination_data["DailyCount"] vaccination_number_total = (all_vaccination_number_total - completed_vaccination_number_total) vaccination_number_last_day = (all_vaccination_number_last_day - completed_vaccination_number_last_day) fully_vaccinated_from_total_vaccinated_percentage = round( completed_vaccination_number_total * 100 / (all_vaccination_number_total), 2) # Create dictionary for final JSON log_status("Compiling final JSON") final_json = { "updatedOn": today, "confirmedCasesNumber": str(n_confirmed_cases), # TODO: For consistency, we should include the change in the number of confirmed cases as well. "hospitalisedNumber": str(hospital["activehospitalizations"][-1]), "hospitalChanged": str(hospital["activehospitalizations"][-1] - hospital["activehospitalizations"][-2]), "deceasedNumber": str(n_deaths), "deceasedChanged": str(n_deaths_change), "recoveredNumber": str(hospital["discharged"][-1]), "recoveredChanged": str(hospital["discharged"][-1] - hospital["discharged"][-2]), "testsAdministeredNumber": str(n_tests_administered), # TODO: For consistency, we should include the change in the number of tests as well. "activeCasesNumber": str(n_active_cases), "activeChanged": str(n_active_cases_change), "perHundred": str(per_100k), # TODO: This should be given a clearer name. # TODO: I can't find anywhere in the app where "dates1" is used. Is it needed? Commented out for now. # "dates1": [str(x.date()) for x in dates1], "dates2": [str(x.date()) for x in dates2], "dates3": [str(x.date()) for x in dates3], "counties": counties, "age_groups": age_groups, "dataInfectionsByCounty": infections_by_county, "dataInfectionsByCounty10000": infections_by_county_10000, "dataActiveInfectionsByCounty100k": active_infections_by_county_100k, "dataActiveInfectionsByCounty": active_infections_by_county, "dataTestsPopRatio": tests_pop_ratio, "countyByDay": county_by_day, "dataCountyDailyActive": county_daily_active, "dataConfirmedCasesByCounties": confirmed_cases_by_county, "dataCumulativeCasesChart": cumulative_cases_chart_data, "dataNewCasesPerDayChart": new_cases_per_day_chart_data, "dataCumulativeTestsChart": cumulative_tests_chart_data, "dataTestsPerDayChart": tests_per_day_chart_data, "dataPositiveTestsByAgeChart": positive_test_by_age_chart_data, "dataPositiveNegativeChart": positive_negative_chart_data, "dataVaccinatedPeopleChart": vaccinated_people_chart_data, "dataMunicipalities": municipalities_data, "hospital": hospital, # TODO: Rename this to make it clearer what data it contains. "vaccinationNumberTotal": vaccination_number_total, "vaccinationNumberLastDay": vaccination_number_last_day, "completedVaccinationNumberTotal": completed_vaccination_number_total, "completedVaccinationNumberLastDay": completed_vaccination_number_last_day, "allVaccinationNumberTotal": all_vaccination_number_total, "allVaccinationNumberLastDay": all_vaccination_number_last_day, "allVaccinationFromPopulationPercentage": last_day_vaccination_data["PopulationCoverage"], "completelyVaccinatedFromTotalVaccinatedPercentage": fully_vaccinated_from_total_vaccinated_percentage, } # Dump JSON output log_status("Dumping JSON output") save_as_json(OUTPUT_FILE_LOCATION, final_json) # Log finish time finish = datetime.today().astimezone(estonian_timezone).strftime( "%d/%m/%Y, %H:%M") log_status("Finished update process at " + finish)
def pretrain_generator(inputs, epo_step, gen_encoder, gen_decoder, enc_units, vocab_size, n_batches, batch_size, epochs, size_stateful, forward_dict, rev_dict, kmer_f_dict, kmer_r_dict, pos_variations_count): #train_model.pretrain_generator([X_train, y_train, test_dataset_in, test_dataset_out, te_batch_size, n_te_batches], i, encoder, decoder, enc_units, vocab_size, n_pretrain_batches, batch_size, pretrain_epochs, size_stateful, forward_dict, rev_dict, kmer_f_dict, kmer_r_dict, y_pos_variations_count) X_train, y_train, test_dataset_in, test_dataset_out, te_batch_size, n_te_batches = inputs epo_avg_tr_gen_loss = list() epo_te_gen_loss = list() epo_tr_seq_var = list() epo_te_seq_var = list() batch_mut_distribution = dict() #pos_variations_count = dict() pos_size = dict() #get_mut_size(pretr_parent_child_mut_indices) #for step, (unrolled_x, unrolled_y) in enumerate(zip(X_train, y_train)): epo_pre_train_save_folder = "data/generated_files/pre_train/{}".format(str(epo_step+1)) enc_pre_train_save_folder = "data/generated_files/pre_train/{}/enc".format(str(epo_step+1)) dec_pre_train_save_folder = "data/generated_files/pre_train/{}/dec".format(str(epo_step+1)) utils.create_dirs(epo_pre_train_save_folder) utils.create_dirs(enc_pre_train_save_folder) utils.create_dirs(dec_pre_train_save_folder) for step in range(n_batches): unrolled_x, unrolled_y = sample_true_x_y(batch_size, X_train, y_train) '''print("Batch {} x and y:".format(str(step+1))) print(unrolled_x[:5, :]) print(unrolled_y[:5, :]) print()''' '''str_x = [",".join(str(pos) for pos in item) for item in unrolled_x] str_y = [",".join(str(pos) for pos in item) for item in unrolled_y] #print(str_x, str_y) muts = utils.get_mutation_tr_indices(str_x, str_y, kmer_f_dict, kmer_r_dict, forward_dict, rev_dict) print(kmer_f_dict) print(muts) print(unrolled_x) print() print(unrolled_y) print()''' seq_len = unrolled_x.shape[1] # verify levenshtein distance '''for i in range(len(unrolled_x)): re_x = utils.reconstruct_seq([kmer_f_dict[str(pos)] for pos in unrolled_x[i][1:]]) re_y = utils.reconstruct_seq([kmer_f_dict[str(pos)] for pos in unrolled_y[i][1:]]) #l_dist = utils.compute_Levenshtein_dist(re_x, re_y) print(re_x) print(re_y) #print(l_dist) print("---")''' '''import sys sys.exit()''' #print(pos_size) with tf.GradientTape() as gen_tape: #print(unrolled_x.shape, unrolled_y.shape) pred_logits, gen_encoder, gen_decoder, gen_loss = utils.loop_encode_decode_stateful(seq_len, batch_size, vocab_size, unrolled_x, unrolled_y, gen_encoder, gen_decoder, enc_units, teacher_forcing_ratio, True, size_stateful, pos_size, pos_variations_count, step) #print("Training: true input seq") #print(unrolled_x[:5, 1:], unrolled_x.shape) #print() print("Training: true output seq") print(unrolled_y[:batch_size,], unrolled_y.shape) print() print(tf.argmax(pred_logits, axis=-1)[:batch_size, :], pred_logits.shape) # compute generated sequence variation variation_score = utils.get_sequence_variation_percentage(unrolled_x, pred_logits) print("Pretr: generation variation score: {}".format(str(variation_score))) epo_tr_seq_var.append(variation_score) #print("Pretr: teacher forcing ratio: {}".format(str(teacher_forcing_ratio))) '''with tf.GradientTape() as pf_tape: # train pf model true_y = unrolled_y[:, 1:] pred_y = tf.argmax(pred_logits, axis=-1) true_o = pf_model(true_y) fake_o = pf_model(pred_y) true_pf_disc_loss, fake_pf_disc_loss = discriminator_loss(true_o, fake_o) total_pf_disc_loss = true_pf_disc_loss + fake_pf_disc_loss fake_pf_gen_loss = generator_loss(fake_o) print(true_pf_disc_loss, fake_pf_disc_loss, fake_pf_gen_loss)''' ########################## #gen_loss = gen_loss + fake_pf_gen_loss print("Pretrain epoch {}/{}, batch {}/{}, gen true loss: {}".format(str(epo_step+1), str(epochs), str(step+1), str(n_batches), str(gen_loss.numpy()))) print() gen_trainable_vars = gen_encoder.trainable_variables + gen_decoder.trainable_variables gradients_of_generator = gen_tape.gradient(gen_loss, gen_trainable_vars) #print("Pretrain gradient norm before clipping: ", [tf.norm(gd) for gd in gradients_of_generator]) gradients_of_generator = [(tf.clip_by_norm(grad, clip_norm=pretrain_clip_norm)) for grad in gradients_of_generator] #print("Pretrain gradient norm after clipping: ", [tf.norm(gd) for gd in gradients_of_generator]) pretrain_generator_optimizer.apply_gradients(zip(gradients_of_generator, gen_trainable_vars)) # optimize pf discriminator #pf_disc_trainable_vars = pf_model.trainable_variables #pf_gradients = pf_tape.gradient(total_pf_disc_loss, pf_disc_trainable_vars) #pf_discriminator_optimizer.apply_gradients(zip(pf_gradients, pf_disc_trainable_vars)) if (step + 1) % test_log_step == 0 and step > 0: print("-------") print("Pretr: Prediction on test data at epoch {}/{}, batch {}/{}...".format(str(epo_step+1), str(epochs), str(step+1), str(n_batches))) print() gen_te_loss, gen_te_seq_var = utils.predict_sequence(epo_step, step, test_dataset_in, test_dataset_out, te_batch_size, n_te_batches, seq_len, vocab_size, enc_units, gen_encoder, gen_decoder, size_stateful, "pretrain", True) epo_te_gen_loss.append(gen_te_loss) epo_te_seq_var.append(gen_te_seq_var) print("-------") print() epo_avg_tr_gen_loss.append(gen_loss) # save model gen_encoder.save_weights(GEN_ENC_WEIGHTS) tf.keras.models.save_model(gen_encoder, PRETRAIN_GEN_ENC_MODEL) tf.keras.models.save_model(gen_decoder, PRETRAIN_GEN_DEC_MODEL) gen_encoder.save_weights(PRE_TR_GEN_ENC_WEIGHTS) gen_decoder.save_weights(PRE_TR_GEN_DEC_WEIGHTS) tf.keras.models.save_model(gen_encoder, enc_pre_train_save_folder) tf.keras.models.save_model(gen_decoder, dec_pre_train_save_folder) utils.save_as_json("data/generated_files/pretr_ave_batch_x_y_mut_epo_{}.json".format(str(epo_step)), batch_mut_distribution) # pretrain_gen_tr_loss, bat_te_gen_loss, bat_te_seq_var, bat_tr_seq_var, encoder, decoder, _ return np.mean(epo_avg_tr_gen_loss), np.mean(epo_te_gen_loss), np.mean(epo_te_seq_var), np.mean(epo_tr_seq_var), gen_encoder, gen_decoder
if __name__ == '__main__': reload(sys) sys.setdefaultencoding('utf-8') devs, log = utils.setup() # Use Google to find potential LinkedIn matches for dev in devs: try: print'\rGoogling for matches for %s...' % dev.get('name') dev['li_matches'] = google_for_li_matches(dev) except: print "%s occurred while processing: %s" % (sys.exc_info()[0].__name__,dev['name']) continue utils.save_as_json(devs, 'googlesearchresults') print "Done.\n" # Compare the LinkedIn profiles to the GitHub profile, score them and sort them. Return best at index[0] devs = utils.load_json('googlesearchresults') for dev in devs: print '\rEvaluating matches for %s...' % dev.get('name') dev['li_matches'] = evaluate_li_matches(dev) utils.save_as_json(devs, 'scoredresults') print "Done.\n" """ # Use Pipl to match remainder for dev in [dev for dev in devs if dev['email']]:
import utils import elections_csv def gen_elections_by_year(elections): elections_by_year = { e['year']: e for e in elections } for e in elections: e.pop('year') e.pop('president') return elections_by_year def main(suas): elections = elections_csv.get_elections() elections_by_year = gen_elections_by_year(elections) year_per_sua = [elections_by_year[year] for year in utils.get_years(suas)] utils.enrich_collection(suas, year_per_sua) if __name__ == '__main__': suas = utils.load_json_as_dict(utils.with_presidents_fname) main(suas) utils.save_as_json(utils.with_elections_fname, suas)
import utils def main(inaugurals): reading_levels = utils.get_reading_levels(inaugurals) utils.enrich_collection(inaugurals, reading_levels) if __name__ == '__main__': inaugurals = utils.get_inaugurals() main(inaugurals) utils.save_as_json(utils.inaugurals_reading_levels_fname, inaugurals)
import pickle from utils import save_as_json, read_from_json import pdb out_dir = 'data/output/blocs/' out_filename = 'dictionary_blocs.json' blocs = ['investment_blocs_2020.json'] dict_blocs = {} for bloc in blocs: bloc_name = bloc.replace('.json', '') this_bloc_dict = read_from_json(out_dir, bloc) dict_blocs[bloc_name] = this_bloc_dict save_as_json(dict_blocs, out_dir, out_filename) # check parsed_dict_blocs = read_from_json(out_dir, out_filename) pdb.set_trace()
def save_as_json(self): save_as_json('vults_items.json', self.items)
def ui_save(expenses): path = ui_input_path() serialized = expenses.do('get_serialized', keep=False) save_as_json(serialized, path)
def save_as_json(self): save_as_json('digital_items.json', self.items)
import utils import presidents_csv def gen_pres_by_name(presidents): pres_by_name = { p['president']: p for p in presidents } for p in presidents: p.pop('president') return pres_by_name def main(suas): presidents = presidents_csv.get_presidents() pres_by_name = gen_pres_by_name(presidents) president_per_sua = [pres_by_name[pres] for pres in utils.get_presidents(suas)] utils.enrich_collection(suas, president_per_sua) if __name__ == '__main__': suas = utils.load_json_as_dict(utils.reading_levels_fname) main(suas) utils.save_as_json(utils.with_presidents_fname, suas)
print'\rGoogling for matches for %s...' % dev.get('name') dev['li_matches'] = get_matching_li_profiles(dev) except: print "%s occurred while processing: %s" % (sys.exc_info()[0].__name__,dev['name']) continue utils.save_as_json(devs, 'googlesearchresults') print "Done.\n" #raw_input('Continue?') """ # Compare the LinkedIn profiles to the GitHub, score them and sort them. Return best at index[0] devs = utils.load_json('googlesearchresults') for dev in devs: print '\rEvaluating matches for %s...' % dev.get('name') dev['li_matches'] = evaluate_li_matches(dev) utils.save_as_json(devs, 'scoredresults') print "Done.\n" #raw_input('Continue?') """ # Use Pipl to match remainder for dev in [dev for dev in devs if dev['email']]: if dev['li_matches']: if dev['li_matches'][0]['score'] < 75: print "Trying piplsearch for %s..." % dev.get('name') result = try_piplsearch(dev) if result: dev['li_matches'].append(result) # re-sort from highest scoring match to lowest dev['li_matches'] = sorted(dev['li_matches'], key =lambda k: k['score'], reverse=True) else: