def file_extension_changes(framework, projects): print("Computing file extension changes") samples = get_samples(projects) configuration_files = create_configuration_dict() extension_files = create_extension_dict() action_in_files = {"A": 0, "M": 0, "D": 0} write_header(action_in_files, configuration_files, extension_files, framework, "file_extension_changes") for sample in samples: extract_changes_log("repositories/", "file_extension_changes", sample) configuration_files = create_configuration_dict() extension_files = create_extension_dict() action_in_files = {"A": 0, "M": 0, "D": 0} with open("file_extension_changes/" + sample + ".txt") as logs: for log in logs: log = remove_next_line(log) if ";" not in log: if ("A" in log) or ("M" in log) or ("D" in log): try: log = log.split("\t") action = log[0] file = get_file_name(log[1]) calculate_configuration_files( configuration_files, file) calculate_extension_files(extension_files, file) action_in_files[action] += 1 except: continue write_content(action_in_files, configuration_files, extension_files, framework, sample, "file_extension_changes")
def delay(framework, projects, githubtoken): print("Computing delay to update") path_dos_repositorios = 'repositories' measure = "delay" output_write(framework, measure, measure, "framework,path,current_version,next_version,framework_release_date (YYYY-DD-MM),sample_update_date (YYYY-DD-MM) ,delay_in_days", True) framework_release_data = buscar_dados_de_lancamento_de_versoes(framework, githubtoken) configuration_file = define_arquivo_de_configuracao(framework) samples = get_samples(projects) for index, sample in enumerate(samples): print_status_samples(index+1, len(samples)) sample_path = path_dos_repositorios + "/" + sample paths_configuration_file = find_paths(configuration_file, sample_path) repository = Repo(sample_path) reversed_commits = get_commits(repository) for path in paths_configuration_file: current_version, reversed_commits = get_first_version(framework, path, repository, reversed_commits) if current_version == {}: continue for commit in reversed_commits: repository.git.checkout(commit, '-f') next_version = buscar_versao_do_framework(framework, path) if current_version != next_version and next_version != '' and current_version != '' and current_version != None and next_version != None: sample_update_date = get_commit_date(commit) framework_release_date = framework_release_data[next_version] delay_in_days = calculate_delay(framework_release_date, sample_update_date) output_write(framework, measure, measure, create_output(current_version, delay_in_days, framework, framework_release_date, next_version, path, sample_update_date), False) current_version = next_version repository.git.checkout('master', '-f')
def stackoverflow(framework, projects): global api api = StackAPI("stackoverflow") samples = get_samples(projects) output_write(framework, directory, "questions_and_answers", get_header(), True) for index, sample in enumerate(samples): print_status_samples(index+1, len(samples)) questions = get_questions_when_body_has(sample) for indx, question in enumerate(questions["items"]): print("{0}% questions analysed of {1}".format( (indx+1)/len(questions)*100, sample)) try: answer = api.fetch("answers/{ids}", ids=[question["accepted_answer_id"]])["items"][0] answer_owner = get_owner_by_user_id(api, answer["owner"]["user_id"]) except KeyError: answer = { "answer_id": "", "score": "", "creation_date": "" } answer_owner = { "user_id": "", "reputation": "", "creation_date": "", "tags": [] } question_owner = get_owner_by_user_id(api, question["owner"]["user_id"]) output = create_output(framework, sample, question, answer, question_owner, answer_owner) output_write(framework, directory, "questions_and_answers", output, False)
def allanswers(framework, projects): global api api = StackAPI("stackoverflow") samples = get_samples(projects) output_write(framework, directory, "all_answers", get_header(), True) with open("stackoverflow/" + framework + "_questions_and_answers_output.csv") as questions: for index, question in enumerate(questions): if index == 0: continue print("Questions from sample " + question.split(",")[1]) question = question.replace("\n", "") question_id = question.split(",")[2] answers = api.fetch("questions/" + question_id + "/answers")["items"] print(len(answers)) for indx, answer in enumerate(answers): print("{0}% answers analysed of question {1}".format( (indx + 1) / len(answers) * 100, question_id)) try: answer_owner = get_owner_by_user_id( api, answer["owner"]["user_id"]) except KeyError: answer_owner = { "user_id": "", "reputation": "", "creation_date": "", "tags": [] } output = create_output(framework, question.split(",")[1], question_id, answer, answer_owner) output_write(framework, directory, "all_answers", output, False)
def create_samples(argv, dataset, vocab_word, test): n_prev_sents = argv.n_prev_sents # number of previous sentences, default:5 max_n_words = argv.max_n_words # default:20 cands = dataset[0][0][3:-1] # [cand_res1, cand_res2, label] n_cands = len(cands) print('\n\nTASK SETTING') print('\n\tn_cands:%d n_prev_sents:%d max_n_words:%d\n' % (n_cands, n_prev_sents, max_n_words)) # print('\n\nConverting words into ids...') # # samples: 1D: n_docs, 2D: n_utterances, 3D: (time, speaker_id, addressee_id, response, ..., label) # samples = doc_to_id(dataset, vocab_word) print('\n\nCreating samples...') # samples: 1D: n_samples; 2D: Sample samples = get_samples(dataset=dataset, n_prev_sents=n_prev_sents, max_n_words=max_n_words, pad=False) print "num of samples: %d" % len(samples) return samples
def fit(self, successes, trials, n_samples=1000, baseline=0.0, values=None, smoothing=1.0): ''' Generate the weights for each arm based on bandit history. Parameters: successes (array): A 1 x n array with total successes for each arm trials (array): A 1 x n array with total trials for each arm n_samples (int): The number of samples to pull from each arm's distribution for Thompson Sampling. baseline (float): The minimum weight to give each ar values (array): A 1 x n array with the reward value for each arm, or None smoothing (float): The constant factor by which to divide all trials and successes Updates self.weights (array): A 1 x n array with normalized weights for each arm ''' self.values = utils.set_values(values, len(trials)) self.samples = utils.get_samples(trials, successes, n_samples, smoothing, self.values) self._raw_weights = utils.get_weights(self.samples) self.weights = utils.normalize_weights(self._raw_weights, baseline)
def main(args): logger.info('Getting samples from input file') samples = get_samples(args.samp) logger.info('Getting inferred ancestry from PCA and writing to output') parse_pca(args.pca, samples, args.out)
def file_extension_changes_forks(framework, projects, githubtoken): samples = get_samples(projects) g = get_py_github_instance(githubtoken) action_in_files = {"A": 0, "M": 0, "D": 0} extension_files = create_extension_dict() configuration_files = create_configuration_dict() write_header(action_in_files, configuration_files, extension_files, framework, "file_extension_changes_forks") for sample in samples: manage_limit_rate(len(samples)) print(sample) repository = g.get_repo(sample) forks = repository.get_forks() for fork in forks: manage_limit_rate(forks.totalCount) try: comparation = repository.compare( repository.default_branch, fork.owner.login + ":" + fork.default_branch) if comparation.ahead_by > 0: print("Downloading " + fork.full_name) Repo.clone_from(fork.clone_url, "forks_repositories/" + fork.full_name) print("Downloaded " + fork.full_name) extract_changes_log("forks_repositories/", "file_extension_changes_forks", fork.full_name, comparation) configuration_files = create_configuration_dict() extension_files = create_extension_dict() action_in_files = {"A": 0, "M": 0, "D": 0} with open("file_extension_changes_forks/" + fork.full_name + ".txt") as logs: for log in logs: log = remove_next_line(log) if ";" not in log: if ("A" in log) or ("M" in log) or ("D" in log): try: log = log.split("\t") action = log[0] file = get_file_name(log[1]) calculate_configuration_files( configuration_files, file) calculate_extension_files( extension_files, file) action_in_files[action] += 1 except: continue write_content(action_in_files, configuration_files, extension_files, framework, fork.full_name, "file_extension_changes_forks") shutil.rmtree("forks_repositories/" + fork.full_name.split("/")[0]) shutil.rmtree("file_extension_changes_forks/" + fork.full_name.split("/")[0]) print("{0} deleted".format(fork.full_name)) except Exception: print(Exception)
def main(args): logger.info('Getting individual IDs from sample list') samples = get_samples(args.samp) logger.info('Getting variants from seqr') variants = parse_var(args.var, samples) logger.info('Writing variants to output') write_bigquery_tsv(variants, args.out)
def setup(self, stage=0): samples = get_samples(self.hparams["image_path"], self.hparams["mask_path"]) num_train = int((1 - self.hparams["val_split"]) * len(samples)) self.train_samples = samples[:num_train] self.val_samples = samples[num_train:] print("Len train samples = ", len(self.train_samples)) print("Len val samples = ", len(self.val_samples))
def main(args): logger.info('Getting samples from file') samples = get_samples(args.samp) logger.info('Getting reported sex for each sample') reported = ped_sex(args.ped, samples) logger.info('Comparing inferred and reported sex and writing to output') compare(args.infer, samples, reported, args.out)
def githubmetadata(framework, projects, githubtoken): print("Computing github metadata") measure = "githubmetadata" output_write(framework, measure, measure, "framework,repository,forks,stargazers,watchers,openedIssues,closedIssues,commits,openedPullRequests,closedPullRequests,updatedAt,projects,lifetime,lifetime per commit", True) g = Github(githubtoken) samples = get_samples(projects) for index, sample in enumerate(samples): print_status_samples(index+1, len(samples)) repo = g.get_repo(sample) output = create_output(framework, repo, sample) output_write(framework, measure, measure, output, False)
def generalprojects(projects): samples = get_samples(projects) output_write("", "generalprojects", "projects", "path,stars,language,framework", False) for repository in samples: clone(repository) print("{0} baixado".format(repository)) framework = get_framework(repository) print("{0} classificado como {1}".format(repository, framework)) shutil.rmtree("generalprojects/repositories/" + repository.split("/")[0]) print("{0} apagado".format(repository)) output_write("", "generalprojects", "projects", "{0},{1}".format(repository, framework), False)
def repositoriesdownload(framework, projects): print('framework: ' + framework) samples = get_samples(projects) for sample in samples: sample = remove_especial_caracters(sample) git_url = create_url_from_github(sample) print("Downloading %s" % sample) repo_dir = "repositories/" isdir = os.path.isdir(repo_dir + sample) if isdir: print("Project " + sample + " downloaded") continue download(git_url, repo_dir, sample) print("%s downloaded" % sample)
def generator(samples, batch_size=CONST.GENERATOR_BATCH_SIZE, filter=CONST.SKIP_FILTER): num_samples = len(samples) while True: # Loop forever so the generator never terminates shuffle(samples) for offset in range(0, num_samples, batch_size): batch_samples = samples[offset:offset + batch_size] images = [] angles = [] for batch_sample in batch_samples: utils.get_samples(images, angles, batch_sample, CONST.ANGLE_CORRECTION, filter) X_train = np.array(images) y_train = np.array(angles) # When using augmentation, it will yield batches of different size: yield shuffle(X_train, y_train)
def importcount(framework, projects): print("Computing imports") measure = "importcount" output_write(framework, measure, measure, "framework,path,imports,javaFiles,imports/java_files", True) samples = get_samples(projects) for index, sample in enumerate(samples): print_status_samples(index + 1, len(samples)) deal_with_empty_repo(sample) java_files_path = find_paths("*.java", "repositories/" + sample) imports = get_imports(framework, java_files_path) relative = calculate_relative(imports, java_files_path) output_write( framework, measure, measure, create_output(framework, imports, java_files_path, relative, sample), False)
def forksahead(framework, projects, githubtoken): print("Computing forks ahead data") g = Github(githubtoken) output_write(framework, "forksahead", "forks_ahead_by_projects", "framework,path,number_of_forks,forks_ahead,ratio", True) output_write(framework, "forksahead", "forks_ahead", "framework,path,number_of_forks,forks_ahead,ratio", True) samples = get_samples(projects) for index, sample in enumerate(samples): manage_limit_rate(len(samples)) print_status_samples(index+1, len(samples)) repository = g.get_repo(sample) forks = repository.get_forks() forks_ahead = count_forks_ahead(framework, forks, repository) number_of_forks = repository.forks_count ratio_forks_ahead = forks_ahead / number_of_forks output = create_output(sample, framework, number_of_forks, forks_ahead, ratio_forks_ahead) output_write(framework, "forksahead", "forks_ahead_by_projects", output, False)
def numberofextensionfile(framework, projects): print("Computing extension files") extensions = create_extension_files() measure = "numberofextensionfile" output_write( framework, measure, measure, 'framework,project,java,properties,jar,build.gradle,pom.xml,manifest.xml,xml,bat,md,adoc,README,yaml,txt,sh,travis.yml,yml,cmd,kt,json,numberOfFiles,others', True) samples = get_samples(projects) for index, sample in enumerate(samples): print_status_samples(index + 1, len(samples)) deal_with_empty_repo(sample) count_extension_files(extensions, sample) others = count_others(extensions) output = concat_output(extensions) + str(others) output_write(framework, measure, measure, framework + "," + sample + "," + output, False)
def currentframeworkversion(framework, projects): print("Computing current framework version") configuration_file = find_config_file(framework) configuration_file_key_words = get_key_words(framework) write_output_header(configuration_file_key_words, framework) samples = get_samples(projects) for index, sample in enumerate(samples): print_status_samples(index+1, len(samples)) checkout_default_branch_repository(sample) deal_with_empty_repo(sample) configuration_files_paths = find_paths(configuration_file, "repositories/" + sample) for path in configuration_files_paths: output = framework + "," + path for key, value in configuration_file_key_words.items(): version = get_framework_version(framework, path, key) output = output + "," + version if ",,," not in output and (framework != "spring" or "RELEASE" in output): output_write(framework, "currentframeworkversion", "currentframeworkversion", output, False)
def understandmetrics(framework, projects): samples = get_samples(projects) owner = samples[0].split("/")[0] create_output_directory("understandmetrics", owner) output_write( framework, "understandmetrics", "understandmetrics", "framework,projeto,AvgCyclomatic,AvgCyclomaticModified,AvgCyclomaticStrict,AvgEssential,AvgLine,AvgLineBlank,AvgLineCode,AvgLineComment,CountClassBase,CountClassCoupled,CountClassCoupledModified,CountClassDerived,CountDeclClass,CountDeclClassMethod,CountDeclClassVariable,CountDeclExecutableUnit,CountDeclFile,CountDeclFunction,CountDeclInstanceMethod,CountDeclInstanceVariable,CountDeclMethod,CountDeclMethodAll,CountDeclMethodDefault,CountDeclMethodPrivate,CountDeclMethodProtected,CountDeclMethodPublic,CountInput,CountLine,CountLineBlank,CountLineCode,CountLineCodeDecl,CountLineCodeExe,CountLineComment,CountOutput,CountPath,CountPathLog,CountSemicolon,CountStmt,CountStmtDecl,CountStmtExe,Cyclomatic,CyclomaticModified,CyclomaticStrict,Essential,Knots,MaxCyclomatic,MaxCyclomaticModified,MaxCyclomaticStrict,MaxEssential,MaxEssentialKnots,MaxInheritanceTree,MaxNesting,MinEssentialKnots,PercentLackOfCohesion,PercentLackOfCohesionModified,RatioCommentToCode,SumCyclomatic,SumCyclomaticModified,SumCyclomaticStrict,SumEssential,?,numberOfJavaFiles", True) for sample in samples: repositories_path = "/home/gabriel/Documentos/gabrielsmenezes/pesquisamestrado/repositories/" sample_path = repositories_path + sample udb_path = "understandmetrics/" + sample deal_with_empty_repo(sample) metrics = get_understand_metrics(framework, sample, udb_path, sample_path) output = create_output(metrics) output_write(framework, "understandmetrics", "understandmetrics", output, False)
def maintainers(framework, projects, githubtoken): print("Computing maintainers data") output_write( framework, "maintainers", "maintainers", "framework,path,framework_contributors,sample_contributors,commom_contributors,commom/framework,commom/sample", True) framework_repository = get_repository_name(framework) framework_contributors = get_contributors(framework_repository, githubtoken) framework_contributors.totalCount samples = get_samples(projects) for index, sample in enumerate(samples): print_status_samples(index + 1, len(samples)) sample_contributors = get_contributors(sample, githubtoken) commmom_contributors = get_commom_contributors(framework_contributors, sample_contributors) output_write( framework, "maintainers", "maintainers", create_output(framework, sample, framework_contributors, sample_contributors, commmom_contributors), False)
def metrics_by_commits(framework, projects): samples = get_samples(projects) for index, sample in enumerate(samples): print_status_samples(index+1, len(samples)) owner = sample.split("/")[0] create_output_directory("metricsbycommits", owner) output_write(sample, "metricsbycommits", "", "framework,path,commits,date,numberOfJavaFiles,countLineCode/numberOfJavaFiles,SumCyclomaticStrict/CountDeclMethod,readability",True) repositories_path = "/home/gabriel/Documentos/gabrielsmenezes/pesquisamestrado/repositories/" sample_path = repositories_path + sample udb_path = "metricsbycommits/" + sample commits = get_commits_from(sample) commits.reverse() ########## é so rodar, esta com a hash certa para o proximo # for index, commit in enumerate(commits): # if commit.hexsha == "dfe62cb3e72c7a9cfd759dc7411197d9a629f813": # position = index # commits = commits[position+1:] for index, commit in enumerate(commits): checkout_to(sample, commit.hexsha) print("commit ======= " + commit.hexsha) metrics = get_metrics(commit, framework, sample, sample_path, udb_path) output_write(sample, "metricsbycommits", "", create_output(metrics), False) delete_unused_files(sample) print("{0}% of commits completed from sample {1}".format((index/len(commits) * 100), sample))
# using command terminal arguments input_filename = sys.argv[1] output_name = sys.argv[2] # using hardcoded file path # NOTE: if the commandline argument is failing just uncomment this and replace with the file path desired # input_filename = "data/in/raptors.jpg" # output_name = "data/in/raptors.png" print("Opening {}".format(input_filename)) img = Image.open(input_filename) img_data_rgb = np.asarray(img) ## get mosaic of image img_data = mosaic(img_data_rgb.T) r_data = get_samples(img_data, img_data.shape, "r", "data") g_data = get_samples(img_data, img_data.shape, "g", "data") b_data = get_samples(img_data, img_data.shape, "b", "data") imgHeight = img_data.shape[0] imgWidth = img_data.shape[1] #interpolating red print("interpolating Red ... ") init = 0 dim = [7, 7] for i in range(0, img_data.shape[0], 2): for j in range(1, img_data.shape[1], 2): block = np.full([8, 8], None) if (i - 2 < 0 and j - 3 < 0): # missing samples corner case 1 r = abs(i - 2)
from keras.models import Model, Sequential from sklearn.model_selection import train_test_split import numpy as np from itertools import chain import matplotlib matplotlib.use('agg') import matplotlib.pyplot as plt from functools import reduce from utils import generator, get_samples, get_sample_image, augment_brightness_camera_images, split_camera_angles, trans_image # change the path to point to loc of data root_path = './data/bend_1' train_samples, validation_samples = train_test_split(get_samples(root_path), test_size=0.2) # visual of augmentations # sample_image = get_sample_image(root_path, train_samples) # plt.imshow(sample_image) # plt.savefig('sample') # plt.imshow(augment_brightness_camera_images(sample_image)) # plt.savefig('augmented') # plt.imshow(np.fliplr(sample_image)) # plt.savefig('flipped') # sample_translated, steering = trans_image(sample_image, 0, 100)
id_train = [id for id in img_id_list if id not in id_test] #%% Generate samples crop_size_train = (160, 160) overlap_train = (80, 80) class_offset_train = (40, 40) class_area_train = (80, 80) crop_size_test = (1024, 1024) overlap_test = None class_offset_test = (0, 0) class_area_test = crop_size_test df_train = get_samples(id_train, \ path_to_img, path_to_mask, \ crop_size_train, overlap_train, \ order_dict, \ class_offset_train, class_area_train, \ verbose=True) df_test = get_samples(id_test, \ path_to_img, path_to_mask, \ crop_size_test, overlap_test, \ order_dict, \ class_offset_test, class_area_test, \ verbose=True) # the test set cut similarly to the train for performance estimations df_test_small = get_samples(id_test, \ path_to_img, path_to_mask, \ crop_size_train, overlap_train, \ order_dict, \
total_iters += params.batch_size epoch_iters += params.batch_size model.set_input(data) #input data model.step() #train the model print('\rTotal iters : {}'.format(total_iters), end="") if total_iters % params.print_freq == 0: model.print_lr() #print current learning rate model.save_loss() print('\nepoch : ', epoch, end=" ") model.print_loss(model.losses) if total_iters % params.save_lasted_freq == 0: print('\nSaving the latest model') model.save_model(epoch, 'latest') #save the model and other necessary states for training later. samples_real, smaples_fake = get_samples( model, data, epoch, params ) #save the samples generated by Generate_A or Generator_B print('Latest model saved') if epoch % params.save_epoch_freq == 0: model.save_model(epoch, 'latest') model.save_model( epoch ) #save the model and other necessary states for training later. samples_real, smaples_fake = get_samples( model, data, epoch, params ) #save the samples generated by Generate_A or Generator_B end = time.time() print('\n{}/{} is Done! Time Taken: {:.4f}s'.format( epoch, params.n_epoch, end - start)) model.print_loss(model.losses) model.update_learning_rate(epoch) #update learning rate
def prepare_data(self): self.train_samples = get_samples( Path(self.hparams["data_path"]) / "images", Path(self.hparams["data_path"]) / "masks", )
def train(args): # Device Configuration # device = torch.device( f'cuda:{args.gpu_num}' if torch.cuda.is_available() else 'cpu') # Fix Seed for Reproducibility # random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed(args.seed) # Samples, Plots, Weights and CSV Path # paths = [ args.samples_path, args.plots_path, args.weights_path, args.csv_path ] for path in paths: make_dirs(path) # Prepare Data # data = pd.read_csv(args.data_path)[args.column] # Pre-processing # scaler_1 = StandardScaler() scaler_2 = StandardScaler() preprocessed_data = pre_processing(data, scaler_1, scaler_2, args.delta) X = moving_windows(preprocessed_data, args.ts_dim) label = moving_windows(data.to_numpy(), args.ts_dim) # Prepare Networks # D = Discriminator(args.ts_dim).to(device) G = Generator(args.latent_dim, args.ts_dim, args.conditional_dim).to(device) # Loss Function # if args.criterion == 'l2': criterion = nn.MSELoss() elif args.criterion == 'wgangp': pass else: raise NotImplementedError # Optimizers # D_optim = torch.optim.Adam(D.parameters(), lr=args.lr, betas=(0.5, 0.9)) G_optim = torch.optim.Adam(G.parameters(), lr=args.lr, betas=(0.5, 0.9)) D_optim_scheduler = get_lr_scheduler(D_optim, args) G_optim_scheduler = get_lr_scheduler(G_optim, args) # Lists # D_losses, G_losses = list(), list() # Train # print("Training Time Series GAN started with total epoch of {}.".format( args.num_epochs)) for epoch in range(args.num_epochs): # Initialize Optimizers # G_optim.zero_grad() D_optim.zero_grad() if args.criterion == 'l2': n_critics = 1 elif args.criterion == 'wgangp': n_critics = 5 ####################### # Train Discriminator # ####################### for j in range(n_critics): series, start_dates = get_samples(X, label, args.batch_size) # Data Preparation # series = series.to(device) noise = torch.randn(args.batch_size, 1, args.latent_dim).to(device) # Adversarial Loss using Real Image # prob_real = D(series.float()) if args.criterion == 'l2': real_labels = torch.ones(prob_real.size()).to(device) D_real_loss = criterion(prob_real, real_labels) elif args.criterion == 'wgangp': D_real_loss = -torch.mean(prob_real) # Adversarial Loss using Fake Image # fake_series = G(noise) fake_series = torch.cat( (series[:, :, :args.conditional_dim].float(), fake_series.float()), dim=2) prob_fake = D(fake_series.detach()) if args.criterion == 'l2': fake_labels = torch.zeros(prob_fake.size()).to(device) D_fake_loss = criterion(prob_fake, fake_labels) elif args.criterion == 'wgangp': D_fake_loss = torch.mean(prob_fake) D_gp_loss = args.lambda_gp * get_gradient_penalty( D, series.float(), fake_series.float(), device) # Calculate Total Discriminator Loss # D_loss = D_fake_loss + D_real_loss if args.criterion == 'wgangp': D_loss += args.lambda_gp * D_gp_loss # Back Propagation and Update # D_loss.backward() D_optim.step() ################### # Train Generator # ################### # Adversarial Loss # fake_series = G(noise) fake_series = torch.cat( (series[:, :, :args.conditional_dim].float(), fake_series.float()), dim=2) prob_fake = D(fake_series) # Calculate Total Generator Loss # if args.criterion == 'l2': real_labels = torch.ones(prob_fake.size()).to(device) G_loss = criterion(prob_fake, real_labels) elif args.criterion == 'wgangp': G_loss = -torch.mean(prob_fake) # Back Propagation and Update # G_loss.backward() G_optim.step() # Add items to Lists # D_losses.append(D_loss.item()) G_losses.append(G_loss.item()) #################### # Print Statistics # #################### print("Epochs [{}/{}] | D Loss {:.4f} | G Loss {:.4f}".format( epoch + 1, args.num_epochs, np.average(D_losses), np.average(G_losses))) # Adjust Learning Rate # D_optim_scheduler.step() G_optim_scheduler.step() # Save Model Weights and Series # if (epoch + 1) % args.save_every == 0: torch.save( G.state_dict(), os.path.join( args.weights_path, 'TimeSeries_Generator_using{}_Epoch_{}.pkl'.format( args.criterion.upper(), epoch + 1))) series, fake_series = generate_fake_samples( X, label, G, scaler_1, scaler_2, args, device) plot_sample(series, fake_series, epoch, args) make_csv(series, fake_series, epoch, args) print("Training finished.")
def main(args): # Device Configuration # device = torch.device( f'cuda:{args.gpu_num}' if torch.cuda.is_available() else 'cpu') # Fix Seed for Reproducibility # random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed(args.seed) # Samples, Plots, Weights and CSV Path # paths = [ args.samples_path, args.weights_path, args.csv_path, args.inference_path ] for path in paths: make_dirs(path) # Prepare Data # data = pd.read_csv(args.data_path)[args.column] # Prepare Data # scaler_1 = StandardScaler() scaler_2 = StandardScaler() preprocessed_data = pre_processing(data, scaler_1, scaler_2, args.constant, args.delta) train_X, train_Y, test_X, test_Y = prepare_data(data, preprocessed_data, args) train_X = moving_windows(train_X, args.ts_dim) train_Y = moving_windows(train_Y, args.ts_dim) test_X = moving_windows(test_X, args.ts_dim) test_Y = moving_windows(test_Y, args.ts_dim) # Prepare Networks # if args.model == 'conv': D = ConvDiscriminator(args.ts_dim).to(device) G = ConvGenerator(args.latent_dim, args.ts_dim).to(device) elif args.model == 'lstm': D = LSTMDiscriminator(args.ts_dim).to(device) G = LSTMGenerator(args.latent_dim, args.ts_dim).to(device) else: raise NotImplementedError ######### # Train # ######### if args.mode == 'train': # Loss Function # if args.criterion == 'l2': criterion = nn.MSELoss() elif args.criterion == 'wgangp': pass else: raise NotImplementedError # Optimizers # if args.optim == 'sgd': D_optim = torch.optim.SGD(D.parameters(), lr=args.lr, momentum=0.9) G_optim = torch.optim.SGD(G.parameters(), lr=args.lr, momentum=0.9) elif args.optim == 'adam': D_optim = torch.optim.Adam(D.parameters(), lr=args.lr, betas=(0., 0.9)) G_optim = torch.optim.Adam(G.parameters(), lr=args.lr, betas=(0., 0.9)) else: raise NotImplementedError D_optim_scheduler = get_lr_scheduler(D_optim, args) G_optim_scheduler = get_lr_scheduler(G_optim, args) # Lists # D_losses, G_losses = list(), list() # Train # print( "Training Time Series GAN started with total epoch of {}.".format( args.num_epochs)) for epoch in range(args.num_epochs): # Initialize Optimizers # G_optim.zero_grad() D_optim.zero_grad() ####################### # Train Discriminator # ####################### if args.criterion == 'l2': n_critics = 1 elif args.criterion == 'wgangp': n_critics = 5 for j in range(n_critics): series, start_dates = get_samples(train_X, train_Y, args.batch_size) # Data Preparation # series = series.to(device) noise = torch.randn(args.batch_size, 1, args.latent_dim).to(device) # Adversarial Loss using Real Image # prob_real = D(series.float()) if args.criterion == 'l2': real_labels = torch.ones(prob_real.size()).to(device) D_real_loss = criterion(prob_real, real_labels) elif args.criterion == 'wgangp': D_real_loss = -torch.mean(prob_real) # Adversarial Loss using Fake Image # fake_series = G(noise) prob_fake = D(fake_series.detach()) if args.criterion == 'l2': fake_labels = torch.zeros(prob_fake.size()).to(device) D_fake_loss = criterion(prob_fake, fake_labels) elif args.criterion == 'wgangp': D_fake_loss = torch.mean(prob_fake) D_gp_loss = args.lambda_gp * get_gradient_penalty( D, series.float(), fake_series.float(), device) # Calculate Total Discriminator Loss # D_loss = D_fake_loss + D_real_loss if args.criterion == 'wgangp': D_loss += args.lambda_gp * D_gp_loss # Back Propagation and Update # D_loss.backward() D_optim.step() ################### # Train Generator # ################### # Adversarial Loss # fake_series = G(noise) prob_fake = D(fake_series) # Calculate Total Generator Loss # if args.criterion == 'l2': real_labels = torch.ones(prob_fake.size()).to(device) G_loss = criterion(prob_fake, real_labels) elif args.criterion == 'wgangp': G_loss = -torch.mean(prob_fake) # Back Propagation and Update # G_loss.backward() G_optim.step() # Add items to Lists # D_losses.append(D_loss.item()) G_losses.append(G_loss.item()) # Adjust Learning Rate # D_optim_scheduler.step() G_optim_scheduler.step() # Print Statistics, Save Model Weights and Series # if (epoch + 1) % args.log_every == 0: # Print Statistics and Save Model # print("Epochs [{}/{}] | D Loss {:.4f} | G Loss {:.4f}".format( epoch + 1, args.num_epochs, np.average(D_losses), np.average(G_losses))) torch.save( G.state_dict(), os.path.join( args.weights_path, 'TS_using{}_and_{}_Epoch_{}.pkl'.format( G.__class__.__name__, args.criterion.upper(), epoch + 1))) # Generate Samples and Save Plots and CSVs # series, fake_series = generate_fake_samples( test_X, test_Y, G, scaler_1, scaler_2, args, device) plot_series(series, fake_series, G, epoch, args, args.samples_path) make_csv(series, fake_series, G, epoch, args, args.csv_path) ######## # Test # ######## elif args.mode == 'test': # Load Model Weights # G.load_state_dict( torch.load( os.path.join( args.weights_path, 'TS_using{}_and_{}_Epoch_{}.pkl'.format( G.__class__.__name__, args.criterion.upper(), args.num_epochs)))) # Lists # real, fake = list(), list() # Inference # for idx in range(0, test_X.shape[0], args.ts_dim): # Do not plot if the remaining data is less than time dimension # end_ix = idx + args.ts_dim if end_ix > len(test_X) - 1: break # Prepare Data # test_data = test_X[idx, :] test_data = np.expand_dims(test_data, axis=0) test_data = np.expand_dims(test_data, axis=1) test_data = torch.from_numpy(test_data).to(device) start = test_Y[idx, 0] noise = torch.randn(args.val_batch_size, 1, args.latent_dim).to(device) # Generate Fake Data # with torch.no_grad(): fake_series = G(noise) # Convert to Numpy format for Saving # test_data = np.squeeze(test_data.cpu().data.numpy()) fake_series = np.squeeze(fake_series.cpu().data.numpy()) test_data = post_processing(test_data, start, scaler_1, scaler_2, args.delta) fake_series = post_processing(fake_series, start, scaler_1, scaler_2, args.delta) real += test_data.tolist() fake += fake_series.tolist() # Plot, Save to CSV file and Derive Metrics # plot_series(real, fake, G, args.num_epochs - 1, args, args.inference_path) make_csv(real, fake, G, args.num_epochs - 1, args, args.inference_path) derive_metrics(real, fake, args) else: raise NotImplementedError
def main(): # define the command line arguments g_help = "teacher + student activation function: 'erf' or 'relu'" M_help = "number of teacher hidden nodes" K_help = "number of student hidden nodes" device_help = "which device to run on: 'cuda' or 'cpu'" generator_help = "Generator of the inputs: dcgan_rand, dcgan_cifar10, dcgan_cifar100_grey, nvp_cifar10." transform_help = "Transform: identity, scattering, ..." steps_help = "training steps as multiples of N" seed_help = "random number generator seed." parser = argparse.ArgumentParser() parser.add_argument("-g", "--g", default="erf", help=g_help) parser.add_argument("-M", "--M", type=int, default=2, help=M_help) parser.add_argument("-K", "--K", type=int, default=2, help=K_help) parser.add_argument("--generator", help=generator_help, default="rand") parser.add_argument("--transform", help=transform_help) parser.add_argument("--device", "-d", help=device_help) parser.add_argument("--lr", type=float, default=0.2, help="learning rate") parser.add_argument("--bs", type=int, default=1, help="mini-batch size") parser.add_argument("--steps", type=int, default=10000, help=steps_help) parser.add_argument("-q", "--quiet", help="be quiet", action="store_true") parser.add_argument("-s", "--seed", type=int, default=0, help=seed_help) parser.add_argument("--store", action="store_true", help="store initial conditions") args = parser.parse_args() torch.manual_seed(args.seed) if args.device is None: device = torch.device("cuda" if torch.cuda.is_available() else "cpu") else: device = torch.device(args.device) (M, K, lr) = (args.M, args.K, args.lr) # Find the right generator for the given scenario generator = utils.get_generator(args.generator, device) # transformation of the inputs transformation = utils.get_transformation(args.transform, generator, device) model_desc = generator.name() if transformation is not None: model_desc += "_" + transformation.name() # Define the dimensions of the problem D = generator.N_in N = generator.N_out if transformation is None else transformation.N_out # get the moments of the generator to center its outputs try: generator_mean_vec = torch.load("moments/%s_mean_x.pt" % generator.name(), map_location=device) generator_cov = torch.load("moments/%s_omega.pt" % generator.name(), map_location=device) except FileNotFoundError: print("Could not find moments of generator %s. Will exit now!" % generator.name()) exit() # define the scalar moments of the generator's output distribution generator_mean, generator_std = utils.get_scalar_mean_std( generator_mean_vec, generator_cov) # Now get the moments of the inputs that come out of the transformation transformation_mean = None transformation_std = None # Either load pre-computed Omega and Phi, or generate from the test set Omega = None # the student input - input covariance Phi = None # the generator input - student input covariance try: mean_x = torch.load( "moments/%s_mean_x.pt" % model_desc, map_location=device, ) Omega = torch.load( "moments/%s_Omega.pt" % model_desc, map_location=device, ) Phi = torch.load( "moments/%s_phi.pt" % model_desc, map_location=device, ) transformation_mean, transformation_std = utils.get_scalar_mean_std( mean_x, Omega) except FileNotFoundError: pass # networks and loss g = erfscaled if args.g == "erf" else F.relu gs = (g, identity) student = TwoLayer(gs, N, args.K, 1, normalise1=True, std0=1e-2) student.to(device) teacher = TwoLayer(gs, D, args.M, 1, normalise1=True, std0=1) nn.init.constant_(teacher.fc2.weight, 1) teacher.freeze() teacher.to(device) B = teacher.fc1.weight.data A = teacher.fc2.weight.data # collect the parameters that are going to be optimised by SGD params = [] params += [{"params": student.fc1.parameters()}] # If we train the last layer, ensure its learning rate scales correctly params += [{"params": student.fc2.parameters(), "lr": lr / N}] optimizer = optim.SGD(params, lr=lr) criterion = HalfMSELoss() # when to print? end = torch.log10(torch.tensor([1.0 * args.steps])).item() times_to_print = list(torch.logspace(-1, end, steps=200)) # generate the test set test_cs, test_xs, test_ys = utils.get_samples( device, NUM_TESTSAMPLES, generator, generator_mean, teacher, transformation, transformation_mean, ) # If we didn't found a pre-computed Omega and Phi (which we need to store the # initial conditions), we can compute them from the test set if Omega is None: Omega = 1 / NUM_TESTSAMPLES * test_xs.T @ test_xs Phi = 1 / NUM_TESTSAMPLES * test_xs.T @ test_cs nus = B.mm(test_cs.T) / math.sqrt(D) # output file + welcome message log_fname = "transform_online_%s_D%d_N%d_%s_M%d_K%d_lr%g_i2_s%d.dat" % ( model_desc, D, N, args.g, M, K, lr, args.seed, ) logfile = open(log_fname, "w", buffering=1) welcome = "# Two-layer nets on inputs from generator %s" % generator.name() if transformation is None: welcome += "\n" else: welcome += " with transformation %s\n" % transformation.name() welcome += "# M=%d, K=%d, lr=%g, batch size=%d, seed=%d\n" % ( M, K, lr, args.bs, args.seed, ) welcome += "# Using device:" + str(device) log(welcome, logfile) print("# Generator, Teacher and Student: ") for net in [generator, teacher, student]: msg = "# " + str(net).replace("\n", "\n# ") log(msg, logfile) msg = "# test xs: mean=%g, std=%g; test ys: std=%g" % ( torch.mean(test_xs), torch.std(test_xs), torch.std(test_ys), ) log(msg, logfile) T = 1.0 / B.shape[1] * B @ B.T rotation = Phi.T @ Phi tildeT = 1 / N * B @ rotation @ B.T if args.store: with torch.no_grad(): # compute the exact densities of r and q exq = torch.zeros((K, K, N), device=device) exr = torch.zeros((K, M, N), device=device) extildet = torch.zeros((M, M, N), device=device) sqrtN = math.sqrt(N) w = student.fc1.weight.data v = student.fc2.weight.data rhos, psis = torch.symeig(Omega, eigenvectors=True) rhos.to(device) psis.to(device) # make sure to normalise, orient evectors according to the note psis = sqrtN * psis.T GammaB = 1.0 / sqrtN * B @ Phi.T @ psis.T GammaW = 1.0 / sqrtN * w @ psis.T for k in range(K): for l in range(K): exq[k, l] = GammaW[k, :] * GammaW[l, :] for n in range(M): exr[k, n] = GammaW[k, :] * GammaB[n, :] for n in range(M): for m in range(M): extildet[n, m] = GammaB[n, :] * GammaB[m, :] root_name = log_fname[:-4] np.savetxt(root_name + "_T.dat", T.cpu().numpy(), delimiter=",") np.savetxt(root_name + "_rhos.dat", rhos.cpu().numpy(), delimiter=",") np.savetxt(root_name + "_T.dat", T.cpu().numpy(), delimiter=",") np.savetxt(root_name + "_A.dat", A[0].cpu().numpy(), delimiter=",") np.savetxt(root_name + "_v0.dat", v[0].cpu().numpy(), delimiter=",") write_density(root_name + "_q0.dat", exq) write_density(root_name + "_r0.dat", exr) write_density(root_name + "_tildet.dat", extildet) time = 0 dt = 1 / N msg = eval_student(time, student, test_xs, test_ys, nus, T, tildeT, A, criterion) log(msg, logfile) while len(times_to_print) > 0: # get the inputs cs, inputs, targets = utils.get_samples( device, args.bs, generator, generator_mean, teacher, transformation, transformation_mean, ) for i in range(args.bs): student.train() preds = student(inputs[i]) loss = criterion(preds, targets[i]) # TRAINING student.zero_grad() loss.backward() optimizer.step() time += dt if time >= times_to_print[0].item() or time == 0: msg = eval_student(time, student, test_xs, test_ys, nus, T, tildeT, A, criterion) log(msg, logfile) times_to_print.pop(0) print("Bye-bye")