def file_extension_changes(framework, projects):
    print("Computing file extension changes")
    samples = get_samples(projects)
    configuration_files = create_configuration_dict()
    extension_files = create_extension_dict()
    action_in_files = {"A": 0, "M": 0, "D": 0}
    write_header(action_in_files, configuration_files, extension_files,
                 framework, "file_extension_changes")
    for sample in samples:
        extract_changes_log("repositories/", "file_extension_changes", sample)
        configuration_files = create_configuration_dict()
        extension_files = create_extension_dict()
        action_in_files = {"A": 0, "M": 0, "D": 0}
        with open("file_extension_changes/" + sample + ".txt") as logs:
            for log in logs:
                log = remove_next_line(log)
                if ";" not in log:
                    if ("A" in log) or ("M" in log) or ("D" in log):
                        try:
                            log = log.split("\t")
                            action = log[0]
                            file = get_file_name(log[1])
                            calculate_configuration_files(
                                configuration_files, file)
                            calculate_extension_files(extension_files, file)
                            action_in_files[action] += 1
                        except:
                            continue
        write_content(action_in_files, configuration_files, extension_files,
                      framework, sample, "file_extension_changes")
Ejemplo n.º 2
0
def delay(framework, projects, githubtoken):
    print("Computing delay to update")
    path_dos_repositorios = 'repositories'
    measure = "delay"
    output_write(framework, measure, measure, "framework,path,current_version,next_version,framework_release_date (YYYY-DD-MM),sample_update_date (YYYY-DD-MM) ,delay_in_days", True)
    framework_release_data = buscar_dados_de_lancamento_de_versoes(framework, githubtoken)
    configuration_file = define_arquivo_de_configuracao(framework)
    samples = get_samples(projects)
    for index, sample in enumerate(samples):
        print_status_samples(index+1, len(samples))
        sample_path = path_dos_repositorios + "/" + sample
        paths_configuration_file = find_paths(configuration_file, sample_path)
        repository = Repo(sample_path)
        reversed_commits = get_commits(repository)
        for path in paths_configuration_file:
            current_version, reversed_commits = get_first_version(framework, path, repository, reversed_commits)
            if current_version == {}:
                continue
            for commit in reversed_commits:
                repository.git.checkout(commit, '-f')
                next_version = buscar_versao_do_framework(framework, path)
                if current_version != next_version and next_version != '' and current_version != '' and current_version != None and next_version != None:
                    sample_update_date = get_commit_date(commit)
                    framework_release_date = framework_release_data[next_version]
                    delay_in_days = calculate_delay(framework_release_date, sample_update_date)
                    output_write(framework, measure, measure, create_output(current_version, delay_in_days, framework, framework_release_date, next_version, path, sample_update_date), False)
                    current_version = next_version
        repository.git.checkout('master', '-f')
def stackoverflow(framework, projects):
    global api
    api = StackAPI("stackoverflow")
    samples = get_samples(projects)
    output_write(framework, directory, "questions_and_answers", get_header(), True)
    for index, sample in enumerate(samples):
        print_status_samples(index+1, len(samples))
        questions = get_questions_when_body_has(sample)
        for indx, question in enumerate(questions["items"]):
            print("{0}% questions analysed of {1}".format( (indx+1)/len(questions)*100, sample))
            try:
                answer = api.fetch("answers/{ids}", ids=[question["accepted_answer_id"]])["items"][0]
                answer_owner = get_owner_by_user_id(api, answer["owner"]["user_id"])
            except KeyError:
                answer = {
                    "answer_id": "",
                    "score": "",
                    "creation_date": ""
                }
                answer_owner = {
                    "user_id": "",
                    "reputation": "",
                    "creation_date": "",
                    "tags": []
                }
            question_owner = get_owner_by_user_id(api, question["owner"]["user_id"])
            output = create_output(framework, sample, question, answer, question_owner, answer_owner)
            output_write(framework, directory, "questions_and_answers", output, False)
Ejemplo n.º 4
0
def allanswers(framework, projects):
    global api
    api = StackAPI("stackoverflow")
    samples = get_samples(projects)
    output_write(framework, directory, "all_answers", get_header(), True)
    with open("stackoverflow/" + framework +
              "_questions_and_answers_output.csv") as questions:
        for index, question in enumerate(questions):
            if index == 0: continue
            print("Questions from sample " + question.split(",")[1])
            question = question.replace("\n", "")
            question_id = question.split(",")[2]
            answers = api.fetch("questions/" + question_id +
                                "/answers")["items"]
            print(len(answers))
            for indx, answer in enumerate(answers):
                print("{0}% answers analysed of question {1}".format(
                    (indx + 1) / len(answers) * 100, question_id))
                try:
                    answer_owner = get_owner_by_user_id(
                        api, answer["owner"]["user_id"])
                except KeyError:
                    answer_owner = {
                        "user_id": "",
                        "reputation": "",
                        "creation_date": "",
                        "tags": []
                    }

                output = create_output(framework,
                                       question.split(",")[1], question_id,
                                       answer, answer_owner)
                output_write(framework, directory, "all_answers", output,
                             False)
Ejemplo n.º 5
0
def create_samples(argv, dataset, vocab_word, test):
    n_prev_sents = argv.n_prev_sents  # number of previous sentences, default:5
    max_n_words = argv.max_n_words  # default:20

    cands = dataset[0][0][3:-1]  # [cand_res1, cand_res2, label]
    n_cands = len(cands)

    print('\n\nTASK  SETTING')
    print('\n\tn_cands:%d  n_prev_sents:%d  max_n_words:%d\n' %
          (n_cands, n_prev_sents, max_n_words))

    # print('\n\nConverting words into ids...')
    # # samples: 1D: n_docs, 2D: n_utterances, 3D: (time, speaker_id, addressee_id, response, ..., label)
    # samples = doc_to_id(dataset, vocab_word)

    print('\n\nCreating samples...')
    # samples: 1D: n_samples; 2D: Sample
    samples = get_samples(dataset=dataset,
                          n_prev_sents=n_prev_sents,
                          max_n_words=max_n_words,
                          pad=False)

    print "num of samples: %d" % len(samples)

    return samples
Ejemplo n.º 6
0
    def fit(self,
            successes,
            trials,
            n_samples=1000,
            baseline=0.0,
            values=None,
            smoothing=1.0):
        '''
        Generate the weights for each arm based on bandit history.

        Parameters:
            successes (array): A 1 x n array with total successes for each arm
               trials (array): A 1 x n array with total trials for each arm
              n_samples (int): The number of samples to pull from each arm's distribution
                               for Thompson Sampling.
             baseline (float): The minimum weight to give each ar
               values (array): A 1 x n array with the reward value for each arm, or None
            smoothing (float): The constant factor by which to divide all trials and successes

        Updates
            self.weights (array): A 1 x n array with normalized weights for each arm
        '''

        self.values = utils.set_values(values, len(trials))
        self.samples = utils.get_samples(trials, successes, n_samples,
                                         smoothing, self.values)
        self._raw_weights = utils.get_weights(self.samples)
        self.weights = utils.normalize_weights(self._raw_weights, baseline)
Ejemplo n.º 7
0
def main(args):

    logger.info('Getting samples from input file')
    samples = get_samples(args.samp)

    logger.info('Getting inferred ancestry from PCA and writing to output')
    parse_pca(args.pca, samples, args.out)
Ejemplo n.º 8
0
def file_extension_changes_forks(framework, projects, githubtoken):
    samples = get_samples(projects)
    g = get_py_github_instance(githubtoken)
    action_in_files = {"A": 0, "M": 0, "D": 0}
    extension_files = create_extension_dict()
    configuration_files = create_configuration_dict()
    write_header(action_in_files, configuration_files, extension_files,
                 framework, "file_extension_changes_forks")
    for sample in samples:
        manage_limit_rate(len(samples))
        print(sample)
        repository = g.get_repo(sample)
        forks = repository.get_forks()
        for fork in forks:
            manage_limit_rate(forks.totalCount)
            try:
                comparation = repository.compare(
                    repository.default_branch,
                    fork.owner.login + ":" + fork.default_branch)
                if comparation.ahead_by > 0:
                    print("Downloading " + fork.full_name)
                    Repo.clone_from(fork.clone_url,
                                    "forks_repositories/" + fork.full_name)
                    print("Downloaded " + fork.full_name)
                    extract_changes_log("forks_repositories/",
                                        "file_extension_changes_forks",
                                        fork.full_name, comparation)
                    configuration_files = create_configuration_dict()
                    extension_files = create_extension_dict()
                    action_in_files = {"A": 0, "M": 0, "D": 0}
                    with open("file_extension_changes_forks/" +
                              fork.full_name + ".txt") as logs:
                        for log in logs:
                            log = remove_next_line(log)
                            if ";" not in log:
                                if ("A" in log) or ("M" in log) or ("D"
                                                                    in log):
                                    try:
                                        log = log.split("\t")
                                        action = log[0]
                                        file = get_file_name(log[1])
                                        calculate_configuration_files(
                                            configuration_files, file)
                                        calculate_extension_files(
                                            extension_files, file)
                                        action_in_files[action] += 1
                                    except:
                                        continue
                    write_content(action_in_files, configuration_files,
                                  extension_files, framework, fork.full_name,
                                  "file_extension_changes_forks")
                    shutil.rmtree("forks_repositories/" +
                                  fork.full_name.split("/")[0])
                    shutil.rmtree("file_extension_changes_forks/" +
                                  fork.full_name.split("/")[0])
                    print("{0} deleted".format(fork.full_name))
            except Exception:
                print(Exception)
Ejemplo n.º 9
0
def main(args):

    logger.info('Getting individual IDs from sample list')
    samples = get_samples(args.samp)

    logger.info('Getting variants from seqr')
    variants = parse_var(args.var, samples)

    logger.info('Writing variants to output')
    write_bigquery_tsv(variants, args.out)
Ejemplo n.º 10
0
    def setup(self, stage=0):
        samples = get_samples(self.hparams["image_path"], self.hparams["mask_path"])

        num_train = int((1 - self.hparams["val_split"]) * len(samples))

        self.train_samples = samples[:num_train]
        self.val_samples = samples[num_train:]

        print("Len train samples = ", len(self.train_samples))
        print("Len val samples = ", len(self.val_samples))
Ejemplo n.º 11
0
def main(args):

    logger.info('Getting samples from file')
    samples = get_samples(args.samp)

    logger.info('Getting reported sex for each sample')
    reported = ped_sex(args.ped, samples)

    logger.info('Comparing inferred and reported sex and writing to output')
    compare(args.infer, samples, reported, args.out)
def githubmetadata(framework, projects, githubtoken):
    print("Computing github metadata")
    measure = "githubmetadata"
    output_write(framework, measure, measure, "framework,repository,forks,stargazers,watchers,openedIssues,closedIssues,commits,openedPullRequests,closedPullRequests,updatedAt,projects,lifetime,lifetime per commit", True)
    g = Github(githubtoken)
    samples = get_samples(projects)
    for index, sample in enumerate(samples):
        print_status_samples(index+1, len(samples))
        repo = g.get_repo(sample)
        output = create_output(framework, repo, sample)
        output_write(framework, measure, measure, output, False)
def generalprojects(projects):
    samples = get_samples(projects)
    output_write("", "generalprojects", "projects",
                 "path,stars,language,framework", False)
    for repository in samples:
        clone(repository)
        print("{0} baixado".format(repository))
        framework = get_framework(repository)
        print("{0} classificado como {1}".format(repository, framework))
        shutil.rmtree("generalprojects/repositories/" +
                      repository.split("/")[0])
        print("{0} apagado".format(repository))
        output_write("", "generalprojects", "projects",
                     "{0},{1}".format(repository, framework), False)
def repositoriesdownload(framework, projects):
    print('framework: ' + framework)
    samples = get_samples(projects)
    for sample in samples:
        sample = remove_especial_caracters(sample)
        git_url = create_url_from_github(sample)
        print("Downloading %s" % sample)
        repo_dir = "repositories/"
        isdir = os.path.isdir(repo_dir + sample)
        if isdir:
            print("Project " + sample + " downloaded")
            continue
        download(git_url, repo_dir, sample)
        print("%s downloaded" % sample)
def generator(samples,
              batch_size=CONST.GENERATOR_BATCH_SIZE,
              filter=CONST.SKIP_FILTER):
    num_samples = len(samples)

    while True:  # Loop forever so the generator never terminates

        shuffle(samples)

        for offset in range(0, num_samples, batch_size):
            batch_samples = samples[offset:offset + batch_size]

            images = []
            angles = []

            for batch_sample in batch_samples:
                utils.get_samples(images, angles, batch_sample,
                                  CONST.ANGLE_CORRECTION, filter)

            X_train = np.array(images)
            y_train = np.array(angles)

            # When using augmentation, it will yield batches of different size:
            yield shuffle(X_train, y_train)
Ejemplo n.º 16
0
def importcount(framework, projects):
    print("Computing imports")
    measure = "importcount"
    output_write(framework, measure, measure,
                 "framework,path,imports,javaFiles,imports/java_files", True)
    samples = get_samples(projects)
    for index, sample in enumerate(samples):
        print_status_samples(index + 1, len(samples))
        deal_with_empty_repo(sample)
        java_files_path = find_paths("*.java", "repositories/" + sample)
        imports = get_imports(framework, java_files_path)
        relative = calculate_relative(imports, java_files_path)
        output_write(
            framework, measure, measure,
            create_output(framework, imports, java_files_path, relative,
                          sample), False)
Ejemplo n.º 17
0
def forksahead(framework, projects, githubtoken):
    print("Computing forks ahead data")
    g = Github(githubtoken)
    output_write(framework, "forksahead", "forks_ahead_by_projects", "framework,path,number_of_forks,forks_ahead,ratio", True)
    output_write(framework, "forksahead", "forks_ahead", "framework,path,number_of_forks,forks_ahead,ratio", True)
    samples = get_samples(projects)
    for index, sample in enumerate(samples):
        manage_limit_rate(len(samples))
        print_status_samples(index+1, len(samples))
        repository = g.get_repo(sample)
        forks = repository.get_forks()
        forks_ahead = count_forks_ahead(framework, forks, repository)
        number_of_forks = repository.forks_count
        ratio_forks_ahead = forks_ahead / number_of_forks
        output = create_output(sample, framework, number_of_forks, forks_ahead, ratio_forks_ahead)
        output_write(framework, "forksahead", "forks_ahead_by_projects", output, False)
Ejemplo n.º 18
0
def numberofextensionfile(framework, projects):
    print("Computing extension files")
    extensions = create_extension_files()
    measure = "numberofextensionfile"
    output_write(
        framework, measure, measure,
        'framework,project,java,properties,jar,build.gradle,pom.xml,manifest.xml,xml,bat,md,adoc,README,yaml,txt,sh,travis.yml,yml,cmd,kt,json,numberOfFiles,others',
        True)
    samples = get_samples(projects)
    for index, sample in enumerate(samples):
        print_status_samples(index + 1, len(samples))
        deal_with_empty_repo(sample)
        count_extension_files(extensions, sample)
        others = count_others(extensions)
        output = concat_output(extensions) + str(others)
        output_write(framework, measure, measure,
                     framework + "," + sample + "," + output, False)
def currentframeworkversion(framework, projects):
    print("Computing current framework version")
    configuration_file = find_config_file(framework)
    configuration_file_key_words = get_key_words(framework)
    write_output_header(configuration_file_key_words, framework)
    samples = get_samples(projects)
    for index, sample in enumerate(samples):
        print_status_samples(index+1, len(samples))
        checkout_default_branch_repository(sample)
        deal_with_empty_repo(sample)
        configuration_files_paths = find_paths(configuration_file, "repositories/" + sample)
        for path in configuration_files_paths:
            output = framework + "," + path
            for key, value in configuration_file_key_words.items():
                version = get_framework_version(framework, path, key)
                output = output + "," + version
            if ",,," not in output and (framework != "spring" or "RELEASE" in output):
                output_write(framework, "currentframeworkversion", "currentframeworkversion", output, False)
Ejemplo n.º 20
0
def understandmetrics(framework, projects):
    samples = get_samples(projects)
    owner = samples[0].split("/")[0]
    create_output_directory("understandmetrics", owner)
    output_write(
        framework, "understandmetrics", "understandmetrics",
        "framework,projeto,AvgCyclomatic,AvgCyclomaticModified,AvgCyclomaticStrict,AvgEssential,AvgLine,AvgLineBlank,AvgLineCode,AvgLineComment,CountClassBase,CountClassCoupled,CountClassCoupledModified,CountClassDerived,CountDeclClass,CountDeclClassMethod,CountDeclClassVariable,CountDeclExecutableUnit,CountDeclFile,CountDeclFunction,CountDeclInstanceMethod,CountDeclInstanceVariable,CountDeclMethod,CountDeclMethodAll,CountDeclMethodDefault,CountDeclMethodPrivate,CountDeclMethodProtected,CountDeclMethodPublic,CountInput,CountLine,CountLineBlank,CountLineCode,CountLineCodeDecl,CountLineCodeExe,CountLineComment,CountOutput,CountPath,CountPathLog,CountSemicolon,CountStmt,CountStmtDecl,CountStmtExe,Cyclomatic,CyclomaticModified,CyclomaticStrict,Essential,Knots,MaxCyclomatic,MaxCyclomaticModified,MaxCyclomaticStrict,MaxEssential,MaxEssentialKnots,MaxInheritanceTree,MaxNesting,MinEssentialKnots,PercentLackOfCohesion,PercentLackOfCohesionModified,RatioCommentToCode,SumCyclomatic,SumCyclomaticModified,SumCyclomaticStrict,SumEssential,?,numberOfJavaFiles",
        True)

    for sample in samples:
        repositories_path = "/home/gabriel/Documentos/gabrielsmenezes/pesquisamestrado/repositories/"
        sample_path = repositories_path + sample
        udb_path = "understandmetrics/" + sample
        deal_with_empty_repo(sample)
        metrics = get_understand_metrics(framework, sample, udb_path,
                                         sample_path)
        output = create_output(metrics)
        output_write(framework, "understandmetrics", "understandmetrics",
                     output, False)
Ejemplo n.º 21
0
def maintainers(framework, projects, githubtoken):
    print("Computing maintainers data")
    output_write(
        framework, "maintainers", "maintainers",
        "framework,path,framework_contributors,sample_contributors,commom_contributors,commom/framework,commom/sample",
        True)
    framework_repository = get_repository_name(framework)
    framework_contributors = get_contributors(framework_repository,
                                              githubtoken)
    framework_contributors.totalCount
    samples = get_samples(projects)
    for index, sample in enumerate(samples):
        print_status_samples(index + 1, len(samples))
        sample_contributors = get_contributors(sample, githubtoken)
        commmom_contributors = get_commom_contributors(framework_contributors,
                                                       sample_contributors)
        output_write(
            framework, "maintainers", "maintainers",
            create_output(framework, sample, framework_contributors,
                          sample_contributors, commmom_contributors), False)
Ejemplo n.º 22
0
    def fit(self, successes, trials, n_samples=1000, baseline=0.0, values=None, smoothing=1.0):
        '''
        Generate the weights for each arm based on bandit history.

        Parameters:
            successes (array): A 1 x n array with total successes for each arm
               trials (array): A 1 x n array with total trials for each arm
              n_samples (int): The number of samples to pull from each arm's distribution
                               for Thompson Sampling.
             baseline (float): The minimum weight to give each ar
               values (array): A 1 x n array with the reward value for each arm, or None
            smoothing (float): The constant factor by which to divide all trials and successes

        Updates
            self.weights (array): A 1 x n array with normalized weights for each arm
        '''

        self.values = utils.set_values(values, len(trials))
        self.samples = utils.get_samples(trials, successes, n_samples, smoothing, self.values)
        self._raw_weights = utils.get_weights(self.samples)
        self.weights = utils.normalize_weights(self._raw_weights, baseline)
def metrics_by_commits(framework, projects):
    samples = get_samples(projects)
    for index, sample in enumerate(samples):
        print_status_samples(index+1, len(samples))
        owner = sample.split("/")[0]
        create_output_directory("metricsbycommits", owner)
        output_write(sample, "metricsbycommits", "", "framework,path,commits,date,numberOfJavaFiles,countLineCode/numberOfJavaFiles,SumCyclomaticStrict/CountDeclMethod,readability",True)
        repositories_path = "/home/gabriel/Documentos/gabrielsmenezes/pesquisamestrado/repositories/"
        sample_path = repositories_path + sample
        udb_path = "metricsbycommits/" + sample
        commits = get_commits_from(sample)
        commits.reverse()
        ########## é so rodar, esta com a hash certa para o proximo
        # for index, commit in enumerate(commits):
        #     if commit.hexsha == "dfe62cb3e72c7a9cfd759dc7411197d9a629f813":
        #         position = index
        # commits = commits[position+1:]
        for index, commit in enumerate(commits):
            checkout_to(sample, commit.hexsha)
            print("commit ======= " + commit.hexsha)
            metrics = get_metrics(commit, framework, sample, sample_path, udb_path)
            output_write(sample, "metricsbycommits", "", create_output(metrics), False)
            delete_unused_files(sample)
            print("{0}% of commits completed from sample {1}".format((index/len(commits) * 100), sample))
Ejemplo n.º 24
0
# using command terminal arguments
input_filename = sys.argv[1]
output_name = sys.argv[2]
# using hardcoded file path
# NOTE: if the commandline argument is failing just uncomment this and replace with the file path desired
# input_filename = "data/in/raptors.jpg"
# output_name = "data/in/raptors.png"

print("Opening {}".format(input_filename))
img = Image.open(input_filename)
img_data_rgb = np.asarray(img)

## get mosaic of image
img_data = mosaic(img_data_rgb.T)

r_data = get_samples(img_data, img_data.shape, "r", "data")
g_data = get_samples(img_data, img_data.shape, "g", "data")
b_data = get_samples(img_data, img_data.shape, "b", "data")
imgHeight = img_data.shape[0]
imgWidth = img_data.shape[1]

#interpolating red
print("interpolating Red ... ")
init = 0
dim = [7, 7]
for i in range(0, img_data.shape[0], 2):
    for j in range(1, img_data.shape[1], 2):
        block = np.full([8, 8], None)
        if (i - 2 < 0 and j - 3 < 0):
            # missing samples corner case 1
            r = abs(i - 2)
Ejemplo n.º 25
0
from keras.models import Model, Sequential
from sklearn.model_selection import train_test_split
import numpy as np
from itertools import chain

import matplotlib
matplotlib.use('agg')
import matplotlib.pyplot as plt
from functools import reduce

from utils import generator, get_samples, get_sample_image, augment_brightness_camera_images, split_camera_angles, trans_image

# change the path to point to loc of data
root_path = './data/bend_1'

train_samples, validation_samples = train_test_split(get_samples(root_path),
                                                     test_size=0.2)

# visual of augmentations
# sample_image = get_sample_image(root_path, train_samples)

# plt.imshow(sample_image)
# plt.savefig('sample')

# plt.imshow(augment_brightness_camera_images(sample_image))
# plt.savefig('augmented')

# plt.imshow(np.fliplr(sample_image))
# plt.savefig('flipped')

# sample_translated, steering = trans_image(sample_image, 0, 100)
id_train = [id for id in img_id_list if id not in id_test]

#%% Generate samples
crop_size_train = (160, 160)
overlap_train = (80, 80)
class_offset_train = (40, 40)
class_area_train = (80, 80)

crop_size_test = (1024, 1024)
overlap_test = None
class_offset_test = (0, 0)
class_area_test = crop_size_test

df_train = get_samples(id_train, \
                       path_to_img, path_to_mask, \
                       crop_size_train, overlap_train, \
                       order_dict, \
                       class_offset_train, class_area_train, \
                       verbose=True)

df_test = get_samples(id_test, \
                       path_to_img, path_to_mask, \
                       crop_size_test, overlap_test, \
                       order_dict, \
                       class_offset_test, class_area_test, \
                       verbose=True)

# the test set cut similarly to the train for performance estimations
df_test_small = get_samples(id_test, \
                       path_to_img, path_to_mask, \
                       crop_size_train, overlap_train, \
                       order_dict, \
Ejemplo n.º 27
0
     total_iters += params.batch_size
     epoch_iters += params.batch_size
     model.set_input(data)  #input data
     model.step()  #train the model
     print('\rTotal iters : {}'.format(total_iters), end="")
     if total_iters % params.print_freq == 0:
         model.print_lr()  #print current learning rate
         model.save_loss()
         print('\nepoch : ', epoch, end=" ")
         model.print_loss(model.losses)
     if total_iters % params.save_lasted_freq == 0:
         print('\nSaving the latest model')
         model.save_model(epoch, 'latest')
         #save the model and other necessary states for training later.
         samples_real, smaples_fake = get_samples(
             model, data, epoch, params
         )  #save the samples generated by Generate_A or Generator_B
         print('Latest model saved')
 if epoch % params.save_epoch_freq == 0:
     model.save_model(epoch, 'latest')
     model.save_model(
         epoch
     )  #save the model and other necessary states for training later.
     samples_real, smaples_fake = get_samples(
         model, data, epoch, params
     )  #save the samples generated by Generate_A or Generator_B
 end = time.time()
 print('\n{}/{} is Done! Time Taken: {:.4f}s'.format(
     epoch, params.n_epoch, end - start))
 model.print_loss(model.losses)
 model.update_learning_rate(epoch)  #update learning rate
Ejemplo n.º 28
0
 def prepare_data(self):
     self.train_samples = get_samples(
         Path(self.hparams["data_path"]) / "images",
         Path(self.hparams["data_path"]) / "masks",
     )
Ejemplo n.º 29
0
def train(args):

    # Device Configuration #
    device = torch.device(
        f'cuda:{args.gpu_num}' if torch.cuda.is_available() else 'cpu')

    # Fix Seed for Reproducibility #
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed(args.seed)

    # Samples, Plots, Weights and CSV Path #
    paths = [
        args.samples_path, args.plots_path, args.weights_path, args.csv_path
    ]
    for path in paths:
        make_dirs(path)

    # Prepare Data #
    data = pd.read_csv(args.data_path)[args.column]

    # Pre-processing #
    scaler_1 = StandardScaler()
    scaler_2 = StandardScaler()
    preprocessed_data = pre_processing(data, scaler_1, scaler_2, args.delta)

    X = moving_windows(preprocessed_data, args.ts_dim)
    label = moving_windows(data.to_numpy(), args.ts_dim)

    # Prepare Networks #
    D = Discriminator(args.ts_dim).to(device)
    G = Generator(args.latent_dim, args.ts_dim,
                  args.conditional_dim).to(device)

    # Loss Function #
    if args.criterion == 'l2':
        criterion = nn.MSELoss()
    elif args.criterion == 'wgangp':
        pass
    else:
        raise NotImplementedError

    # Optimizers #
    D_optim = torch.optim.Adam(D.parameters(), lr=args.lr, betas=(0.5, 0.9))
    G_optim = torch.optim.Adam(G.parameters(), lr=args.lr, betas=(0.5, 0.9))

    D_optim_scheduler = get_lr_scheduler(D_optim, args)
    G_optim_scheduler = get_lr_scheduler(G_optim, args)

    # Lists #
    D_losses, G_losses = list(), list()

    # Train #
    print("Training Time Series GAN started with total epoch of {}.".format(
        args.num_epochs))

    for epoch in range(args.num_epochs):

        # Initialize Optimizers #
        G_optim.zero_grad()
        D_optim.zero_grad()

        if args.criterion == 'l2':
            n_critics = 1
        elif args.criterion == 'wgangp':
            n_critics = 5

        #######################
        # Train Discriminator #
        #######################

        for j in range(n_critics):
            series, start_dates = get_samples(X, label, args.batch_size)

            # Data Preparation #
            series = series.to(device)
            noise = torch.randn(args.batch_size, 1, args.latent_dim).to(device)

            # Adversarial Loss using Real Image #
            prob_real = D(series.float())

            if args.criterion == 'l2':
                real_labels = torch.ones(prob_real.size()).to(device)
                D_real_loss = criterion(prob_real, real_labels)

            elif args.criterion == 'wgangp':
                D_real_loss = -torch.mean(prob_real)

            # Adversarial Loss using Fake Image #
            fake_series = G(noise)
            fake_series = torch.cat(
                (series[:, :, :args.conditional_dim].float(),
                 fake_series.float()),
                dim=2)

            prob_fake = D(fake_series.detach())

            if args.criterion == 'l2':
                fake_labels = torch.zeros(prob_fake.size()).to(device)
                D_fake_loss = criterion(prob_fake, fake_labels)

            elif args.criterion == 'wgangp':
                D_fake_loss = torch.mean(prob_fake)
                D_gp_loss = args.lambda_gp * get_gradient_penalty(
                    D, series.float(), fake_series.float(), device)

            # Calculate Total Discriminator Loss #
            D_loss = D_fake_loss + D_real_loss

            if args.criterion == 'wgangp':
                D_loss += args.lambda_gp * D_gp_loss

            # Back Propagation and Update #
            D_loss.backward()
            D_optim.step()

        ###################
        # Train Generator #
        ###################

        # Adversarial Loss #
        fake_series = G(noise)
        fake_series = torch.cat(
            (series[:, :, :args.conditional_dim].float(), fake_series.float()),
            dim=2)
        prob_fake = D(fake_series)

        # Calculate Total Generator Loss #
        if args.criterion == 'l2':
            real_labels = torch.ones(prob_fake.size()).to(device)
            G_loss = criterion(prob_fake, real_labels)

        elif args.criterion == 'wgangp':
            G_loss = -torch.mean(prob_fake)

        # Back Propagation and Update #
        G_loss.backward()
        G_optim.step()

        # Add items to Lists #
        D_losses.append(D_loss.item())
        G_losses.append(G_loss.item())

        ####################
        # Print Statistics #
        ####################

        print("Epochs [{}/{}] | D Loss {:.4f} | G Loss {:.4f}".format(
            epoch + 1, args.num_epochs, np.average(D_losses),
            np.average(G_losses)))

        # Adjust Learning Rate #
        D_optim_scheduler.step()
        G_optim_scheduler.step()

        # Save Model Weights and Series #
        if (epoch + 1) % args.save_every == 0:
            torch.save(
                G.state_dict(),
                os.path.join(
                    args.weights_path,
                    'TimeSeries_Generator_using{}_Epoch_{}.pkl'.format(
                        args.criterion.upper(), epoch + 1)))

            series, fake_series = generate_fake_samples(
                X, label, G, scaler_1, scaler_2, args, device)
            plot_sample(series, fake_series, epoch, args)
            make_csv(series, fake_series, epoch, args)

    print("Training finished.")
Ejemplo n.º 30
0
def main(args):

    # Device Configuration #
    device = torch.device(
        f'cuda:{args.gpu_num}' if torch.cuda.is_available() else 'cpu')

    # Fix Seed for Reproducibility #
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed(args.seed)

    # Samples, Plots, Weights and CSV Path #
    paths = [
        args.samples_path, args.weights_path, args.csv_path,
        args.inference_path
    ]
    for path in paths:
        make_dirs(path)

    # Prepare Data #
    data = pd.read_csv(args.data_path)[args.column]

    # Prepare Data #
    scaler_1 = StandardScaler()
    scaler_2 = StandardScaler()
    preprocessed_data = pre_processing(data, scaler_1, scaler_2, args.constant,
                                       args.delta)

    train_X, train_Y, test_X, test_Y = prepare_data(data, preprocessed_data,
                                                    args)

    train_X = moving_windows(train_X, args.ts_dim)
    train_Y = moving_windows(train_Y, args.ts_dim)

    test_X = moving_windows(test_X, args.ts_dim)
    test_Y = moving_windows(test_Y, args.ts_dim)

    # Prepare Networks #
    if args.model == 'conv':
        D = ConvDiscriminator(args.ts_dim).to(device)
        G = ConvGenerator(args.latent_dim, args.ts_dim).to(device)

    elif args.model == 'lstm':
        D = LSTMDiscriminator(args.ts_dim).to(device)
        G = LSTMGenerator(args.latent_dim, args.ts_dim).to(device)

    else:
        raise NotImplementedError

    #########
    # Train #
    #########

    if args.mode == 'train':

        # Loss Function #
        if args.criterion == 'l2':
            criterion = nn.MSELoss()

        elif args.criterion == 'wgangp':
            pass

        else:
            raise NotImplementedError

        # Optimizers #
        if args.optim == 'sgd':
            D_optim = torch.optim.SGD(D.parameters(), lr=args.lr, momentum=0.9)
            G_optim = torch.optim.SGD(G.parameters(), lr=args.lr, momentum=0.9)

        elif args.optim == 'adam':
            D_optim = torch.optim.Adam(D.parameters(),
                                       lr=args.lr,
                                       betas=(0., 0.9))
            G_optim = torch.optim.Adam(G.parameters(),
                                       lr=args.lr,
                                       betas=(0., 0.9))

        else:
            raise NotImplementedError

        D_optim_scheduler = get_lr_scheduler(D_optim, args)
        G_optim_scheduler = get_lr_scheduler(G_optim, args)

        # Lists #
        D_losses, G_losses = list(), list()

        # Train #
        print(
            "Training Time Series GAN started with total epoch of {}.".format(
                args.num_epochs))

        for epoch in range(args.num_epochs):

            # Initialize Optimizers #
            G_optim.zero_grad()
            D_optim.zero_grad()

            #######################
            # Train Discriminator #
            #######################

            if args.criterion == 'l2':
                n_critics = 1
            elif args.criterion == 'wgangp':
                n_critics = 5

            for j in range(n_critics):
                series, start_dates = get_samples(train_X, train_Y,
                                                  args.batch_size)

                # Data Preparation #
                series = series.to(device)
                noise = torch.randn(args.batch_size, 1,
                                    args.latent_dim).to(device)

                # Adversarial Loss using Real Image #
                prob_real = D(series.float())

                if args.criterion == 'l2':
                    real_labels = torch.ones(prob_real.size()).to(device)
                    D_real_loss = criterion(prob_real, real_labels)

                elif args.criterion == 'wgangp':
                    D_real_loss = -torch.mean(prob_real)

                # Adversarial Loss using Fake Image #
                fake_series = G(noise)
                prob_fake = D(fake_series.detach())

                if args.criterion == 'l2':
                    fake_labels = torch.zeros(prob_fake.size()).to(device)
                    D_fake_loss = criterion(prob_fake, fake_labels)

                elif args.criterion == 'wgangp':
                    D_fake_loss = torch.mean(prob_fake)
                    D_gp_loss = args.lambda_gp * get_gradient_penalty(
                        D, series.float(), fake_series.float(), device)

                # Calculate Total Discriminator Loss #
                D_loss = D_fake_loss + D_real_loss

                if args.criterion == 'wgangp':
                    D_loss += args.lambda_gp * D_gp_loss

                # Back Propagation and Update #
                D_loss.backward()
                D_optim.step()

            ###################
            # Train Generator #
            ###################

            # Adversarial Loss #
            fake_series = G(noise)
            prob_fake = D(fake_series)

            # Calculate Total Generator Loss #
            if args.criterion == 'l2':
                real_labels = torch.ones(prob_fake.size()).to(device)
                G_loss = criterion(prob_fake, real_labels)

            elif args.criterion == 'wgangp':
                G_loss = -torch.mean(prob_fake)

            # Back Propagation and Update #
            G_loss.backward()
            G_optim.step()

            # Add items to Lists #
            D_losses.append(D_loss.item())
            G_losses.append(G_loss.item())

            # Adjust Learning Rate #
            D_optim_scheduler.step()
            G_optim_scheduler.step()

            # Print Statistics, Save Model Weights and Series #
            if (epoch + 1) % args.log_every == 0:

                # Print Statistics and Save Model #
                print("Epochs [{}/{}] | D Loss {:.4f} | G Loss {:.4f}".format(
                    epoch + 1, args.num_epochs, np.average(D_losses),
                    np.average(G_losses)))
                torch.save(
                    G.state_dict(),
                    os.path.join(
                        args.weights_path,
                        'TS_using{}_and_{}_Epoch_{}.pkl'.format(
                            G.__class__.__name__, args.criterion.upper(),
                            epoch + 1)))

                # Generate Samples and Save Plots and CSVs #
                series, fake_series = generate_fake_samples(
                    test_X, test_Y, G, scaler_1, scaler_2, args, device)
                plot_series(series, fake_series, G, epoch, args,
                            args.samples_path)
                make_csv(series, fake_series, G, epoch, args, args.csv_path)

    ########
    # Test #
    ########

    elif args.mode == 'test':

        # Load Model Weights #
        G.load_state_dict(
            torch.load(
                os.path.join(
                    args.weights_path, 'TS_using{}_and_{}_Epoch_{}.pkl'.format(
                        G.__class__.__name__, args.criterion.upper(),
                        args.num_epochs))))

        # Lists #
        real, fake = list(), list()

        # Inference #
        for idx in range(0, test_X.shape[0], args.ts_dim):

            # Do not plot if the remaining data is less than time dimension #
            end_ix = idx + args.ts_dim

            if end_ix > len(test_X) - 1:
                break

            # Prepare Data #
            test_data = test_X[idx, :]
            test_data = np.expand_dims(test_data, axis=0)
            test_data = np.expand_dims(test_data, axis=1)
            test_data = torch.from_numpy(test_data).to(device)

            start = test_Y[idx, 0]

            noise = torch.randn(args.val_batch_size, 1,
                                args.latent_dim).to(device)

            # Generate Fake Data #
            with torch.no_grad():
                fake_series = G(noise)

            # Convert to Numpy format for Saving #
            test_data = np.squeeze(test_data.cpu().data.numpy())
            fake_series = np.squeeze(fake_series.cpu().data.numpy())

            test_data = post_processing(test_data, start, scaler_1, scaler_2,
                                        args.delta)
            fake_series = post_processing(fake_series, start, scaler_1,
                                          scaler_2, args.delta)

            real += test_data.tolist()
            fake += fake_series.tolist()

        # Plot, Save to CSV file and Derive Metrics #
        plot_series(real, fake, G, args.num_epochs - 1, args,
                    args.inference_path)
        make_csv(real, fake, G, args.num_epochs - 1, args, args.inference_path)
        derive_metrics(real, fake, args)

    else:
        raise NotImplementedError
Ejemplo n.º 31
0
def main():
    # define the command line arguments
    g_help = "teacher + student activation function: 'erf' or 'relu'"
    M_help = "number of teacher hidden nodes"
    K_help = "number of student hidden nodes"
    device_help = "which device to run on: 'cuda' or 'cpu'"
    generator_help = "Generator of the inputs: dcgan_rand, dcgan_cifar10, dcgan_cifar100_grey, nvp_cifar10."
    transform_help = "Transform: identity, scattering, ..."
    steps_help = "training steps as multiples of N"
    seed_help = "random number generator seed."
    parser = argparse.ArgumentParser()
    parser.add_argument("-g", "--g", default="erf", help=g_help)
    parser.add_argument("-M", "--M", type=int, default=2, help=M_help)
    parser.add_argument("-K", "--K", type=int, default=2, help=K_help)
    parser.add_argument("--generator", help=generator_help, default="rand")
    parser.add_argument("--transform", help=transform_help)
    parser.add_argument("--device", "-d", help=device_help)
    parser.add_argument("--lr", type=float, default=0.2, help="learning rate")
    parser.add_argument("--bs", type=int, default=1, help="mini-batch size")
    parser.add_argument("--steps", type=int, default=10000, help=steps_help)
    parser.add_argument("-q", "--quiet", help="be quiet", action="store_true")
    parser.add_argument("-s", "--seed", type=int, default=0, help=seed_help)
    parser.add_argument("--store",
                        action="store_true",
                        help="store initial conditions")
    args = parser.parse_args()

    torch.manual_seed(args.seed)
    if args.device is None:
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    else:
        device = torch.device(args.device)

    (M, K, lr) = (args.M, args.K, args.lr)

    # Find the right generator for the given scenario
    generator = utils.get_generator(args.generator, device)
    # transformation of the inputs
    transformation = utils.get_transformation(args.transform, generator,
                                              device)

    model_desc = generator.name()
    if transformation is not None:
        model_desc += "_" + transformation.name()

    # Define the dimensions of the problem
    D = generator.N_in
    N = generator.N_out if transformation is None else transformation.N_out

    # get the moments of the generator to center its outputs
    try:
        generator_mean_vec = torch.load("moments/%s_mean_x.pt" %
                                        generator.name(),
                                        map_location=device)
        generator_cov = torch.load("moments/%s_omega.pt" % generator.name(),
                                   map_location=device)
    except FileNotFoundError:
        print("Could not find moments of generator %s. Will exit now!" %
              generator.name())
        exit()
    # define the scalar moments of the generator's output distribution
    generator_mean, generator_std = utils.get_scalar_mean_std(
        generator_mean_vec, generator_cov)

    # Now get the moments of the inputs that come out of the transformation
    transformation_mean = None
    transformation_std = None
    # Either load pre-computed Omega and Phi, or generate from the test set
    Omega = None  # the student input - input covariance
    Phi = None  # the generator input - student input covariance
    try:
        mean_x = torch.load(
            "moments/%s_mean_x.pt" % model_desc,
            map_location=device,
        )
        Omega = torch.load(
            "moments/%s_Omega.pt" % model_desc,
            map_location=device,
        )
        Phi = torch.load(
            "moments/%s_phi.pt" % model_desc,
            map_location=device,
        )

        transformation_mean, transformation_std = utils.get_scalar_mean_std(
            mean_x, Omega)
    except FileNotFoundError:
        pass

    # networks and loss
    g = erfscaled if args.g == "erf" else F.relu
    gs = (g, identity)
    student = TwoLayer(gs, N, args.K, 1, normalise1=True, std0=1e-2)
    student.to(device)

    teacher = TwoLayer(gs, D, args.M, 1, normalise1=True, std0=1)
    nn.init.constant_(teacher.fc2.weight, 1)
    teacher.freeze()
    teacher.to(device)
    B = teacher.fc1.weight.data
    A = teacher.fc2.weight.data

    # collect the parameters that are going to be optimised by SGD
    params = []
    params += [{"params": student.fc1.parameters()}]
    # If we train the last layer, ensure its learning rate scales correctly
    params += [{"params": student.fc2.parameters(), "lr": lr / N}]
    optimizer = optim.SGD(params, lr=lr)
    criterion = HalfMSELoss()

    # when to print?
    end = torch.log10(torch.tensor([1.0 * args.steps])).item()
    times_to_print = list(torch.logspace(-1, end, steps=200))

    # generate the test set
    test_cs, test_xs, test_ys = utils.get_samples(
        device,
        NUM_TESTSAMPLES,
        generator,
        generator_mean,
        teacher,
        transformation,
        transformation_mean,
    )

    # If we didn't found a pre-computed Omega and Phi (which we need to store the
    # initial conditions), we can compute them from the test set
    if Omega is None:
        Omega = 1 / NUM_TESTSAMPLES * test_xs.T @ test_xs
        Phi = 1 / NUM_TESTSAMPLES * test_xs.T @ test_cs

    nus = B.mm(test_cs.T) / math.sqrt(D)

    # output file + welcome message
    log_fname = "transform_online_%s_D%d_N%d_%s_M%d_K%d_lr%g_i2_s%d.dat" % (
        model_desc,
        D,
        N,
        args.g,
        M,
        K,
        lr,
        args.seed,
    )
    logfile = open(log_fname, "w", buffering=1)
    welcome = "# Two-layer nets on inputs from generator %s" % generator.name()
    if transformation is None:
        welcome += "\n"
    else:
        welcome += " with transformation %s\n" % transformation.name()
    welcome += "# M=%d, K=%d, lr=%g, batch size=%d, seed=%d\n" % (
        M,
        K,
        lr,
        args.bs,
        args.seed,
    )
    welcome += "# Using device:" + str(device)
    log(welcome, logfile)

    print("# Generator, Teacher and Student: ")
    for net in [generator, teacher, student]:
        msg = "# " + str(net).replace("\n", "\n# ")
        log(msg, logfile)

    msg = "# test xs: mean=%g, std=%g; test ys: std=%g" % (
        torch.mean(test_xs),
        torch.std(test_xs),
        torch.std(test_ys),
    )
    log(msg, logfile)

    T = 1.0 / B.shape[1] * B @ B.T
    rotation = Phi.T @ Phi
    tildeT = 1 / N * B @ rotation @ B.T
    if args.store:
        with torch.no_grad():
            # compute the exact densities of r and q
            exq = torch.zeros((K, K, N), device=device)
            exr = torch.zeros((K, M, N), device=device)
            extildet = torch.zeros((M, M, N), device=device)
            sqrtN = math.sqrt(N)
            w = student.fc1.weight.data
            v = student.fc2.weight.data

            rhos, psis = torch.symeig(Omega, eigenvectors=True)
            rhos.to(device)
            psis.to(device)
            #  make sure to normalise, orient evectors according to the note
            psis = sqrtN * psis.T

            GammaB = 1.0 / sqrtN * B @ Phi.T @ psis.T
            GammaW = 1.0 / sqrtN * w @ psis.T

            for k in range(K):
                for l in range(K):
                    exq[k, l] = GammaW[k, :] * GammaW[l, :]
                for n in range(M):
                    exr[k, n] = GammaW[k, :] * GammaB[n, :]
            for n in range(M):
                for m in range(M):
                    extildet[n, m] = GammaB[n, :] * GammaB[m, :]

            root_name = log_fname[:-4]
            np.savetxt(root_name + "_T.dat", T.cpu().numpy(), delimiter=",")
            np.savetxt(root_name + "_rhos.dat",
                       rhos.cpu().numpy(),
                       delimiter=",")
            np.savetxt(root_name + "_T.dat", T.cpu().numpy(), delimiter=",")
            np.savetxt(root_name + "_A.dat", A[0].cpu().numpy(), delimiter=",")
            np.savetxt(root_name + "_v0.dat",
                       v[0].cpu().numpy(),
                       delimiter=",")

            write_density(root_name + "_q0.dat", exq)
            write_density(root_name + "_r0.dat", exr)
            write_density(root_name + "_tildet.dat", extildet)

    time = 0
    dt = 1 / N

    msg = eval_student(time, student, test_xs, test_ys, nus, T, tildeT, A,
                       criterion)
    log(msg, logfile)
    while len(times_to_print) > 0:
        # get the inputs
        cs, inputs, targets = utils.get_samples(
            device,
            args.bs,
            generator,
            generator_mean,
            teacher,
            transformation,
            transformation_mean,
        )

        for i in range(args.bs):
            student.train()
            preds = student(inputs[i])
            loss = criterion(preds, targets[i])

            # TRAINING
            student.zero_grad()
            loss.backward()
            optimizer.step()

            time += dt

            if time >= times_to_print[0].item() or time == 0:
                msg = eval_student(time, student, test_xs, test_ys, nus, T,
                                   tildeT, A, criterion)
                log(msg, logfile)
                times_to_print.pop(0)

    print("Bye-bye")