Ejemplo n.º 1
0
def initiate_data():
    test_dict = {}

    # test_load_data(test_dict)
    load_data(test_dict)
    with open('data_dict.obj', 'wb') as fp:
        pickle.dump(test_dict, fp)

    storage_tree = build_tree(test_dict)

    for letter, node in storage_tree.children.items():
        with open('subtrees/sub_tree_' + letter + '.obj', 'wb') as fp:
            pickle.dump(node, fp)
 def __init__(self):
     self.weather_data = np.array(
         data_parser.load_data(
             os.path.abspath(os.path.dirname(sys.argv[0])) +
             "/Data/MonthlyDurham.csv"))
Ejemplo n.º 3
0
def main():
    datasets = load_all_files()
    features_names = load_features_names()
    diseases_names = load_diseases_names()

    data = load_data(datasets)
    data.print_classes_strength()
    data.print_combined_strength()

    show_distribution(datasets, features_names)

    features = np.concatenate([*datasets])
    classes = np.concatenate([
        np.full(dataset.shape[0], diseases_names[index])
        for (index, dataset) in enumerate(datasets)
    ])

    scores, p_values = chi2(features, classes)

    features_with_values = pd.concat([
        pd.DataFrame(features_names, columns=['Features']),
        pd.DataFrame(scores, columns=['Scores']),
        pd.DataFrame(p_values, columns=['P_values'])
    ],
                                     axis=1)

    print(features_with_values.sort_values('Scores', ascending=False).round(3))
    ordered_features = features.copy()
    for idx, feature_idx in enumerate(
            features_with_values.sort_values('Scores', ascending=False).index):
        ordered_features[:,
                         idx:idx + 1] = features[:,
                                                 feature_idx:feature_idx + 1]

    print('==============================================================')

    alpha = 0.05
    for index, row in features_with_values.iterrows():
        p_value = row['P_values']
        if p_value > alpha:
            features_with_values.drop(index, inplace=True)

    print(features_with_values.sort_values('Scores', ascending=False).round(3))

    rskf = RepeatedStratifiedKFold(n_repeats=5, n_splits=2, random_state=1)

    n_neighbors_variants = [1, 5, 10]
    metric_variants = ['manhattan', 'euclidean']

    df_columns = [
        'n_features', 'n_neighbors', 'metric', 'scores', 'mean_accuracy',
        'mean_confusion_matrix'
    ]
    results_df = pd.DataFrame(columns=df_columns)
    #number_of_features = 8
    number_of_features = ordered_features.shape[1] + 1  # or set to 8

    print('Training models. Please wait...')
    for n_features in range(1, number_of_features):
        for n_neighbors in n_neighbors_variants:
            for metric in metric_variants:
                knn = KNeighborsClassifier(n_neighbors=n_neighbors,
                                           metric=metric)
                current_iteration_scores = []
                current_iteration_confusion_matrices = np.zeros(shape=(5, 5))
                number_of_iterations = 0

                for train, test in rskf.split(
                        ordered_features[:, 0:n_features], classes):
                    knn.fit(ordered_features[:, 0:n_features][train],
                            classes[train])
                    current_score = knn.score(
                        ordered_features[:, 0:n_features][test], classes[test])
                    current_iteration_scores.append(current_score)

                    y_pred = knn.predict(ordered_features[:,
                                                          0:n_features][test])
                    current_confusion_matrix = confusion_matrix(classes[test],
                                                                y_pred=y_pred)
                    current_iteration_confusion_matrices += current_confusion_matrix

                    number_of_iterations += 1

                results_df.loc[len(results_df)] = [
                    n_features, n_neighbors, metric, current_iteration_scores,
                    np.array(current_iteration_scores).mean().round(3),
                    (current_iteration_confusion_matrices /
                     number_of_iterations)
                ]

    results_df = results_df.sort_values('mean_accuracy')
    j = 0
    print('Best mean models scoreboard:')
    for i, row in results_df.iterrows():
        j = j + 1
        print(
            f'[{len(results_df) - j}] Mean score for n_neighbors={row["n_neighbors"]}, metric={row["metric"]}, '
            f'n_features={row["n_features"]}: {row["mean_accuracy"]}')

    # compare_every_model_paired(results_df)

    # Compare two best models (indexed from 0 - best model)
    print('Compare two best models:')

    compare_two_models(0, 1, results_df)

    print('Best statistically significant model:')
    find_best_statistically_significant_model(results_df)

    best_model_params = results_df.sort_values('mean_accuracy',
                                               ascending=False).iloc[0]
    print(f'\nBest score: {best_model_params["mean_accuracy"]}')
    print(
        f'Best parameters: metric - {best_model_params["metric"]}, n_neighbors - {best_model_params["n_neighbors"]}, '
        f'number of features - {best_model_params["n_features"]}')

    show_best_score_confusion_matrix(
        best_model_params['mean_confusion_matrix'], diseases_names)

    show_summarising_plots(number_of_features=number_of_features,
                           results_df=results_df,
                           metric_variants=metric_variants,
                           n_neighbors_variants=n_neighbors_variants)
Ejemplo n.º 4
0
file_name += '_latent_size_' + str(hyper_params['latent_size'])

# Path to store the log file and the model file
log_file_root = "saved_logs/"
model_file_root = "saved_models/"
if not os.path.isdir(log_file_root):
    os.mkdir(log_file_root)
if not os.path.isdir(model_file_root):
    os.mkdir(model_file_root)
hyper_params['log_file'] = log_file_root + hyper_params[
    'project_name'] + '_log' + file_name + '.txt'
hyper_params['model_file_name'] = model_file_root + hyper_params[
    'project_name'] + '_model' + file_name + '.pt'

# Load the processed data and get the reader classes for training, test, and validation sets
train_reader, val_reader, test_reader, total_items = load_data(hyper_params)
hyper_params['total_items'] = total_items
hyper_params['testing_batch_limit'] = test_reader.num_b

file_write(hyper_params['log_file'],
           "\n\nSimulation run on: " + str(dt.datetime.now()) + "\n\n")
file_write(hyper_params['log_file'], "Data reading complete!")
file_write(hyper_params['log_file'],
           "Number of train batches: {:4d}".format(train_reader.num_b))
file_write(hyper_params['log_file'],
           "Number of validation batches: {:4d}".format(val_reader.num_b))
file_write(hyper_params['log_file'],
           "Number of test batches: {:4d}".format(test_reader.num_b))
file_write(hyper_params['log_file'], "Total Items: " + str(total_items) + "\n")

# Instantiate the model
Ejemplo n.º 5
0
def main(args):
    # ensure reproducibility
    numpy.random.seed(10)
    random.seed(10)

    print('Loading data ...')
    data = load_data(args.task, args.paths, args.feature_set, args.emo_dim, args.normalize, args.norm_opts,
                     args.win_len, args.hop_len, save=args.cache)
    data_loader = {}
    for partition in data.keys():  # one DataLoader for each partition
        set = MuSeDataset(data, partition)
        batch_size = args.batch_size if partition == 'train' else 1
        shuffle = True if partition == 'train' else False  # shuffle only for train partition
        data_loader[partition] = torch.utils.data.DataLoader(set, batch_size=batch_size, shuffle=shuffle, num_workers=4,
                                                             worker_init_fn=seed_worker)

    args.d_in = data_loader['train'].dataset.get_feature_dim()
    if args.task == 'sent':
        args.n_targets = max([x[0, 0] for x in data['train']['label']]) + 1  # number of classes
        criterion = CrossEntropyLoss()
        score_str = 'Macro-F1'
    else:
        args.n_targets = 1
        criterion = CCCLoss()
        score_str = 'CCC'

    if args.eval_model is None:  # Train and validate for each seed
        seeds = range(args.seed, args.seed + args.n_seeds)
        val_losses, val_scores, best_model_files, test_scores = [], [], [], []

        for seed in seeds:
            torch.manual_seed(seed)

            model = Model(args)

            print('=' * 50)
            print('Training model... [seed {}]'.format(seed))

            val_loss, val_score, best_model_file = train_model(args.task, model, data_loader, args.epochs,
                                                               args.lr, args.paths['model'], seed, args.use_gpu,
                                                               criterion, regularization=args.regularization)
            if not args.predict:  # run evaluation only if test labels are available
                test_loss, test_score = evaluate(args.task, model, data_loader['test'], criterion, args.use_gpu)
                test_scores.append(test_score)
                if args.task in ['physio', 'stress', 'wilder']:
                    print(f'[Test CCC]:  {test_score:7.4f}')
            val_losses.append(val_loss)
            val_scores.append(val_score)
            best_model_files.append(best_model_file)

        best_idx = val_scores.index(max(val_scores))  # find best performing seed

        print('=' * 50)
        print(f'Best {score_str} on [Val] for seed {seeds[best_idx]}: '
              f'[Val {score_str}]: {val_scores[best_idx]:7.4f}'
              f"{f' | [Test {score_str}]: {test_scores[best_idx]:7.4f}' if not args.predict else ''}")
        print('=' * 50)

        model_file = best_model_files[best_idx]  # best model of all of the seeds

    else:  # Evaluate existing model (No training)
        model_file = args.eval_model
        model = torch.load(model_file)
        _, valid_score = evaluate(args.task, model, data_loader['devel'], criterion, args.use_gpu)
        print(f'Evaluating {model_file}:')
        print(f'[Val {score_str}]: {valid_score:7.4f}')
        if not args.predict:
            _, test_score = evaluate(args.task, model, data_loader['test'], criterion, args.use_gpu)
            print(f'[Test {score_str}]: {test_score:7.4f}')

    if args.predict:  # Make predictions for the test partition; this option is set if there are no test labels
        print('Predicting test samples...')
        best_model = torch.load(model_file)
        evaluate(args.task, best_model, data_loader['test'], criterion, args.use_gpu, predict=True,
                 prediction_path=args.paths['predict'])

    if args.save:  # Save predictions for all partitions (needed to subsequently do late fusion)
        print('Save all predictions...')
        seed = int(model_file.split('_')[-1].split('.')[0])
        torch.manual_seed(seed)
        best_model = torch.load(model_file)
        # Load data again without any segmentation
        data = load_data(args.task, args.paths, args.feature_set, args.emo_dim, args.normalize, args.norm_opts,
                         args.win_len, args.hop_len, save=args.cache, apply_segmentation=False)
        for partition in data.keys():
            dl = torch.utils.data.DataLoader(MuSeDataset(data, partition), batch_size=1, shuffle=False,
                                             worker_init_fn=seed_worker)
            evaluate(args.task, best_model, dl, criterion, args.use_gpu, predict=True,
                     prediction_path=args.paths['save'])

    # Delete model if save option is not set.
    if not args.save and not args.eval_model:
        if os.path.exists(model_file):
            os.remove(model_file)

    print('Done.')
Ejemplo n.º 6
0
def main(args):
    print("Save prediction:"+ str(args.save))
    # ensure reproducibility
    numpy.random.seed(10)
    random.seed(10)
    print('Loading data ...')
    encoding_position= True
    data = load_data(args.task, args.paths, args.feature_set, args.emo_dim, args.normalize, args.norm_opts,
                     args.win_len, args.hop_len, save=args.cache, encoding_position=encoding_position)
    data_loader = {}
    for partition in data.keys():  # one DataLoader for each partition
        set = MuSeDataset(data, partition)
        batch_size = args.batch_size if partition == 'train' else 1
        shuffle = True if partition == 'train' else False  # shuffle only for train partition
        data_loader[partition] = torch.utils.data.DataLoader(set, batch_size=batch_size, shuffle=shuffle, num_workers=4,
                                                             worker_init_fn=seed_worker)
        
    # args.d_in = data_loader['train'].dataset.get_feature_dim()
    args.n_targets = 1
    criterion = CCCLoss()
    score_str = 'CCC'
    d_in= {'vggface': 512, "egemaps": 88, "bert-4":  768, 'fau_intensity':17, 'deepspectrum': 4096, 'vggish': 128}
    d_rnn_in= {'vggface': 256, "egemaps": 128, "bert-4":  512,'fau_intensity':16, 'deepspectrum': 512, 'vggish': 128}
    # rnn_out= {'visual': 256, "audio": 128, "text": 256, "bio": 128}
    # args.rnn_in = rnn_in
    # args.rnn_out = rnn_out
    # tcn_ins = {'vggface': 512, 'egemaps':88, 'bert-4': 768, 'fau_intensity':17,'deepspectrum': 4096, 'vggish': 128}
    tcn_channels = {'vggface': (256,), 'egemaps':(128,), 'bert-4': (512,), 'fau_intensity':(16,), 'deepspectrum': (512,),'vggish': (128,)}
    d_rnn= {'vggface': 128, 'egemaps':64, 'bert-4': 128, 'fau_intensity':16, 'deepspectrum': 512, 'vggish': 128 }
    # model_file = 'egemaps_model_101.pth'
    # if args.save:  # Save predictions for all partitions (needed to subsequently do late fusion)
    #     print('Save all predictions...')
    #     # seed = int(model_file.split('_')[-3])
    #     seed= 101
    #     torch.manual_seed(seed)
       
    #     print(torch.cuda.is_available())
    #     best_model = torch.load(model_file)
    #     # Load data again without any segmentation
    #     data = load_data(args.task, args.paths, args.feature_set, args.emo_dim, args.normalize, args.norm_opts,
    #                      args.win_len, args.hop_len, save=args.cache, apply_segmentation=False)
    #     for partition in data.keys():
    #         dl = torch.utils.data.DataLoader(MuSeDataset(data, partition), batch_size=1, shuffle=False,
    #                                          worker_init_fn=seed_worker)
    #         evaluate(args.task, best_model, dl, criterion, args.use_gpu, predict=True,
    #                  prediction_path=args.paths['save'])
    #     return
    if args.eval_model is None:  # Train and validate for each seed
        seeds = range(args.seed, args.seed + args.n_seeds)
        val_losses, val_scores, best_model_files, test_scores = [], [], [], []
        for seed in seeds:
            torch.manual_seed(seed)
            #params setting
            if encoding_position:
                args.d_in = d_in[args.feature_set[0]]+1
            else:
                args.d_in = d_in[args.feature_set[0]]

            args.fea_dr=0.

            args.tcn_layer =1
            args.tcn_channels= tcn_channels[args.feature_set[0]]
            args.num_dilations=4
            args.tcn_kernel_size=3
            args.tcn_dr=0.1
            args.tcn_norm= True

            args.d_rnn_in = d_rnn_in[args.feature_set[0]]
            args.d_rnn = d_rnn[args.feature_set[0]]
            args.rnn_dr=0.2

            #Attention Parameter
            args.attn_layer=1
            args.n_heads=1
            args.attn_dr= 0.2



            # model = Model(args)
            # model= MuseModel(args, num_outputs=1, tcn_in= tcn_in, tcn_channels= bio_signal_tcn_channel, num_dilations=8,
            # dropout=0.2, use_norm=True, features_dropout=0.)
            # model=  MuseModelBiCrossAttention(args, dropout=0.2, features_dropout=None, attn_dropout=0.2, num_last_regress=32, d_attention_out=32)
            model= MuseModelWithSelfAttention(args, num_outputs =1, num_last_regress = 64)
            # model= MuseModel2(args, num_outputs=1, tcn_in1= tcn_in1,tcn_in2= tcn_in2,
            #  tcn_channels1= tcn_channels[args.feature_set[0]],tcn_channels2= tcn_channels[args.feature_set[0]], num_dilations=8,
            #  dropout=0.2, use_norm=True, features_dropout=0.2, d_rnn1= d_rnn1, d_rnn2= d_rnn2)
            print('=' * 50)
            print('Training model... [seed {}]'.format(seed))

            val_loss, val_score, best_model_file = train_model(args.task, model, data_loader, args.epochs,
                                                               args.lr, args.paths['model'], seed, args.use_gpu,
                                                               criterion, regularization=args.regularization)
            if not args.predict:  # run evaluation only if test labels are available
                test_loss, test_score = evaluate(args.task, model, data_loader['test'], criterion, args.use_gpu)
                test_scores.append(test_score)
                if args.task in ['physio', 'stress', 'wilder']:
                    print(f'[Test CCC]:  {test_score:7.4f}')
            # val_losses.append(val_loss)
            val_scores.append(val_score)
            best_model_files.append(best_model_file)
            # epochs= range(1,51)
            # plt.plot(epochs, tran_l, 'g', label='Training Loss')
            # plt.plot(epochs, val_l, 'b', label='Validation Loss')
            # plt.plot(epochs, val_s, 'r', label='Validation CCC')
            # plt.title(f'Training/Validation Loss and Score seed{seed}')
            # plt.xlabel('Epochs')
            # plt.ylabel('Value')
            # plt.legend()
            # plt.show()

        best_idx = val_scores.index(max(val_scores))  # find best performing seed
        print('=' * 50)
        print(f'Best {score_str} on [Val] for seed {seeds[best_idx]}: '
              f'[Val {score_str}]: {val_scores[best_idx]:7.4f}'
              f"{f' | [Test {score_str}]: {test_scores[best_idx]:7.4f}' if not args.predict else ''}")
        print('=' * 50)

        model_file = best_model_files[best_idx]  # best model of all of the seeds

    else:  # Evaluate existing model (No training)
        model_file = args.eval_model
        model = torch.load(model_file)
        _, valid_score = evaluate(args.task, model, data_loader['devel'], criterion, args.use_gpu)
        print(f'Evaluating {model_file}:')
        print(f'[Val {score_str}]: {valid_score:7.4f}')
        if not args.predict:
            _, test_score = evaluate(args.task, model, data_loader['test'], criterion, args.use_gpu)
            print(f'[Test {score_str}]: {test_score:7.4f}')

    if args.predict:  # Make predictions for the test partition; this option is set if there are no test labels
        print('Predicting test samples...')
        best_model = torch.load(model_file)
        evaluate(args.task, best_model, data_loader['test'], criterion, args.use_gpu, predict=True,
                 prediction_path=args.paths['predict'])
    if args.save:  # Save predictions for all partitions (needed to subsequently do late fusion)
        print('Save all predictions...')
        seed = int(model_file.split('_')[-3])
        torch.manual_seed(seed)
        best_model = torch.load(model_file)
        # Load data again without any segmentation
        data = load_data(args.task, args.paths, args.feature_set, args.emo_dim, args.normalize, args.norm_opts,
                         args.win_len, args.hop_len, save=args.cache, apply_segmentation=False)
        for partition in data.keys():
            dl = torch.utils.data.DataLoader(MuSeDataset(data, partition), batch_size=1, shuffle=False,
                                             worker_init_fn=seed_worker)
            evaluate(args.task, best_model, dl, criterion, args.use_gpu, predict=True,
                     prediction_path=args.paths['save'])
    

    # Delete model if save option is not set.
    if not args.save and not args.eval_model:
        if os.path.exists(model_file):
            os.remove(model_file)

    print('Done.')
Ejemplo n.º 7
0
        help='If data set already pre-processed by Prober lab Perl script.')
    args = parser.parse_args()

    # Specify output
    bokeh.io.output_file(args.html_file, title='fish sleep explorer')

    # Parse data Frames
    if args.tidy:
        df = pd.read_csv(args.activity_file)
    elif args.perl_processed:
        df_gt = data_parser.load_gtype(args.gtype_file)
        df = data_parser.load_perl_processed_activity(args.activity_file,
                                                      df_gt)
    else:
        df = data_parser.load_data(args.activity_file, args.gtype_file,
                                   args.lights_on, args.lights_off,
                                   int(args.day_in_the_life))

    # Resample the data
    df_resampled = data_parser.resample(df, int(args.ind_win))

    # Get approximate time interval of averages
    inds = df_resampled.fish == df_resampled.fish.unique()[0]
    zeit = np.sort(df_resampled.loc[inds, 'zeit'].values)
    dt = np.mean(np.diff(zeit)) * 60

    # Make y-axis label
    y_axis_label = 'sec. of act. in {0:.1f} min.'.format(dt)

    # Get summary statistic
    if args.summary_trace in ['none', 'None']:
Ejemplo n.º 8
0
#Run Script
#Myles Scholz

import data_parser as dp
import Network as nw

dp.make_data()
tr_d, te_d, va_d = dp.load_data()
print(len(tr_d))
net = nw.Network([28, 25, 25, 28])
net.SGD(tr_d, 30, 10, 1.0, test_data=te_d)
Ejemplo n.º 9
0
    def __init__(self, scheduleType="Fixed", wake_up=7.0, sleep=11.0, typical=1, num_outings=4, variation_weekday=1, variation_weekend=1, abrupt_change_per_year=2, test_id=0, progression_rate = 0):
        self.scheduleType = scheduleType
        self.wake_up = wake_up
        self.sleep = sleep
        self.typical = typical
        self.variation_weekday = variation_weekday
        self.variation_weekend = variation_weekend
        self.abrupt_change_per_year = abrupt_change_per_year
        self.progression_rate = progression_rate
        self.functions = {}
        self.functions = {
            'Test': self.getTimesTest,
            'Fixed': self.getTimesFixed,
            'Progressing': self.getTimesProgressing,
        }
        
        self.activities_data = np.array(data_parser.load_data(os.path.abspath(os.path.dirname(sys.argv[0])) + "/Data/not_at_home_activities.csv"))
        idx = np.r_[0:1, 50001:75002]
        self.activities_data = self.activities_data[idx]

        self.activities_data = np.append(self.activities_data, np.array(data_parser.load_data(os.path.abspath(os.path.dirname(sys.argv[0])) + "/Data/clusters.csv")), axis=1)
        #print(self.activities_data[0])

        self.clustered_data = []
        for row in self.activities_data:
            #print row
            if row[7] != '0':
                self.clustered_data.append(row)
        #print self.clustered_data.shape

            
        #print(self.activities_data[self.test_id][0])

        self.test_id = test_id
        self.rand_id = np.random.randint(1, max(self.activities_data.shape))
        self.num_outings = num_outings

        self.base_start_times = []
        self.base_durations = []

        self.clust_id = np.random.randint(1, len(self.clustered_data))
        self.past_clust_id = [0]
        for i in range(num_outings):
            #print self.clustered_data[self.clust_id][7]
            #print self.past_clust_id
            while self.clustered_data[self.clust_id][7] in self.past_clust_id:
                self.clust_id = np.random.randint(1, len(self.clustered_data))
            self.past_clust_id.append(self.clustered_data[self.clust_id][7])

            self.base_start_times.append(float(self.clustered_data[self.clust_id][4]))
            self.base_durations.append(float(self.clustered_data[self.clust_id][3]))
        #print self.base_start_times
        #print self.base_durations
            
        self.base_weekend_start_times = []
        self.base_weekend_durations = []

        self.clust_id = np.random.randint(1, len(self.clustered_data))
        self.past_clust_id = [0]
        for i in range(num_outings):
            #print self.clustered_data[self.clust_id][7]
            #print self.past_clust_id
            while self.clustered_data[self.clust_id][7] in self.past_clust_id:
                self.clust_id = np.random.randint(1, len(self.clustered_data))
            self.past_clust_id.append(self.clustered_data[self.clust_id][7])

            self.base_weekend_start_times.append(float(self.clustered_data[self.clust_id][4]))
            self.base_weekend_durations.append(float(self.clustered_data[self.clust_id][3]))