Exemple #1
0
    def invoke_data_summary(self):
        # Instantiate classes

        # Load Data from pickle files
        ld = LoadData(self.my_variables.training_file,
                      self.my_variables.testing_file,
                      self.my_variables.validation_file)
        train_test_valid_data = ld.get_data()

        #########################################################################################################
        self.x_train, self.y_train = train_test_valid_data[
            0], train_test_valid_data[1]
        self.x_test, self.y_test = train_test_valid_data[
            2], train_test_valid_data[3]
        self.x_valid, self.y_valid = train_test_valid_data[
            4], train_test_valid_data[5]

        #########################################################################################################
        # Basic Summary of dataset
        self.bs.summary_report(self.x_train, self.y_train, self.x_test,
                               self.y_test, self.x_valid, self.y_valid)

        #########################################################################################################
        # Exploratory visualization for train data
        self.vz.bar_chart(self.y_train, "train_data")
        # Exploratory visualization for train data
        self.vz.bar_chart(self.y_test, "test_data")
        # Exploratory visualization for train data
        self.vz.bar_chart(self.y_valid, "validation_data")

        #########################################################################################################
        self.vz.read_sign_names_from_csv(self.my_variables)
        self.vz.display_random_images(self.x_train, self.y_train,
                                      self.my_variables, "train")
Exemple #2
0
def show_no_temperature_difference():
    """
    Plot the estimate of the phase at different temperatures.
    """
    l_var = []
    l_var_var = []
    l_temp = [34,37,40]
    for temperature in l_temp:
        path =  "Data/NIH3T3.ALL.2017-04-04/ALL_TRACES_INFORMATION.p"
        dataClass=LoadData(path, 10000000, temperature = temperature,
                            division = False)
        (ll_area, ll_signal, ll_nan_circadian_factor, ll_obs_phi, ll_peak, \
        ll_idx_cell_cycle_start, T_theta, T_phi) = \
                                            dataClass.load(load_annotation=True)
        var,var_var = compute_phase_variance_with_confidence(ll_peak)
        l_var.append(var)
        l_var_var.append(var_var)

    plt.errorbar(l_temp, l_var, yerr = l_var_var, fmt='o')
    plt.xlim([33,41])
    plt.xlabel("Temperature")
    plt.ylabel("Phase diffusion variance mean and deviation")
    plt.savefig('Results/RawData/var_diffusion.pdf')
    plt.show()
    plt.close()
Exemple #3
0
 def train_n_classes(self):
     l = LoadData()
     stopWords = l.loadStopWords()
     train_sizes = [100, 200, 300]  # size per class
     for train_size in train_sizes:
         print('Training size:',
               math.floor(train_size * 0.75) * 5, 'Test size:',
               math.ceil(train_size * 0.25) * 5)
         self.loadData(train_size)
         vect = TfidfVectorizer(stop_words=stopWords)
         self.train_and_test_split(0.75)
         classes = {}
         x = 0
         for i in self.data['class_name']:
             if i not in classes:
                 classes[i] = x
                 x += 1
         X_train = vect.fit_transform(self.train_data['data'])
         Y_train = [classes[i] for i in self.train_data['class_name']]
         X_test = vect.transform(self.test_data['data'])
         Y_test = [classes[i] for i in self.test_data['class_name']]
         nb = MultinomialNB()
         Y_pred = nb.fit(X_train, Y_train).predict(X_test)
         self.metric(Y_test, Y_pred)
         print('---------------------------------------------------')
def main():
    """
    This is the main program of the project. It calls all functions to get the result and shows it to the user.
    """
    try:
        yelp = LoadData()
        user = UserChoice()
        choice = user.get_user_input()
        plots = PlotVisualization(yelp.get_data())
        h = Html()
        # Output result to html
        if choice == 'quit':
            print "Quitting..."
            pass
        elif choice == "overview":
            plots.plot_overview()
            print "Overview only."
            h.output_to_file(False)
        else:
            plots.plot_search_results(choice)
            print 'Your choice of restaurants received.'
            h.output_to_file(True)

    except ValueError:
        print "Found value error."
        sys.exit()
    except KeyboardInterrupt:
        print "Interrupted!"
        sys.exit()
    except MemoryError:
        print "Memory Error"
        sys.exit()
Exemple #5
0
def estimate_cycle_dev(cell, temperature):
    """
    Final estimate for the cell-cycle phase deviation.

    Parameters
    ----------
    cell : string
        Cell condition.
    temperature : int
        Temperature condition.

    Returns
    -------
    The standard deviation for the phase progression, and the periods.
    """
    ##################### LOAD DATA ##################
    if cell == 'NIH3T3':
        path =  "Data/NIH3T3.ALL.2017-04-04/ALL_TRACES_INFORMATION.p"
    elif cell == 'U2OS':
        path = "Data/U2OS-2017-03-20/ALL_TRACES_INFORMATION_march_2017.p"
    dataClass=LoadData(path, 10000000, temperature = temperature,
                        division = True)
    (ll_area, ll_signal, ll_nan_circadian_factor, ll_obs_phi, ll_peak, \
    ll_idx_cell_cycle_start, T_theta, T_phi) = \
                                            dataClass.load(load_annotation=True)
    std, std_T = estimate_phase_dev_from_div(ll_idx_cell_cycle_start)

    return std, std_T
    def predict(self):
        """
        Retruns the forecasts of the run function
        """
        #import pdb; pdb.set_trace()
        #classifier = self.optimization()[1]
        self.optimization()
        classifier = pickle.load(open('best_model3.pkl', 'rb'))
        predict_model = theano.function(
            inputs=[classifier.input],
            outputs=classifier.LinearRegression.y_pred)

        # We can test it on some examples from test test
        data = LoadData(self.link)
        datasets = data.load_data()
        #import pdb; pdb.set_trace()
        x_test, y_test = datasets[2]


        predicted_values = predict_model(x_test.get_value())
        fig = figure()
        _ = plt.scatter(x_test.get_value(), predicted_values, c = 'red', label='Predicted Values')
        _ = plt.scatter(x_test.get_value(), y_test.get_value(), facecolors='none',
                    edgecolors='r', label='Sample Points')
        _ = plt.legend()
        #plt.show()
        return fig
Exemple #7
0
    def __init__(self, src, dst, city, dimension, adjacency_list, heuristic_matrices):
        self.__src = src
        self.__dst = dst
        self.__city = city

        self.__open_list = []
        self.__closed_list = []
        self.__dimension = self.__dimension_index(dimension)

        self.__org_node = LoadData.load_org_node(city)
        self.__org_path = LoadData.load_org_path(city, 6)
        self.__link_table = adjacency_list
        self.__heuristic = heuristic_matrices
        # self.__heuristic = self.__calculate_heuristic()

        self.__dimension_cost, self.__heuristic_cost, self.__total_cost = ({} for _ in range(3))

        self.__dimension_cost = {node: float("inf") for node in self.__org_node.keys()}
        self.__dimension_cost[self.__src] = 0
        self.__heuristic_cost[self.__src] = self.__heuristic.loc[self.__src, self.__dst]
        self.__total_cost[self.__src] = \
            self.__heuristic_cost[self.__src] + self.__heuristic_cost[self.__src]

        self.__previous_node = {self.__src: None}

        heapq.heappush(self.__open_list, (self.__total_cost[self.__src], self.__src))
    def predict(self):
        """
        Retruns the forecasts of the run function
        """
        #import pdb; pdb.set_trace()
        #classifier = self.optimization()[1]
        self.optimization()
        classifier = pickle.load(open('best_model3.pkl', 'rb'))
        predict_model = theano.function(
            inputs=[classifier.input],
            outputs=classifier.LinearRegression.y_pred)

        # We can test it on some examples from test test
        data = LoadData(self.link)
        datasets = data.load_data()
        #import pdb; pdb.set_trace()
        x_test, y_test = datasets[2]

        predicted_values = predict_model(x_test.get_value())
        fig = figure()
        _ = plt.scatter(x_test.get_value(),
                        predicted_values,
                        c='red',
                        label='Predicted Values')
        _ = plt.scatter(x_test.get_value(),
                        y_test.get_value(),
                        facecolors='none',
                        edgecolors='r',
                        label='Sample Points')
        _ = plt.legend()
        #plt.show()
        return fig
Exemple #9
0
def compute_likelihood_sigma():
    """
    Compute and plot the likelihood of the phase diffusion parameter,
    depending on the temperature.
    """
    l_T = [34,37,40]
    l_likelihood_T = []
    mean_IG = 24
    domain_sigma = np.linspace(0.05, 0.3, 100)

    for T in l_T:

        path =  "Data/NIH3T3.ALL.2017-04-04/ALL_TRACES_INFORMATION.p"
        dataClass=LoadData(path, 10000000, temperature = T, division = False)
        (ll_area, ll_signal, ll_nan_circadian_factor, ll_obs_phi, ll_peak,
        ll_idx_cell_cycle_start, T_theta, T_phi) = \
                                            dataClass.load(load_annotation=True)

        l_T_clock=[]
        for l_peak in ll_peak:
            l_idx_peak = [idx for idx, i in enumerate(l_peak) if i==1]
            for t_peak_1, t_peak_2 in zip(l_idx_peak[:-1], l_idx_peak[1:]):
                #remove outliers
                T = (t_peak_2-t_peak_1)/2
                if T<12 or T>38:
                    #double or missing annotation
                    pass
                else:
                    l_T_clock.append(T )

        l_likelihood = []
        for sigma_theta in domain_sigma:
            lpx = np.log(1/sigma_theta)
            #lpx = 1
            for T in l_T_clock:
                lpx = lpx + np.log(invgauss(T, mean_IG,
                                            4*np.pi**2/sigma_theta**2))
            l_likelihood.append( lpx/len(l_T_clock))
        l_likelihood_T.append(l_likelihood)


    plt.plot(domain_sigma,l_likelihood_T[0], c = 'red', label = '34' )
    plt.plot(domain_sigma,l_likelihood_T[1], c = 'blue',label = '37' )
    plt.plot(domain_sigma,l_likelihood_T[2], c = 'orange',label = '40' )
    plt.axvline(domain_sigma[np.argmax(l_likelihood_T[0])], c= 'red')
    plt.axvline(domain_sigma[np.argmax(l_likelihood_T[1])], c= 'blue')
    plt.axvline(domain_sigma[np.argmax(l_likelihood_T[2])], c= 'orange')


    plt.ylabel(r'$log(L(\sigma_\theta))$')
    plt.xlabel(r'$\sigma_\theta$' )
    plt.legend()
    plt.savefig('Results/RawData/likelihood_sigma_theta.pdf')
    plt.show()
    plt.close()
Exemple #10
0
    def __init__(self, tickers, start='2014-01-01', end='2018-01-01', interval='1d', n_series=20, T_pred=10, n_cols=30, n_rows=30, T_space=10, train=True):

        self.folder = './' + ''.join(tickers) + '_start' + start + '_end' + end + '_int' + interval + \
                      '/case' + str(n_series) + '_' + str(T_pred) + '_' + str(n_cols) + '_' + str(n_rows) + '_' + str(T_space)

        try:
            self.original = np.load(self.folder + '/original.npy')
            if train:
                self.x = np.load(self.folder + '/Xtrain.npy')
                self.y = np.load(self.folder + '/Ytrain.npy')
            else:
                self.x = np.load(self.folder + '/Xtest.npy')
                self.y = np.load(self.folder + '/Ytest.npy')
        except:
            ld = LoadData(tickers, start, end, interval)
            try:
                ld.unprocessed = pd.read_csv('./' + ''.join(tickers) + '_start' + start + '_end' + end + '_int' + interval + '/UnprocessedData.csv')
            except:
                print('DOWNLOADING DATA')
                ld.download()
            print('PROCESSING DATA')
            ld.process(n_series, T_pred, n_cols, n_rows, T_space, plot=True)
            ld.cut_and_shuffle()

            if train:
                self.x = ld.Xtrain
                self.y = ld.Ytrain
            else:
                self.x = ld.Xtest
                self.y = ld.Ytest
            self.original = ld.original

        # Shape of X: (Number of datasamples, Number of tickers, Number of rows, Number of columns)
        # Shape of Y: (Number of datasamples, Number of tickers)
        self.len = self.x.shape[0]
Exemple #11
0
def train():
    batch_size = 10
    epochs = 50
    bestloss = 1e10
    learning_rate = 5e-4
    Trainer = VGG16Trainer().cuda()

    path = './train'
    trainLabel = getLabel(path)
    traindata = LoadData(path, Label=trainLabel)
    dataloader = DataLoader(traindata, batch_size, shuffle=True)
    valLabel = getLabel('./val')
    valdata = LoadData('./val', Label=valLabel)
    valdataloader = DataLoader(valdata, batch_size, shuffle=True)
    count = 0
    for epoch in range(epochs):

        if count == 5:
            learning_rate *= 0.5
            for param_group in Trainer.optimizer.param_groups:
                param_group['lr'] = learning_rate

        if count == 10:
            break

        Trainer.train()
        totalloss = 0
        for i_batch, batch_data in enumerate(dataloader):
            image = batch_data['image']
            label = batch_data['label'].cuda()
            image = image.cuda().float() / 255.
            loss = Trainer.train_step(image, label)
            totalloss += loss
        print('train loss:')
        print(totalloss / len(dataloader))

        Trainer.eval()
        valloss = 0
        with torch.no_grad():
            for i_batch, batch_data in enumerate(valdataloader):
                image = batch_data['image']
                label = batch_data['label'].cuda()
                image = image.cuda().float() / 255.
                valloss += Trainer.forward(image, label)
        print('val loss:')
        valloss_a = valloss / len(valdataloader)
        print(valloss_a)
        if valloss_a < bestloss:
            bestloss = valloss_a
            print('saved')
            Trainer.save('VGG.pkl')
            count = 0
        else:
            count += 1
Exemple #12
0
    def __init__(self, src, dst, dtype, location, dimension):
        self.src = int(src)
        self.dst = int(dst)

        self.org_node = LoadData.load_org_node(location)
        self.org_path = LoadData.load_org_path(location, dimension)
        self.link_table = LoadData.load_linking_table(location)
        # print('org_node', self.org_node)
        # print('org_path', self.org_path)
        # print('self.link_table', self.link_table)

        self.heap = Heap(dtype)
        self.quadrant = self.__get_quadrant(self.src, self.dst)
Exemple #13
0
 def train(self):
     l = LoadData()
     stopWords = l.loadStopWords()
     self.loadDataCSV('bbc-text.csv')
     vect = TfidfVectorizer(stop_words=stopWords)
     self.train_and_test_split(0.75)
     X_train = vect.fit_transform(self.train_data['data'])
     Y_train = self.train_data['class_name']
     X_test = vect.transform(self.test_data['data'])
     Y_test = self.test_data['class_name']
     nb = MultinomialNB()
     Y_pred = nb.fit(X_train, Y_train).predict(X_test)
     self.metric(Y_test, Y_pred)
    def __init__(self):

        self.rawdata = LoadData('.\ex1data1.txt')
        self.X, self.y, self.batch_size = self.rawdata.loadTXT()
        self.theta = np.array([[0.], [0.]])
        self.alpha = 0.01
        self.costlst = []
        self.thetalst = []

        print(self.theta.shape)
        print(self.batch_size)
        print(np.sum(self.X[:, 1]))
        print(np.sum(self.y))
Exemple #15
0
    def __init__(self):
        
        self.rawdata = LoadData('.\ex2data1.txt')
        self.X, self.y, self.batch_size, rawdata = self.rawdata.loadTXT()
        self.theta = np.array([[0.],[0.],[0.]])
        self.alpha = 0.01
        self.costlst = []
        self.thetalst = []
        self.rawdata_p = rawdata[np.where(rawdata[:,2]==1)]
        self.rawdata_n = rawdata[np.where(rawdata[:,2]==0)]
#         print(self.theta.shape)
#         print(self.batch_size)
#         print(self.rawdata_p)
#         print(self.rawdata_n)
        self.y = self.y[0]
def main():
    # Data loading
    arg_parser = CSACCMArgs()
    args = arg_parser.args
    os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu

    np.random.seed(args.random_seed)
    tf.set_random_seed(args.random_seed)

    data = LoadData(args.path, args.dataset, label=args.label, sep=args.sep, append_id=True, include_id=False)
    if args.verbose > 0:
        print(args)

    # Training
    t1 = time()
    model = CSACCM(feature_num=len(data.features), feature_dims=data.feature_dims, f_vector_size=args.f_vector_size,
                   user_feature_num=len(data.user_features), item_feature_num=len(data.item_features),
                   user_num=data.user_num, item_num=data.item_num, ui_vector_size=args.ui_vector_size,
                   warm_ratio=args.warm_ratio,
                   cb_hidden_layers=eval(args.cb_hidden_layers), attention_size=args.attention_size,
                   optimizer=args.optimizer, learning_rate=args.lr, batch_size=args.batch_size, epoch=args.epoch,
                   dropout_keep=args.dropout_keep, l2_regularize=args.l2,
                   verbose=args.verbose, random_seed=args.random_seed, model_path=args.model)
    if args.load == 1:
        model.load_model()
    # model.run_debug(data.test_data)

    # train
    model.train(data.train_data, data.validation_data, data.test_data, args.load == 1)
    model.print_result(t1)

    # test
    model.load_model()
    model.predict(data.test_data)
Exemple #17
0
def feature_extraction_NFM(X, y_binary, i_Pos_sample_set, i_Neg_sample_set,
                           args):

    #the labeled data will come from both pos and neg
    X_label = [X[i] for i in set.union(i_Pos_sample_set, i_Neg_sample_set)]
    y_label = [
        y_binary[i] for i in set.union(i_Pos_sample_set, i_Neg_sample_set)
    ]

    X_train = np.asarray(X_label)
    Y_train = np.asarray(y_label)
    X_validation = np.asarray(X_label)
    Y_validation = np.asarray(y_label)
    X_test = copy.deepcopy(
        X
    )  #set the whole dataset for test, so that we will get the new features
    Y_test = copy.deepcopy(y_binary)

    data = LoadData(args.loss_type, X_train, Y_train, X_validation,
                    Y_validation, X_test, Y_test)

    # Training
    model = NeuralFM(data.features_M, args.hidden_factor, args.layers,
                     args.loss_type, args.pretrain, args.epoch,
                     args.batch_size, args.lr, args.lamda, args.keep_prob,
                     args.optimizer, args.batch_norm, args.activation_function,
                     args.verbose, args.early_stop)
    model.train(data.Train_data, data.Validation_data, data.Test_data)

    features = model.get_deep_feature(
        data.Test_data)  #model.get_bi_feature(data.Test_data)#
    return features
Exemple #18
0
def MeanAndVarMapper(fileName):
    inputData = LoadData(fileName)[0]
    inputMat = numpy.mat(inputData)
    mean = numpy.mean(inputMat)
    var = numpy.var(inputData)
    count = len(inputData)
    return mean, var, count
Exemple #19
0
def job(data, city, dimension, heuristic_matrices, pname):
    """
    :param data: 起點陣列
    :param heuristic_matrices:
    :param pname: 多線程的編號
    :return:
    """
    print('<---Processing ' + str(pname) + ' start--->')

    adjacency_list = LoadData.load_linking_table(city)
    count = 0
    byte_limit = 1024 * 20
    full_path = []

    for (src, dst) in data:
        if src != dst:
            algorithm = Astar(src, dst, city, dimension, adjacency_list,
                              heuristic_matrices)
            algorithm.query()
            path = algorithm.get_shortest_path()
            if bool(path):
                full_path.append(tuple(path))

            # 輸出檔案,依檔案大小分檔案
            if sys.getsizeof(full_path) >= byte_limit:
                Export.export_a_star(full_path, dimension, city, count)
                count += 1
                full_path = []

    print('<---End with ' + str(pname), '--->')
Exemple #20
0
def main():
    # Data loading
    arg_parser = BaseArgs()
    args = arg_parser.args

    os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu
    np.random.seed(args.random_seed)
    tf.set_random_seed(args.random_seed)

    data = LoadData(args.path, args.dataset, label=args.label, append_id=False, include_id=True)
    if args.verbose > 0:
        print(args)

    # Training
    t1 = time()
    model = BaseModel(feature_num=len(data.features), optimizer=args.optimizer,
                      learning_rate=args.lr, batch_size=args.batch_size, epoch=args.epoch,
                      dropout_keep=args.dropout_keep, l2_regularize=args.l2,
                      verbose=args.verbose, random_seed=args.random_seed, model_path=args.model)
    if args.load == 1:
        model.load_model()
    # model.run_debug(data.test_data)

    # train
    model.train(data.train_data, data.validation_data, data.test_data, args.load == 1)
    model.print_result(t1)

    # test
    model.load_model()
    model.predict(data.test_data)
def plot_hist_periods(cell, temperature, division):
    """
    Given a cell condition, compute and plot a histgram of periods.

    Parameters
    ----------
    cell : string
        Cell condition.
    """
    ##################### LOAD DATA ##################
    if cell == 'NIH3T3':
        path = "../Data/NIH3T3.ALL.2017-04-04/ALL_TRACES_INFORMATION.p"
    else:
        path = "../Data/U2OS-2017-03-20/ALL_TRACES_INFORMATION_march_2017.p"
    dataClass = LoadData(path,
                         10000000,
                         temperature=temperature,
                         division=division)
    (ll_area, ll_signal, ll_nan_circadian_factor, ll_obs_phi, ll_peak,
     ll_idx_cell_cycle_start, T_theta, T_phi) \
                                        = dataClass.load(load_annotation=True)

    if division:
        ##################### COMPUTE CELL CYCLE DISTRIBUTION ##################
        l_T_cell_cycle = []
        for l_div_index in ll_idx_cell_cycle_start:
            for t1, t2 in zip(l_div_index[:-1], l_div_index[1:]):
                l_T_cell_cycle.append((t2 - t1) / 2)
    ##################### COMPUTE CIRCADIAN CLOCK DISTRIBUTION #################
    l_T_clock = []
    for l_peak in ll_peak:
        l_idx_peak = [idx for idx, i in enumerate(l_peak) if i == 1]
        for t_peak_1, t_peak_2 in zip(l_idx_peak[:-1], l_idx_peak[1:]):
            l_T_clock.append((t_peak_2 - t_peak_1) / 2)
    ##################### PLOT BOTH DISTRIBUTIONS ##################
    bins = np.linspace(8, 38, 40)
    if division:
        plt.hist(l_T_cell_cycle, bins, alpha=0.5, label='cell-cycle')
    plt.hist(l_T_clock, bins, alpha=0.5, label='clock')
    plt.legend(loc='upper right')
    if division:
        plt.savefig('../Results/RawData/Distributions_div_'+str(temperature)\
                    +"_"+cell+'.pdf', bbox_inches='tight')
    else:
        plt.savefig('../Results/RawData/Distributions_nodiv_'+str(temperature)\
                    +"_"+cell+'.pdf', bbox_inches='tight')
    plt.close()
Exemple #22
0
    def __init__(self):

        self.headerFiles={
            "abslogout": "conf/wifi_abslogout_header",
            "login": "******",
            "auth": "conf/wifi_auth_header",
            "logout": "conf/wifi_logout_header"
    }
        self.paramFiles={
            "abslogout": "conf/wifi_abslogout_params",
            "login": "******",
            "auth": "conf/wifi_auth_params",
            "logout": "conf/wifi_logout_params"
        }

        self.outFiles = {
            "abslogout": "out/abslogout.txt",
            "login": "******",
            "auth": "out/auth.txt",
            "logout": "out/logout.txt"
        }

        ld = LoadData()
        urls = ld.loadUrls('conf/urls')
        self.urls={
            "abslogout":urls['abslogout'],
            "login": urls['login'],
            "logout": urls['logout'],
            "auth": urls['auth']
        }

        edatas = ld.loadParams('conf/email',split="=")
        self.eml = MyEmail()
        self.eml.setUser(edatas['msg_from'],edatas['msg_to'],edatas['passwd'])

        # self.absLogoutUrl = "http://ipgw.neu.edu.cn/include/auth_action.php"
        # self.loginUrl = "http://ipgw.neu.edu.cn/srun_portal_pc.php?ac_id=1&"
        # self.logoutUrl = "http://ipgw.neu.edu.cn/srun_portal_pc.php?ac_id=1&"
        # self.authUrl = "http://ipgw.neu.edu.cn/include/auth_action.php?"

        self.ld = ld
        if not os.path.exists("out"):
            os.mkdir('out')
        if not os.path.exists('conf'):
            print("配置文件损坏,无法运行,请自行查看代码修复!很容易")
    def optimize(self):
        """
        Doing the actual optimiziation
        """
        batch_size, finetune_lr = self.batch_size, self.finetune_lr
        batch_size, n_epochs = self.batch_size, self.training_epochs

        data = LoadData(self.link)
        datasets = data.load_data()
        train_set_x = datasets[0][0]
        n_train_batches = train_set_x.get_value(
            borrow=True).shape[0] // batch_size

        # numpy random generator
        numpy_rng = numpy.random.RandomState(123)

        # construct the Deep Belief Network
        dbn = DBN(numpy_rng=numpy_rng,
                  output_layer=LinearRegression,
                  n_ins=1,
                  hidden_layers_sizes=[3, 3],
                  n_outs=1)

        # Pretraining
        pretraining_fns = dbn.pretraining_functions(train_set_x=train_set_x,
                                                    batch_size=batch_size,
                                                    k=k)
        self.__pretraining(dbn.n_layers, pretraining_fns, n_train_batches)

        # Backpropagation
        train_fn, validate_model, test_model = dbn.build_finetune_functions(
            datasets=datasets,
            batch_size=batch_size,
            learning_rate=finetune_lr)
        models = (train_fn, validate_model, test_model)
        finetuning = Optimization(batch_size=batch_size, n_epochs=n_epochs)
        finetuning.Backpropagation(models, datasets)

        test = theano.function(inputs=[dbn.x], outputs=dbn.output_layer.y_pred)
        prediction = test(datasets[2][0].get_value())
        import pdb
        pdb.set_trace()
        return dbn
Exemple #24
0
 def loadData(self):
     l = LoadData()
     data_cluster = l.loadData('wine-clustering.csv')
     self.std_scaler = StandardScaler()
     self.min_max_scaler = MinMaxScaler()
     data_cluster[data_cluster.columns] = self.std_scaler.fit_transform(
         data_cluster)
     data_cluster[data_cluster.columns] = self.min_max_scaler.fit_transform(
         data_cluster)
     # print(data_cluster.mean())
     # data = data_cluster.to_numpy()
     # np.savetxt('data.txt', data, fmt='%.1f')
     # coverience_matrix = np.dot(np.transpose(data),
     #                            data) / (data.shape[1] - 1)
     # np.savetxt('matrix.txt', coverience_matrix, fmt='%.1f')
     # pca
     self.pca_2 = PCA(2)
     self.pca_2_result = self.pca_2.fit_transform(data_cluster)
     self.data = data_cluster
    def optimize(self):
        """
        Doing the actual optimiziation
        """
        batch_size, finetune_lr = self.batch_size, self.finetune_lr
        batch_size, n_epochs = self.batch_size, self.training_epochs

        data = LoadData(self.link)
        datasets = data.load_data()
        train_set_x = datasets[0][0]
        n_train_batches = train_set_x.get_value(borrow=True).shape[0] // batch_size

        # numpy random generator
        numpy_rng = numpy.random.RandomState(123)

        # construct the Deep Belief Network
        dbn = DBN(numpy_rng=numpy_rng, output_layer = LinearRegression, n_ins=1,
                  hidden_layers_sizes=[3, 3],
                  n_outs=1)

        # Pretraining
        pretraining_fns = dbn.pretraining_functions(train_set_x=train_set_x,
                                                    batch_size=batch_size,
                                                    k=k)
        self.__pretraining(dbn.n_layers, pretraining_fns, n_train_batches)

        # Backpropagation
        train_fn, validate_model, test_model = dbn.build_finetune_functions(
            datasets = datasets,
            batch_size = batch_size,
            learning_rate = finetune_lr
        )
        models = (train_fn, validate_model, test_model)
        finetuning = Optimization(batch_size = batch_size, n_epochs = n_epochs)
        finetuning.Backpropagation(models, datasets)

        test  = theano.function(inputs = [dbn.x], outputs = dbn.output_layer.y_pred)
        prediction = test(datasets[2][0].get_value())
        import pdb; pdb.set_trace()
        return dbn
Exemple #26
0
def calculate_heuristic(city):
    """
    計算 heuristic matrices
    :param city:
    :return:
    """
    org_node = LoadData.load_org_node(city)
    data = np.array(list(org_node.values()))
    result = euclidean_distances(data, data)
    header = [x for x in list(org_node.keys())]
    pd_data = pd.DataFrame(result, columns=header)
    # print(pd_data.shape)
    return pd_data
Exemple #27
0
class GradientDescent():
    
    #def __init__(self, X, y, theta, alpha, batch_size):
    def __init__(self):
        
        self.rawdata = LoadData('.\ex2data1.txt')
        self.X, self.y, self.batch_size, rawdata = self.rawdata.loadTXT()
        self.theta = np.array([[0.],[0.],[0.]])
        self.alpha = 0.01
        self.costlst = []
        self.thetalst = []
        self.rawdata_p = rawdata[np.where(rawdata[:,2]==1)]
        self.rawdata_n = rawdata[np.where(rawdata[:,2]==0)]
#         print(self.theta.shape)
#         print(self.batch_size)
#         print(self.rawdata_p)
#         print(self.rawdata_n)
        self.y = self.y[0]

    
    def sigmoid(self,inputs):
        inputs = np.array(inputs)
        sigmoid_scores = [1/float(1 + np.exp(-x)) for x in inputs[0]]
        return sigmoid_scores
        
    def costFunction(self):
        h = np.matrix(self.sigmoid(self.theta.transpose()*self.X))
        print(-self.y.transpose()*np.log(h))
        print((1-self.y.transpose())*np.log(1-h))
        return 1/(self.batch_size) * np.sum(-self.y.transpose()*np.log(h) - (1-self.y.transpose())*np.log(1-h))
        
    def gradientDescent(self):
        
        diff = self.theta.transpose()*self.X - self.y
        self.theta -= self.alpha*(1/self.batch_size) * self.X*diff.transpose()  
        self.thetalst.append(self.theta)  
        self.costlst.append(self.costFunction())
    
    def plotCostJ(self):
        x = [i for i in range(len(self.costlst))]
        plt.plot(x, self.costlst)
        
    def plotCostJTheta(self):
        fig = plt.figure()
        ax = fig.gca(projection='3d')
        x = self.thetalst[0]
        y = self.thetalst[1]
        z = self.costlst
        ax.plot(x, y, z, label='parametric curve')
        ax.legend()
        plt.show()
def estimate_OU_par(cell, temperature, W=None, gamma_A=0.03, gamma_B=0.03):
    """
    Estimate mean and variance of OU processes given a set of conditions,
    according to which a set of traces is filtered.

    Parameters
    ----------
    cell : string
        Cell type.
    temperature : integer
        Temperature condition.
    W : list
        Waveform.
    gamma_A : float
        Regression parameter for the amplitude.
    gamma_b : float
        Regression parameter for the background.

    Returns
    -------
    The mean and standard deviations of the amplitude and the background.
    """
    ######### CORRECTION BECAUSE NOT ENOUGH TRACES AT 34°C AND 40°C #########
    print(
        'CAUTION : Parameters for None temperature selected since not enough \
            traces at 34°C and 40°C')
    temperature = None

    ##################### LOAD DATA ################
    if cell == 'NIH3T3':
        path = "Data/NIH3T3.ALL.2017-04-04/ALL_TRACES_INFORMATION.p"
        dataClass = LoadData(path,
                             10000000,
                             temperature=temperature,
                             division=False)
    elif cell == 'U2OS':
        path = "Data/U2OS-2017-03-20/ALL_TRACES_INFORMATION_march_2017.p"
        dataClass = LoadData(path,
                             10000000,
                             temperature=temperature,
                             division=True)

    try:
        (ll_area, ll_signal, ll_nan_circadian_factor, ll_obs_phi, ll_peak,
        ll_idx_cell_cycle_start, T_theta, T_phi) = \
                                            dataClass.load(load_annotation=True)
    except:
        dataClass.path = '../' + dataClass.path
        (ll_area, ll_signal, ll_nan_circadian_factor, ll_obs_phi, ll_peak,
        ll_idx_cell_cycle_start, T_theta, T_phi) = \
                                            dataClass.load(load_annotation=True)

    return estimate_OU_par_from_signal(ll_signal, W, gamma_A, gamma_B)
Exemple #29
0
def estimate_phase_dev(cell, temperature):
    """
    Final estimate for the circadian phase deviation.

    Parameters
    ----------
    cell : string
        Cell condition.
    temperature : int
        Temperature condition.

    Returns
    -------
    The standard deviation for the phase progression, and the periods.
    """

    ######### CORRECTION BECAUSE NOT ENOUGH TRACES AT 34°C AND 40°C #########
    print('CAUTION : Parameters for None temperature selected since not enough \
                                                       traces at 34°C and 40°C')
    temperature = None

    ##################### LOAD DATA ##################
    if cell == 'NIH3T3':
        path =  "Data/NIH3T3.ALL.2017-04-04/ALL_TRACES_INFORMATION.p"
        dataClass=LoadData(path, 10000000, temperature = temperature,
                            division = False)
        (ll_area, ll_signal, ll_nan_circadian_factor, ll_obs_phi, ll_peak,
        ll_idx_cell_cycle_start, T_theta, T_phi) = \
                                            dataClass.load(load_annotation=True)
        #print(len(ll_area))
        std, std_T = estimate_phase_dev_from_signal(ll_peak)

    elif cell == 'U2OS':
        path = "Data/U2OS-2017-03-20/ALL_TRACES_INFORMATION_march_2017.p"
        dataClass=LoadData(path, 10000000, temperature = temperature,
                            division = True)
        (ll_area, ll_signal, ll_nan_circadian_factor, ll_obs_phi, ll_peak,
        ll_idx_cell_cycle_start, T_theta, T_phi) = \
                                            dataClass.load(load_annotation=True)
        std, std_T = estimate_phase_dev_from_signal(ll_peak)
        #correction for the neglected coupling since dividing traces
        std = std*0.65
        std_T  = std_T *0.65

    else:
        print("Cell type doesn't exist")

    '''
    for (idx, l_signal), l_peak in zip(enumerate(ll_signal), ll_peak):
        plt.plot(l_signal)
        plt.plot(l_peak)
        plt.show()
        plt.close()
        if idx>17:
            break
    '''
    return std, std_T
Exemple #30
0
    def train_1_class(self):
        l = LoadData()
        stopWords = l.loadStopWords()
        train_sizes = [100, 200]  # size per class
        for train_size in train_sizes:
            print('Training size:',
                  math.floor(train_size * 0.75) * 2, 'Test size:',
                  math.ceil(train_size * 0.25) * 2)
            self.loadData(train_size)
            vect = TfidfVectorizer(stop_words=stopWords)

            # balance classes
            temp_class = self.data['class_name'][train_size:]
            temp_data = self.data['data'][train_size:]
            idx = random.choices(range(len(temp_class)), k=train_size)
            temp_class = [temp_class[i] for i in idx]
            temp_data = [temp_data[i] for i in idx]
            del self.data['data'][train_size:]
            del self.data['class_name'][train_size:]
            self.data['class_name'].extend(temp_class)
            self.data['data'].extend(temp_data)

            self.train_and_test_split(0.75)
            X_train = vect.fit_transform(self.train_data['data'])
            Y_train = [
                1 if i == 'business' else 0
                for i in self.train_data['class_name']
            ]
            X_test = vect.transform(self.test_data['data'])
            Y_test = [
                1 if i == 'business' else 0
                for i in self.test_data['class_name']
            ]
            nb = MultinomialNB()
            Y_pred = nb.fit(X_train, Y_train).predict(X_test)
            self.metric(Y_test, Y_pred)
            print('---------------------------------------------------')
class GradientDescent():

    #def __init__(self, X, y, theta, alpha, batch_size):
    def __init__(self):

        self.rawdata = LoadData('.\ex1data1.txt')
        self.X, self.y, self.batch_size = self.rawdata.loadTXT()
        self.theta = np.array([[0.], [0.]])
        self.alpha = 0.01
        self.costlst = []
        self.thetalst = []

        print(self.theta.shape)
        print(self.batch_size)
        print(np.sum(self.X[:, 1]))
        print(np.sum(self.y))

    def costFunction(self):

        return 1 / (2 * self.batch_size) * np.sum(
            np.square(self.theta.transpose() * self.X - self.y))

    def gradientDescent(self):

        diff = self.theta.transpose() * self.X - self.y
        self.theta -= self.alpha * (
            1 / self.batch_size) * self.X * diff.transpose()
        self.thetalst.append(self.theta)
        self.costlst.append(self.costFunction())

    def plotCostJ(self):
        x = [i for i in range(len(self.costlst))]
        plt.plot(x, self.costlst)

    def plotCostJTheta(self):
        fig = plt.figure()
        ax = fig.gca(projection='3d')
        x = self.thetalst[0]
        y = self.thetalst[1]
        z = self.costlst
        ax.plot(x, y, z, label='parametric curve')
        ax.legend()
        plt.show()
Exemple #32
0
def load_model_and_predict(data=None):

    transaction_classifier = TransactionClassifier()
    transaction_classifier.load_model()

    if data is None:
        data = LoadData(x_input_features=[5, 6, 7])
        data.load_processed_data()

    val_data = data.load_validation_data()
    test_data = data.load_test_data()

    print(transaction_classifier.get_confusion_matrix(val_data))
    print(transaction_classifier.get_confusion_matrix(test_data))
Exemple #33
0
 def __init__(self):
     ld = LoadData()
     ld.load_csv()
     self.df = ld.df
def sgd_optimization(learning_rate=0.13, n_epochs=1000,
                           dataset='mnist.pkl.gz',
                           batch_size=20):
    """
    Demonstrate stochastic gradient descent optimization of a log-linear
    model

    This is demonstrated on MNIST.

    :type learning_rate: float
    :param learning_rate: learning rate used (factor for the stochastic
                          gradient)

    :type n_epochs: int
    :param n_epochs: maximal number of epochs to run the optimizer

    :type dataset: string
    :param dataset: the path of the MNIST dataset file from
                 http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz

    """

    link = '/home/fabian/Documents/DeepLearningTutorials/data/DeepQ.npy'
    data = LoadData(link)
    datasets = data.load_data()
    x_train, y_train = datasets[0]
    x_valid, y_valid = datasets[1]

    x = T.matrix('x')
    index = T.lscalar('index')
    y = T.vector('y')

    n_in = 1
    n_out = 1
    batch_size = 20
    import pdb; pdb.set_trace()

    # compute number of minibatches for training, validation and testing
    n_train_batches = x_train.get_value().shape[0] // batch_size
    n_valid_batches = x_valid.get_value().shape[0] // batch_size

    print('... building the model')

    classifier = PiecewiseLinear_Reinforcement(n_in = 1, input = x, n_out = n_out)


    # the cost we minimize during training is the negative log likelihood of
    # the model in symbolic format
    cost = classifier.loss(y)
    error = classifier.error(y)

    validate_model = theano.function(
        inputs=[index],
        outputs=[cost, error],
        givens={
            x: x_valid[index * batch_size: (index + 1) * batch_size],
            y: y_valid[index * batch_size: (index + 1) * batch_size]
        }
    )

    # compute the gradient of cost with respect to theta = (W,b)
    g_W = T.grad(cost=cost, wrt=classifier.W)
    g_b = T.grad(cost=cost, wrt=classifier.b)

    # start-snippet-3
    # specify how to update the parameters of the model as a list of
    # (variable, update expression) pairs.
    updates = [(classifier.W, classifier.W - learning_rate * g_W),
               (classifier.b, classifier.b - learning_rate * g_b)]

    # compiling a Theano function `train_model` that returns the cost, but in
    # the same time updates the parameter of the model based on the rules
    # defined in `updates`
    train_model = theano.function(
        inputs=[index],
        outputs=[cost, error],
        updates=updates,
        givens={
            x: x_train[index * batch_size: (index + 1) * batch_size],
            y: y_train[index * batch_size: (index + 1) * batch_size]
        }
    )
    test = [train_model(i) for i in range(n_train_batches)]


    print('... training the model')
    # early-stopping parameters
    patience = 5000  # look as this many examples regardless
    patience_increase = 2  # wait this much longer when a new best is
                                  # found
    improvement_threshold = 0.005  # a relative improvement of this much is
                                  # considered significant
    validation_frequency = min(n_train_batches, patience // 2)

    best_expected_utility =  -1 * numpy.inf
    test_score = 0.
    #start_time = timeit.default_timer()

    done_looping = False
    epoch = 0
    while (epoch < n_epochs) and (not done_looping):
        epoch = epoch + 1
        for minibatch_index in range(n_train_batches):

            minibatch_avg_cost = train_model(minibatch_index)[0]
            # iteration number
            iter = (epoch - 1) * n_train_batches + minibatch_index

            if (iter + 1) % validation_frequency == 0:
                # compute zero-one loss on validation set
                expected_utility = [-1*validate_model(i)[0]
                                     for i in range(n_valid_batches)]
                validation_error = [validate_model(i)[1]
                                    for i in range(n_valid_batches)]
                this_expected_utility = numpy.mean(expected_utility)
                this_validation_error = numpy.mean(validation_error)

                print(
                    'epoch %i, minibatch %i/%i, expected Utility %f validation error  %f %%' %
                    (
                        epoch,
                        minibatch_index + 1,
                        n_train_batches,
                        this_expected_utility,
                        this_validation_error * 100.
                    )
                )

                # if we got the best validation score until now
                if this_expected_utility > best_expected_utility:
                    #improve patience if loss improvement is good enough
                    if this_expected_utility > best_expected_utility *  ( 1 + improvement_threshold):
                        patience = max(patience, iter * patience_increase)

                    best_expected_utility = this_expected_utility
                    # test it on the test set

                    # test_losses = [test_model(i)
                    #                for i in range(n_test_batches)]
                    # test_score = numpy.mean(test_losses)
                    #
                    # print(
                    #     (
                    #         '     epoch %i, minibatch %i/%i, test error of'
                    #         ' best model %f %%'
                    #     ) %
                    #     (
                    #         epoch,
                    #         minibatch_index + 1,
                    #         n_train_batches,
                    #         test_score * 100.
                    #     )
                    # )

                    # save the best model
                    # with open('best_model.pkl', 'wb') as f:
                    #
                    #     pickle.dump(classifier, f)

            if patience <= iter:
                done_looping = True
                break

    end_time = timeit.default_timer()
    def optimization(self):
        """
        Does the optimization
        """
        batch_size, n_epochs = self.batch_size, self.n_epochs
        data = LoadData(self.link)
        datasets = data.load_data()
        x_train, y_train = datasets[0]
        x_validate, y_validate = datasets[1]


        classifier, train_model, validate_model = self.__models(datasets)

        n_train_batches = x_train.get_value(borrow=True).shape[0] // batch_size
        n_valid_batches = x_validate.get_value(borrow=True).shape[0] // batch_size
        done_looping = False
        epoch = 0
        train_loss, validation_loss = [], []
        patience = 500000  # look as this many examples regardless
        patience_increase = 2  # wait this much longer when a new best is
                                      # found
        improvement_threshold = 0.995  # a relative improvement of this much is
                                      # considered significant
        validation_frequency = min(n_train_batches, patience // 2)
        best_validation_loss = np.inf
        test_score = 0.

        while (epoch < n_epochs) and (not done_looping):
            epoch = epoch + 1
            for minibatch_index in range(n_train_batches):
                #import pdb; pdb.set_trace()
                minibatch_avg_cost = train_model(minibatch_index)
                iter = (epoch - 1) * n_train_batches + minibatch_index
                if (iter + 1) % validation_frequency == 0:
                    validation_losses = [validate_model(i)
                                         for i in range(n_valid_batches)]
                    this_validation_loss = np.mean(validation_losses)

                    print(
                        'epoch %i, minibatch %i/%i, validation error %f %%' %
                        (
                            epoch,
                            minibatch_index + 1,
                            n_train_batches,
                            this_validation_loss * 100.
                        )
                    )

                    # if we got the best validation score until now
                    if this_validation_loss < best_validation_loss:
                        #improve patience if loss improvement is good enough
                        if this_validation_loss < best_validation_loss *  \
                           improvement_threshold:
                            patience = max(patience, iter * patience_increase)

                        best_validation_loss = this_validation_loss
                        # test it on the test set

                        # test_losses = [test_model(i)
                        #                for i in range(n_test_batches)]
                        # test_score = np.mean(test_losses)
                        #
                        # print(
                        #     (
                        #         '     epoch %i, minibatch %i/%i, test error of'
                        #         ' best model %f %%'
                        #     ) %
                        #     (
                        #         epoch,
                        #         minibatch_index + 1,
                        #         n_train_batches,
                        #         test_score * 100.
                        #     )
                        # )

                        # save the best model
                        with open('best_model3.pkl', 'wb') as f:
                            pickle.dump(classifier, f)

                if patience <= iter:
                    done_looping = True
                    break