Ejemplo n.º 1
0
def main(argv=sys.argv):
    if len(argv) < 2:
        usage(argv)
    config_uri = argv[1]
    options = parse_vars(argv[2:])
    setup_logging(config_uri)
    settings = get_appsettings(config_uri, options=options)
    engine = engine_from_config(settings, 'sqlalchemy.')
    DBSession.configure(bind=engine)
    Base.metadata.drop_all(engine)
    Base.metadata.create_all(engine)

    load_data('genproc/scripts/data/genproc_checkplan2014_data.csv')
Ejemplo n.º 2
0
def dbscan_demo():
    print 'starting up'
    df = load_data()
    print 'load done'
    df = extract_features(df)
    print 'features done'
    dbscan(df, 0.2, 10)
Ejemplo n.º 3
0
def write_svm_data(num_features, num_blocks):
	svm_train_file = open("svm_train_file.dat", "w")
	svm_test_file = open("svm_test_file.dat", "w")
	data = load_data.load_data()
	#data.genes_to_rank = 3
	train_samples = 40
	features = dimension_reduction.choose_features(data, num_blocks, num_features)
	for s in range(0, data.samples):
		for g in range(0, data.genes_to_rank):
			line = ""
			line += str(int(data.ranking[g, s]))
			line += " qid:"
			line += str(int(s))
			for f in range(0, num_features):
				line += " "
				feature_index = features[g, f]
				line += str(int(feature_index))
				line += ":"
				if feature_index >= data.expression_genes:
					copynumber_index = feature_index - data.expression_genes
					line += str(data.copynumber[copynumber_index, s])
				elif feature_index < data.expression_genes:
					line += str(data.expression[feature_index, s])
			line += "\n"
			if s <= train_samples:
				svm_train_file.write(line)
			else:
				svm_test_file.write(line)
			
	svm_train_file.close()
	svm_test_file.close()					
Ejemplo n.º 4
0
  def start(self):
    data = load_data('formatted_veltman_pbp_small.pkl', False)
    self.train_set_x, self.train_set_y = data[0]
    self.test_set_x, self.test_set_y = data[1]

    # Opening prompt
    print('\nTry your luck as an NFL coach! Guess the play call based on each '
      '(admittedly simple) game situation.')
    inpt = raw_input('Type \'q\' at any time to stop. '
      'Press enter to begin...\n')

    n_correct = 0
    n_incorrect = 0

    # Game loop
    if inpt != 'q':
      response = ''
      while response != 'q':
        response, answer = self.ask_question()
        if response == 'q':
          self.end_game(n_correct, n_incorrect)
          continue

        response = int(response) - 1

        if response == answer:
          print('Good call, coach!\n')
          n_correct += 1
        else:
          action = self.format_action(answer)
          print('Whoops, that\'s not what your NFL counterpart decided.'
            ' He {0}.\n'.format(action))
          n_incorrect += 1
    else:
      self.end_game(n_correct, n_incorrect)
def init(m, seed):
    if m == -1:
        m = None
    gc, mt, track = load_data(m, seed)

    msequences, mlabels = df_to_sequence_list(mt.data)
    gsequences, glabels = df_to_sequence_list(gc.data)

    sequences = np.concatenate((msequences, gsequences), 0)
    labels = np.concatenate((mlabels, glabels))

    sequences = np.concatenate((sequences, -1 * sequences))

    # tie positive and negative expression sequences
    tied = {}
    for i, label in enumerate(labels):
        tied[label] = [i, i+labels.size]

    labels = np.concatenate(((labels + '+'), (labels + '-')))

    # noise model trained on all data once
    # genes/metabolites will be assigned to noise model if other models
    # fail to model it better
    noise_dist = [NormalDistribution(0, 1)]
    noise_trans = np.array([[1]])
    starts = np.array([1])
    noise = HiddenMarkovModel.from_matrix(noise_trans, noise_dist, starts,
                                          name='noise')
    noise.freeze_distributions()

    return sequences, labels, tied, noise
Ejemplo n.º 6
0
def stn_eval(model_file):
    print("model: %s" % (model_file))
    data = load.load_data(mnist_cluttered, DIM)
    
    values = pickle.load(open(model_file, 'r'))
    network_model, l_transform = model(DIM, DIM, NUM_CLASSES)
    lasagne.layers.set_all_param_values(network_model, values)
   
    X = T.tensor4()
    y = T.ivector()

    output_eval, transform_eval = lasagne.layers.get_output([network_model, l_transform], X, deterministic=True)

    # create funcition
    eval = theano.function([X], [output_eval, transform_eval])

    # evalation function
    def eval_func(X, y):
        output_eval, transform_eval = eval(X)
        preds = np.argmax(output_eval, axis=-1)
        acc = np.mean(preds == y)
        return acc, transform_eval
  
    test_acc, test_transform = eval_func(data['X_test'], data['y_test'])
    transpose_visualization(data, test_transform)
    print("test acc: %f" % (test_acc))
Ejemplo n.º 7
0
def run():
    config_dict = yaml.load(open(sys.argv[1], 'r'))
    print config_dict
    data_location = config_dict['data_location']
    uniq_map_file = config_dict['uniq_map_file']
    runiq_map_file = config_dict['runiq_map_file']
    vertices_map, runiq_map = load_data(data_location)
    broken, unequal = fix_similarity_symmetry(vertices_map)
    print "* Fixed similarity relation symmetry (%d unidirected, %d unequal)" % (broken, unequal)

    print "* Vertices map generated"
    _, deleted = purge_invalid_vertices(vertices_map, runiq_map, uniq_map_file, runiq_map_file)
    print "* Cleaned up vertices map (deleted %d isolated vertices)" % (deleted)
 
    if 'min_elems' in config_dict:
        forest = Forest(vertices_map, min_graph_elems=config_dict['min_elems'])
    else:
        forest = Forest(vertices_map)
    ccs = forest.build_connected_components()
    print "* Built connected components"
    forest.build_forest(ccs)
    print "* Built graphs out of connected components"
    forest.reduce()
    print "* Forest reduced!"

    for graph in forest.elements:
        print graph.distance_matrix()

    print len(forest.elements)
    print forest.elements_size_hist()
    forest.pickle(config_dict['pickle_dir'])
def init():
    m = 1000  # restricts number of genes, used for local testing
    gc, mt, track = load_data(m)
    state_range = [5, 10, 25, 50, 100]
    z_range = [3, 5, 10, 20]

    msequences, mlabels = df_to_sequence_list(mt.data)
    gsequences, glabels = df_to_sequence_list(gc.data)

    sequences = np.concatenate((msequences, gsequences), 0)
    labels = np.concatenate((mlabels, glabels))

    sequences = np.concatenate((sequences, -1 * sequences))

    # tie positive and negative expression sequences
    tied = {}
    for i, label in enumerate(labels):
        tied[label] = [i, i+labels.size]

    state_labels = np.concatenate(((labels + '+'), (labels + '-')))
    labels = np.concatenate((labels, labels))

    # noise model trained on all data once
    # genes/metabolites will be assigned to noise model if other models
    # fail to model it better
    noise_dist = [NormalDistribution(0, 1)]
    noise_trans = np.array([[1]])
    starts = np.array([1])
    noise = HiddenMarkovModel.from_matrix(noise_trans, noise_dist, starts)
    noise.freeze_distributions()
    return gc, mt, sequences, labels, state_labels, tied, noise, z_range, \
        state_range
Ejemplo n.º 9
0
def sgd_predict(dataset=DataHome, batch_size=28):
    """
    Demonstrate stochastic gradient descent optimization of a log-linear
    model

    This is demonstrated on MNIST.

    :type learning_rate: float
    :param learning_rate: learning rate used (factor for the stochastic
                          gradient)

    :type n_epochs: int
    :param n_epochs: maximal number of epochs to run the optimizer

    :type dataset: string
    :param dataset: the path of the MNIST dataset file from
                 http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz

    """

    logistic_regression_model_pkl = open(train_model_route, "r")
    logistic_regression_model_state = cPickle.load(logistic_regression_model_pkl)
    W, b = logistic_regression_model_state

    datasets = load_data.load_data(dataset)

    test_set_x, test_set_y = datasets[2]

    n_test_batches = test_set_x.get_value(borrow=True).shape[0] / batch_size

    ######################
    # BUILD ACTUAL MODEL #
    ######################
    # print '... building the model'

    # allocate symbolic variables for the data
    index = T.lscalar()  # index to a [mini]batch
    x = T.matrix("x")  # the data is presented as rasterized images
    y = T.ivector("y")  # the labels are presented as 1D vector of
    # [int] labels

    # construct the logistic regression class
    # Each MNIST image has size 28*28
    classifier = LogisticRegression(input=x, n_in=28 * 28, n_out=10, W=W, b=b)

    # the cost we minimize during training is the negative log likelihood of
    # the model in symbolic format
    cost = classifier.negative_log_likelihood(y)

    # compiling a Theano function that computes the mistakes that are made by
    # the model on a minibatch

    test_results = theano.function(
        inputs=[index], outputs=classifier.y_pred, givens={x: test_set_x[index * batch_size : (index + 1) * batch_size]}
    )

    test_res = [test_results(i) for i in xrange(n_test_batches)]
    print test_res
Ejemplo n.º 10
0
def gmm_demo():
    print 'starting up'
    df = load_data()
    print 'load done'
    df = extract_features(df)
    print 'extract done'
    features_list = list(df.columns.values)[1:]
    print 'features done'
    gmm(df, features_list)
def factorize_and_save():
    """ 1. Loads the original data.
        2. Factorizes the resulting dataframe
        3. Saves it as a CSV-file.
    """ 
    data = ld.load_data()
    data = ld.factorize_data(data)
    del data["status_group"]
    data.to_csv(FACTORIZED_PATH)
Ejemplo n.º 12
0
def counts(config, cut='llh', bintype='logdist', weight=False, zcorrect=False):

    dataList = getDataList(config, bintype)
    bins = getEbins(reco=True)

    # Build histograms of desired information
    N, Err = {},{}
    for cfg, date in dataList[:2]:

        d = load_data(cfg, date, bintype)
        eList = getComps(d)
        c0 = d['cuts'][cut]
        r = np.log10(d['ML_energy'])
        if zcorrect:
            r -= zfix(d['zenith'], bintype=bintype)

        # Total counts
        w = d['weights'][c0] if weight else None
        w2 = d['weights'][c0]**2 if weight else None
        counts = np.histogram(r[c0], bins=bins, weights=w)[0]
        errors = np.sqrt(np.histogram(r[c0], bins=bins, weights=w2)[0])
        try:
            N['All']   += counts
            Err['All'] += errors
        except KeyError:
            N['All']   = counts
            Err['All'] = errors

        # Counts by composition
        for e in eList:
            ecut = d['llh_comp'] == e
            c1 = c0 * ecut
            w = d['weights'][c1] if weight else None
            w2 = d['weights'][c1]**2 if weight else None
            counts = np.histogram(r[c1], bins=bins, weights=w)[0]
            errors = np.sqrt(np.histogram(r[c1], bins=bins, weights=w2)[0])
            try:
                N[e]   += counts
                Err[e] += errors
            except KeyError:
                N[e]   = counts
                Err[e] = errors

    fig, ax = plt.subplots()
    ax.set_xlabel(r'$\log_{10}(E/\mathrm{GeV})$')
    ax.set_ylabel('Counts')

    # Plot reconstructions
    for e in eList + ['All']:
        pnt = getColor(e) + '.'
        ax.errorbar(getMids(bins), N[e], yerr=Err[e], fmt=pnt, label=e)

    ax.set_yscale('log')
    ax.legend(loc='lower left')

    plt.show()
Ejemplo n.º 13
0
def draw_scatter(filename,start,end):
    datav = load_data(filename,5)[start:end]
    dataj = load_data(filename,6)[start:end]
    datac = load_data(filename,4)[start:end]
    mp = [dataj[i]/datav[i] for i in range(len(datav))]
    
    lable = []
    for i in range(len(datac)):
        if i == 0:
            lable.append(0)
        else:
            if datac[i]>datac[i-1]:
                lable.append(1)
            else:
                lable.append(0)
    datac = [i**1 for i in datac]
    mp = [i**1 for i in mp]
    plt.scatter(datac,mp,c=lable)
    plt.show()
Ejemplo n.º 14
0
def fit_model(formula, model_file):
    """
    Saves a model
    :param formula: formula for the model
    :param model_file: name of file to save the model to
    """
    data = load_data()
    model = logit(formula=formula, data=data)
    fitted = model.fit()
    fitted.save(model_file)
Ejemplo n.º 15
0
 def test_df_columns(self):
     """
     Test for output dataframe column count in load_data module.
     """
     df = load_data.load_data()
     cols = df.columns.tolist()
     num = len(cols)
     num_assert = len(['kingdom', 'phylum', 'class', 'order',
                       'family', 'genus', 'length', 'oxygen',
                       'replicate', 'week', 'abundance'])
     self.assertEqual(num, num_assert)
Ejemplo n.º 16
0
	def _get_number_of_participants(self):
		"""
		Returns the number of participants in the dataset found in the specified 
		data directory.
		"""
		sys.path.insert(0, self.args.data_dir)
		print os.getcwd()
		print sys.path
		from load_data import load_data
		dataset = load_data(self.args.data_dir)
		return len(dataset['data']['Y'])
    def initialize_chair(self):

        self.trX, self.trY, self.teX, self.teY = load_data()
        self.trX = self.trX.reshape(-1, 1, 48, 64)
        self.teX = self.teX.reshape(-1, 1, 48, 64)

        self.w1 = self.init_weights((32, 1, 3, 3))
        self.w2 = self.init_weights((64, 32, 3, 3))
        self.w3 = self.init_weights((128, 64, 3, 3))
        self.w4 = self.init_weights((128 * 5 * 7, 625))
        self.wo = self.init_weights((625, 2))
Ejemplo n.º 18
0
def select_sample(oxygen, replicate):
    dataframe = load_data.load_data()
    
    if (oxygen == "Low") or (oxygen == 'low'):
        dataframe = dataframe[dataframe['oxygen'] == 'Low']
    if (oxygen == "High") or (oxygen == "high"):
        dataframe = dataframe[dataframe['oxygen'] == 'High']

    dataframe = dataframe[dataframe['replicate'] == int(replicate)]

    return dataframe
Ejemplo n.º 19
0
 def __init__(self, fn, median=True):
     self.t, self.f, self.fe, self.truth = load_data(fn, median)
     self.ivar = 1.0 / self.fe ** 2
     self.central = transit.Central(q1=self.truth["q1"],
                                    q2=self.truth["q2"])
     self.system = transit.System(self.central)
     self.body = transit.Body(period=self.truth["period"],
                              r=self.truth["r"],
                              b=self.truth["b"],
                              t0=self.truth["t0"])
     self.system.add_body(self.body)
def classify_and_compare(data_path_1, data_path_2):
    data_1 = ld.load_data(data_path_1)
    data_2 = ld.load_data(data_path_2)    
    
    y = data_1["status_group"].tolist()
    
    del data_1["status_group"]
    del data_1["date_recorded"]
    del data_2["status_group"]
    del data_2["date_recorded"]
    
    x_1 = data_1.as_matrix()
    x_2 = data_2.as_matrix()
    
    frac_test = 0.2 
    len_test = int(frac_test * len(y))
    indices = np.random.choice(range(0,len(y)), len_test)
    
    test_set_1 = [x_1[i] for i in indices]
    train_set_1 = [x_1[i] for i in range(0,len(y)) if i not in indices]
    
    test_y = [y[i] for i in indices]
    train_y = [y[i] for i in range(0,len(y)) if i not in indices]
    
    test_set_2 = [x_2[i] for i in indices]
    train_set_2 = [x_2[i] for i in range(0,len(y)) if i not in indices]
   
    classifier_1 = ensm.RandomForestClassifier(n_estimators = 100)
    classifier_2 = ensm.RandomForestClassifier(n_estimators = 100)
    
    classifier_1.fit(train_set_1, train_y)
    classifier_2.fit(train_set_2, train_y)
    
    prediction_1 = classifier_1.predict(test_set_1)
    prediction_2 = classifier_2.predict(test_set_2)
    
    print "accuracy classifier 1 =", met.accuracy_score(test_y, prediction_1)
    print "accuracy classifier 2 =", met.accuracy_score(test_y, prediction_2)
    
    
    
Ejemplo n.º 21
0
def main():
    """
    Entry point for all code
    """
    print "starting up"
    df = load_data()
    df_vectorized = extract_features(df, column_list=FEATURES_TO_EXTRACT,
                                     fillna=True, debug=False)
    target_correlation = calculate_features_target_correlation(df_vectorized, df_vectorized.columns.tolist(),
                                                               PREDICTION_TARGET, PCA_METHOD)
    pca = pca_bacteria(df_vectorized, PCA_COMPONENTS)
    return target_correlation, pca
Ejemplo n.º 22
0
def riseperiod(filename,start,end):
    data =  load_data(filename,4)[start:end]
    tmp = 0
    p=[]
    for i in range(1,len(data)):
        if data[i]<data[i-1]:
            tmp += 1
        else:
            p.append(tmp)
            tmp = 0
    print p
    p=filter(lambda x:x!=0, p)
    print p
    return np.mean(p),np.std(p)
	def initialize_chair(self):

		self.trX, self.trY, self.teX, self.teY = load_data()
		self.trX = self.trX.reshape(-1, 1, self.w, self.h)
		self.teX = self.teX.reshape(-1, 1, self.w, self.h)

		w = math.ceil((float(self.w)+4)/2)
		w = ((w -2)/2-2)/2
		h = math.ceil((float(self.h)+4)/2)
		h = ((h -2)/2-2)/2

		self.w1 = self.init_weights((32, 1, 3, 3))
		self.w2 = self.init_weights((64, 32, 3, 3))
		self.w3 = self.init_weights((128, 64, 3, 3))
		self.w4 = self.init_weights((128 * w * h, 625))
		self.wo = self.init_weights((625, 10))
def group_factorize_and_save():
    """ 1. Loads the original data.
        2. Finds the best splits of all categorical variables.
        3. Factorizes the resulting dataframe 
        4. Saves it as a CSV-file.
    """
    data = ld.load_data()
    
    for var in VARS_TO_GROUP:
        print "\nfinding best split of variable \"", var, "\""
        data = cg.group_categories(data, var) 
    
    #data = ld.factorize_data(data)        
    del data["status_group"]  
    
    data.to_csv(GROUPED_PATH_NAMED)    
Ejemplo n.º 25
0
    def load_dataset(self):
        if self.verbose:
            print 'loading data ... '
            start_time = time.time()

        self.xs_train, self.xs_test, self.ys_train, self.ys_test, self.categories = load_data(self.data_dir, self.sample_size, self.train_test_split_percentage, self.verbose)
        self.inv_categories = {v: k for k, v in self.categories.items()}

        num_val = len(self.xs_train)/10
        self.xs_val = self.xs_train[-num_val:]
        self.ys_val = self.ys_train[-num_val:]
        self.xs_train = self.xs_train[:-num_val]
        self.ys_train = self.ys_train[:-num_val]

        if self.verbose:
            end_time = time.time()
            self.print_time(start_time,end_time,'loading data')
Ejemplo n.º 26
0
def load_epo_data(data_cat, n_before=-3, n_len=100, subjects=None):
    # loading 'data_cat' data
    data, channels, markers = load_data(FS, folder_name, data_cat, subjects)

    # converting plain data to continuous Data object
    cnt = convert_mushu_data(data, markers, FS, channels)

    # Define the markers belonging to class 1 and 2
    markers_definitions = None
    if data_cat == 'train':
        markers_definitions = {'class 1': (train_labels.query('Prediction == 0', engine='python')['IdFeedBack']).tolist(),
                           'class 2': (train_labels.query('Prediction == 1', engine='python')['IdFeedBack']).tolist()}
    else:
        # marker classes doesn't matter for test data
        markers_definitions = {'class 1': [m[1] for m in markers], 'class 2': []}

    # segmenting continuous Data object into epoched data
    # Epoch the data -25ms(5 rows) and +500ms(100 rows) around the markers defined in markers_definitions
    return segment_dat(cnt, markers_definitions, [n_before*5, (n_before + n_len)*5])
Ejemplo n.º 27
0
def distro(config, bintype='logdist', cut='llh', xaxis='energy', weight=False):

    # General setup
    labelDict = {'energy':r'$\log_{10}(E/\mathrm{GeV})$',
                 'zenith':r'$\cos(\theta)$',
                 'core':'Distance from center (m)'}
    binDict = {'energy':getEbins(),
               'zenith':np.linspace(0.8, 1, 41),
               'core':np.linspace(0, 700, 71)}

    dataList = getDataList(config, bintype)
    bins = binDict[xaxis]
    xlabel = labelDict[xaxis]
    #fbins = fineBins(bins)

    # Build histograms of desired information
    for cfg, date in dataList[:1]:

        d = load_data(cfg, date, bintype)
        c0 = d['cuts'][cut]
        w = d['weights'][c0] if weight else None
        if xaxis == 'energy':
            y = np.log10(d['ML_energy'])
        if xaxis == 'zenith':
            y = np.cos(d['zenith'])
        if xaxis == 'core':
            y = np.sqrt(d['ML_x']**2 + d['ML_y']**2)

        counts = np.histogram(y[c0], bins=bins, weights=w)[0]
        try: h += counts
        except NameError:
            h = counts

    # Plot
    fig, ax = plt.subplots()
    x = getMids(bins)
    width = bins[1] - bins[0]
    ax.plot(x, h, ls='steps')
    ax.set_xlabel(xlabel)
    ax.set_ylabel('Counts')
    ax.set_yscale('log')

    plt.show()
Ejemplo n.º 28
0
def compute_feature_matrix(data_dir, functions, labels,
                           save_file=None, verbose=False):
    """
    For each .mat EEG data file in data_dir, compute the features given
    by functions and labels and return a 2D array where each row contains
    the index of the hour the segment belongs to, the segment type
    ('preictal': 1, 'interictal': 0, 'test': -1), and its features.
    Save the resulting feature matrix if the save_file keyword is set.
    """
    X = np.zeros(len(labels) + 2) # add 2 columns for hour and type
    data_files = []
    
    for f in os.listdir(data_dir):
        if f.split('.')[-1] == 'mat':
            data_files.append(f)
            if verbose:
                print f
            data = load_data.load_data(os.path.join(data_dir, f))
            new_features = compute_features(data, functions)
            if data['type'] == 'preictal':
                seg_type = 1
            elif data['type'] == 'interictal':
                seg_type = 0
            elif data['type'] == 'test':
                seg_type = -1
            else:
                seg_type = np.nan
            new_features = np.hstack(([data['hour'], seg_type],
                                      new_features))
            X = np.vstack((X, new_features))

    X = X[1:,:]

    if save_file is not None:
        columns = ['hour', 'type'] + labels
        np.savetxt(save_file, X, fmt='%.4e',
                   header='Data directory: ' + data_dir + \
                        '\nColumns:\n ' + '\n '.join(columns))
        data_list_file = '.'.join(save_file.split('.')[:-1]) + '_data_files.txt'
        with open(data_list_file, 'w') as df:
            df.writelines('\n'.join(data_files))
        
    return (X, data_files)
Ejemplo n.º 29
0
def my_model():
    xtrain, ytrain, xtest, ytest, features = load_data()

#    ytrain = transform_to_log(ytrain)
#
#    mosq_model = GradientBoostingRegressor(loss='ls', verbose=1, max_depth=7,
#                                        n_estimators=20)
#    train_nmosq_model(mosq_model, xtrain, ytrain, do_grid_search=False)

    model = GradientBoostingClassifier(verbose=1, max_depth=3, 
                                       n_estimators=100)

    train_has_wnv_model(model, xtrain, ytrain, do_grid_search=False, 
                        feature_list=features)

    prepare_submission(model, xtrain, ytrain[:, 1], xtest, ytest, 
                       feature_list=features)

    return
Ejemplo n.º 30
0
    def __init__(self):
        config = get_config()
        self.data = load_data(config)
        print "%s data loaded..." %config["dataset"]
        nhidden_layers = len(config["hidden_sizes"])
        nhidden = config["hidden_sizes"][0]
        print "num_hidden_layers      :",nhidden_layers
        print "hidden_units_per_layer :",nhidden
        X = T.fmatrix()
        Y = T.ivector()
        scaling_factors = T.fvector()
        num_input = config["num_input"]
        num_output = 10

        
        w_h, b_h = init_parameters(num_input, num_output, config["hidden_sizes"],scale=0.01)
        w_m, b_m, = init_parameters(num_input, num_output, config["hidden_sizes"],scale=0.0)
        self.parameters = w_h + b_h
        self.momentum   = w_m + b_m

        Layers = [X]


        py_x = model(X, w_h, b_h, Layers)

        y_x = T.argmax(py_x, axis=1)

        individual_cost = -1.0 * (T.log(py_x)[T.arange(Y.shape[0]), Y])
        cost = T.mean(individual_cost)

        scaled_individual_cost = scaling_factors * individual_cost
        scaled_cost = T.mean(scaled_individual_cost)

        updates = sgd(scaled_cost, self.parameters, self.momentum, config["learning_rate"], config["momentum_rate"])
        squared_norm_var = compute_grad_norms(X,cost,Layers)

        accuracy = T.mean(T.eq(T.argmax(py_x, axis = 1), Y))

        self.train = theano.function(inputs=[X, Y, scaling_factors], outputs=[cost,squared_norm_var, individual_cost, accuracy], updates=updates, allow_input_downcast=True)
        self.predict = theano.function(inputs=[X], outputs=[y_x,py_x], allow_input_downcast=True)

        self.get_attributes = theano.function(inputs=[X, Y], outputs=[cost,squared_norm_var, individual_cost, accuracy], allow_input_downcast=True)
Ejemplo n.º 31
0
def sgd_optimization_mnist(learning_rate=0.01, n_epochs=1000, dataset='mnist.pkl.gz', batch_size=600, optimizer='gd'):

    datasets = load_data(dataset)

    train_set_x, train_set_y = datasets[0]
    valid_set_x, valid_set_y = datasets[1]
    test_set_x, test_set_y = datasets[2]

    tmpl = [(28 * 28, 10), 10]
    flat, (Weights, bias) = climin.util.empty_with_views(tmpl)

    cli.initialize.randomize_normal(flat, 0, 1)

    if batch_size is None:
        args = itertools.repeat(([train_set_x, train_set_y], {}))
        n_train_batches = 1
    else:
        args = cli.util.iter_minibatches([train_set_x, train_set_y], batch_size, [0, 0])
        args = ((i, {}) for i in args)
        n_train_batches = train_set_x.shape[0] // batch_size

    print('... building the model')

    x = T.matrix('x')
    y = T.ivector('y')

    classifier = LogisticRegression(
            input = x,
            n_in = 28 * 28,
            n_out = 10,
            W = theano.shared(value = Weights, name = 'W', borrow = True),
            b = theano.shared(value = bias, name = 'b', borrow = True)
            )

    gradients = theano.function(
            inputs = [x, y],
            outputs = [
                T.grad(classifier.negative_log_likelihood(y), classifier.W),
                T.grad(classifier.negative_log_likelihood(y), classifier.b)
                ],
            allow_input_downcast = True
            )

    cost = theano.function(
        inputs=[x, y],
        outputs=classifier.negative_log_likelihood(y),
        allow_input_downcast=True
    )

    def loss(parameters, input, target):
        return cost(input, target)

    def d_loss_wrt_pars(parameters, inputs, targets):
        g_W, g_b = gradients(inputs, targets)

        return np.concatenate([g_W.flatten(), g_b])

    zero_one_loss = theano.function(
            inputs = [x, y],
            outputs = classifier.errors(y),
            allow_input_downcast = True
            )

    if optimizer == 'gd':
        print('... using gradient descent')
        opt = cli.GradientDescent(flat, d_loss_wrt_pars, step_rate=learning_rate, momentum=.95, args=args)
    elif optimizer == 'rmsprop':
        print('... using rmsprop')
        opt = cli.RmsProp(flat, d_loss_wrt_pars, step_rate=1e-4, decay=0.9, args=args)
    elif optimizer == 'rprop':
        print('... using resilient propagation')
        opt = cli.Rprop(flat, d_loss_wrt_pars, args=args)
    elif optimizer == 'adam':
        print('... using adaptive momentum estimation optimizer')
        opt = cli.Adam(flat, d_loss_wrt_pars, step_rate = 0.0002, decay = 0.99999999, decay_mom1 = 0.1, decay_mom2 = 0.001, momentum = 0, offset = 1e-08, args=args)
    elif optimizer == 'adadelta':
        print('... using adadelta')
        opt = cli.Adadelta(flat, d_loss_wrt_pars, step_rate=1, decay = 0.9, momentum = .95, offset = 0.0001, args=args)
    else:
        print('unknown optimizer')
        return 1

    print('... training the model')

    # early stopping parameters
    if batch_size== None:
        patience = 250
    else:
        patience = 5000 # look at this many samples regardless

    patience_increase = 2 # wait this mutch longer when a new best is found
    improvement_threshold = 0.995 # a relative improvement of this mutch is considered signigicant
    validation_frequency = min(n_train_batches, patience // 2)
    best_validation_loss = np.inf
    test_loss = 0.

    valid_losses = []
    train_losses = []
    test_losses = []

    epoch = 0

    start_time = timeit.default_timer()
    for info in opt:
        iter = info['n_iter']
        epoch = iter // n_train_batches
        minibatch_index = iter % n_train_batches

        if iter % validation_frequency == 0:
            # compute zero-one loss on validation set
            validation_loss = zero_one_loss(valid_set_x, valid_set_y)
            valid_losses.append(validation_loss)
            train_losses.append(zero_one_loss(train_set_x, train_set_y))
            test_losses.append(zero_one_loss(test_set_x, test_set_y))

            print(
                    'epoch %i, minibatch %i/%i, validation error % f %%, iter/patience %i/%i' %
                    (
                        epoch,
                        minibatch_index + 1,
                        n_train_batches,
                        validation_loss * 100,
                        iter,
                        patience
                        )
                    )
            # if we got the best validation score until now
            if validation_loss < best_validation_loss:
               # improve patience if loss improvement is good enough
                if validation_loss < best_validation_loss * improvement_threshold:
                    patience = max(patience, iter * patience_increase)
                best_validation_loss = validation_loss
                # test it on the test set
                test_loss = zero_one_loss(test_set_x, test_set_y)

                print(
                        '    epoch %i, minibatch %i/%i, test error of best model %f %%' %
                        (
                            epoch,
                            minibatch_index + 1,
                            n_train_batches,
                            test_loss * 100
                            )
                        )

        if patience <= iter or epoch >= n_epochs:
            break

    end_time = timeit.default_timer()

    print('Optimization complete with best validation score of %f %%, with test performance %f %%' % (best_validation_loss * 100., test_loss * 100.))
    print('The code run for %d epochs, with %f epochs/sec' % (epoch, 1. * epoch / (end_time - start_time)))
    print(('The code for file ' + os.path.split(__file__)[1] + ' ran for %.1fs' % ((end_time - start_time))), file=sys.stderr)

    losses = (train_losses, valid_losses, test_losses)

    return classifier, losses
Ejemplo n.º 32
0
def base_train(data_tag, to_train=False, is_bin=False):
    print("Unpacking data...")
    state_len, num_classes, x_train, y_train, x_test, y_test = load_data(
        data_tag)
    print(f"state_len: {state_len}, num_classes: {num_classes}")

    model = build_model(state_len, num_classes)
    print("Model built. ")

    time_stamp = get_time()
    print(time_stamp)
    model_save_root = f"checkpoints/{data_tag}/{MAGIC_CODE}"
    history_save_root = f"history/{data_tag}/{MAGIC_CODE}/{WORK_MAGIC_CODE}/{time_stamp}"
    os.makedirs(model_save_root, exist_ok=True)
    os.makedirs(history_save_root, exist_ok=True)

    model_basename = f"{MAGIC_CODE}-{data_tag}-{WORK_MAGIC_CODE}"
    model_save_path = f"{model_save_root}/{model_basename}-{time_stamp}.h5"
    model_universal = f"best_models/{model_basename}.h5"
    history = []
    if to_train:
        # earlystopper = EarlyStopping(patience=10, verbose=1, monitor="val_acc")
        # checkpointer = ModelCheckpoint(model_universal, verbose=1, save_best_only=True, monitor="val_acc")

        earlystopper = EarlyStopping(patience=5, verbose=1)
        checkpointer = ModelCheckpoint(model_universal,
                                       verbose=1,
                                       save_best_only=True)

        # reduce_lr_loss = ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=10, verbose=1, epsilon=1e-4, mode='min')
        history = model.fit(
            x_train,
            [y_train, y_train],
            batch_size=batch_size,
            epochs=epochs,
            verbose=1,
            validation_data=(x_test, [y_test, y_test]),
            # callbacks=[earlystopper, checkpointer, reduce_lr_loss])
            callbacks=[earlystopper, checkpointer],
            class_weight=generate_class_weights(np.argmax(y_train, axis=1)),
        )

    with custom_object_scope({
            "Projection":
            Projection,
            "Proj2Prob":
            Proj2Prob,
            "EigenDist":
            EigenDist,
            "categorical_bernoulli_crossentropy":
            categorical_bernoulli_crossentropy,
            "FullConnectedNeuralNetwork":
            FullConnectedNeuralNetwork,
            "Softmax":
            Softmax,
            "categorical_crossentropy":
            categorical_crossentropy
    }):
        model.load_weights(model_universal)
        if to_train:
            model.save(model_save_path)

    score = model.evaluate(x_test, [y_test, y_test], verbose=0)
    print(score)
    # print('Test loss:', score[0])
    # print('Test loss 2:', score[1])
    # print('Test accuracy:', score[2])
    # print('Test accuracy 2:', score[3])

    dataset = [x_train, y_train, x_test, y_test]
    save_history(dataset, model, num_classes, history, data_tag,
                 WORK_MAGIC_CODE, MAGIC_CODE, history_save_root, time_stamp)
    cmp_res = compare_all(dataset,
                          num_classes,
                          model,
                          data_tag,
                          WORK_MAGIC_CODE,
                          MAGIC_CODE,
                          time_stamp,
                          is_bin=is_bin)
    # save_compare_result(cmp_res, data_tag, WORK_MAGIC_CODE, MAGIC_CODE, time_stamp)

    print("Wait Nutstore to sync...")
    import time
    time.sleep(5)

    shutil.copy(f"history_{data_tag}.txt", f"{history_save_root}/")
Ejemplo n.º 33
0
import utils
import load_data
import numpy as np
import torch

adjs, attributes = load_data.load_data("DBLP_sub")
adj = adjs[-2:].sum(0)
print(adj.shape)
for val in range(0, 15):
    print(val, len(np.where(adj == val)[0]))
Ejemplo n.º 34
0
def train_net(num_epochs=20,
              batch_size=50,
              learning_rate=1e-4,
              unseen=False,
              update_method=''):
    # Prepare Theano variables for inputs and targets
    input_var = T.tensor4('inputs')
    target_var = T.ivector('targets')
    net = vgg16.build_model(input_var, batch_size)
    network = net['prob']
    # Load the dataset
    if unseen:
        print("Loading data, unseen val/test signatories task...")
        X_train, y_train, X_val, y_val, X_test, y_test = load_data.load_data_unseen_separated(
        )
    else:
        print("Loading data, standard task...")
        X_train, y_train, X_val, y_val, X_test, y_test = load_data.load_data()
    # Create a loss expression for training, i.e., a scalar objective we want
    # to minimize (for our multi-class problem, it is the cross-entropy loss):
    prediction = lasagne.layers.get_output(network)
    all_params = lasagne.layers.get_all_params(network, trainable=True)
    # Get all the parameters we don't want to train
    fixed_params = lasagne.layers.get_all_params(net[LAST_FIXED_LAYER])
    params = [x for x in all_params if x not in fixed_params]
    loss = lasagne.objectives.categorical_crossentropy(
        prediction, target_var) + REG * lasagne.regularization.apply_penalty(
            params, lasagne.regularization.l2)
    loss = loss.mean()
    # We could add some weight decay as well here, see lasagne.regularization.

    # Create update expressions for training, i.e., how to modify the
    # parameters at each training step. Here, we'll use Stochastic Gradient
    # Descent (SGD) with Nesterov momentum, but Lasagne offers plenty more.

    # First get all the parameters
    if update_method.lower() == 'nesterov' or update_method == '':
        updates = lasagne.updates.nesterov_momentum(
            loss, params, learning_rate=learning_rate, momentum=0.9)
    elif update_method.lower() == 'momentum':
        updates = lasagne.updates.momentum(loss,
                                           params,
                                           learning_rate=learning_rate,
                                           momentum=0.9)
    elif update_method.lower() == 'sgd':
        updates = lasagne.updates.sgd(loss,
                                      params,
                                      learning_rate=learning_rate)
    elif update_method.lower() == 'adam':
        updates = lasagne.updates.adam(loss,
                                       params,
                                       learning_rate=learning_rate,
                                       beta1=0.9,
                                       beta2=0.999,
                                       epsilon=1e-08)
    elif update_method.lower() == 'rmsprop':  # typically better than adaGrad
        updates = lasagne.updates.rmsprop(loss,
                                          params,
                                          learning_rate=learning_rate,
                                          rho=0.9,
                                          epsilon=1e-06)
    elif update_method.lower() == 'adadelta':
        updates = lasagne.updates.adadelta(loss,
                                           params,
                                           learning_rate=learning_rate,
                                           rho=0.9,
                                           epsilon=1e-06)
    else:
        raise IOError("Not an acceptable parameter update method.")

    #updates = lasagne.updates.adam(
    #       loss, params, learning_rate=learning_rate)
    # Create a loss expression for validation/testing. The crucial difference
    # here is that we do a deterministic forward pass through the network,
    # disabling dropout layers.
    test_prediction = lasagne.layers.get_output(network, deterministic=True)
    test_loss = lasagne.objectives.categorical_crossentropy(
        test_prediction,
        target_var) + REG * lasagne.regularization.apply_penalty(
            params, lasagne.regularization.l2)
    test_loss = test_loss.mean()
    # As a bonus, also create an expression for the classification accuracy:
    test_acc = T.mean(T.eq(T.argmax(test_prediction, axis=1), target_var),
                      dtype=theano.config.floatX)

    # Hacky code to create the confusion matrix, which exists due to my
    # poor understanding of theano
    preds = T.argmax(test_prediction, axis=1)
    inv_preds = 1 - preds
    inv_target_var = 1 - target_var
    true_positives = T.sum(preds * target_var)  # Use mult as elementwise and
    true_negatives = T.sum(inv_preds * inv_target_var)
    false_positives = T.sum(preds * inv_target_var)
    false_negatives = T.sum(inv_preds * target_var)

    # Compile a function performing a training step on a mini-batch (by giving
    # the updates dictionary) and returning the corresponding training loss:
    train_fn = theano.function([input_var, target_var], loss, updates=updates)
    print("train_fn set up.")

    # Compile a second function computing the validation loss and accuracy:
    val_fn = theano.function([input_var, target_var], [
        test_loss, test_acc, true_positives, true_negatives, false_positives,
        false_negatives
    ])

    # Finally, launch the training loop.
    print("Starting training...")
    # We iterate over epochs:
    val_loss_per_epoch = []
    train_loss_per_epoch = []
    for epoch in range(num_epochs):
        # In each epoch, we do a full pass over the training data:
        train_err = 0
        train_batches = 0
        start_time = time.time()

        for batch in iterate_minibatches(X_train, y_train, batch_size):
            inputs, targets = batch
            train_err += train_fn(inputs, targets)
            train_batches += 1

        # And a full pass over the validation data:
        val_err = 0
        val_acc = 0
        val_far = 0
        val_frr = 0
        val_batches = 0
        for batch in iterate_minibatches(X_val, y_val, batch_size):
            inputs, targets = batch
            err, acc, t_p, t_n, f_p, f_n = val_fn(inputs, targets)
            val_err += err
            val_acc += acc
            val_frr += float(f_n) / (t_p + f_n)
            val_far += float(f_p) / (f_p + t_n)
            val_batches += 1

        # Then we print the results for this epoch:
        print("Epoch {} of {} took {:.3f}s".format(epoch + 1, num_epochs,
                                                   time.time() - start_time))
        print("  training loss:\t\t{:.6f}".format(train_err / train_batches))
        print("  validation loss:\t\t{:.6f}".format(val_err / val_batches))
        print("  validation accuracy:\t\t{:.2f} %".format(val_acc /
                                                          val_batches * 100))
        print("  validation far:\t\t{:.2f} %".format(val_far / val_batches *
                                                     100))
        print("  validation frr:\t\t{:.2f} %".format(val_frr / val_batches *
                                                     100))

        val_loss_per_epoch.append(val_err / val_batches)
        train_loss_per_epoch.append(train_err / train_batches)

    print("Val loss per epoch:", val_loss_per_epoch)
    print("Train loss per epoch:", train_loss_per_epoch)
    # After training, we compute and print the test error:
    test_err = 0
    test_acc = 0
    test_far = 0
    test_frr = 0
    test_batches = 0
    for batch in iterate_minibatches(X_test, y_test, batch_size):
        inputs, targets = batch
        err, acc, t_p, t_n, f_p, f_n = val_fn(inputs, targets)
        test_err += err
        test_acc += acc
        test_frr += float(f_n) / (t_p + f_n)
        test_far += float(f_p) / (f_p + t_n)
        test_batches += 1
    print("Final results:")
    print("  test loss: withheld until final submission lolol")
    print(" test accuracy: withheld until final submission lolol")
Ejemplo n.º 35
0
from theano import config
import theano.sandbox.cuda
config.floatX = 'float32'
print(config.floatX)
theano.sandbox.cuda.use("gpu0")

import load_data
import prepare_images
import rotate_image

pizza_eng_names, pizza_imgs = prepare_images.load_photos()

channels, height, width = 3, 32, 32
batch_size = 20

labels, onehotencoder = load_data.load_data()
labels_list = []
j = 0

image_list = []
for pizza_img in pizza_imgs:
    lst = load_data.resize_rotate_flip(pizza_img, (height, width))
    print(len(lst))
    image_list.extend(lst)

    lbls = []
    for i in range(len(lst)):
        lbls.append(shuffle(labels[j], random_state=i))
    labels_list.extend(lbls)
    j += 1
Ejemplo n.º 36
0
    k_size = [1, 2, 3, 4, 5, 6, 7, 8]
''' Decoder config '''
de_embed = 256
de_H = 256
de_layers = 1
de_bi = False
en_Hbi = en_H * (2 if en_bi == True else 1)
''' File path '''
th_en_ref = "th-en/ted_test_th-en.en.tok_seg"
th_vi_ref = "th-vi/ted_test_th-vi.vi.tok"

###########################
### Load Data and Dict ####
###########################

train_data1, train_target1, val_data1, val_target1, inp_dict1, tgt_dict1 = load_data(
    lang_pair1, source_type, tgt_type)
train_data2, train_target2, val_data2, val_target2, inp_dict2, tgt_dict2 = load_data(
    lang_pair2, source_type, tgt_type)

# combine dicts
raw_inp_dict = {**inp_dict2, **inp_dict1}
raw_tgt_dict = {**tgt_dict2, **tgt_dict1}

inp_dict, tgt_dict = {}, {}
count, count2 = 0, 0

for k, v in raw_inp_dict.items():
    inp_dict[k] = count
    count += 1
for k2, v2 in raw_tgt_dict.items():
    tgt_dict[k2] = count2
Ejemplo n.º 37
0
### WAND Debug ###

import wandb
import logConfig, load_data, accuracy_loss, train, preprocessing, plot

train_X, train_Y, test_X, test_Y, labels = load_data.load_data()

(N, w, h), n_labels = train_X.shape, len(labels)

# Number of datapoints to train
n = 100

# Dimension of datapoints
d = w * h

# Data Preprocessing
(train_x, train_y), (val_x,
                     val_y), (test_x, test_y) = preprocessing.pre_process(
                         d, n_labels, train_X, train_Y, test_X, test_Y)


def mainDebug(config=None):
    run = wandb.init(config=config)
    config = wandb.config

    hl = [config.hidden_layer_size] * config.hidden_layers  # Hidden layers
    ol = [len(train_y[0])]  # Output layers
    n_hl = len(hl)

    logConfig.logConfig(config)
Ejemplo n.º 38
0
# -*- coding: utf-8 -*-
"""
Created on Tue Nov 17 15:15:48 2020

@author: groes
"""
import neural_network as nn
import numpy as np
import load_data
import utils

data = load_data.load_data()
X = data['data']
y = data['target']

X_train, X_test, y_train, y_test = utils.split_data(X, y, 0.3)

unittest_mod = nn.new_neural_network(0.001)
unittest_mod.create_input_layer(784)
unittest_mod.add_hidden_layer(256)
#unittest_mod.add_hidden_layer(256)
unittest_mod.add_hidden_layer(128)
#unittest_mod.add_hidden_layer(64)
unittest_mod.add_output_layer(10)
unittest_mod.new_train(X_train, y_train,5,batch_size = 32, optimiser= "Adam")

unittest_mod.accuracy_score(X_test, y_test)

y_test[0]

                    default=1111,
                    help='torch seed for randomization')
args = parser.parse_args()

torch.manual_seed(args.seed)
np.random.seed(args.seed)

if not os.path.exists(args.save_folder):
    os.makedirs(args.save_folder)

if not os.path.exists(args.save_folder + '/imgs'):
    os.makedirs(args.save_folder + '/imgs')
# loading useful data
print('\nLOADING CORPUS')
model = load_model(args.model_path)
sentences, labels = load_data(args.tree_data, 'open_nodes')
corpus = data.Corpus(args.training_data)

if args.gated_forward:
    print('USING GATED FORWARD')
    model_values = utils.get_model_values(model)

data_load_failed = False
if args.load_data:
    try:
        print('LOADING DATA')
        hidden_states = np.load(args.hidden_location).item()
        cell_states = np.load(args.cell_location).item()
        targets = np.load(args.targets_location)

        depth_targets = np.load(args.depth_targets_location)
Ejemplo n.º 40
0
Set the following three parameters.
Plotting will run subsequently.

scale: int, scales daily data. 7: one week. 30: one month. 0.5: half day.
no_periods: int, determine scaled-day periods.
measures: list, takes the desired measures from header_dict.
header_dict: dict, output from load_data.ipynb.
files: list, output from load_data.ipynb.
"""
import matplotlib.dates as mdates
from matplotlib import cm
from matplotlib.colors import ListedColormap, LinearSegmentedColormap
from load_data import load_data

# Init variables
df, files, header_dict = load_data()
scale = 7  # 1 = single day, 7 = week, etc.
no_periods = 52  # numer of periods, ie 10 weeks; scale =7, no periods = 10
measures = ['no2', 'no', 'pm10']


def fix_series(series, missing, flag):
    """ Calls a series recursively and patches the minimal datetime value with the pervious one
        continues with the maxium. Et cetera.
        Not error proof, ie whole series == missing will recurse to the limit.
        Make sure to not pass an empty series.
    """
    if len(missing) == 0: return series
    if flag == 'min':
        if missing.min() < series['datetime'].min():
            pass
Ejemplo n.º 41
0
	def run(_run):
		# Load configs, if parameters are unspecified, fill in a default
		config = _run.config		

		run = config.get('fit_params') 
		model_params = config.get('model_params')   
		data_params = config.get('data_params')
		batch_size = data_params.get('batch_size')
		augmentations = data_params.get('augmentations')
		buffer_size = data_params.get('buffer_size') # the buffer sizes for shuffling
		use_sampling = data_params.get('use_sampling')
		class_target_prob = 1 / model_params.get('num_classes')
		print("[!] list of parameter configurations")
		pprint(config)
		
		
		# Load data and define generators ------------------------------------------
		print("[!] loading datasets \n")
		x_train,  x_val, x_test, probs = load_data()
		
		# get a rough estimate: there are 100 files per TFRecord
		# except for one TFRecord per item, so this estimate might not be 100% correct
		num_training = len(x_train) * 100
		
		# TF parsing functions
		print("[!] Creating dataset iterators \n")
		# Load the dataset iterators
		
		train_dataset = create_training_dataset(x_train, batch_size, buffer_size, augmentations,
										  use_sampling, probs, class_target_prob,
										  **model_params)
		
		val_dataset = validate(x_val, batch_size, **model_params)
		test_dataset = validate(x_test, batch_size, **model_params)		
		
		
		# we need the actual labels from the TFRecords, but they take INCREDIBLY long to parse
		# parse through them once, and create a csv file with a list of all the labels
		# note: the tf parsing requires that there is no randomness (shuffling) in the validation/test labels

		if not os.path.exists('../datasets/data/valid/val_labels.csv'):
			print(os.path.exists('../datasets/data/valid/val_labels.csv'))
			print("[!] creating validation label file in ../datasets/data/valid/val_labels.csv")
			create_label_csv(val_dataset,'../datasets/data/valid/val_labels.csv')
		else:
			print("[!] validation labels csv exist")
			
		if not os.path.exists('../datasets/data/test/test_labels.csv'):
			print("[!] creating test label file in ../datasets/data/test/test_labels.csv")
			create_label_csv(test_dataset,'../datasets/data/test/test_labels.csv')
		else:
			print("[!] test labels csv exist")

		# load the file with validation labels
		# getting labels from a TFRecords with lots of other data is horribly slow...
		print("[!] Loading validation labels for callbacks")
		val_labels = pd.read_csv('../datasets/data/valid/val_labels.csv')
		val_labels = np.squeeze(val_labels.to_numpy())
		
		# Model definitions --------------------------------------------------------
		print("[!] compiling model and adding callbacks \n")
		# function for building the model
		model_func = model_dict[run.get('model')]

		# invoke the user function
		model = model_func(**model_params)
		model.summary()
		# compile the model with catcrossentropy: one hot encoded labels!!
		model.compile(optimizer= tf.keras.optimizers.Adam(run.get('lr')),
						loss= 'categorical_crossentropy',
						metrics=['accuracy'])
		
		# Model callbacks ----------------------------------------------------------
		
		# ReduceLRonPlateau
		if run.get('reduce_lr_on_plateau'):
			reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=3, min_lr=10e-7, verbose=1)
		else:
			reduce_lr = Callback()

		# Model checkpoints
		now = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
		aug_string = 'aug' if augmentations==True else 'noaug'
		modelcheckpoint_name= lambda x: "checkpoints/model-{}-{}-{}-{}-{}.hdf5".format(run.get('model'), 
																					x, 
																					aug_string, 
																					'ch_' + str(len(model_params.get('channels'))), 
																					now)

		modelcheckpoint = ModelCheckpoint(modelcheckpoint_name('best_loss'), 
									monitor = 'val_loss', 
									verbose=1, 
									save_best_only=True, 
									save_weights_only=True)
		
		# Model early stopping
		earlystopping = EarlyStopping(monitor='val_loss', patience=10)


		# tensorboard and metric callbacks

		log_dir = "logs/fit/{}-{}-{}-{}".format(run.get('model'), aug_string, 'ch_' + str(len(model_params.get('channels'))), now)

		file_writer = tfsum.create_file_writer(log_dir)
		tensorboard_cb = tf.keras.callbacks.TensorBoard(log_dir=log_dir, 
														histogram_freq=1, 
														profile_batch=0)

		f1_metric = Metrics(val_dataset, 
				            val_labels, 
				            save_best=True, 
							save_name= modelcheckpoint_name('best_f1'), 
							writer=file_writer)
		
		# Model Training and evaluation --------------------------------------------
		print("[!] fitting model \n")
		
		model.fit(
			train_dataset.repeat(), 
			epochs=run.get('epochs'), 
			steps_per_epoch= int(num_training / batch_size),
			validation_data=val_dataset, 
			validation_steps = None,
			shuffle=True,
			verbose= 1,
			callbacks = [tensorboard_cb, f1_metric, LogMetrics(), modelcheckpoint, earlystopping, reduce_lr, MemoryCallback()]
		)

		print("[!] done running, terminating program")
		'''
Ejemplo n.º 42
0
    def activation(self, N):
        x = (self.location + self.posn0 - self.t0 + N) % self.num_posns
        return x == 0


def _parse(d):
    d = d.split(' ')
    disc_number = int(d[1][1:])
    num_posns = int(d[3])
    t0 = int(d[6][d[6].index('=') + 1:-1])
    posn0 = int(d[-1][:-1])
    return (disc_number, num_posns, t0, posn0)


if __name__ == "__main__":
    data = load_data('./input/day15.txt')

    print('Part 1')
    discs = [Disc(*_parse(d)) for d in data]
    N = 0
    while not all(d.activation(N) for d in discs):
        N += 1
    print('\tFirst N is: {}'.format(N))

    print('\nPart 2')
    discs2 = [Disc(*_parse(d)) for d in data]
    discs2.append(Disc(discs2[-1].location + 1, 11, 0, 0))
    N2 = 0
    while not all(d.activation(N2) for d in discs2):
        N2 += 1
    print('\tFirst N is: {}'.format(N2))
Ejemplo n.º 43
0
from model import unet

from load_data import load_data
from show_prediction import predict
from callbacks import keras_callback


train_gen, val_gen, x_test, y_test = load_data(datapath='../data/processed/')

if __name__ == '__main__':
    model = unet()
    model.summary()

    model.fit(
        train_gen,
        epochs=100,
        validation_data=val_gen,
        callbacks=[keras_callback()]
    )
    model.save('../models/checkpoint.h5')
    predict(model, x_test, y_test)
Ejemplo n.º 44
0
import torch

from load_data import load_data
from model.vgg16 import vgg16
from model.tiny import TinyClassifier2d
from model.resnet50 import resnet50
from train import train_model

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)

train_loader, validate_loader, _ = load_data()

# VGG16 Network
# vgg16 = vgg16()
# train_model(vgg16, 'vgg16', train_loader, validate_loader, 3, device, one_batch=True)

# Tiny Residual Network
# tiny = TinyClassifier2d()
# train_model(tiny, 'tiny', train_loader, validate_loader, 3, device, one_batch=True)

# ResNet50 Network
resnet50 = resnet50()
train_model(resnet50,
            'resnet50',
            train_loader,
            validate_loader,
            1,
            device,
            one_batch=True)
Ejemplo n.º 45
0
        return self._train(state, reward, result_state)

    def predict(self, state):
        return self._predict(state)

    def q_values(self, state):
        return self._q_values(state)

    def bellman_error(self, state, reward, result_state):
        return self._bellman_error(state, reward, result_state)


if __name__ == "__main__":

    dataset = "controllerTuples.json"
    states, actions, result_states, rewards = load_data.load_data(dataset)

    classifier = NeuralNet(states, n_in=9, n_out=9)
    best_error = 10000000.0

    # print "Initial Model: " + str(classifier._model(states).shape)
    # print "Initial Model: " + str(np.max(classifier._model(states), axis=1, keepdims=True).shape)
    for i in range(20000):
        for start, end in zip(range(0, len(states), 128),
                              range(128, len(states), 128)):
            # cost = train(states[start:end], rewards[start:end], result_states[start:end])
            _states = states[start:end]
            _rewards = rewards[start:end]
            _result_states = result_states[start:end]
            """
            print _states.shape
Ejemplo n.º 46
0
def train(args):
    dataset = args.dataset
    print("loading data from :{}".format(dataset))
    adjs, features = load_data.load_data(dataset)
    node_num = adjs.shape[1]
    attribute_num = features.shape[2]
    time_length = adjs.shape[0]
    print("finish loading: node_number:{} ; time_length:{}; attribute_number:{}".format(node_num, time_length, attribute_num))

    """  set parameters """
    pre_len = args.pre_len  # how many time steps to predict.
    train_len = time_length - pre_len

    """ preProcess data """
    # preserve original data
    adjs_ori = torch.from_numpy(adjs).type(torch.float) + torch.eye(node_num)
    feats_ori = torch.from_numpy(features).type(torch.float).type(torch.float)

    # process data

    # divide testing/validating/training sets
    adjs_train, val_adjs, val_adjs_negative, test_adjs, test_adjs_negative = utils.mask_adjs_test(adjs=adjs)
    fea_train, val_feas, val_feas_false, \
    test_feas, test_feas_false = utils.mask_attributes_test(features)

    adjs_train_lable = torch.from_numpy(adjs_train).type(torch.float) + torch.eye(node_num)
    adjs_train = utils.preprocess_adjs(adjs_train)
    adjs_train = torch.from_numpy(adjs_train).type(torch.float)

    # node_features = torch.eye(node_num).unsqueeze(0).repeat(time_length, 1, 1) 单位矩阵作为特征
    node_features = torch.from_numpy(features).type(torch.float)
    attributes = torch.from_numpy(features.transpose([0, 2, 1])).type(torch.float)  # batch_size = 1

    """ implement a CDN model """
    myModel = MyModel(node_num=node_num,
                      feat_num=attribute_num,
                      b_size=args.belief_size,
                      pre_hid_size=args.pre_hidden_size,
                      hid_size=args.hidden_size,
                      pre_out_size=args.pre_out_size,
                      z_size=args.emb_size,
                      hid_decoder_size=args.decoder_hidden,
                      flag=args.co_embedding)

    # Adam Optimizer
    optimizer = optim.Adam(myModel.parameters(), lr=args.lr,
                           weight_decay=args.weight_decay)

    # begin training
    print("="*30)
    print("begin training")
    for epoch in range(args.epochs):
        myModel.train()
        optimizer.zero_grad()
        myModel.forward(adjs=adjs_train[0:train_len],
                        node_features=node_features[0:train_len],
                        attr_features=attributes[0:train_len])
        # random choose two successive time steps t1 and t2
        t_1 = np.random.choice(train_len - 1)
        t_2 = t_1 + np.random.choice([1])
        loss, loss_fea_rec, loss_adj_rec, kl_loss, log_loss, adj_t2_prob, feature_t2_prob \
            = myModel.calculate_loss(t_1, t_2, adjs_ori=adjs_train_lable, graph_feats_ori=feats_ori)

        # test on validate test
        roc_adj, ap_adj = get_roc_score_adj(val_adjs[t_2], val_adjs_negative[t_2], adj_t2_prob, t_2, adjs_ori)
        roc_feat, ap_feat = get_roc_score_feat(val_feas[t_2], val_feas_false[t_2], feature_t2_prob, t_2, feats_ori)
        print("epoch:{} loss_train:{:.5f} "
              "t1:{:2} t2:{:2} "
              "loss_fea_rec:{:.5f} loss_adj_rec:{:.5f} "
              "kl_loss:{:.5f} log_loss:{:.5f} "
              "roc_adj:{:.5f} ap_adj:{:.5f}"
              "roc_fea:{:.5f} ap_fea:{:.5f} ".format(epoch, loss.item(),
                                                     t_1, t_2,
                                                     loss_fea_rec, loss_adj_rec,
                                                     kl_loss, log_loss,
                                                     roc_adj, ap_adj,
                                                     roc_feat, ap_feat
                                                     ))

        # update the dynamic network
        if epoch % 300 == 0:
            print("=" * 30)
            print("begin testing")
            print(" time_length : {} train_length : {} predict_length : {}".format(time_length, train_len, pre_len))

            # predict the future observations
            adjs_pre, features_pre, adj_last, features_last = myModel.predict(t_final=-1, pre_len=pre_len)

            # calculate the scores
            for t in range(train_len, time_length):
                adj_t_prob = adjs_pre[t - train_len]
                feature_t_prob = features_pre[t - train_len]
                roc_adj, ap_adj = get_roc_score_adj(val_adjs[t], val_adjs_negative[t], adj_t_prob, t, adjs_ori)
                roc_feat, ap_feat = get_roc_score_feat(val_feas[t], val_feas_false[t], feature_t_prob, t, feats_ori)
                print(" roc_adj:{:.5f} ap_adj:{:.5f}"
                      " roc_fea:{:.5f} ap_adj:{:.5f} ".format(roc_adj, ap_adj, roc_feat, ap_feat))

            # using the last embedding to reconstruct and predict the links and associations
            print("using the last time")
            for t in range(train_len-1, time_length):
                adj_t_prob = adj_last
                feature_t_prob = features_last
                roc_adj, ap_adj = get_roc_score_adj(val_adjs[t], val_adjs_negative[t], adj_t_prob, t, adjs_ori)
                roc_feat, ap_feat = get_roc_score_feat(val_feas[t], val_feas_false[t], feature_t_prob, t, feats_ori)
                print(" roc_adj:{:.5f} ap_adj:{:.5f}"
                      " roc_fea:{:.5f} ap_adj:{:.5f} ".format(roc_adj, ap_adj, roc_feat, ap_feat))
            print("finish testing")

            print("=" * 30)

        # update parameters
        loss.backward()
        optimizer.step()
    print("="*30)
    print("finish training")

    print("="*30)
    print("begin testing")
    print(" time_length : {} train_length : {} predict_length : {}".format(time_length, train_len, pre_len))

    # predict the future observations
    adjs_pre, features_pre, adj_last, features_last = myModel.predict(t_final=-1, pre_len=pre_len)

    # calculate the scores
    print("using the delta way to predict")
    for t in range(train_len, time_length):
        adj_t_prob = adjs_pre[t - train_len]
        feature_t_prob = features_pre[t - train_len]
        roc_adj, ap_adj = get_roc_score_adj(test_adjs[t], test_adjs_negative[t], adj_t_prob, t, adjs_ori)
        roc_feat, ap_feat = get_roc_score_feat(test_feas[t], test_feas_false[t], feature_t_prob, t, feats_ori)
        print(" roc_adj:{:.5f} ap_adj:{:.5f}"
              " roc_fea:{:.5f} ap_adj:{:.5f} ".format(roc_adj, ap_adj, roc_feat, ap_feat))

    print("using the latest embeddings to predict")
    for t in range(train_len-1, time_length):
        adj_t_prob = adj_last
        feature_t_prob = features_last
        roc_adj, ap_adj = get_roc_score_adj(test_adjs[t], test_adjs_negative[t], adj_t_prob, t, adjs_ori)
        roc_feat, ap_feat = get_roc_score_feat(test_feas[t], test_feas_false[t], feature_t_prob, t, feats_ori)
        print(" roc_adj:{:.5f} ap_adj:{:.5f}"
              " roc_fea:{:.5f} ap_adj:{:.5f} ".format(roc_adj, ap_adj, roc_feat, ap_feat))
    print("finish testing")
    print("="*30)

    return 0
Ejemplo n.º 47
0
# Parameters
data_directory = '../../data/generated-data-r-10-n-02/'
booking_file = '../../data/booking.csv'
users_file = '../../data/user.csv'
rating_thresholds = []
true_objects_indexes = [0, 1, 2, 3, 4, 5, 6, 7]
false_objects_indexes = [8, 9]

file_names = os.listdir(data_directory)
ids_vector = [int(name.split('-')[0]) for name in file_names]
categories_vector = [name.split('-')[1] for name in file_names]
ratings_vector = [int(name.split('.')[0].split('-')[2]) for name in file_names]
name_vector = [data_directory + name for name in file_names]

ratings_matrix, images_indexes_for_id, ids_indexes, users_matrix = load_data(
    data_directory, booking_file, users_file, rating_thresholds)

features, new_ratings_vector, new_categories_vector, new_ids_vector, new_paths_vector, text_indexes = divide_texts(
    name_vector, ratings_vector, categories_vector, ids_vector, n=10)

ratings_vector = new_ratings_vector
ids_vector = new_ids_vector

scores_auc = []
scores_rmse = []
for i in range(10):
    cv_results_file = '../results/cv-generated-data-r-10-n-02-z-random-' + str(
        i) + '.csv'
    selection = ObjectSelection(show_selection_results=False,
                                selection_algorithm='random')
    selection.transform(ids=ids_vector,
@author: Admin
"""
import torch
from load_data import load_data
from learning_function import learning_function
from torchsummary import summary
from plot import plot
from Unet import UNet
from ict import ICT
from config import config
import transform

#####################################################################################################
######################################## load data ##################################################
#####################################################################################################
l_train = load_data("data", "l_train")
u_train = load_data("data", "u_train")
test = load_data("data", "test")

#####################################################################################################
################################## transformation  ##################################################
#####################################################################################################
transform_fn = transform.transform(*config["transform"])

#####################################################################################################
#################################### student model ##################################################
#####################################################################################################
S_model = UNet(2, transform_fn)
#summary(S_model, (3, 480 ,640))

#####################################################################################################
Ejemplo n.º 49
0
for layer in base_model.layers:
    layer.trainable = False


# YOLO_v1 中x,y,w,h 损失函数
def loss(y_true, y_pred):
    a = K.abs(y_pred[:, 0] - y_true[:, 0]) + K.abs(y_pred[:, 1] - y_true[:, 1])
    b = K.abs(K.sqrt(y_pred[:, 2]) - K.sqrt(y_true[:, 2])) + K.abs(K.sqrt(y_pred[:, 3]) - K.sqrt(y_true[:, 3]))
    value = K.mean(a + b, axis=-1)
    return value


model.compile(optimizer='rmsprop', loss=loss)

# fine tuning 全连接层
x_train, y_train = load_data()
print(x_train.shape, y_train.shape)
model.fit(x_train, y_train, epochs=2, batch_size=16, verbose=1)

# fine tuning 第二次
# make_data_set()
# x_train, y_train = load_data()
# for layer in model.layers[:11]:
#     layer.trainable = True
# for layer in model.layers[11:]:
#     layer.trainable = True
for layer in model.layers:
    layer.trainable = True
model.compile(optimizer=SGD(lr=0.0001, momentum=0.9), loss=loss)
model.fit(x_train, y_train, epochs=4, batch_size=16, verbose=1)
Ejemplo n.º 50
0
def main():
    # camera matrix
    fx = 3551.342810
    fy = 3522.689669
    cx = 2033.513326
    cy = 1455.489194
    

    # fx = 718.8560
    # fy = 718.8560
    # cx = 607.1928
    # cy = 185.2157

    K = np.float64([[fx, 0, cx], 
                    [0, fy, cy], 
                    [0, 0, 1]])
    
    D = np.float64([-0.276796, 0.113400, -0.000349, -0.000469])
    
    #load images
    # dataset1_dir = '/home/linjian/datasets/Data_trajectory/2018-08-21/22_47_20_load/'
    # dataset2_dir = '/home/linjian/datasets/Data_trajectory/2018-08-22/'
    # dataset1_dir ='/home/linjian/dataset/docking_dataset/image/Data_trajectory/2018-08-21/22_47_20_load/'
    dataset1_dir = '/home/linjian/dataset/docking_dataset/image/Data_trajectory/2018-08-22/16h-26m-42s load/'
    filelist1 = glob.glob(dataset1_dir+'*.jpg')
    filelist1 = sorted(filelist1)
    img_num = len(filelist1)

    #load scale as speed of the wheel
    loaded_data = load_data(dataset1_dir)
    scale = loaded_data.get_speed()

    #initialization
    rotation_array =[]
    transformation_array =[]
    pose_array =[]
    R = np.eye(3)
    t = np.zeros((1, 3))
    rotation_array.append(R)
    transformation_array.append(t)
    pose_array.append(t)
    
    #bag of virtual words init
    detector = cv2.ORB_create()
    bovw_class = bovw(detector)

    #init loop closure class
    loopclosure_class = loopclosure()

    #init the relative scale list
    relative_scale_list = []

    #init keypoints and descriptors
    keypoints_list = []
    descriptors_list = []

    #keyframe flags


    #initialize input images 
    img1 = cv2.imread(filelist1[0])
    img2 = cv2.imread(filelist1[1]) 
    keyframe_index = 1
    # for i in range(0,50): 
    for i in range(1,img_num):
   
        #initialize matching class with camera parameters
        matching_class = matching(K,D)
        #insert images 
        matching_class.load_image(img1,img2)
        #create a detector
        detector = cv2.ORB_create()

        #scan matching
        enough_match,matches = matching_class.match_images(detector)
        if matches == -1 or scale[i-1] < 0.01:
            print('not a good keyframe')
            keyframe_index = keyframe_index+1
            if keyframe_index >img_num-1:
                break
            img2 = cv2.imread(filelist1[keyframe_index])
            continue
        kp1_match,kp2_match = matches
        keypoints_list.append(matching_class.kp1)
        descriptors_list.append(matching_class.des1)

        #calculate the relative scale
        try:
            relative_scale = comput_relative_scale(kp1_match,kp2_match)
            relative_scale_list.append(relative_scale)
            print("for the ",i,"image relative scale is ",relative_scale)
            print("for the ",i,"image absolute scale is ",scale[i-1])
            print("for the ",i,"image calculated absolute scale is ",scale[i-2]/relative_scale_list[i-1]*relative_scale)      
        except:
            print("An exception occurred")
        #add into bovw
        bovw_class.add_histogram(matching_class.des1)

        #calculate rotation and transformation
        dR = matching_class.getRotation()
        rotation_array.append(dR)
        dt = np.transpose(matching_class.getTransformation())
        transformation_array.append(dt)
        R = dR.dot(R)
        t = t+dt.dot(R)*scale[i-1]
        pose_array.append(t)

        #find loop closure
        lc_index,lc_cost = bovw_class.find_lc(matching_class.des2)
        print(lc_cost)
        if (lc_cost < 0.01):
            img_lc = cv2.imread(filelist1[lc_index])
            cv2.imshow('Loop closure matched',img_lc)
            #scale calculate, 1st calutate the good matches, then relative scale
            # lc_scale = comput_relative_scale(,)
            # print('lc scale is ', scale[i-2]/relative_scale_list[i-1]*lc_scale)
        cv2.waitKey(1)  
        img1 = img2
        keyframe_index = keyframe_index+1
        if keyframe_index >img_num-1:
            break
        img2 = cv2.imread(filelist1[keyframe_index])             


    bovw_class.save_bovw_lib()
    save_to_pickle(filelist1,"image_file_list")
    #convert lists to array
    rotation_array = np.asarray(rotation_array)
    transformation_array = np.asarray(transformation_array)
    pose_array=np.asarray(pose_array)
    mapmax = np.amax(pose_array) +2
    mapmin = np.amin(pose_array) -2
    #plot
    # plot_camera_pose3d(pose_array)
    # plot_camera_pose2d(pose_array)
    plot_pose(pose_array,mapmax,mapmin)
    print('there are ', str(len(pose_array)),'number of camera poses')
Ejemplo n.º 51
0
    # option
    parser.add_argument('-snapshot',
                        type=str,
                        default=None,
                        help='filename of model snapshot [default: None]')
    parser.add_argument('-predict',
                        type=str,
                        default=None,
                        help='predict the sentence given')
    parser.add_argument('-test',
                        action='store_true',
                        default=False,
                        help='train or test')
    args = parser.parse_args()
    # load data
    load_data(load_path)
    '''
    '''
    print("\nLoading data...")
    issue1_field = data.Field(lower=True)
    issue2_field = data.Field(lower=True)
    label_field = data.Field(sequential=False)
    pairid_field = data.Field(lower=True)
    train_data, dev_data, test_data = mydatasets.MR.splits(
        issue1_field, issue2_field, label_field, pairid_field)

    issue1_field.build_vocab(train_data, dev_data, test_data)
    issue2_field.build_vocab(train_data, dev_data, test_data)
    label_field.build_vocab(train_data, dev_data, test_data)
    pairid_field.build_vocab(train_data, dev_data, test_data)
    print(len(train_data), len(dev_data), len(test_data))
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.cm as cm

from load_data import load_data
from moving_average import moving_average

N = 365

if __name__ == "__main__":
    _, dates, data = load_data()
    temperature = data[:, 0]
    temperature_max = data[:, 1]
    percipitation = data[:, 3]

    first_year, last_year = dates[0].year, dates[-1].year

    x = range(first_year, last_year + 1)
    y = range(0, 365)

    dt2day_of_year = lambda dt: dt.timetuple().tm_yday

    result = np.empty((len(y), len(x)))
    for i, val in enumerate(percipitation):
        dt = dates[i]
        x_i = dt.year - first_year
        y_i = dt2day_of_year(dt) - 1
        if y_i < 365:
            result[y_i, x_i] = val

    result = np.log10(result)
Ejemplo n.º 53
0
    return data


def getRangeList(data):
    data = sorted([_parse_range(d) for d in data])
    datarange = [Range(*d) for d in data]
    datarange = _parse_intvls(datarange)
    return datarange


def _count_allowed(data):
    ctr = data[0].a - 0
    for j in range(len(data) - 1):
        ctr += data[j + 1].a - data[j].b - 1
    ctr += 2**32 - 1 - data[-1].b
    return ctr


if __name__ == "__main__":
    data = load_data('./input/day20.txt')
    data = getRangeList(data)

    print('Part 1')
    print('\tFirst IP is: {}.'.format(data[0].b + 1))
    # Answer: 32259706

    print('\nPart 2')
    ctr = _count_allowed(datarange)
    print('\tNumber allowed IPs: {}.'.format(ctr))
    # Answer: 113
Ejemplo n.º 54
0
def lstm_model_headline_body_combin(body_length, numb_epoch):
    fexc = Preprocessing()
    data = load_data()

    # Loading train data from files
    data.set_path(path='fnc-1-master')
    train_stance_data = data.get_headline_body_stance()
    train_bodies_data = data.get_body_id_text()
    train_headlines, train_bodies, train_stances = data.get_mapped_id_body(
        train_stance_data, train_bodies_data)

    # Removing punctuation and stop words from the headline and body of train data
    train_headlines_cl = fexc.get_clean_data(train_headlines)
    train_bodies_cl = fexc.get_clean_data(train_bodies)
    train_stances_cl = fexc.get_clean_data(train_stances)

    # Convert labels to integer
    train_stances_in = fexc.convert_lable_int(train_stances_cl)

    # Load the test data
    data.set_name("test")
    test_stance_data = data.get_headline_body_stance()
    test_bodies_data = data.get_body_id_text()
    test_headlines, test_bodies = data.get_mapped_id_body(test_stance_data,
                                                          test_bodies_data,
                                                          data_type="test")

    # Removing punctuation and stop words from the headline and body of test data
    test_headlines_cl = fexc.get_clean_data(test_headlines)
    test_bodies_cl = fexc.get_clean_data(test_bodies)

    # Remove Stop words #
    test_headlines_cl = fexc.remove_stop_words_list(test_headlines_cl)
    test_bodies_cl = fexc.remove_stop_words_list(test_bodies_cl)

    # Set the tokenizer
    alltext = train_headlines_cl + train_bodies_cl + test_headlines_cl + test_bodies_cl
    token = Tokenizer(num_words=30000)
    token.fit_on_texts(alltext)
    print('Number of Unique words: ' + str(len(token.word_index.keys())))

    # Combine the headline and bodies of training data
    train_data = fexc.combine_heading_body(train_headlines_cl, train_bodies_cl)
    word_index = token.word_index

    # Converting train data to sequence
    train_data = token.texts_to_sequences(train_data)

    # Padding train data
    train_data = pad_sequences(train_data,
                               maxlen=(MAX_HEADLINE_LENGTH + int(body_length)))

    # Converting the labels to one hot encoder
    onehotencoder = OneHotEncoder()
    train_stances_in = onehotencoder.fit_transform(train_stances_in).toarray()

    # Splitting the data in train and validation
    train_data, val_data, train_stances_final, stances_val = \
        train_test_split(train_data, train_stances_in, test_size=0.2, random_state=42)

    # Combining test data
    test_data = fexc.combine_heading_body(test_headlines_cl, test_bodies_cl)

    # Converting test data to sequence
    test_data = token.texts_to_sequences(test_data)

    # Padding test data
    test_data = pad_sequences(test_data,
                              maxlen=MAX_HEADLINE_LENGTH + int(body_length))

    # Getting embedding index
    embeddings_index = models.get_embeddings_index(GLOVE_DIR)

    print('Found %s word vectors.' % len(embeddings_index))

    # Getting embedding matrix
    embedding_matrix = models.get_embedding_matrix(
        embedding_dim=EMBEDDING_DIM,
        embeddings_index=embeddings_index,
        word_index=word_index)

    # Building the Model
    fake_nn = models.lstm_with_combine_headline_body(
        headline_length=MAX_HEADLINE_LENGTH,
        body_length=int(body_length),
        embedding_dim=EMBEDDING_DIM,
        word_index=word_index,
        embedding_matrix=embedding_matrix,
        activation='relu',
        drop_out=0.5,
        numb_layers=300,
        cells=200)

    # Early stopping and model checkpoint
    early_stopping = EarlyStopping(monitor='val_loss', patience=10)
    bst_model_path = 'Fake_news_nlp.h5'
    model_checkpoint = ModelCheckpoint(bst_model_path,
                                       save_best_only=True,
                                       save_weights_only=True)

    # Fitting the model
    fake_hist = fake_nn.fit(train_data,
                            train_stances_final,
                            batch_size=128,
                            epochs=int(numb_epoch),
                            shuffle=True,
                            validation_data=(val_data, stances_val),
                            callbacks=[early_stopping, model_checkpoint])

    # Storing the training and validation accuracy and loss in file for plot
    lstm_data = []
    with open(
            os.path.join(
                OBJECT_DUMP,
                "lstm_headline_body_combine" + str(body_length) + ".txt"),
            'wb') as bow_hist:
        lstm_data.append(fake_hist.history['acc'])
        lstm_data.append(fake_hist.history['val_acc'])
        lstm_data.append(fake_hist.history['loss'])
        lstm_data.append(fake_hist.history['val_loss'])
        pickle.dump(lstm_data, bow_hist)

    # Predict the labels for test data
    result = fake_nn.predict([test_data], batch_size=128)

    # Store the results in the result file
    result_str = fexc.convert_lable_string(result)
    with io.open(TEST_FILE, mode='r', encoding='utf8') as read_file:
        test_stance = csv.DictReader(read_file)
        with io.open(RESULT_FILE + "_" + str(body_length) + ".csv",
                     mode='w',
                     encoding='utf8') as write_file:
            writer = csv.DictWriter(
                write_file, fieldnames=['Headline', 'Body ID', 'Stance'])
            writer.writeheader()
            for sample, prediction in zip(test_stance, result_str):
                writer.writerow({
                    'Body ID': sample['Body ID'],
                    'Headline': sample['Headline'],
                    'Stance': prediction
                })

            # Print the Accuracy, competition score and confusion matrix
            print_result("fnc-1-master/competition_test_stances.csv",
                         RESULT_FILE + "_" + str(body_length) + ".csv")
Ejemplo n.º 55
0
            x = add([Lambda(slice_last)(x), x_rnn])
    return x


if __name__ == '__main__':
    # Example usage
    from keras.layers import Input, Dense, Dropout
    from keras.models import Model
    from keras.callbacks import ReduceLROnPlateau
    from keras.optimizers import SGD
    from load_data import load_data
    from sklearn.model_selection import train_test_split
    from sklearn.preprocessing import MinMaxScaler

    x, y = load_data()

    input = Input(shape=(200, 84))
    output = make_residual_lstm_layers(input,
                                       rnn_width=128,
                                       rnn_depth=4,
                                       rnn_dropout=0.4)
    output = Dropout(0.4)(output)
    output = Dense(2,
                   activation='softmax',
                   kernel_regularizer=keras.regularizers.l2(0.02))(output)

    model = Model(inputs=input, outputs=output)

    model.compile(optimizer=SGD(0.01, nesterov=True),
                  loss='categorical_crossentropy',
Ejemplo n.º 56
0
    args = paras()
    args.train_path = 'data/train.csv'
    args.dev_path = 'data/dev.csv'
    args.test_path = in_path
    args.to_test_path = 'data/to_test.csv'
    args.w2v_model_path = 'data/w2v_train.save'
    args.data_path = 'data/atec_nlp_sim_train.csv'
    args.res_path = out_path
    # load data
    # text_field, label_field, train_data, train_iter,\
    #     dev_data, dev_iter = load_data(args)

    # load data
    text_field, label_field, train_data, train_iter,\
        dev_data, dev_iter, test_data, test_iter = load_data(args)

    # text_field.build_vocab(train_data, dev_data)

    args.embed_num = 7563
    args.embed_dim = 300
    args.word_Embedding = True

    embedding_dict = Word2Vec.load(args.w2v_model_path)
    word_vec_list = []
    oov = 0
    for idx, word in enumerate(text_field.vocab.itos):
        try:
            vector = np.array(embedding_dict[str(word.encode('utf-8'))],
                              dtype=float).reshape(1, args.embed_dim)
        except:
from sklearn import linear_model
from sklearn import kernel_ridge
from sklearn import svm
from load_data import load_data
from write_submission import write_submission
import numpy as np
#from matplotlib import pyplot as plt
from expand_features import expand_features


def rmse(predictions, targets):
    return np.sqrt(((predictions - targets)**2).mean())


# load data
[Xtr, Ytr, Xte, testID] = load_data()

# expand features
Xtr_expanded = expand_features(Xtr)
Xte_expanded = expand_features(Xte)

print('Xtr shape', Xtr_expanded.shape, 'Ytr shape', Ytr.shape, 'Xte shape',
      Xte.shape)

clf = linear_model.RidgeCV(alphas=[1e-3, 1e-2, 1e-1],
                           normalize=True,
                           store_cv_values=True).fit(Xtr_expanded, Ytr)
ridge_preds = clf.predict(Xtr_expanded)
print('RMSE Ridge:', rmse(ridge_preds, Ytr))
print(clf.alpha_)
Ejemplo n.º 58
0
import numpy
from load_data import load_data
from nn import nn
from ova import ova
from pca import pca
from rfe import rfe
from tsne import tsne
# import json
# numpy.set_printoptions(threshold=1000000)

train_values, train_values_rfe, train_classes, train_classes_binary, test_values, test_values_rfe, test_classes, test_classes_binary, class_desc\
    = load_data()

all_values = numpy.concatenate((train_values, test_values))
all_classes = numpy.concatenate((train_classes, test_classes))
all_classes_binary = numpy.concatenate(
    (train_classes_binary, test_classes_binary))

all_values_rfe = numpy.concatenate((train_values_rfe, test_values_rfe))

# print(all_values_rfe.shape)
# print(all_values_rfe)

# print(all_values_rfe[:,13])
# print(all_values[:,46])

# print(all_values)
# print(all_classes)

# t-sne on train data
# tsne(train_values, train_classes, class_desc, 0)
Ejemplo n.º 59
0
from sklearn.metrics import roc_curve, auc
import tensorflow.keras.backend as K
np.random.seed(2020)
import hyper_params as hp
from dl_models import build_model_predict
from dl_models import build_model_ae
from load_data import load_data, impute_data, calc_impute_values

if __name__ == '__main__':
    # load dataset
    hp = hp.create_hparams()
    op_mode = hp.op_mode

    print(hp.outcome)
    pdirname = os.path.dirname(__file__)
    clin_params, outcomes, patients_id = load_data(hp.outcome, pdirname + hp.dataset_path) # Load dataset
    orig_clin_params = clin_params  # Imputation will change clin_params; so, we need to store it a copy separate

    sss1 = StratifiedShuffleSplit(n_splits=1, test_size=0.30, random_state=0)  #prepare cross-validation: training/validation (70%) + test (30%)
    sss2 = StratifiedShuffleSplit(n_splits=10, test_size=0.28, random_state=0) #=0.28x0.7 (20%) Validation + (50%) Training
    print('Positives = {:.1f}'.format(np.sum(outcomes))+'   Negatives = {:.1f}'.format(np.sum(1-outcomes))+' Total = {:.1f}'.format(len(outcomes)))

    ###################################################################################################
    ## Stage 1: PRE-TRAINING of AutoEncoder
    ###################################################################################################
    if op_mode is 'pretrain':
        idx =0
        for train_valid_index, test_index in sss1.split(clin_params, outcomes):#Dummy for-loop: only one split (test vs train/vlid) is performed.
            trv_params = orig_clin_params[train_valid_index]  # 70% of data; the other 30% are kept separate; not used for train or valid at all
            trv_outcomes = outcomes[train_valid_index]        # 70% of data
            for train_index, valid_index in sss2.split(trv_params, trv_outcomes): # for-loop on the 10-fold cross-validations
Ejemplo n.º 60
0
import pandas as pd
from xgboost import XGBClassifier
from scipy.sparse import hstack
from sklearn.preprocessing import LabelEncoder
import os
from os import path, makedirs
from azureml.logging import get_azureml_logger
from sklearn.model_selection import GridSearchCV
import feature_engineering as fe
from load_data import load_data

# load data
app_events, app_labels, events, gender_age_train, gender_age_test, label_categories, brand_model = load_data(
)

# initialize logger
run_logger = get_azureml_logger()
run_logger.log("amlrealworld.distributed-tuning.single-vm", "true")

# default temporary library of joblib is too small, change it
os.environ["JOBLIB_TEMP_FOLDER"] = "/tmp"

#################################################################
# Feature engineering
#################################################################

# Create one-hot encoding of brand and model
train_brand, test_brand, train_model, test_model = fe.one_hot_brand_model(
    brand_model, gender_age_train, gender_age_test)

# Create weekday and hour features (represented using one-hot encoding)