Example #1
0
def main():
    
    #load data
    df = load_data('../../assignment10_data/restaurants.csv', ['CAMIS','BORO','GRADE','GRADE DATE'])
    df = clean_data(df) #clean data
    
    #question 4
    sum_nyc, sum_boro = grade_sum(df) #calculate sum of test_grade in nyc and in each borough
    print 'The sum of test_grade in NYC is: {} \n'.format(sum_nyc)
    print 'The sum of test_grade in each boroughs is: \n {}'.format(sum_boro)
    
    #question 5
    grade_overtime_plot(df, 'nyc') #grade overtime plot for nyc
    #grade overtime plot for each borough
    for borough in ['BRONX', 'BROOKLYN', 'MANHATTAN', 'QUEENS', 'STATEN ISLAND']:
        df_boro = df[df['BORO'] == borough]
        grade_overtime_plot(df_boro, borough.lower())
    
    #question 6
    df1 = load_data('../../assignment10_data/restaurants.csv', ['CAMIS','CUISINE DESCRIPTION'])
    type_name = get_top_10_nyc(df1)
    df2 = load_data('../../assignment10_data/restaurants.csv', ['CAMIS','CUISINE DESCRIPTION', 'GRADE', 'GRADE DATE'])
    df2 = clean_data(df2)
    df2 = df2[df2['CUISINE DESCRIPTION'].isin(type_name)]
    df_sum = top_10_grade_overtime(df2, type_name) #calculate score overtime for each restaurant type
    top_10_plot(df_sum) #score overtime plot
    top_10_colormap(df_sum) #plot correlation between any two restaurant types in NYC in color map
Example #2
0
def main():
    print("Loading the classifier")
    classifier = utility.load_model("fullsgd_model_rev{}".format(revision))

    print("Reading in the training data")
    train = utility.load_data("training", "finalinput")
    truth = train['votes_useful_log']
    del train['votes_useful_log']

    print("Predicting the training data")
    logpred = np.ravel(classifier.predict(train.values[:,1:]))
    score = utility.rmsle_log(logpred, truth)
    print "Score:", score

    print("Reading in the test data")
    test = utility.load_data("test", "finalinput")
    del test['votes_useful_log']

    print("Predicting the test data")
    logpred = np.ravel(classifier.predict(test.values[:,1:]))
    pred = np.exp(np.array(logpred, dtype=np.float64)) - 1
    test['votes'] = pred
    
    print("Writing out a new submission file")
    utility.write_submission(test, "fullsgd_sub_rev{}.csv".format(revision))
def main():
    revision = 1

    print("Loading the classifier")
    classifier = utility.load_model("train_rtext_rev{}".format(revision))
    
    print("Reading in the training data")
    train = utility.load_data("training", "rtext")

    print("Predicting the rest of the training data")
    pred = np.ravel(classifier.predict(list(train['rtext_bcat'])))
    score = utility.rmsle_log(pred, train['votes_useful_log'])
    print "Score:", score

    print("Writing out new training data")
    del train['rtext_bcat']
    train['votes_useful_log_rtextpred_sgd'] = pd.Series(pred, index=train.index)
    utility.save_data(train, "training", "rtext_sgd_rev{}".format(revision))
    
    print("Reading in the test data")
    test = utility.load_data("test", "rtext")
    tepred = np.ravel(classifier.predict(list(test['rtext_bcat'])))

    print("Writing out new test data")
    del test['rtext_bcat']
    test['votes_useful_log_rtextpred_sgd'] = pd.Series(tepred, index=test.index)
    utility.save_data(test, "test", "rtext_sgd_rev{}".format(revision))
    test['votes'] = pd.Series(np.exp(tepred) + 1, index=test.index)

    print("Writing out a new submission file")
    utility.write_submission(test, "rtextsgd_sub_rev{}".format(revision))
Example #4
0
def main():
    """
    This function is to present the results of this assignment.
    Users will ask to see:
    1)Income distribution across all countries for a given year:
      Users need to input a year from 1800 to 2012.
      Results will be saved as a .png file.
    2)Income distribution by region in recent years:
      Users need to input the first year, last year and year gap in a year rangeand select a plot type, boxplot or histograms.
      Results will be saved as a .pdf file.
    """

    #load countries and income data
    countries = load_data('countries.csv')
    income = load_data('indicator gapminder gdp_per_capita_ppp.csv')
    #transform income data set
    income = trans_data(income)

    try:
        while raw_input('To see income distribution across all countries? (y/n) ') == 'y':
            try:
                year = raw_input('Which year? ') #select a year
                income_distr(income, year)
            except:
                print 'Please input a year from 1800 to 2012'
        
        while raw_input('To see income distribution by region in recent years? (y/n) ') == 'y':
            try:
                from_year = int(raw_input('From which year? ')) #input the first year
                to_year = int(raw_input('To which year? ')) #input the last year
                year_gap = int(raw_input('Year gap? ')) #input a year gap
                pltype = raw_input('Plot type: boxplots or histograms? (b/h) ') #select a plot type
                if pltype == 'b':
                    pp = PdfPages('results/Income by region from {0} to {1}_boxplot.pdf'.format(from_year, to_year)) #create a pdf file to save plots
                    for i in xrange(from_year, to_year+1, year_gap):
                        fig = income_region(1,str(i))
                        pp.savefig(fig)
                elif pltype == 'h':
                    pp = PdfPages('results/Income by region from {0} to {1}_hist.pdf'.format(from_year, to_year))
                    for i in xrange(from_year, to_year+1, year_gap):
                        fig = income_region(0, str(i))
                        plt.suptitle('{}'.format(i))
                        pp.savefig(fig)
                pp.close() #close the pdf file
            except:
                print 'please input years from 1800 to 2012 and try again!'

    except(KeyboardInterrupt):
        print 'Bye!'
        sys.exit()
Example #5
0
    def on_load(self):
        self.savable_environment = utility.load_data("lisp_state")

        if not self.savable_environment:
            self.savable_environment = Environment(self.globals)
        else:
            self.savable_environment.parent = self.globals
Example #6
0
def test_without_changes():
    """
    Plots the rectangles on top of the image, without converting to a 5-D representation. Useful only for debugging
    of the 5-D transform
    Returns:

    """
    path = "../debug_dataset"  # Only add a couple of pictures to this path
    images, pos_rectangles, neg_rectangles = load_data(path)
    df = pd.DataFrame(columns=["filenames", "p1", "p2", "p3", "p4"])
    for filename in pos_rectangles.filenames.unique():
        points = pos_rectangles[pos_rectangles["filenames"] == filename]
        points = points.loc[:, ['x', 'y']]
        for i in range(0, len(points), 4):
            x1, y1 = points.iloc[i][0], points.iloc[i][1]
            x2, y2 = points.iloc[i + 1][0], points.iloc[i + 1][1]
            x3, y3 = points.iloc[i + 2][0], points.iloc[i + 2][1]
            x4, y4 = points.iloc[i + 3][0], points.iloc[i + 3][1]
            new_row = [
                filename,
                np.asarray([int(x1), int(y1)]),
                np.asarray([int(x2), int(y2)]),
                np.asarray([int(x3), int(y3)]),
                np.asarray([int(x4), int(y4)])
            ]
            df.loc[len(df)] = new_row
    print(df)
    for i, j in images.iterrows():
        rectangles = df[df["filenames"] == j["filenames"]]
        plot(j["images"], j["filenames"], rectangles)
Example #7
0
def run(train_file, test_file, output_file):
    train, labels, test = utils.load_data(train_file, test_file)

    clf = XGBoost(max_iterations=500, max_depth=12, min_child_weight=4.9208250938262745,
                  row_subsample=.9134478530382129, min_loss_reduction=.5132278416508804,
                  column_subsample=.730128689911957, step_size=.1)
    clf.fit(train, labels)
    predictions = clf.predict_proba(test)
    utils.save_prediction(output_file, predictions)
def optimal_svm(optimal_c):
    """
    This function is to calculate AUC for optimal C chose from model selection
    """
    
    #load datasets
    train_X, train_y = load_data('train_X.csv', 'train_y.csv')
    test_X, test_y = load_data('test_X.csv', 'test_y.csv')
    train_X_pca = data_pca(0.95, train_X, train_X)
    test_X_pca = data_pca(0.95, train_X, test_X)
    train_y = np.array(train_y).ravel()
    test_y = np.array(test_y).ravel()
    #set up model with the optimal C
    my_svm = svm.SVC(kernel='linear', C=optimal_c, class_weight='auto')
    predicted_y = my_svm.fit(train_X_pca,train_y).decision_function(test_X_pca)
    fpr, tpr, tr = roc_curve(test_y, predicted_y)
    
    print auc(fpr, tpr)
Example #9
0
def main():
    revision = 4

    print("Loading the classifier")
    classifier = utility.load_model("train_rtext_rev{}".format(revision))
    
    print("Reading in the training data")
    train = utility.load_data("training", "rtext")

    print("Predicting the rest of the training data")
    bunch = 50000
    pred = np.zeros(len(train))
    for ibunch in range(int(len(train) / bunch)) :
        beg = ibunch * bunch
        end = (ibunch + 1) * 50000
        mtrain = train.ix[beg:end - 1]
        mpred = np.ravel(classifier.predict(list(mtrain['rtext_bcat'])))
        pred[beg:end] = mpred

    beg = int(len(train) / bunch) * bunch
    mtrain = train.ix[beg:]
    mpred = np.ravel(classifier.predict(list(mtrain['rtext_bcat'])))
    pred[beg:] = mpred

    score = utility.rmsle_log(pred, train['votes_useful_log'])
    print "Score:", score

    print("Writing out new training data")
    del train['rtext_bcat']
    train['votes_useful_log_rtextpred'] = pd.Series(pred, index=train.index)
    utility.save_data(train, "training", "rtext_rev{}".format(revision))
    
    print("Reading in the test data")
    test = utility.load_data("test", "rtext")
    tepred = np.ravel(classifier.predict(list(test['rtext_bcat'])))

    print("Writing out new test data")
    del test['rtext_bcat']
    test['votes_useful_log_rtextpred'] = pd.Series(tepred, index=test.index)
    utility.save_data(test, "test", "rtext_rev{}".format(revision))
    test['votes'] = pd.Series(np.exp(tepred) + 1, index=test.index)

    print("Writing out a new submission file")
    utility.write_submission(test, "rtextrf_sub_rev{}.csv".format(revision))
Example #10
0
def train_and_evaluate(output_dir, hparams):
    # Main orchastrator of training and evaluation by calling models from estimator_model.py
    shutil.rmtree(output_dir, ignore_errors = True)
    tf.compat.v1.summary.FileWriterCache.clear()

    # Set log configuration, export to local file
    date_string = datetime.datetime.now().strftime("%m%d_%H%M")
    filename = 'training log/train_estimator_log_' + date_string + '.txt'
    logging.basicConfig(filename=filename, level=20)

    ((train_text, train_label), (eval_text, eval_label)) = utility.load_data("warehouse/store/", CLASSES)


    # Create vocabulary from training corpus 
    tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=VOCAB_SIZE+1, oov_token='<unk>')
    tokenizer.fit_on_texts(train_text)

    # Save token dictionary to use during prediction time
    # saving
    with open('tokenizer.pickle', 'wb') as handle:
        pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

    # Create estimator config
    run_config = tf.estimator.RunConfig(save_checkpoints_steps=EVAL_INTERVAL,
                                        log_step_count_steps = 20,
                                        save_summary_steps = 50
                                    )
    estimator = cnn_estimator(hparams['model_version'], output_dir, run_config,
                                hparams['learning_rate'],
                                hparams['grad_clip_rate'],
                                EMBEDDING_DIM,
                                hparams['filters'],
                                hparams['dropout_rate'],
                                hparams['kernel_size'],
                                hparams['pool_size'],
                                hparams['strides'],
                                hparams['padding_type'],
                                hparams['fc_layer_nodes'],
                                hparams['growth_rate'],
                                word_index=tokenizer.word_index,
                                embedding_path=hparams['embedding_path'])

    train_steps = hparams['num_epochs'] * len(train_text) / hparams['batch_size']
    train_spec = tf.estimator.TrainSpec(input_fn=input_function(train_text, train_label, tokenizer,
            hparams['batch_size'], mode=tf.estimator.ModeKeys.TRAIN), max_steps=train_steps)

    # Create exporter configuration
    exporter = tf.estimator.LatestExporter('exporter', serving_input_fn)
    eval_spec = tf.estimator.EvalSpec(input_fn=input_function(eval_text, eval_label, tokenizer,
            hparams['batch_size'], mode=tf.estimator.ModeKeys.EVAL), 
            steps=None, 
            exporters=exporter,
            throttle_secs = 5)  # evaluate every N seconds

    tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)
    return True
def predict(config_file):
    """
    Main function that runs predictions

    Args:
        config_file [str]: path to config file

    Returns:
        None
    """
    ##################
    # configure logger
    ##################
    logger = set_logger("./log/predict.log")

    ##################
    # Load config from config file
    ##################
    logger.info(f"Load config from {config_file}")
    config = parse_config(config_file)

    model_path = Path(config["predict"]["model_path"])
    processed_test = config["predict"]["processed_test"]
    predicted_file = config["predict"]["predicted_file"]
    export_result = config["predict"]["export_result"]

    logger.info(f"config: {config['predict']}")

    ##################
    # Load model & test set
    ##################
    # Load model
    logger.info(
        f"-------------------Load the trained model-------------------")
    with open(model_path, "rb") as f:
        trained_model = load(f)

    # Load test set
    logger.info(f"Load the test data from {processed_test}")
    X, y, cols = load_data(processed_test)
    logger.info(f"cols: {cols}")
    logger.info(f"X: {X.shape}")
    logger.info(f"y: {y.shape}")

    ##################
    # Make prediction and evaluate
    ##################
    logger.info(f"-------------------Predict and evaluate-------------------")
    y_hat = trained_model.predict(X)
    logger.info(f"Classification report: \n {classification_report(y, y_hat)}")
    output = pd.DataFrame(y)
    output["prediction"] = y_hat
    if export_result:
        output.to_csv(predicted_file, index=False)
        logger.info(f"Export prediction to : {predicted_file}")
Example #12
0
def update_history(opt: Namespace, epoch: int, loss: float,
                   split: str) -> "model":

    train_hist = utility.load_data(os.path.join(opt.save, 'train_hist.pt'))
    val_hist = utility.load_data(os.path.join(opt.save, 'val_hist.pt'))

    if split == 'train':
        train_hist[epoch] = loss
        torch.save(train_hist, os.path.join(opt.save, 'train_hist.pt'))
        with open(os.path.join(opt.save, 'train_hist'), 'w') as f:
            f.write(utility.hist_to_str(train_hist))
        return train_hist
    elif split == 'val':
        val_hist[epoch] = loss
        torch.save(val_hist, os.path.join(opt.save, 'val_hist.pt'))
        with open(os.path.join(opt.save, 'val_hist'), 'w') as f:
            f.write(utility.hist_to_str(val_hist))
        return val_hist
    else:
        logging.error('Unknown split: ' + split)
Example #13
0
def main():
    in_arg = utility.get_input_args()
    print(in_arg)
    device = utility.set_device_type(in_arg.gpu)
    image_datasets, dataloaders = utility.load_data(in_arg.data_dir)
    cat_to_name = utility.load_category_map()
    model, criterion, optimizer = modelbuilder.build_model(in_arg, device)

    train_model(model, dataloaders['train'], dataloaders['valid'],
                in_arg.epochs, 40, criterion, optimizer, device)

    modelbuilder.save_model(in_arg, model, image_datasets['train'], optimizer)
Example #14
0
def test_load_data():
    X, y, cols = load_data("data/winequality.csv")

    # type check
    assert isinstance(X, np.ndarray)
    assert isinstance(y, pd.core.series.Series)
    assert isinstance(cols, list)

    # shape check
    assert X.shape[0] == y.shape[0], "# row of features = # rows of target"
    assert X.shape[1] >= 1, "# features >= 1"
    assert len(cols) == 12, "data should have 12 cols"
Example #15
0
def main():
    print("Reading in the training data")
    train = utility.load_data("training", "finalinput")
    truth = np.ravel(np.array(train['votes_useful_log']))
    del train['votes_useful_log']

    print("Extracting features and training review text model")
    classifier = get_pipeline()
    classifier.fit(train.values[:,1:], np.array(truth))

    print("Saving the classifier")
    utility.save_model(classifier, "fullsgd_model_rev{}".format(revision))
Example #16
0
def save_labels(path_to_data, path_to_labels):
    _, pos_rectangles, neg_rectangles = load_data(path_to_data)
    pos_rectangles = to_five_dimensional(pos_rectangles)
    neg_rectangles = to_five_dimensional(neg_rectangles)

    saved_path = Path(path_to_labels)
    pos_label_path = saved_path / "pos_labels.csv"
    neg_label_path = saved_path / "neg_labels.csv"
    if not saved_path.exists():
        Path.mkdir(saved_path, parents=True)

    pos_rectangles.to_csv(pos_label_path)
    neg_rectangles.to_csv(neg_label_path)
def get_future_trade_date_index(start: datetime, end: datetime) -> np.ndarray:
    """
    返回用于对比所有期货品种的标准化时间序列
    各品种夜盘时间不固定(有的无夜盘),在计算每日盈亏的时候就会出现时间线不齐的情况,此处以au为标准。
    """
    df = load_data(
        'AU888.SHFE',
        '1h',
        start,
        end,
    )
    df['date'] = df.index.map(lambda dt: dt.date())
    return df['date'].drop_duplicates().values
Example #18
0
def main():

    dataset, labels = load_data(args.XLSX)

    features = [row[:-1] for row in dataset]

    mean_vector, e_values, e_vectors, norm_Features = calculate_eigen_of_cov_mat(features)
    scree_graph(e_values)
    k = 20
    print('eigen value 1:', e_values[0])
    print('eigen vector 1:\n', e_vectors[0])
    print('eigen value 2:', e_values[1])
    print('eigen vector 2:\n', e_vectors[1])
    choose_k_largest_eigen_transform(norm_Features, e_vectors, mean_vector, k)
def main():

    trabus = utility.load_data("training", "business")
    tesbus = utility.load_data("test", "business")
    bus = pd.concat((trabus, tesbus))
    for cat in delbuscats :
        if hasattr(bus, cat) : del bus[cat]
    bus['procbcat'] = pd.Series(map(process_bcat, bus['categories']), bus.index)
    del bus['categories']

    for s in ["training", "test"] :

        rev = utility.load_data(s, "review")
        for cat in delrevcats :
            if hasattr(rev, cat) : del rev[cat]
        if hasattr(rev, 'votes_useful') :
            rev['votes_useful_log'] = np.log(rev.votes_useful + 1)
        rev = pd.merge(rev, bus, 'inner')

        rev['rtext_bcat'] = rev['text'] + rev['procbcat']
        del rev['procbcat']
        del rev['text']

        utility.save_data(rev, s, 'rtext')
Example #20
0
def main():
    revision = 4

    print("Reading in the training data")
    train = utility.load_data("training", "rtext")
    inds = random.sample(range(len(train)), 100000)
    mtrain = train.ix[inds]

    print("Extracting features and training review text model")
    classifier = get_pipeline()
    classifier.fit(list(mtrain['rtext_bcat']), 
                   list(mtrain['votes_useful_log']))

    print("Saving the classifier")
    utility.save_model(classifier, "train_rtext_rev{}".format(revision))
def main():
    dataset, labels = load_data(args.XLSX)

    class_values = list(set(row[-1] for row in dataset))

    root = get_best_feature_to_split(dataset, 'gain_ratio')

    split(root,
          class_values,
          max_depth=3,
          min_size=5,
          depth=1,
          mode='gain_ratio')

    print(root)
Example #22
0
def main():
    
    #load datasets
    train_X, train_Y = load_data('train_X.csv', 'train_y.csv')
    train_X_pca = data_pca(0.95, train_X, train_X)
    train = train_X_pca
    train['Y'] = train_Y
    #set a list of hyperparameter C
    c = [10**i for i in range(-9,2)]
    #conduct X cross validation and return AUCs in each sample for each C
    aucs=xValSVM(train, 'Y', 5, c)
    #calculate the average and standard error of AUC for each C
    avg, stderr = avg_stderr(aucs, c)
    #plot the results of cross validation
    plotxValSVM(avg, stderr, c)
Example #23
0
def main():

    #load data
    df = load_data('../../assignment10_data/restaurants.csv',
                   ['CAMIS', 'BORO', 'GRADE', 'GRADE DATE'])
    df = clean_data(df)  #clean data

    #question 4
    sum_nyc, sum_boro = grade_sum(
        df)  #calculate sum of test_grade in nyc and in each borough
    print 'The sum of test_grade in NYC is: {} \n'.format(sum_nyc)
    print 'The sum of test_grade in each boroughs is: \n {}'.format(sum_boro)

    #question 5
    grade_overtime_plot(df, 'nyc')  #grade overtime plot for nyc
    #grade overtime plot for each borough
    for borough in [
            'BRONX', 'BROOKLYN', 'MANHATTAN', 'QUEENS', 'STATEN ISLAND'
    ]:
        df_boro = df[df['BORO'] == borough]
        grade_overtime_plot(df_boro, borough.lower())

    #question 6
    df1 = load_data('../../assignment10_data/restaurants.csv',
                    ['CAMIS', 'CUISINE DESCRIPTION'])
    type_name = get_top_10_nyc(df1)
    df2 = load_data('../../assignment10_data/restaurants.csv',
                    ['CAMIS', 'CUISINE DESCRIPTION', 'GRADE', 'GRADE DATE'])
    df2 = clean_data(df2)
    df2 = df2[df2['CUISINE DESCRIPTION'].isin(type_name)]
    df_sum = top_10_grade_overtime(
        df2, type_name)  #calculate score overtime for each restaurant type
    top_10_plot(df_sum)  #score overtime plot
    top_10_colormap(
        df_sum
    )  #plot correlation between any two restaurant types in NYC in color map
Example #24
0
def test():
    """
    Plots the rectangles on top of the image. To see if the 5-D transformation works well.
    Returns:

    """
    path = "../debug_dataset"  # Only add a couple of pictures to this path
    images, pos_rectangles, neg_rectangles = load_data(path)
    pos_rectangles = to_five_dimensional(pos_rectangles)
    replicated_imgs = replicate_images(images, pos_rectangles)
    x_train, y_train, x_test, y_test = split_train_test_data(
        replicated_imgs, pos_rectangles)
    df = to_four_points(pos_rectangles)
    for i, j in x_test.iterrows():
        rectangles = df[df.filenames == j["filenames"]]
        plot(j["images"], j["filenames"], rectangles)
Example #25
0
def main():

    # Get command line input
    results_argparse = argparse_train()

    # Load and transform image data
    image_datasets, dataloaders = load_data(results_argparse.data_directory)

    # Create the model
    model = create_model(results_argparse.arch, results_argparse.hidden_units,
                         results_argparse.prob_dropout)

    # Train the model
    model, optimizer = train_model(model,
                                   dataloaders['train'],
                                   results_argparse.learning_rate,
                                   results_argparse.epochs,
                                   results_argparse.gpu,
                                   print_every=40,
                                   validloader=dataloaders['valid'])

    # Save model
    model.class_to_idx = image_datasets['train'].class_to_idx
    save_model(model, optimizer)
Example #26
0
	def on_load(self):
		self.notebook = utility.load_data('notes', {})
import utility
import numpy as np
import pandas as pd
import multiprocessing
from sklearn.grid_search import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import train_test_split
from sklearn.ensemble import GradientBoostingRegressor

nweakgbms = 18

dftest = utility.load_data('test')
dftrain = utility.load_data('train')

dftestpreds = pd.DataFrame(dftest.id)
dftrainpreds = pd.DataFrame({'id':np.arange(len(dftrain)),
                             'ACTION':dftrain.ACTION})

y = np.array(dftrain.ACTION)
del dftrain['ACTION']
X = np.array(dftrain)

Xtest = np.array(dftest)[:,1:]

def train_weakgbm(i) :

    cols = np.ones(9)
    cols[i % X.shape[1]] = 0
    smallX = np.compress(cols, X, axis=1)

    X_cvtrain, X_cvtest, y_cvtrain, y_cvtest = train_test_split(
Example #28
0
File: stock.py Project: osund/pynik
 def on_load(self):
     self.__aliases = utility.load_data("stockaliases", {})
Example #29
0
	def on_load(self):
		self.location = utility.load_data('festern_bbq', "okänt")
Example #30
0
	def on_load(self):
		self.reminders = utility.load_data("reminders", [])
Example #31
0
 def on_load(self):
     self.places = utility.load_data("places", {})
min_delta = setup["Delta"]
patience = setup["Patience"]
optimizer = setup["Optimizer"]
loss = setup["Loss"]
activation_function = setup["Activation"]
# layers for decoder(/encoder)
# if 2 the overall net will have 4 layers
layers_num = setup["Layer"]
cell_type = setup["Cell"]
start_dim = setup["Starting"]
# ---------------------------------------------------- #

# deep autoencoders for gene clustering
# ---------------------------- LOAD DATA -------------------------------- #
print "Loading the data."
dataset, _ = utility.load_data(dataset_code, cell_type, withLabel=True)
while start_dim < dataset.shape[1] / 2 and start_dim != 20000:
    dataset = dataset.loc[:, dataset.var() > dataset.var().median()]
print "Pre-filtering dimensions " + str(dataset.shape[1])
# reduce intial dimensions
exit_flag = False
while True and start_dim != 20000:
    min_var = dataset.var().min()
    for i in dataset:
        if dataset.shape[1] <= start_dim:
            exit_flag = True
            break
        if dataset[i].var() == min_var:
            del (dataset[i])
    if exit_flag:
        break
Example #33
0
 def mask_load(self):
     self.url_masks = utility.load_data("urlmasks", {})
Example #34
0
 def load_urls(self):
     self.url_list = utility.load_data("urls", [])
parser.add_argument('--sample_percent', type=float, default=1.0)
parser.add_argument('--layers', type=str, default='100')
parser.add_argument('--feature_output_path', type=str, default=None)

args = parser.parse_args()
load_path = args.load_path
image_dir = args.image_dir
batch_size = args.batch_size
epochs = args.epochs
save_name = args.save_name
sample_percent = args.sample_percent
feature_output_path = args.feature_output_path
layers = list(map(int, args.layers.split(',')))

X, labels, file_names = load_data(image_dir,
                                  sample_percent=sample_percent,
                                  return_names=True)
length, width = X[0].shape
input_size = length * width
X = torch.Tensor(X).view(-1, input_size).type(torch.float32)
X /= 255

if load_path is None:
    model = AutoEncoder([input_size] + layers)
    model.train(X=X, batch_size=batch_size, epochs=epochs, verbose=True)
else:
    model = AutoEncoder.load(load_path)

if feature_output_path is not None:
    print('Saving learned features...')
    new_features = model(X)
Example #36
0
 def on_load(self):
     self.compliments = utility.load_data("compliments", [])
Example #37
0
	def on_load(self): 
		self.insults = utility.load_data("insults", [])
Example #38
0
	def on_load(self): 
		self.compliments = utility.load_data("compliments", [])
Example #39
0
	def on_load(self):
		self.places = utility.load_data("postnr_addresses", {})
Example #40
0
import time, datetime
import matplotlib
# Force matplotlib to not use any Xwindows backend.
matplotlib.use('Agg')
import matplotlib.pyplot as plt
from constants import DEV_MODE, IMAGE_SIZE, SIMPLE_3LAYERS_FILENAME, LENET5_CNN_FILENAME
from lasagne_neuralnet import NeuralNet1, NeuralNet2, load_model_if_exists, predict, plot_neural_net, reshape_data
from utility import clean_data, data_preview, get_label_args, load_data, save_result, scale_data
from visualization import plot_images, plot_label_distribution, plot_lasagne_learning_curves

# Count running time
starttime = datetime.datetime.now()

# Load and preview data
train, test, label = load_data(dev_mode=DEV_MODE)
data_preview(train, test, label)
plot_images(train, label)

# Fill NA with average.
# TODO: remove abnormal samples
label = clean_data(label)
label_min, label_max, label_median, label_var = get_label_args(label)
plot_label_distribution(label)

# Convert dataframe to array and normalize the data
train_array, label_array, test_array = scale_data(train, test, label)

# Train and predict

# Simple 3-layers neural nets
Example #41
0
	def on_load(self):
		self.favorites = utility.load_data("favorites", {})
def printPOS(pos_words):
    #pos_words is a list of (word, tag)
    s = ""
    t = ""
    for p in pos_words:
        l = len(p[0]) if len(p[0]) > len(p[1]) else len(p[1])
        s = s + p[0].rjust(l) + ' '
        t = t + p[1].rjust(l) + ' '

    print '-----------'
    print s
    print t
    print ""





if __name__ == '__main__':
    if len(sys.argv) != 2:
        print "Usage: python showTaggedSentences.py <input file>"
        sys.exit(0)

    qaTests = load_data(sys.argv[1])

    showAllTaggedSentences(qaTests)



Example #43
0
        self.i2o = nn.Linear(input_size + hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, input_tensor, hidden_tensor):
        combined = torch.cat((input_tensor, hidden_tensor), 1)  # doubt

        hidden = self.i2h(combined)
        output = self.i2o(combined)
        output = self.softmax(output)
        return output, hidden

    def init_hidden(self):
        return torch.zeros(1, self.hidden_size)


category_lines, all_categories = load_data()
n_categories = len(all_categories)
print(n_categories)

n_hidden = 128
rnn = RNN(N_LETTERS, n_hidden, n_categories)

input_tensor = letter_to_tensor("A")
hidden_tensor = rnn.init_hidden()
output, next_hidden = rnn(input_tensor, hidden_tensor)
print(output.size())
print(next_hidden.size())

input_tensor = line_to_tensor('Albert')
hidden_tensor = rnn.init_hidden()
output, next_hidden = rnn(input_tensor[0], hidden_tensor)
Example #44
0
 def on_load(self):
     self.insults = utility.load_data("insults", [])
Example #45
0
 def load_games(self):
     self.games = utility.load_data("games", {})
Example #46
0
File: rss.py Project: dentarg/pynik
	def on_load(self):
		self.watch_list = utility.load_data("rss_watch_list", [])
Example #47
0
	def on_load(self):
		self.id_directory = utility.load_data('schema_id', {})
		self.id_presets = utility.load_data('schema_fav', {})
Example #48
0
	def load_games(self):
		self.games = utility.load_data("games", {})
Example #49
0
	def load_urls(self):
		self.url_lists = utility.load_data("urls", {})
Example #50
0
 def on_load(self):
     self.places = utility.load_data("places", {})
Example #51
0
	def mask_load(self):
		self.url_masks = utility.load_data("urlmasks", {})
Example #52
0
def train(config_file):
    """
    Main function that train and persists model based on training set/

    Args:
        config_file [str]: path to config file

    Returns:
        None
    """
    ################
    # config logger
    ################
    logger = set_logger("../log/train.log")

    ###############################
    # Load config from config file
    ###############################
    logger.info(f"Load config from {config_file}")
    config = parse_config(config_file)

    keypoints_csv = Path(config['common']['labels_csv_path'])
    val_split = config['common']['val_split']
    train_img_scr_path = config['common']['img_source_path']
    test_img_scr_path = config['common']['img_source_path']
    image_width = config['common']['in_image_width']
    image_height = config['common']['in_image_height']

    epochs = config['train']['epochs']
    train_batch_size = config['train']['batch_size']
    weight_path = config['common']['weight_path']
    no_of_aug = config['train']['no_of_aug']
    test_batch_size = config['test']['batch_size']

    ############
    # Load Data
    ############
    logger.info(f"----------------Load the data----------------")
    selected_img, keypoint_df = load_data(keypoints_csv)
    logger.info(f"Number of selected images are {selected_img.shape}")
    logger.info(f"Few of the selected images are {selected_img[0:5]}")

    ####################################
    # Get train and test data generators
    ####################################

    X_train, y_train, X_test, y_test = train_test_split(
        selected_img, keypoint_df, val_split)

    train_gen = Car(x_set=X_train,
                    y_set=y_train,
                    mode='Train',
                    data_path=train_img_scr_path,
                    image_width=image_width,
                    image_height=image_height,
                    batch_size=train_batch_size,
                    augmentations='Self',
                    no_of_aug=no_of_aug)
    test_gen = Car(
        x_set=X_test,
        y_set=y_test,
        mode='Test',
        data_path=test_img_scr_path,
        image_width=image_width,
        image_height=image_height,
        batch_size=test_batch_size,
    )

    #####################
    # Set and train model
    #####################

    logger.info(
        f"-------------------------Initiate Model---------------------")
    model = KeyPointModel().getModel()

    logger.info(
        f"--------------------Model Summary---------------------------")
    logger.info(f"{model.summary}")

    # compile the model
    model.compile(loss='mean_squared_error',
                  optimizer='adam',
                  metrics=['mean_absolute_error'])

    # modelCheckPoint = ModelCheckpoint('car-{val_loss:.2f}.h5', monitor='val_loss', verbose=1, save_best_only=True, save_weights_only=True)
    earlyS = EarlyStopping(monitor='val_loss',
                           min_delta=1,
                           patience=3,
                           restore_best_weights=True)
    reducelr = ReduceLROnPlateau(monitor='val_loss',
                                 factor=0.1,
                                 patience=2,
                                 min_lr=1e-7)

    history = model.fit(x=train_gen,
                        validation_data=test_gen,
                        callbacks=[earlyS, reducelr],
                        epochs=epochs)
    logger.info(history)
    logger.info("------------Saving Weights--------------")
    model.save_weights(weight_path)
Example #53
0
                decoding2.decoding(j) + '\n')
    scores = model.evaluate(X, Y)
    f.write("\n%s: %.2f%% \n%s: %.2f%% \n%s: %.2f%%" %
            (model.metrics_names[1], scores[1] * 100, model.metrics_names[2],
             scores[2] * 100, model.metrics_names[3], scores[3] * 100))
    f.close()
    return print("Predictions saved in the file: " + name)


# 0. Load Data
path = '../data/prop_55.csv'  # len 464
# path = '../data/prop_65.csv'
# path = '../data/prop_75.csv'
# path = '../data/prop_85.csv'
# path = '../data/prop_95.csv'
X, Y = ut.load_data(path)
batch = len(X)

# 1. Define Model
model = Sequential()
model.add(Dense(460, input_shape=(460, ), activation='sigmoid'))

# 2. Compile model
keras_metrics = KerasMetrics()
model.compile(optimizer='adam',
              loss='mean_squared_error',
              metrics=[
                  keras_metrics.fbeta_score, keras_metrics.recall,
                  keras_metrics.precision
              ])
Example #54
0
 def load_refs(self):
     self.references = utility.load_data("spotify", {})
Example #55
0
 def on_load(self):
     self.id_directory = utility.load_data('schema_id', {})
     self.id_presets = utility.load_data('schema_fav', {})
Example #56
0
def train():
    batch_size = TRAIN_BATCH_SIZE
    num_labels = 10

    train_images = {}
    train_labels = {}
    test_images = {}
    test_labels = {}
    validation_images = {}
    validation_labels = {}
    for data in utility.Data:
        path = utility.generate_data_path(TRAINING_DATA_SIZE,
                                          data,
                                          label=False)
        train_images[data] = utility.load_data(path)
        path = utility.generate_data_path(TRAINING_DATA_SIZE, data, label=True)
        train_labels[data] = utility.load_data(path, label=True)
        path = utility.generate_data_path(utility.Size.TEST, data, label=False)
        test_images[data] = utility.load_data(path)
        path = utility.generate_data_path(utility.Size.TEST, data, label=True)
        test_labels[data] = utility.load_data(path, label=True)
        path = utility.generate_data_path(utility.Size.VALIDATION,
                                          data,
                                          label=False)
        validation_images[data] = utility.load_data(path)
        path = utility.generate_data_path(utility.Size.VALIDATION,
                                          data,
                                          label=True)
        validation_labels[data] = utility.load_data(path, label=True)

    train_size = train_images[utility.Data.CUSTOM].shape[0] + train_images[
        utility.Data.STREET].shape[0] + train_images[
            utility.Data.MNIST].shape[0]
    total_batch = int(train_size / batch_size)

    # Boolean for MODE of train or test
    #is_training = tf.placeholder(tf.bool, name='MODE')
    data_type = tf.placeholder(tf.string)
    # tf input
    x = tf.placeholder(tf.float32)
    y_ = tf.placeholder(tf.float32, [None, 10])  #answer
    y = cnn_model.CNN(x, data_type)

    # Get loss of model
    with tf.name_scope("LOSS"):
        loss = cnn_model.loss(
            y,
            y_,
        )
    # Define optimizer
    with tf.name_scope("ADAM"):
        # Optimizer: set up a variable that's incremented once per batch and
        # controls the learning rate decay.
        batch = tf.Variable(0)
        learning_rate = tf.train.exponential_decay(
            1e-4,  # Base learning rate.
            batch * batch_size,  # Current index into the dataset.
            train_size,  # Decay step.
            0.95,  # Decay rate.
            staircase=True)
        # Use simple momentum for the optimization.
        train_step = tf.train.AdamOptimizer(learning_rate).minimize(
            loss, global_step=batch)

    # Get accuracy of model
    with tf.name_scope("ACC"):
        correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(y_, 1))
        accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))

    # Create a summary to monitor learning_rate tensor
    tf.summary.scalar('learning_rate', learning_rate)
    # Create a summary to monitor accuracy tensor
    tf.summary.scalar('acc', accuracy)
    # Merge all summaries into a single op
    merged_summary_op = tf.summary.merge_all()
    # Add ops to save and restore all the variables
    saver = tf.train.Saver()
    sess = tf.InteractiveSession()
    sess.run(tf.global_variables_initializer(), feed_dict={})
    # op to write logs to Tensorboard
    summary_writer = tf.summary.FileWriter(LOGS_DIRECTORY,
                                           graph=tf.get_default_graph())
    max_acc = 0.

    start_time = time.time()
    # Loop for epoch
    for epoch in range(training_epochs):
        type_order = utility.generate_random_order(train_size /
                                                   (3 * TRAIN_BATCH_SIZE))
        # Random shuffling
        for data in utility.Data:
            [validation_images[data], validation_labels[data]
             ] = utility.shuffle_data(validation_images[data],
                                      validation_labels[data])
            [train_images[data], train_labels[data]
             ] = utility.shuffle_data(train_images[data], train_labels[data])

        count = {
            utility.Data.CUSTOM: 0,
            utility.Data.MNIST: 0,
            utility.Data.STREET: 0
        }
        # Loop over all batches
        for i in range(total_batch):
            set_type = type_order[i]
            #print("HELLLLLLLLO")

            #print("GOOODBYYYYYYE")
            offset = (count[set_type] * batch_size) % (train_size)
            count[set_type] = count[set_type] + 1
            # Compute the offset of the current minibatch in the data.
            #offset = (i * batch_size) % (train_size)
            batch_xs = train_images[set_type][offset:(offset + batch_size), :]
            batch_ys = train_labels[set_type][offset:(offset + batch_size), :]

            assert set_type.value in [
                utility.Data.CUSTOM.value, utility.Data.MNIST.value,
                utility.Data.STREET.value
            ]
            # Run optimization op (backprop), loss op (to get loss value)
            # and summary nodes
            _, train_accuracy, summary = sess.run(
                [train_step, accuracy, merged_summary_op],
                feed_dict={
                    x: batch_xs,
                    y_: batch_ys,
                    data_type: set_type.value
                })

            # Write logs at every iteration
            summary_writer.add_summary(summary, epoch * total_batch + i)

            # Display logs
            if i % display_step == 0:
                print(
                    "Epoch:", '%04d,' % (epoch + 1),
                    "batch_index %4d/%4d, training accuracy %.5f" %
                    (i, total_batch, train_accuracy))

            # Get accuracy for validation data
            # need to average 3 validation data entries
            if i % validation_step == 0:
                # Calculate accuracy
                validation_accuracy_a = sess.run(
                    accuracy,
                    feed_dict={
                        x: validation_images[utility.Data.CUSTOM],
                        y_: validation_labels[utility.Data.CUSTOM],
                        data_type: utility.Data.CUSTOM.value
                    })

                #print("HIIII")
                #print(validation_images[utility.Data.STREET].size)
                validation_accuracy_b = sess.run(
                    accuracy,
                    feed_dict={
                        x: validation_images[utility.Data.STREET],
                        y_: validation_labels[utility.Data.STREET],
                        data_type: utility.Data.STREET.value
                    })

                validation_accuracy_c = sess.run(
                    accuracy,
                    feed_dict={
                        x: validation_images[utility.Data.MNIST],
                        y_: validation_labels[utility.Data.MNIST],
                        data_type: utility.Data.MNIST.value
                    })

                validation_accuracy = (validation_accuracy_a +
                                       validation_accuracy_b +
                                       validation_accuracy_c) / 3
                print(
                    "Epoch:", '%04d,' % (epoch + 1),
                    "batch_index %4d/%4d, validation accuracy %.5f" %
                    (i, total_batch, validation_accuracy))

            # Save the current model if the maximum accuracy is updated
            if validation_accuracy > max_acc:
                max_acc = validation_accuracy
                save_path = saver.save(sess, MODEL_DIRECTORY)
                print("Model updated and saved in file: %s" % save_path)

    print("Optimization Finished!")
    print("--- %s seconds ---" % (time.time() - start_time))

    # Restore variables from disk
    saver.restore(sess, MODEL_DIRECTORY)

    # TESTING
    # TODO: edit testing
    for data in utility.Data:
        labels = test_labels[data]
        images = test_images[data]
        test_size = labels.shape[0]
        batch_size = TEST_BATCH_SIZE
        total_batch = int(test_size / batch_size)

        acc_buffer = []
        for i in range(total_batch):
            offset = (i * batch_size) % (test_size)
            batch_xs = images[offset:(offset + batch_size), :]
            batch_ys = labels[offset:(offset + batch_size), :]
            y_final = sess.run(y,
                               feed_dict={
                                   x: batch_xs,
                                   y_: batch_ys,
                                   data_type: data.value
                               })
            correct_prediction = numpy.equal(numpy.argmax(y_final, 1),
                                             numpy.argmax(batch_ys, 1))
            acc_buffer.append(numpy.sum(correct_prediction) / batch_size)

        print("test accuracy for the stored model for %s images: %g" %
              (data, numpy.mean(acc_buffer)))
Example #57
0
File: fml.py Project: raek/pynik
 def on_load(self):
     self._api_key = utility.load_data("betacie_api_key", "readonly")
Example #58
0
	def load_refs(self):
		self.references = utility.load_data("spotify", {})
Example #59
0
def main():
    usr = pd.concat((utility.load_data('training', 'user'),
                     utility.load_data('test', 'user')))
    for cat in usrdelcats :
        if hasattr(usr, cat) : del usr[cat]
    usr = usr.rename(columns={'average_stars' : 'user_average_stars',
                              'review_count' : 'user_review_count'})

    bus = pd.concat((utility.load_data('training', 'business'),
                     utility.load_data('test', 'business')))
    for cat in busdelcats :
        if hasattr(bus, cat) : del bus[cat]
    bus = bus.rename(columns={'stars' : 'business_average_stars',
                              'review_count' : 'business_review_count',})

    rtxttag = 'rtext_rev{}'.format(rtext_rev)
    rtxt_tr = utility.load_data('training', rtxttag)
    rtxt_te = utility.load_data('test', rtxttag)
    rtxt_te.index = rtxt_te.index + len(rtxt_tr)
    rtxt = pd.concat((rtxt_tr, rtxt_te))

    sgdtag = 'rtext_sgd_rev{}'.format(rtext_sgd_rev)
    sgdtxt_tr = utility.load_data('training', sgdtag)
    sgdtxt_te = utility.load_data('test', sgdtag)
    sgdtxt_te.index = sgdtxt_te.index + len(sgdtxt_tr)
    sgdtxt = pd.concat((sgdtxt_tr, sgdtxt_te))

    tesrev = utility.load_data('test', 'review')
    trarev = utility.load_data('training', 'review')
    revlist = [trarev, tesrev]
    for i in range(len(revlist)) :
        revlength = revlist[i]['text'].apply(lambda t : len(t.split()))
        revlist[i]['review_length'] = revlength

        for col in revlist[i].columns :
            if not col in ['review_id', 'stars', 'date', 'review_length'] :
                del revlist[i][col]
        revlist[i] = revlist[i].rename(columns={'stars' : 'review_stars'})

        revlist[i] = pd.merge(revlist[i], rtxt, 'left')
        revlist[i] = pd.merge(revlist[i], sgdtxt, 'left')
        revlist[i] = pd.merge(revlist[i], usr, 'left')
        revlist[i] = pd.merge(revlist[i], bus, 'left')
        revlist[i] = revlist[i].fillna(-1)

    for c in normedcols : norm_col(revlist, c)

    dates = [pd.to_datetime('2013-01-19'), pd.to_datetime('2013-03-12')]
    for i in range(len(revlist)) :
        ddiff = dates[i] - revlist[i]['date']
        ddiff = ddiff.apply(lambda x: x / np.timedelta64(1, 'D'))
        revlist[i]['datediff'] = ddiff
    norm_col(revlist, 'datediff')

    for i in range(len(revlist)) :
        for c in delcols :
            if hasattr(revlist[i], c) :
                del revlist[i][c]

    utility.save_data(revlist[0], 'training', 'finalinput')
    utility.save_data(revlist[1], 'test', 'finalinput')