Ejemplo n.º 1
0
def experiment_for_submitting():
    y_train, tX_train, ids = load_csv_data(DATA_TRAIN_PATH)
    _, tX_test, ids_test = load_csv_data(DATA_TEST_PATH)

    np.random.seed(2019)
    results = pd.DataFrame(
        columns=["Preprocessing", "Class -1 count", "Class +1 count"])

    for preprocessing_param in preprocessing_options:
        tX_stacked = np.vstack((tX_train, tX_test))
        prep_param = {
            "bias": True,
            "fill": True,
            "standardize": False,
            "degree": 11,
            "log": True,
            "root": True
        }
        tX_stacked_prep, _, desc_prep = preprocess_data(
            tX_stacked, None, prep_param)
        tX_train_prep, tX_test_prep = np.split(tX_stacked_prep,
                                               [len(tX_train)])

        lambda_ = lambda_cv(tX_train_prep, y_train)
        print(f"Best lambda: {lambda_}")
        w, _ = ridge_regression(y_train, tX_train_prep, lambda_)

        y_pred = predict_labels(w, tX_test_prep)
        uniq, count = np.unique(y_pred, return_counts=True)

        print(preprocessing_param,
              f"Class -1: {count[0]}, Class +1: {count[1]}")
        results.loc[len(results)] = (desc_prep, count[0], count[1])

    results.to_csv("Submitting experiment.csv", sep=";")
Ejemplo n.º 2
0
def main():
    y_train, tX_train, ids = load_csv_data(DATA_TRAIN_PATH)
    _, tX_test, ids_test = load_csv_data(DATA_TEST_PATH)

    np.random.seed(2019)

    # Preprocess data together to have the same shifts while creating log or root features
    tX_stacked = np.vstack((tX_train, tX_test))
    prep_param = {
        "bias": True,
        "fill": True,
        "standardize": False,
        "degree": 8,
        "log": True,
        "root": True
    }
    tX_stacked_prep, *_ = preprocess_data(tX_stacked, None, prep_param)
    tX_train_prep, tX_test_prep = np.split(tX_stacked_prep, [len(tX_train)])

    # Split data according to PRI_jet_num value
    tX_tr_splitted, indices_tr = divide_data(tX_train_prep)
    tX_te_splitted, indices_te = divide_data(tX_test_prep)
    n_models = len(indices_tr)

    y_tr_splitted = []
    for i in range(n_models):
        y_tr_splitted.append(y_train[indices_tr[i]])

    # Train
    weights = []
    for i in range(n_models):
        lambda_ = lambda_cv(tX_tr_splitted[i], y_tr_splitted[i])
        print(f"Class {i}, lambda: {lambda_}")
        weights.append(
            ridge_regression(y_tr_splitted[i], tX_tr_splitted[i], lambda_)[0])

    # Predict
    y_pr_tr = np.zeros(tX_train.shape[0])
    y_pr_te = np.zeros(tX_test.shape[0])
    for i in range(n_models):
        y_pr_tr[indices_tr[i]] = predict_labels(weights[i], tX_tr_splitted[i])
        y_pr_te[indices_te[i]] = predict_labels(weights[i], tX_te_splitted[i])

    acc_tr = compute_accuracy(y_train, y_pr_tr)
    print(f"Total accuracy train: {acc_tr}")
    _, counts = np.unique(y_pr_te, return_counts=True)
    print(
        f"Distribution on test data class -1: {counts[0]}, class +1: {counts[1]}"
    )

    create_csv_submission(ids_test, y_pr_te, OUTPUT_PATH)
Ejemplo n.º 3
0
def recipe_drop_down():
    if request.method == 'GET':
        info = load_csv_data()
        image_file = ''
        headline_py = "Pick a Recipe"
        return render_template("recipe_dropdown.html",
                               headline=headline_py,
                               recipe_image=image_file,
                               image_dict=info)
    else:  # POST
        info = load_csv_data()
        headline_py = "Pick a Recipe"
        image_file = request.form.get("recipe_image_list_drop_down")
        return render_template("recipe_dropdown.html",
                               headline=headline_py,
                               recipe_image=image_file,
                               image_dict=info)
Ejemplo n.º 4
0
    def load_user_neg_regex(self):
        if self.userfiles and FILEKEY_NEG_TITLE_REGEX in self.userfiles:
            for userfile in self.userfiles[FILEKEY_NEG_TITLE_REGEX].values():
                negwords = load_csv_data(userfile, [FILEKEY_NEG_TITLE_REGEX_FIELD])
                negdict = dict(zip(negwords, negwords))

                self.negative_regex.update(negdict)

            self.log(f'Loaded {len(self.negative_regex.keys())} negative title regex values for user.')
Ejemplo n.º 5
0
def run_experiments():
    y, tX, ids = load_csv_data(DATA_TRAIN_PATH)

    # run_preprocessing_experiment(tX, y)
    # run_balancing_experiment(tX, y)
    # run_filling_experiment(tX, y)
    # experiment_for_submitting()
    # feature_correlation_checking(tX, y)
    train_3models(tX, y)
Ejemplo n.º 6
0
    Returns
    -------
    Tuple (ndarray, ndarray)
        Sorted labels and predictions
    """
    idx = ids.argsort()
    return ids[idx], y_pred[idx]


# Locations of the train/test data and the submission files
train_fname = "data/train.csv"
test_fname = "data/test.csv"
sumbission_fname = "data/submission.csv"

# Load the train/test data
y_train, X_train, ids_train = load_csv_data(train_fname)
y_test, X_test, ids_test = load_csv_data(test_fname)

# Print out the shapes for convinience
print("Shapes")
print(X_train.shape, y_train.shape, ids_train.shape)
print(X_test.shape, y_test.shape, ids_test.shape)

# Split the datasets into 8 subsets
combine_vals = False
train_subsets = PRI_jet_num_split(y_train, X_train, ids_train, combine_vals)
test_subsets = PRI_jet_num_split(y_test, X_test, ids_test, combine_vals)

# Print the number of subsets and assert that their sizes are the same
# If not, there is something wrong with the split functionality
print(f"Number of train subsets: { len(train_subsets) }")
Ejemplo n.º 7
0
def main():
    print(f"RUNNING TEST PEN")
    
    info = helpers.get_nutridata()
 
    #pprint(info)
 
    for k,v in info.items():        
        print(f"{ k } = { v }")
        
    print(f"__name__ is: {__name__}")
    print(f"__file__ is: {__file__}")
    print(f"__loader__ is: {__loader__}")
    print(f"__package__ is: {__package__}")
        
    info['n_EnkJ'] = str( round( float( info['n_En'] ) * 4.184 ) )
    info['serving_size'] = str( round( float( info['serving_size'] ) ) )
    
    print(f"n_EnkJ:       {info['n_EnkJ']}")
    print(f"serving_size: {info['serving_size']}")
    
    print(f"{1}")
    
    helpers.get_nutrients_per_serving()
    
    helpers.load_csv_data()



#def get_ingredients_from_recipe(name):
    print( helpers.get_ingredients_from_recipe('') )

    requested_recipe = 'mushroom rissotto'

    print( glob.glob('./static/recipe/*.txt') )
    
    recipe_ref = {}
    for recipe_file in glob.glob('./static/recipe/*.txt'):
        print(recipe_file)
        #print(re.search(r'\d{8}_\d{6}_(.*).txt', './static/recipe/20190301_145910_mushroom rissotto.txt') )
        print(re.search(r'\d{8}_\d{6}_(.*).txt', recipe_file).group(0) )
        print(re.search(r'\d{8}_\d{6}_(.*).txt', recipe_file).group(1) )
        recipe_name = re.search(r'\d{8}_\d{6}_(.*).txt', recipe_file).group(1)
        recipe_ref[recipe_name] = recipe_file
        
    print(f"Looking for: {requested_recipe} <")
    print(f"Found: {recipe_ref[requested_recipe]} <")
    
    recipe_file ="./static/recipe/20190301_145910_mushroom rissotto.txt"
    
    # with open(recipe_file) as f:
    #    content = [line.rstrip() for line in f]
    # print( content.__class__.__name__ )            # list
    # 
    # for l in content:
    #     print( l )
                                         # match
    # ^-+- for the (.*) \((\d+)\)        # 1.name (2.portions)
    # (.*)                               # 3. ingredients
    # ^\s+Total \((.*?)\)                # 4. yield
    # need DOTALL so multiline works
    # https://www.thegeekstuff.com/2014/07/advanced-python-regex/
    # all together
    # ^-+- for the (.*) \((\d+)\)(.*)^\s+Total \((.*?)\)
    #
    
    with open(recipe_file) as f:
        content = "".join(f.readlines())
                        
    print( content.__class__.__name__ )
    print( ' - - - recipe text' )
    print( content )
    
    #match = re.search( r'^-+- for the (.*) \((\d+)\)(.*)^\s+Total \((.*?)\)', content, re.DOTALL )
    #match = re.search( r'^-+- for the (.*) \((\d+)\)(.*)^\s+Total \((.*?)\)', content, re.MULTILINE )
    
    match = re.search( r'^-+- for the (.*) \((\d+)\)', content, re.DOTALL )
    r_name = match.group(1)
    r_portions = match.group(2)
    
    match = re.search( r'\)(.*)^\s+T', content, re.MULTILINE )
    #r_ingredients = match.group(1)
    r_ingredients = 'ingredients'
    #r_yield = match.group(4)
    r_yield = '1kg'
    
    print(f" - - - recipe: {r_name} <\n{r_ingredients}\nmakes {r_yield} which is {r_portions} portions" )
    
    this_is_multiline = \
'''
------------------ for the mushroom rissotto (3)
50g	fennel
46g	butter
70g	leek
20g	green pepper
16g	garlic
88g	white mushrooms
80g	sauteed mushrooms
100g	white wine
10g	chicken stock cube
490g	water
40g	peas
80g	cream cheese
100g	arborio rice
								Total (900g)
'''
    
    print(f" - - -  - - -  this_is_multiline \n{this_is_multiline}\n - - - - - ")
    
    #match = re.search( r'^-+- for the (.*) \((\d+)\)(.*)^\s+Total \((.*?)\)', this_is_multiline, re.DOTALL )
    #match = re.search( r'^-+- for the (.*) \((\d+)\)(.*)^\s+Total \((.*?)\)', this_is_multiline, re.MULTILINE )
    r_name = 'No Match'
    r_portions = 'No Match'
    r_ingredients = []
    r_yield = 'No Match'

    # name and portions
    match = re.search( r'^-+- for the (.*) \((\d+)\)', this_is_multiline, re.MULTILINE )
    #match = re.search( r'^-+- for the (.*) \((\d+)\)', this_is_multiline, re.DOTALL )
    if match:
        r_name = match.group(1)
        r_portions = match.group(2)
    else:
        print("name and portions NO MATCH")
        
    
    # ingredients
    #match = re.search( r'^(\d+)g\s+([a-zA-Z ]+)$', this_is_multiline, re.MULTILINE )
    match = re.findall( r'^(\d+)g\s+([a-zA-Z ]+)$', this_is_multiline, re.MULTILINE )
    #match = re.search( r'\)(.*)^\s+T', this_is_multiline, re.DOTALL )
    if match:
        #pprint(match)
        print(f"ingredients: {match.__class__.__name__}") # - {match.size}")
        for i in match:
            r_ingredients.append( f"{i[0]}g\t{i[1]}" )
            print( f"{i[0]}g\t{i[1]}" )
    else:
        print("ingredients NO MATCH")
    
    # yield
    match = re.search( r'^\s+Total \((.*?)\)', this_is_multiline, re.MULTILINE )
    #match = re.search( r'^\s+Total \((.*?)\)', this_is_multiline, re.DOTALL )
    if match:
        r_yield = match.group(1)
    else:
        print("yield NO MATCH")

    print(f" - - - recipe ML: {r_name} <\n{r_ingredients}\nmakes {r_yield} which is {r_portions} portions" )
    
    match = re.search( r'^-+- for the (.*) \((\d+)\)(.*)^\s+Total \((.*?)\)', this_is_multiline, re.MULTILINE | re.DOTALL )
    #match = re.search( r'^-+- for the (.*) \((\d+)\)', this_is_multiline, re.MULTILINE )
    #match = re.search( r'\).*?^(.*)^\s+Total \((.*?)\)', this_is_multiline, re.MULTILINE )
    #match = re.search( r'^\s+Total \((.*?)\)', this_is_multiline, re.MULTILINE )
    r_ingredients = match.group(3).strip()
    print(f"{r_ingredients.__class__.__name__}")
    print(f"{r_ingredients}")
    print(f"{match.group(3)}")


    get_ingredients_from_recipe('mushroom risotto')
parser.add_argument("--dyhigh", help="The max y limit on the d43 plots", type=float, required=True)
parser.add_argument("--tlim", help="The max time on the m0 and d43 plots", type=float, required=True)
parser.add_argument("--dlegendloc", help="The location of the legend on the d43 plot", type=str, required=True)

args = parser.parse_args()

x = np.linspace(1, 10, 300)

case_num = args.case
time_end = args.tlim
node_num = args.nodes

print("Building Comparison Plots for Case {}.".format(case_num))

# Read analytic data from file.
analytic_data = helpers.load_csv_data("Vanni2000_Case{}_N_RIG.csv".format(case_num))
x_analytic = list(analytic_data.keys())
y_analytic = list(analytic_data.values())

# Produce Figures a-d
plt.figure(case_num, figsize=(12,8), dpi=80)

# Produce Figure a
plt.subplot(221)

plt.plot([], color="#007F00", linestyle="-.", label="LnEQMOM N={}, $N_\\alpha$ = 20".format(node_num))
plt.plot([], color="#956363", linestyle="--", label="EQMOM each nodes")
plt.plot([], color="#0A246A", linestyle="-", label="Rigorous solution")

plt.plot(x_analytic, y_analytic, color="#0A246A", linestyle="-")