Ejemplo n.º 1
0
def process_pubmed(output_graph):
    pubmed_df = read_csv(PREPROCESSED_PUBMED_FILEPATH)
    drugs_df = read_csv(PREPROCESSED_DRUGS_FILE_PATH)

    drugs_list = drugs_df['drug'].tolist()

    for index, row in pubmed_df.iterrows():
        drugs_found = find_drugs_in_title(row['title'], drugs_list)
        for drug in drugs_found:
            output_graph.add_pubmed(drug, row['id'], row['date'])
            output_graph.add_journal(drug, row['journal'], row['date'])
Ejemplo n.º 2
0
def process_clinical_trial(output_graph):
    clinical_trial_df = read_csv(PREPROCESSED_CLINICAL_TRIALS_FILE_PATH)
    drugs_df = read_csv(PREPROCESSED_DRUGS_FILE_PATH)

    drugs_list = drugs_df['drug'].tolist()

    for index, row in clinical_trial_df.iterrows():
        drugs_found = find_drugs_in_title(row['scientific_title'], drugs_list)
        for drug in drugs_found:
            output_graph.add_clinical_trial(drug, row['id'], row['date'])
            output_graph.add_journal(drug, row['journal'], row['date'])
Ejemplo n.º 3
0
def preprocess_drugs():
    """
    Loads the drugs file and preprocesses it

    :returns: a dataframe with clean data on drugs
    :rtype: pandas.DataFrame
    """
    drugs_df = read_csv(DRUGS_INPUT_FILE_PATH)
    drugs_df['drug'] = drugs_df['drug'].str.lower()
    return drugs_df
Ejemplo n.º 4
0
def preprocess_pubmed_csv():
    """
    Loads the pubmed csv file and preprocesses it

    :returns: a dataframe with clean data on pubmed
    :rtype: pandas.DataFrame
    """
    pubmed_df = read_csv(PUBMED_CSV_FILE_PATH)
    pubmed_df['title'] = pubmed_df['title'].str.lower()
    return pubmed_df
Ejemplo n.º 5
0
def main():
    filename = "training_data.csv"
    n_hidden_nodes = [5]
    l_rate = 0.6
    n_epochs = 800
    n_folds = 4

    print("Neural network model:\n n_hidden_nodes = {}".format(n_hidden_nodes))
    print(" l_rate = {}".format(l_rate))
    print(" n_epochs = {}".format(n_epochs))
    print(" n_folds = {}".format(n_folds))

    print("\nReading '{}'...".format(filename))
    X, y = utils.read_csv(filename)
    utils.normalize(X)
    N, d = X.shape
    n_classes = len(np.unique(y))

    print(" X.shape = {}".format(X.shape))
    print(" y.shape = {}".format(y.shape))
    print(" n_classes = {}".format(n_classes))

    idx_all = np.arange(0, N)
    idx_folds = utils.crossval_folds(N, n_folds, seed=1)

    acc_train, acc_test = list(), list()
    print("\nTraining and cross-validating...")
    for i, idx_test in enumerate(idx_folds):
        idx_train = np.delete(idx_all, idx_test)
        X_train, y_train = X[idx_train], y[idx_train]
        X_test, y_test = X[idx_test], y[idx_test]

        model = NeuralNetwork(n_input=d,
                              n_output=n_classes,
                              n_hidden_nodes=n_hidden_nodes)
        model.train(X_train, y_train, l_rate=l_rate, n_epochs=n_epochs)

        y_train_predict = model.predict(X_train)
        y_test_predict = model.predict(X_test)

        acc_train.append(100 * np.sum(y_train == y_train_predict) /
                         len(y_train))
        acc_test.append(100 * np.sum(y_test == y_test_predict) / len(y_test))

        print(
            " Fold {}/{}: train acc = {:.2f}%, test acc = {:.2f}% (n_train = {}, n_test = {})"
            .format(i + 1, n_folds, acc_train[-1], acc_test[-1], len(X_train),
                    len(X_test)))

    print("\nAvg train acc = {:.2f}%".format(
        sum(acc_train) / float(len(acc_train))))
    print("Avg test acc = {:.2f}%".format(
        sum(acc_test) / float(len(acc_test))))
Ejemplo n.º 6
0
def main():
    # ===================================
    # Settings
    # ===================================
    csv_filename = "data/creditcard.csv"
    hidden_layers = [5]
    eta = 0.1
    n_epochs = 500
    n_folds = 3

    X, y, n_classes = utils.read_csv(csv_filename, target_name="Class")
    N, d = X.shape
    print(" -> X.shape = {}, y.shape = {}, n_classes = {}\n".format(X.shape, y.shape, n_classes))

    print("Running")
    idx_all = np.arange(0, N)
    idx_folds = utils.crossval_folds(N, n_folds, seed=1)

    acc_train, acc_valid = list(), list()
    print("Cross-validation")
    for i, idx_valid in enumerate(idx_folds):
        idx_train = np.delete(idx_all, idx_valid)
        X_train, y_train = X[idx_train], y[idx_train]
        X_valid, y_valid = X[idx_valid], y[idx_valid]

        model = NeuralNetwork(input_dim=d, output_dim=n_classes,
                              hidden_layers=hidden_layers, seed=1)
        model.train(X_train, y_train, eta=eta, n_epochs=n_epochs)

        ypred_train = model.predict(X_train)
        ypred_valid = model.predict(X_valid)

        acc_train.append(100 * np.sum(y_train == ypred_train) / len(y_train))
        acc_valid.append(100 * np.sum(y_valid == ypred_valid) / len(y_valid))
        print("TP: " + str(np.sum((y_valid == ypred_valid) & (y_valid == 1))))
        print("TN: " + str(np.sum((y_valid == ypred_valid) & (y_valid == 0))))
        print("FP: " + str(np.sum((y_valid != ypred_valid) & (y_valid == 1))))
        print("FN: " + str(np.sum((y_valid != ypred_valid) & (y_valid == 0))))
        TP = np.sum((y_valid == ypred_valid) & (y_valid == 1))
        TN = np.sum((y_valid == ypred_valid) & (y_valid == 0))
        FP = np.sum((y_valid != ypred_valid) & (y_valid == 1))
        FN = np.sum((y_valid != ypred_valid) & (y_valid == 0))
        precision = calculate_precision(TP, FP)
        recall = calculate_recall(TP, FN)

        print(str(f1_score(recall, precision)))
        print(" Fold {}/{}: acc_train = {:.2f}%, acc_valid = {:.2f}% (n_train = {}, n_valid = {})".format(
            i + 1, n_folds, acc_train[-1], acc_valid[-1], len(X_train), len(X_valid)))

    print("  -> acc_train_avg = {:.2f}%, acc_valid_avg = {:.2f}%".format(
        sum(acc_train) / float(len(acc_train)), sum(acc_valid) / float(len(acc_valid))))
Ejemplo n.º 7
0
def train(_start_temp, _end_temp, _eq_number, _cool_number, nodes_number,
          stride):
    """
    Trains the model by applying the optimization by simulated annealing

    """
    input = read_csv(nodes_number, stride)
    weights = np.zeros((nodes_number, nodes_number))

    start_temp = _start_temp
    end_temp = _end_temp
    T = start_temp
    energy = mse(weights, input, nodes_number, stride)
    eq_number = _eq_number
    cool_parameter = _cool_number

    energies = [energy]
    best_energies = [energy]

    best_weights = weights
    best_energy = energy

    while T >= end_temp:
        print(T)
        # stay on the same temp (in the equilibrium) for eq_number iterations
        for _ in range(eq_number):
            weights, energy = annealing(weights, energy, T, input,
                                        nodes_number, stride)
            T = decrease_temp(T, cool_parameter)

            if energy < best_energy:
                best_energy = energy
                best_weights = weights

            energies.append(energy)
            best_energies.append(best_energy)

    plt.plot(energies, label="Energy")
    plt.plot(best_energies, label="Best energy")
    plt.xlabel("Epochs")
    plt.ylabel("MSE")
    plt.legend()
    plt.show()
    return best_energy
def preprocess_clinical_trials():
    """
    Loads the clinical_trials file and preprocesses it

    :returns: a dataframe with clean data on clinical_trials
    :rtype: pandas.DataFrame
    """
    clinical_trials_df = read_csv(CLINICAL_TRIALS_INPUT_FILE_PATH)
    clinical_trials_df['scientific_title'] = clinical_trials_df[
        'scientific_title'].str.lower()

    # Remove byte-like characters
    clinical_trials_df['scientific_title'] = clinical_trials_df[
        'scientific_title'].apply(
            lambda x: x.replace('\\xc3', '').replace('\\xb1', ''))
    clinical_trials_df['journal'] = clinical_trials_df['journal'].astype(
        str).apply(lambda x: x.replace('\\xc3', '').replace('\\x28', ''))

    return clinical_trials_df
def main():
    # ===================================
    # Settings
    # ===================================
    csv_filename = "data/Leeds02.csv"
    hidden_layers = [5] # number of nodes in hidden layers i.e. [layer1, layer2, ...]
    eta = 0.1 # learning rate
    n_epochs = 400 # number of training epochs
    n_folds = 4 # number of folds for cross-validation
    seed_crossval = 1 # seed for cross-validation
    seed_weights = 1 # seed for NN weight initialization

    # ===================================
    # Read csv data + normalize features
    # ===================================
    print("Reading '{}'...".format(csv_filename))
    X, y, n_classes = utils.read_csv(csv_filename, target_name="y", normalize=True)
    N, d = X.shape
    print(" -> X.shape = {}, y.shape = {}, n_classes = {}\n".format(X.shape, y.shape, n_classes))

    print("Neural network model:")
    print(" input_dim = {}".format(d))
    print(" hidden_layers = {}".format(hidden_layers))
    print(" output_dim = {}".format(n_classes))
    print(" eta = {}".format(eta))
    print(" n_epochs = {}".format(n_epochs))
    print(" n_folds = {}".format(n_folds))
    print(" seed_crossval = {}".format(seed_crossval))
    print(" seed_weights = {}\n".format(seed_weights))

    # ===================================
    # Create cross-validation folds
    # ===================================
    idx_all = np.arange(0, N)
    idx_folds = utils.crossval_folds(N, n_folds, seed=seed_crossval) # list of list of fold indices

    # ===================================
    # Train/evaluate the model on each fold
    # ===================================
    acc_train, acc_valid = list(), list()  # training/test accuracy score
    print("Cross-validating with {} folds...".format(len(idx_folds)))
    for i, idx_valid in enumerate(idx_folds):

        # Collect training and test data from folds
        idx_train = np.delete(idx_all, idx_valid)
        X_train, y_train = X[idx_train], y[idx_train]
        X_valid, y_valid = X[idx_valid], y[idx_valid]

        # Build neural network classifier model and train
        model = NeuralNetwork(input_dim=d, output_dim=n_classes,
                              hidden_layers=hidden_layers, seed=seed_weights)
        model.train(X_train, y_train, eta=eta, n_epochs=n_epochs)

        # Make predictions for training and test data
        ypred_train = model.predict(X_train)
        ypred_valid = model.predict(X_valid)

        # Compute training/test accuracy score from predicted values
        acc_train.append(100*np.sum(y_train==ypred_train)/len(y_train))
        acc_valid.append(100*np.sum(y_valid==ypred_valid)/len(y_valid))

        # Print cross-validation result
        print(" Fold {}/{}: acc_train = {:.2f}%, acc_valid = {:.2f}% (n_train = {}, n_valid = {})".format(
            i+1, n_folds, acc_train[-1], acc_valid[-1], len(X_train), len(X_valid)))

    # ===================================
    # Print results
    # ===================================
    print("  -> acc_train_avg = {:.2f}%, acc_valid_avg = {:.2f}%".format(
        sum(acc_train)/float(len(acc_train)), sum(acc_valid)/float(len(acc_valid))))
Ejemplo n.º 10
0
def main(model_name, data_file, i_days, mva, batch_size, p_days):
    """
    Args:
        model_name (string): model name
        data_file (string): data_file
        i_days (int): number of input days per test sequence
        mva (bool): whether to apply 7-day moving average
        batch_size (int): number of test sequences
        p_days (int): number of prediction days per test sequence
    """

    # initialize model
    if model_name == 'arma':
        model = ARMA()
    elif model_name == 'seird':
        model = SEIRD()
    elif model_name == 'gamma':
        model_type = 'default'  #There is no other model for now.
        delta1 = 11
        delta2 = 18
        delta3 = 14
        p = 0.02
        num_past_days = 7
        model = GAMMA(model_type, delta1, delta2, delta3, p,
                      num_past_days)  #Config file needs to be added.
    elif model_name == 'gamma_l1':
        model_type = 'default'  #There is no other model for now.
        delta1 = 11
        delta2 = 18
        delta3 = 14
        p = 0.02
        num_past_days = 7
        lbd = 1000
        model = GAMMA_L1(model_type, delta1, delta2, delta3, p, num_past_days,
                         lbd)  #Config file needs to be added.
    elif model_name == 'gamma_2':
        model_type = 'default'  #There is no other model for now.
        delta1 = 11
        delta2 = 18
        delta3 = 14
        p = 0.02
        num_past_days = 7
        lbd = 1000
        model = GAMMA_2(model_type, delta1, delta2, delta3, p,
                        num_past_days)  #Config file needs to be added.

    else:
        raise ('Invalid model type:', model)

    # load datafile
    data_dir = './datasets/processed/'
    data, dates, columns = utils.read_csv(data_dir + data_file)

    # apply moving average
    if mva:
        data, dates = utils.moving_average(data, dates, days=7)

    # split up data into batch_size train+test sequences
    datasplit = utils.train_test_split_multi(data,
                                             dates,
                                             train_days=i_days,
                                             test_days=p_days,
                                             batch_size=batch_size,
                                             seed=0)
    train, test, train_dates, test_dates = datasplit

    B = train.shape[0]  #Arec: Is B batch size?

    # fit and predict on each sequence
    c_preds, h_preds, d_preds = [], [], []
    for i in range(B):

        # refit model with new sequence
        model.fit(train[i])

        # predict days

        c_preds.append(model.predict_cases(p_days))
        h_preds.append(model.predict_hospitalizations(p_days))
        d_preds.append(model.predict_deaths(p_days))

    # evaluate metrics
    print("Model:", model_name)

    c_true, h_true, d_true = test[..., 0], test[..., 1], test[..., 2]

    for (pred, true, name) in [(c_preds, c_true, "Cases"),
                               (h_preds, h_true, "Hospitalizations"),
                               (d_preds, d_true, "Deaths")]:

        print(f'{name}...')
        # skip if no prediction
        if pred[0] is None:
            print("%s: no predictions" % (name))
            continue

        # batch predictions
        pred_batch = np.stack(pred)

        # run metrics on batches
        rmses = utils.rmse(true, pred_batch)
        maes = utils.mae(true, pred_batch)
        mapes = utils.mape(true, pred_batch)

        # report mean and std around mean (std / sqrt(B))
        for (metric, name) in [(rmses, "RMSE"), (maes, "MAE"),
                               (mapes, "MAPE")]:
            print('%s: %f \pm %f' % (name, metric.mean(), metric.std() /
                                     (len(metric)**0.5)))

    #plotting
    utils.plotting(train, test, c_preds, h_preds, d_preds, model_name)
Ejemplo n.º 11
0
import time
from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor

from src.generate_data import parse_web_page
from src.utils import read_csv


def parallelize(func, iterable, use_thread=True, *args, **kwargs):
    pool_executor = ThreadPoolExecutor if use_thread else ProcessPoolExecutor
    with pool_executor() as executor:
        data = list(executor.map(func, iterable))
    return data

if __name__ == "__main__":
    data = read_csv("data/summary/common.csv")
    urls = [row["doc_url"] for row in data if row["doc_url"]]
    selected_urls = urls[:100]
    
    start = time.time()
    norm_result = [parse_web_page(url) for url in selected_urls]
    done = time.time()
    print(f"Done, initial {start} -> {done} = {done - start}")
    
    start = done
    thread_result = parallelize(parse_web_page, selected_urls, use_thread=True)
    done = time.time()
    print(f"Done, Threading {start} -> {done} = {done - start}")
    
    # start = time.time()
    # parallelize(parse_web_page, selected_urls, use_thread=False)
    # # parallelize(print, selected_urls, use_thread=False)
Ejemplo n.º 12
0
def main():
    # ===================================
    # Settings
    # ===================================
    filename = "data/seeds_dataset.csv"
    n_hidden_nodes = [
        5
    ]  # nodes in hidden layers i.e. [n_nodes_1, n_nodes_2, ...]
    l_rate = 0.6  # learning rate
    n_epochs = 800  # number of training epochs
    n_folds = 4  # number of folds for cross-validation

    print("Neural network model:\n n_hidden_nodes = {}".format(n_hidden_nodes))
    print(" l_rate = {}".format(l_rate))
    print(" n_epochs = {}".format(n_epochs))
    print(" n_folds = {}".format(n_folds))

    # ===================================
    # Read data (X,y) and normalize X
    # ===================================
    print("\nReading '{}'...".format(filename))
    X, y = utils.read_csv(filename)  # read as matrix of floats and int
    utils.normalize(X)  # normalize
    N, d = X.shape  # extract shape of X
    n_classes = len(np.unique(y))

    print(" X.shape = {}".format(X.shape))
    print(" y.shape = {}".format(y.shape))
    print(" n_classes = {}".format(n_classes))

    # ===================================
    # Create cross-validation folds
    # These are a list of a list of indices for each fold
    # ===================================
    idx_all = np.arange(0, N)
    idx_folds = utils.crossval_folds(N, n_folds, seed=1)

    # ===================================
    # Train and evaluate the model on each fold
    # ===================================
    acc_train, acc_test = list(), list()  # training/test accuracy score
    print("\nTraining and cross-validating...")
    for i, idx_test in enumerate(idx_folds):

        # Collect training and test data from folds
        idx_train = np.delete(idx_all, idx_test)
        X_train, y_train = X[idx_train], y[idx_train]
        X_test, y_test = X[idx_test], y[idx_test]

        # Build neural network classifier model and train
        model = NeuralNetwork(n_input=d,
                              n_output=n_classes,
                              n_hidden_nodes=n_hidden_nodes)
        model.train(X_train, y_train, l_rate=l_rate, n_epochs=n_epochs)

        # Make predictions for training and test data
        y_train_predict = model.predict(X_train)
        y_test_predict = model.predict(X_test)

        # Compute training/test accuracy score from predicted values
        acc_train.append(100 * np.sum(y_train == y_train_predict) /
                         len(y_train))
        acc_test.append(100 * np.sum(y_test == y_test_predict) / len(y_test))

        # Print cross-validation result
        print(
            " Fold {}/{}: train acc = {:.2f}%, test acc = {:.2f}% (n_train = {}, n_test = {})"
            .format(i + 1, n_folds, acc_train[-1], acc_test[-1], len(X_train),
                    len(X_test)))

    # ===================================
    # Print results
    # ===================================
    print("\nAvg train acc = {:.2f}%".format(
        sum(acc_train) / float(len(acc_train))))
    print("Avg test acc = {:.2f}%".format(
        sum(acc_test) / float(len(acc_test))))
Ejemplo n.º 13
0
import src.utils as utils

# Settings
csv_filename = "data/seeds_dataset.csv"
hidden_layers = [
    5
]  # number of nodes in hidden layers i.e. [layer1, layer2, ...]
eta = 0.1  # learning rate
n_epochs = 400  # number of training epochs
n_folds = 4  # number of folds for cross-validation
seed_crossval = 1  # seed for cross-validation
seed_weights = 1  # seed for NN weight initialization

# Read csv data + normalize features
print("Reading '{}'...".format(csv_filename))
X, y, n_classes = utils.read_csv(csv_filename, target_name="y", normalize=True)
print(" -> X.shape = {}, y.shape = {}, n_classes = {}\n".format(
    X.shape, y.shape, n_classes))
N, d = X.shape

print("Neural network model:")
print(" input_dim = {}".format(d))
print(" hidden_layers = {}".format(hidden_layers))
print(" output_dim = {}".format(n_classes))
print(" eta = {}".format(eta))
print(" n_epochs = {}".format(n_epochs))
print(" n_folds = {}".format(n_folds))
print(" seed_crossval = {}".format(seed_crossval))
print(" seed_weights = {}\n".format(seed_weights))

# Create cross-validation folds
Ejemplo n.º 14
0
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import logging

from src.domain.euromillions.rules import Rules
from src.domain.euromillions.compute import Compute

# import argparse

from src import utils

FORMAT = "%(asctime)s %(name)s %(levelname)s - %(message)s"
logging.basicConfig(format=FORMAT, level=logging.INFO)
logging.getLogger("winner").setLevel(logging.INFO)

history_file = "assets/history/euromillions-1354.csv"

if __name__ == "__main__":
    logging.info("lets check for a winner")
    numbers = utils.read_csv(file=history_file, depth=5)
    for n in numbers:
        logging.info(n)

    rules = Rules()
    cmp = Compute(rules, numbers)

    cmp.compute()
    unknown_file = os.path.join('', uc_configs['trajectory'])
    is_trajectory = True
else:
    print("Please determine a correct value for your use case")
    sys.exit()

error_handler.handle_wrong_arguments(known_file, unknown_file)
known_filename, known_file_extension = os.path.splitext(known_file)
unknown_filename, unknown_file_extension = os.path.splitext(unknown_file)

trajectory_list = []
leak_lists = []

# read trajectories and leaking source files
if known_file_extension == '.csv':
    leak_lists = utils.read_csv(known_file)
elif known_file_extension == '.json':
    leak_lists = get_matrix_geojson(known_file)
if unknown_file_extension == '.csv':
    trajectory_list = utils.read_csv(unknown_file)
elif unknown_file_extension == '.json':
    trajectory_list = get_matrix_geojson(unknown_file)

print('checking requirements of known stations...')
error_handler.check_requirements(leak_lists[0], is_random)
print('checking requirements of trajectories or unknown stations...')
error_handler.check_requirements(trajectory_list[0], is_random)

# initialize variables
uknown_stations = trajectory_list
sensors_payload = []
Ejemplo n.º 16
0
import sys, os
sys.path.append('./')
import numpy as np
import matplotlib.pyplot as plt
from src import utils

# load data from processed csv
path_loc = './datasets/processed/sf.csv'
data, dates, columns = utils.read_csv(path_loc)
print("Data columns: ", columns)
print("Max Daily Deaths: ", data[:, 2].max())

# without averaging

train, test, train_dates, test_dates = utils.train_test_split(data, dates)

plt.figure()
plt.title('SF Data w/o moving average')
plt.plot(train_dates, train)
plt.plot(test_dates, test)
plt.legend(columns)
plt.show()

# with 7-day moving average
mva_data, mva_dates = utils.moving_average(data, dates, days=7)
train, test, train_dates, test_dates = utils.train_test_split(
    mva_data, mva_dates)

plt.figure()
plt.title('SF Data w/ 7-day moving average')
plt.plot(train_dates, train)