Beispiel #1
0
def compress(name, k=8, force_update=False):
    result = []
    block_list = []
    elements_file_name = name + '.pickle'
    compressed_elements_file_name = name + '_compressed.pickle'
    if not force_update and os.path.exists(compressed_elements_file_name):
        return load_data(compressed_elements_file_name)
    else:
        elements = load_data(elements_file_name)
        for i in xrange(0, len(elements), k):
            block_list.append(
                    (0,
                     elements[i].term,
                     elements[i].count,
                     elements[i].posting_lists
                     )
            )
            last_index = len(elements) - i
            for bi in xrange(1, min(k, last_index)):
                c = compare(elements[i + bi - 1].term, elements[i + bi].term)
                block_list.append(
                        (c,
                         elements[i + bi].term[c:],
                         elements[i + bi].count,
                         # elements[i + bi].term,
                         elements[i + bi].posting_lists
                         )
                )
            result.append(block_list)
            block_list = []
        save_data(result, compressed_elements_file_name)
        return result
Beispiel #2
0
	def hand_generator(self):

		for root, directories, files in walk(self.color_folder):
			for c_filename in files:
				if self.c_file_ext in c_filename:

					try:
						c_filepath = path.join(root, c_filename)
						d_filepath = c_filepath.replace('color', 'depth').replace(self.c_file_ext, self.d_file_ext)

						_, c_data = utils.load_data(c_filepath, self.data_type)
						d_data, d_data_norm = utils.load_data(d_filepath, 'depth')


						c_fgmask = self.c_fgbg.apply(c_data)
						c_fgmask = cv2.morphologyEx(c_fgmask, cv2.MORPH_OPEN, self.kernel)
		
						img_bs = d_data_norm.copy()
						img_bs[c_fgmask==0] = 0
						img_bs = ip.smooth_image(img_bs)

						boxes = ip.parse_hands(img_bs, display=False)
						hands = Hands(c_fgmask, d_data, boxes, d_filepath, 'depth')

						yield hands
					except Exception as e:
						print e
Beispiel #3
0
	def initialize_detector(self, background_folder, video_folder, file_ext):

		self.c_fgbg 		= cv2.BackgroundSubtractorMOG2(300, 20, True)
		self.d_fgbg			= cv2.BackgroundSubtractorMOG2(300, 10, False)

		color_bg_folder		= background_folder + "/color"

		self.color_folder	= video_folder + "/color"
		self.depth_folder	= video_folder + "/depth"
		self.c_file_ext		= file_ext
		self.d_file_ext		= 'bin'

		count = 0
		thresh_total = 0
		for root, directories, files in walk(color_bg_folder):
			for c_filename in files:
				if self.c_file_ext in c_filename:

					try:
						c_filepath = path.join(root, c_filename)
						# d_filepath = c_filepath.replace('color', 'depth').replace(self.c_file_ext, self.d_file_ext)

						_, c_data = utils.load_data(c_filepath, self.data_type)
						# d_data, d_data_norm = utils.load_data(d_filepath, 'depth')

						c_fgmask = self.c_fgbg.apply(c_data)
						# d_fgmask = self.d_fgbg.apply(d_data)
					except Exception as e:
						print e

		self.initialized = True
	def hand_generator(self):

		for root, directories, files in walk(self.folder):
			for filename in files:
				if self.file_ext in filename:
					filepath = path.join(root, filename)

					data, data_norm = utils.load_data(filepath, self.data_type)

					fgmask = self.fgbg.apply(data_norm)
					fgmask = cv2.morphologyEx(fgmask, cv2.MORPH_OPEN, self.kernel)
		
					img_bs = data.copy()
					img_bs[fgmask==0] = 0
					img_bs = ip.smooth_image(img_bs)

					orig_mask = fgmask.copy()

					fgmask, thresh_val = self.remove_legs(img_bs)

					img_fg = data_norm.copy()
					img_fg[fgmask==0] = 0

					boxes = ip.parse_hands(fgmask, display=False)
					hands = Hands(fgmask, data, boxes, filepath, 'depth')

					yield hands
Beispiel #5
0
def get_index(folder_name, force_update=False):
    index_file_name = folder_name + '.pickle'
    if not force_update and os.path.exists(index_file_name):
        return load_data(index_file_name)
    else:
        elements = []
        documents = get_file_list(folder_name)
        for doc_id in xrange(len(documents)):
            elements += map(lambda x: Element(x, doc_id), get_tokens(documents[doc_id]))
        elements.sort()
        result = []
        for el in elements:
            if result and result[-1] == el:
                result[-1].update(el)
            else:
                result.append(el)

        save_data(result, name=index_file_name)
Beispiel #6
0
	def img_generator(self):

		for root, directories, files in walk(self.folder):
			for filename in files:
				if self.file_ext in filename:
					filepath = path.join(root, filename)

					try:

						data, data_norm = utils.load_data(filepath, self.data_type)

						fgmask = self.fgbg.apply(data_norm)
						fgmask = cv2.morphologyEx(fgmask, cv2.MORPH_OPEN, self.kernel)
			
						img_bs = data_norm.copy()
						img_bs[fgmask==0] = 0
						img_bs = ip.smooth_image(img_bs)

						yield img_bs, filepath
					except Exception as e:
						print e
Beispiel #7
0
	def initialize_detector(self, background_folder, video_folder, file_ext):

		self.fgbg 		= cv2.BackgroundSubtractorMOG2(300,.2,False)

		self.folder		= video_folder
		self.file_ext	= file_ext

		count = 0
		thresh_total = 0
		for root, directories, files in walk(background_folder):
			for filename in files:
				if self.file_ext in filename:

					try:
						filepath = path.join(root, filename)
						data, data_norm = utils.load_data(filepath, self.data_type)

						fgmask = self.fgbg.apply(data_norm)
					except Exception as e:
						print e

		self.initialized = True
Beispiel #8
0
if __name__ == "__main__":
    args = parse_args()

    # fix random seed
    set_random_seed(args.seed)

    # use cuda or not
    if args.use_cuda:
        device = torch.device('cuda')
    else:
        device = torch.device('cpu')

    # load data
    data, n_fields, n_features = load_data(args.dataset,
                                           device=device,
                                           use_content=args.use_content,
                                           use_rating=False,
                                           print_info=True)

    # create model
    model = AttentionalFM(n_features=n_features,
                          n_fields=n_fields,
                          embedding_dim=args.embedding_dim)
    model.to(device=device)

    # output dir
    output_dir = "./results/{:%Y%m%d_%H%M%S}/".format(datetime.now())
    output_path = output_dir + "model.weights"
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
Beispiel #9
0
    ############### PREPROCESSING ###############

    classes = results.classes.replace(" ", "").split(',')

    preprocess_dir(TRAIN_DIR,
                   PREPROCESSED_DIR,
                   REORIENT_SCRIPT_PATH,
                   ROBUSTFOV_SCRIPT_PATH,
                   classes,
                   results.numcores,
                   verbose=0)

    ############### DATA IMPORT ###############

    X, y, filenames, num_classes, img_shape = load_data(
        PREPROCESSED_DIR, classes)

    print("Finished data processing")

    ############### MODEL SELECTION ###############

    LR = 1e-3
    LOAD_WEIGHTS = False
    MODEL_NAME = "phinet_model_" + "-".join(results.classes.split(","))
    MODEL_PATH = os.path.join(WEIGHT_DIR, MODEL_NAME + ".json")

    if not os.path.exists(WEIGHT_DIR):
        os.makedirs(WEIGHT_DIR)

    if LOAD_WEIGHTS:
        weight_files = os.listdir(WEIGHT_DIR)
Beispiel #10
0
cont_channels = [
    i for (i, x) in enumerate(discretizer_header) if x.find("->") == -1
]

normalizer = Normalizer(
    fields=cont_channels)  # choose here which columns to standardize
normalizer_state = 'ihm_normalizer'
normalizer_state = os.path.join(os.path.dirname(data_path), normalizer_state)
normalizer.load_params(normalizer_state)

# %%

n_trained_chunks = 0
train_raw = utils.load_data(train_reader,
                            discretizer,
                            normalizer,
                            small_part,
                            return_names=True)
val_raw = utils.load_data(val_reader,
                          discretizer,
                          normalizer,
                          small_part,
                          return_names=True)

# %%

demographic_data = []
diagnosis_data = []
idx_list = []

demo_path = data_path + 'demographic/'
Beispiel #11
0
import sys
from utils.utils import load_data, datset_gen, savepickle
from utils.trainer import train_with_checkpoin_tensorboard

from Models import svhn_model_simple

import tensorflow as tf
import numpy as np
import pandas as pd

root_path = "I:\\Files_ML\\Coursera\\Dl_ON_ud\\dataset"
test_path = os.path.join(root_path, "test")
train_path = os.path.join(root_path, "train")
extra_path = os.path.join(root_path, "extra")

X_test, y_test = load_data(test_path, num_only=True)
X_train, y_train = load_data(train_path, num_only=True)
X_extra, y_extra = load_data(extra_path, num_only=True)

#X_train = np.concatenate([X_train,X_extra])
#y_train = np.concatenate([y_train,y_extra])
y_train = y_train.reshape((-1, 5, 1))
y_test = y_test.reshape((-1, 5, 1))

# %%
batch_size = 32

ds_train = datset_gen(X_train, y_train, batch_size=batch_size, buffer_size=100)
ds_test = datset_gen(X_test, y_test, batch_size=batch_size, buffer_size=100)

# %%
def initial_load():
    kwargs = request.args.to_dict()
    print kwargs
    return load_data(**kwargs)
Beispiel #13
0
def main():
    logger = logMaster.get_logger('main')
    logger.info('loading data...')
    att_feats, train_data, val_data, test_data, test_s_data, classes = load_data(
        att_path=att_path, res_path=res_path)

    logger.info('building model...')

    gen = Generator(x_dim=args.x_dim,
                    s_dim=args.s_dim,
                    z_dim=args.z_dim,
                    layers=args.dec)
    # gen.train()
    # states = torch.load(args.vae_ckpt)
    # gen.load_state_dict(states['model'])

    dis = Discriminator(x_dim=args.x_dim, s_dim=args.s_dim, layers=args.dis)
    reg = Regressor(x_dim=args.x_dim, s_dim=args.s_dim, layers=args.reg)

    gen.cuda()
    dis.cuda()
    reg.cuda()

    mse_loss = nn.MSELoss()
    l1_loss = nn.L1Loss()

    adam_betas = (0.8, 0.999)
    gen_opt = optim.Adam(gen.parameters(),
                         lr=args.learning_rate,
                         weight_decay=0.01,
                         betas=adam_betas)
    dis_opt = optim.Adam(dis.parameters(),
                         lr=args.learning_rate,
                         weight_decay=0.01,
                         betas=adam_betas)
    reg_opt = optim.Adam(reg.parameters(),
                         lr=args.learning_rate,
                         weight_decay=0.01,
                         betas=adam_betas)

    train_manager = DataManager(train_data,
                                args.epoch,
                                args.batch,
                                infinite=True)

    ones = Variable(torch.ones([args.batch, 1]),
                    requires_grad=False).float().cuda()
    zeros = Variable(torch.zeros([args.batch, 1]),
                     requires_grad=False).float().cuda()

    loss_history = []
    logger.info('start training...')
    for epoch in range(args.epoch):
        running_loss = 0
        t1 = time.time()
        d_total_loss = 0.0
        g_total_loss = 0.0
        # cyc_total_loss = 0.0
        r_total_loss = 0.0
        # rd_total_loss = 0.0
        # vae_total_loss = 0.0
        g_scores = 0.0

        if args.steps == -1:
            steps = train_manager.num_batch
        else:
            steps = args.steps

        for batch in tqdm(range(steps), leave=False, ncols=70, unit='b'):
            for i in range(args.d_iter):
                dis.zero_grad()

                # get true data
                data = train_manager.get_batch()
                X = Variable(
                    torch.from_numpy(np.asarray([item[0] for item in data
                                                 ]))).float().cuda()
                Y = [item[1] for item in data]
                S = Variable(torch.from_numpy(att_feats[Y])).float().cuda()
                Yc = get_negative_samples(Y, classes['train'])
                Sc = Variable(torch.from_numpy(att_feats[Yc])).float().cuda()

                # get fake data
                # Xp = gen.forward(X, S)
                # Xp = Xp.detach()  # fix the generator
                Xpp = gen.sample(S).detach()
                Sp = reg.forward(X).detach()  # fix the regressor

                # get scores
                true_scores = dis.forward(X, S)
                # fake_scores = dis.forward(Xp, S)
                fake_scores2 = dis.forward(Xpp, S)
                # reg_scores = dis.forward(X, Sp)
                # ctrl_scores = dis.forward(X, Sc)

                # calculate loss
                d_loss = mse_loss(true_scores, ones) + mse_loss(
                    fake_scores2,
                    zeros)  # + args.theta3 * mse_loss(reg_scores, zeros) \
                #  + mse_loss(ctrl_scores, zeros)

                d_loss.backward()
                dis_opt.step()

                d_total_loss += d_loss.cpu().data.numpy()

            for i in range(args.g_iter):
                gen.zero_grad()
                reg.zero_grad()

                # get true data
                data = train_manager.get_batch()
                X = Variable(
                    torch.from_numpy(np.asarray([item[0] for item in data
                                                 ]))).float().cuda()
                Y = [item[1] for item in data]
                S = Variable(torch.from_numpy(att_feats[Y])).float().cuda()

                # get fake data
                # Xp, mu, log_sigma = gen.forward(X, S)
                Xp2 = gen.sample(S)
                Sp = reg.forward(X)
                # Spp = reg.forward(Xp)
                # Xpp, _, _ = gen.forward(X, Sp)

                # get scores
                # fake_scores = dis.forward(Xp, S)
                fake_scores2 = dis.forward(Xp2, S)
                # reg_scores = dis.forward(X, Sp)

                # calculate loss
                # vae_loss = gen.vae_loss(X=X, Xp=Xp, mu=mu, log_sigma=log_sigma)
                # cyc_loss = mse_loss(Spp, S) + mse_loss(Xpp, X)

                g_loss = mse_loss(fake_scores2, ones)
                r_loss = mse_loss(Sp, S)
                # rd_loss = mse_loss(reg_scores, ones)

                # total_loss = vae_loss + g_loss + args.theta1 * cyc_loss + args.theta2 * r_loss + args.theta3 * rd_loss
                total_loss = g_loss + args.theta2 * r_loss
                total_loss.backward()

                gen_opt.step()
                reg_opt.step()

                # vae_total_loss += vae_loss.cpu().data.numpy()
                g_total_loss += g_loss.cpu().data.numpy()
                # cyc_total_loss += cyc_loss.cpu().data.numpy()
                r_total_loss += r_loss.cpu().data.numpy()
                # rd_total_loss += rd_loss.cpu().data.numpy()
                g_scores += np.mean(fake_scores2.cpu().data.numpy())

        g_total_steps = steps * args.g_iter
        d_total_steps = steps * args.d_iter
        # vae_avg_loss = vae_total_loss / g_total_steps
        g_avg_loss = g_total_loss / g_total_steps
        # cyc_avg_loss = cyc_total_loss / g_total_steps
        r_avg_loss = r_total_loss / g_total_steps
        # rd_avg_loss = rd_total_loss / g_total_steps
        d_avg_loss = d_total_loss / d_total_steps
        g_avg_score = g_scores / g_total_steps
        loss_history.append(
            f'{g_avg_loss:.4}\t{d_avg_loss:.4}\t{r_avg_loss:.4}\t'
            f'{g_avg_score:.4}\n')
        elapsed = (time.time() - t1) / 60.0

        if (epoch + 1) % 10 == 0 or epoch == 0:
            filename = 'gdan_' + str(epoch + 1) + '.pkl'
            save_path = save_dir / Path(filename)
            states = dict()
            states['epoch'] = epoch + 1
            states['gen'] = gen.state_dict()
            states['dis'] = dis.state_dict()
            states['reg'] = reg.state_dict()
            # states['enc_layers'] = args.enc
            states['gen_layers'] = args.dec
            states['reg_layers'] = args.reg
            states['dis_layers'] = args.dis
            states['z_dim'] = args.z_dim
            states['x_dim'] = args.x_dim
            states['s_dim'] = args.s_dim
            states['gen_opt'] = gen_opt.state_dict()
            states['dis_opt'] = dis_opt.state_dict()
            states['reg_opt'] = reg_opt.state_dict()
            states['theta1'] = args.theta1
            states['theta2'] = args.theta2
            states['theta3'] = args.theta3

            torch.save(states, str(save_path))
            logger.info(
                f'epoch: {epoch+1:4}, g_loss: {g_avg_loss: .4}, d_loss: {d_avg_loss: .4}, \n'
                f'r_loss: {r_avg_loss: .4}, '
                f'g_score: {g_avg_score:.4}')

    with result_path.open('w') as fout:
        for s in loss_history:
            fout.write(s)

    logger.info('program finished')
Beispiel #14
0
def partOne(data):
    valid = 0
    for n in data:
        l, u, k, p = processItem(n)
        count = utils.count_characters_in_string(p, k)
        if utils.int_in_range(count, l, u):
            valid += 1

    return valid


def partTwo(data):
    valid = 0
    for n in data:
        p1, p2, k, p = processItem(n)
        if (p[int(p1) - 1] is k) ^ (p[int(p2) - 1] is k):
            valid += 1

    return valid


if __name__ == "__main__":
    # Load Data
    data = utils.load_data("day2.txt")

    # Do puzzle
    print("---- Day 2 ----")
    print("Part 1: " + str(partOne(data)))
    print("Part 2: " + str(partTwo(data)))
def plot_power_sector(s=None, c=None, order=None):
    data = load_data()
    data = data[data.cost != 'none']

    if s is None:
        all_cost = sorted(list(set(data.cost)))
        s = [
            min(all_cost), all_cost[int((len(all_cost) - 1) / 2)],
            max(all_cost)
        ]

    if c is None:
        all_cost = sorted(list(set(data.tax)))
        c = [
            min(all_cost), all_cost[int((len(all_cost) - 1) / 2)],
            max(all_cost)
        ]

    data = data[data.cost.isin(s)]
    data = data[data.tax.isin(c)]

    synonyms = {
        'Solar': 'Renewable',
        'Hydro': 'Renewable',
        'Wind': 'Renewable',
        'Biomass': 'Renewable',
        'Import': 'Others',
        'Oil': 'Others'
    }

    col_dic = {
        'Coal|w/o CCS': '#000000',
        'Coal|w/ CCS': '#918F88',
        'Gas|w/o CCS': '#A3CFD6',
        'Gas|w/ CCS': '#D3EAED',
        'Renewable': '#4EB378',
        'Nuclear': '#724ac1',
        'Others': '#b2b2b2'
    }

    years = [2020, 2030, 2040, 2050]
    activity = get_plot_data(data,
                             keyword='Secondary Energy|Electricity',
                             synonyms=synonyms,
                             col_dic=col_dic)
    activity[years] = activity[years] * 8.760
    plot_facet_grids(activity,
                     y_title='PPL Activity [TWh]',
                     figure_title='Power_Activity_TWh',
                     col_dic=col_dic,
                     y_max=880,
                     order=order)

    capacity = get_plot_data(data,
                             keyword='Capacity|Electricity',
                             synonyms=synonyms,
                             col_dic=col_dic)
    plot_facet_grids(capacity,
                     y_title='PPL Capacity [GW]',
                     figure_title='Power_Capacity_GW',
                     col_dic=col_dic,
                     y_max=280,
                     order=order)
Beispiel #16
0
parser = argparse.ArgumentParser()
parser.add_argument('--dataset', type=str, default='MUTAG')
parser.add_argument('--hidden', type=int, default=32)
parser.add_argument('--idx', type=int, default=1)
parser.add_argument('--batch_size', type=int, default=128)
parser.add_argument('--epochs', type=int, default=300)
parser.add_argument('--lr', type=float, default=0.01)
args = parser.parse_args()

np.random.seed(0)
torch.manual_seed(0)
torch.cuda.manual_seed_all(0)
# writer = SummaryWriter('runs/la_PROTEINS')
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

dataset = load_data(args.dataset)
index_train = []
index_test = []
with open(
        osp.join(osp.dirname(osp.realpath(__file__)), 'datasets',
                 '%s' % args.dataset, '10fold_idx',
                 'train_idx-%d.txt' % args.idx), 'r') as f_train:
    for line in f_train:
        index_train.append(int(line.split('\n')[0]))
with open(
        osp.join(osp.dirname(osp.realpath(__file__)), 'datasets',
                 '%s' % args.dataset, '10fold_idx',
                 'test_idx-%d.txt' % args.idx), 'r') as f_test:
    for line in f_test:
        index_test.append(int(line.split('\n')[0]))
import os
from S_and_R import main
from utils.utils import load_data

if __name__ == '__main__':
    input_parameters = dict(number_of_candles=60,
                            minimum_window_size=5,
                            maximum_window_size=20,
                            tolerance=0.001)

    path = os.path.join("data", "EURUSD1440.csv")
    dataframe = load_data(path)
    unique_resistances_supports_list, scaled_power_resistances_supports_list = main(dataframe, input_parameters)
 def load_data_from_json(self):
     title, description = load_data()
     self.set_title(title)
     self.set_description(description)
Beispiel #19
0
import pandas as pd

from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import roc_auc_score, log_loss
from sklearn.model_selection import StratifiedKFold

from keras.models import load_model
from keras.models import Sequential, Model
from keras.layers import Input, Dense, Dropout, Activation
from keras.layers.normalization import BatchNormalization
from keras.callbacks import EarlyStopping, ModelCheckpoint, Callback, CSVLogger
from keras.wrappers.scikit_learn import KerasClassifier

MODEL_NAME = 'keras_1'

combined = utils.load_data()
combined = utils.cat_transform(combined, 'onehot')

train, test = utils.recover_train_test_na(combined, fillna=True)

# Fillna for minmax scaler
train = train.replace(np.NaN, -1)
test = test.replace(np.NaN, -1)

X_train = train.drop('target', axis=1)
y_train = train.target
X_test = test

scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
Beispiel #20
0
args = get_args()
if (args.model_type != const.GMVAE and args.model_type != const.GMVAECNN):
    print('Choose a valid model_type!')
    sys.exit()
    # odel checkpoint experiments/checkpoint/GMV_FREY_0001_8_2_8_64_2/-1029 ..
config, flags = get_config_and_flags(args)

# create the experiments dirs
utils.create_dirs(
    [config.summary_dir, config.checkpoint_dir, config.results_dir])
utils.save_args(args, config.summary_dir)
'''  ------------------------------------------------------------------------------
                                     GET DATA
    ------------------------------------------------------------------------------ '''
print('\n Loading data...')
data_train, data_valid, data_test = utils.load_data(config.dataset_name)
'''  ------------------------------------------------------------------------------
                                     GET NETWORK PARAMS
    ------------------------------------------------------------------------------ '''
network_params = Bunch()
network_params.input_height = data_train.height
network_params.input_width = data_train.width
network_params.input_nchannels = data_train.num_channels

network_params.hidden_dim = config.hidden_dim
network_params.z_dim = config.z_dim
network_params.w_dim = config.w_dim
network_params.K = config.K_clusters
network_params.num_layers = config.num_layers
'''  -----------------------------------------------------------------------------
                        COMPUTATION GRAPH (Build the model)
Beispiel #21
0
def run(args):

    print("\nInput args:")
    pprint(vars(args))

    t0 = time()
    te_size = verify_size(args.te_size)
    datapath = Path(args.datapath).resolve()

    # Hard split
    # split_on = None if args.split_on is None else args.split_on.upper()
    cv_method = args.cv_method
    te_method = cv_method

    # Specify ML task (regression or classification)
    if cv_method == "strat":
        mltask = "cls"  # cast mltask to cls in case of stratification
    else:
        mltask = args.ml_task

    # Target column name
    trg_name = str(args.trg_name)
    # assert args.trg_name in data.columns, f'The prediction target ({args.name}) \
    #     was not found in the dataset.'

    # import ipdb; ipdb.set_trace()

    # -----------------------------------------------
    #       Create outdir
    # -----------------------------------------------
    if args.gout is not None:
        gout = Path(args.gout).resolve()
        sufx = "none" if args.split_on is None else args.split_on
        gout = gout / datapath.with_suffix(".splits")
        if args.split_on is not None:
            gout = gout / f"split_on_{sufx}"
        else:
            gout = gout / f"split_on_none"
    else:
        # Note! useful for drug response
        sufx = "none" if args.split_on is None else args.split_on
        gout = datapath.with_suffix(".splits")

    outfigs = gout / "outfigs"
    os.makedirs(gout, exist_ok=True)
    os.makedirs(outfigs, exist_ok=True)

    # -----------------------------------------------
    #       Create logger
    # -----------------------------------------------
    lg = Logger(gout / "data.splitter.log")
    print_fn = get_print_func(lg.logger)
    print_fn(f"File path: {fdir}")
    print_fn(f"\n{pformat(vars(args))}")
    dump_dict(vars(args), outpath=gout / "data.splitter.args.txt")

    # -----------------------------------------------
    #       Load data
    # -----------------------------------------------
    print_fn("\nLoad master dataset.")
    data = load_data(datapath)
    print_fn("data.shape {}".format(data.shape))

    # ydata = data[trg_name] if trg_name in data.columns else None
    # if (cv_method == "strat") and (ydata is None):
    #     raise ValueError("Prediction target column must be available if splits need to be stratified.")

    if (cv_method == "strat") and (trg_name not in data.columns):
        raise ValueError(
            "Prediction target column must be available if splits need to be stratified."
        )

    # if ydata is not None:
    #     plot_hist(ydata, title=f"{trg_name}", fit=None, bins=100,
    #               path=outfigs/f"{trg_name}_hist_all.png")

    if trg_name in data.columns:
        plot_hist(data[trg_name],
                  title=f"{trg_name}",
                  fit=None,
                  bins=100,
                  path=outfigs / f"{trg_name}_hist_all.png")

    # -----------------------------------------------
    #       Generate splits (train/val/test)
    # -----------------------------------------------
    print_fn("\n{}".format("-" * 50))
    print_fn("Split data into hold-out train/val/test")
    print_fn("{}".format("-" * 50))

    kwargs = {
        "cv_method": cv_method,
        "te_method": te_method,
        "te_size": te_size,
        "mltask": mltask,
        "split_on": args.split_on
    }

    data_splitter(
        data=data,
        n_splits=args.n_splits,
        gout=gout,
        outfigs=outfigs,
        # ydata = ydata,
        target_name=trg_name,
        print_fn=print_fn,
        seed=seed,
        **kwargs)

    print_fn("Runtime: {:.1f} min".format((time() - t0) / 60))
    print_fn("Done.")
    lg.close_logger()
Beispiel #22
0
def gen_ml_data(
    fpath,
    common_samples,
    # fea_type,
    # drg_set,
    dd_fea=None,
    fps_fea=None,
    img_fea=None,
    ID='TITLE',
    fea_sep='_',
    score_name='reg',
    n_samples=None,
    n_top=None,
    sampling=None,
    q_cls=0.025,
    binner=False,
    bin_th=2.0,
    baseline=False,
    print_fn=print,
    outdir=Path('out'),
    outfigs=Path('outfigs')):
    """ Generate a single set of ML data for the loaded target from fpath.
    This func was specifically created to process the new LARGE DOE-MD datasets
    with ZINC drugs that contains >6M molecules.
    Args:
        fpath: path to load docking scores file
        common_samples : list of drug names that are commong to all features
                         types including dd_fea, fps_fea, and img_fea
        dd_fea : df of Mordred descriptors
        fps_fea : df pf ecfp2 fingerprints
        img_fea : image data (TODO: this is not supported yet!)
        fea_sep : separator between feature prefix string and feature name
        score_name : rename the docking score col with score_name
        n_samples : total number of samples in the final ml_df
        n_top : keep this number of top-most dockers
        sampling : specify the method to use when sampling samples from df
        q_cls : quantile value to compute along the docking scores to generate
                the 'cls' col
        bin_th : threshold value of docking score to generate the 'binner' col
        binner : add binner column
        baseline : whether to compute ML baseline scores

    Returns:
        res : results summary
    """
    print_fn(f'\nProcess {fpath.name} ...')
    res = {}
    trg_name = fpath.with_suffix('').name  # note! depends on dock file names
    res['target'] = trg_name

    # Load docking
    dock = load_data(fpath)
    if dock.empty:
        print_fn('Empty file')
        return None

    if (n_samples is not None) and (dock.shape[0] <= n_samples):
        print_fn("n_samples is larger than len(dock), skip this receptor")
        return res

    # Pre-proc the dock file
    ## ID = 'TITLE'
    scoring_func = 'Chemgauss4'
    dock = proc_dock_score(dock,
                           ID=ID,
                           score_name=score_name,
                           scoring_func=scoring_func)

    # Plot histogram of all (raw) scores
    plot_hist_dock_scores(dock,
                          outfigs=outfigs,
                          subdir_name='all.raw',
                          trg_name=trg_name,
                          scoring_func=scoring_func)

    # Convert and bound scores to >=0
    dock[score_name] = abs(np.clip(dock[score_name], a_min=None, a_max=0))
    print_fn('dock: {}'.format(dock.shape))

    # Plot histogram of all (transformed) scores
    plot_hist_dock_scores(dock,
                          outfigs=outfigs,
                          subdir_name='all.transformed',
                          trg_name=trg_name,
                          scoring_func=scoring_func)

    # -----------------------------------------
    # Sample a subset of scores
    # -------------------------
    # Extract samples that are common to all feature types
    aa = dock[dock[ID].isin(common_samples)].reset_index(drop=True)

    # Extract subset of samples
    if (n_samples is not None) and (n_top is not None):
        n_bot = n_samples - n_top

        aa = aa.sort_values('reg', ascending=False).reset_index(drop=True)
        df_top = aa[:n_top].reset_index(drop=True)  # e.g. 100K
        df_rest = aa[n_top:].reset_index(drop=True)

        # if flatten:
        #     df_bot = flatten_dist(df=df_rest, n=n_bot, score_name=score_name)
        # else:
        #     df_bot = df_rest.sample(n=n_bot, replace=False)
        if sampling == 'flatten':
            df_bot = flatten_dist(df=df_rest, n=n_bot, score_name=score_name)
        elif sampling == 'random':
            df_bot = df_rest.sample(n=n_bot, replace=False)
        else:
            raise ValueError("'sampling' arg must be specified.")

        assert df_top.shape[1] == df_bot.shape[
            1], 'Num cols must be the same when concat.'
        aa = pd.concat([df_top, df_bot], axis=0).reset_index(drop=True)

        # Plot histogram of sampled scores
        outfigs_dir = outfigs / 'sampled.transformed'
        os.makedirs(outfigs_dir, exist_ok=True)
        fig, ax = plt.subplots()
        ax.hist(df_top[score_name],
                bins=100,
                facecolor='r',
                alpha=0.7,
                label='Top 10K Docking Ligands')
        ax.hist(df_bot[score_name],
                bins=100,
                facecolor='b',
                alpha=0.7,
                label='Other Ligands (balanced)')
        ax.set_xlabel(f'Docking Score ({scoring_func})')
        ax.set_ylabel('Count')
        plt.grid(True)
        plt.legend(loc='best', framealpha=0.5)
        plt.title(f'sampled.transformed; Samples {n_samples}; n_top {n_top}')
        plt.savefig(outfigs_dir / f'dock.dist.{trg_name}.png', dpi=150)
        del df_top, df_bot, df_rest

    elif (n_samples is not None):
        # if flatten:
        #     aa = flatten_dist(df=aa, n=n_samples, score_name=score_name)
        # else:
        #     aa = aa.sample(n=n_samples, replace=False)
        if sampling == 'flatten':
            aa = flatten_dist(df=aa, n=n_samples, score_name=score_name)
        elif sampling == 'random':
            aa = aa.sample(n=n_samples, replace=False)
        else:
            raise ValueError("'sampling' arg must be specified.")

        plot_hist_dock_scores(dock,
                              outfigs=outfigs,
                              subdir_name='sampled.transformed',
                              trg_name=trg_name,
                              scoring_func=scoring_func)
    dock = aa
    del aa

    # -----------------------------------------
    # Create cls col
    # --------------
    # Find quantile value
    if dock[score_name].min() >= 0:  # if scores were transformed to >=0
        q_cls = 1.0 - q_cls
    cls_th = dock[score_name].quantile(q=q_cls)
    res['cls_th'] = cls_th
    print_fn('Quantile score (q_cls={:.3f}): {:.3f}'.format(q_cls, cls_th))

    # Generate a classification target col
    if dock[score_name].min() >= 0:  # if scores were transformed to >=0
        value = (dock[score_name] >= cls_th).astype(int)
    else:
        value = (dock[score_name] <= cls_th).astype(int)
    dock.insert(loc=1, column='cls', value=value)
    # print_fn('Ratio {:.2f}'.format( dd['dock_bin'].sum() / dd.shape[0] ))

    # Plot
    hist, bin_edges = np.histogram(dock[score_name], bins=100)
    x = np.ones((10, )) * cls_th
    y = np.linspace(0, hist.max(), len(x))

    fig, ax = plt.subplots()
    plt.hist(dock[score_name],
             bins=200,
             density=False,
             facecolor='b',
             alpha=0.7)
    plt.title(f'Scores clipped to 0: {trg_name}')
    plt.xlabel(f'Docking Score ({scoring_func})')
    plt.ylabel('Count')
    plt.plot(x, y, 'm--', alpha=0.7, label=f'{q_cls}-th quantile')
    plt.grid(True)
    plt.tight_layout()
    plt.savefig(outfigs / f'dock.dist.cls.{trg_name}.png')
    # -----------------------------------------

    # Save dock scores
    cols = ['Inchi-key', 'SMILES', 'TITLE', 'reg', 'cls']
    dock = dock[[c for c in cols if c in dock.columns]]
    trg_outdir = outdir / f'DIR.ml.{trg_name}'
    outpath = trg_outdir / f'docks.df.{trg_name}.csv'
    os.makedirs(trg_outdir, exist_ok=True)
    dock.to_csv(outpath, index=False)

    # Add binner (note! may not be necessary since we get good dock scores)
    # if binner:
    #     dock = add_binner(dock, score_name=score_name, bin_th=bin_th)

    # Merge only on TITLE (when including also SMILES, there is a mismatch on
    # certain samples; maybe smiles that come with features are canonicalied)
    merger = ID

    def merge_dock_and_fea(dock,
                           fea_df,
                           fea_prfx,
                           fea_sep,
                           merger='TITLE',
                           fea_name=None,
                           baseline=False):
        """ ... """
        # drug_names = set(common_samples).intersection(set(dock[ID].values))

        ml_df = pd.merge(dock, fea_df, how='inner',
                         on=merger).reset_index(drop=True)
        del fea_df

        # bb = fea_df[ fea_df[merger].isin(dock[merger].tolist()) ].reset_index(drop=True)
        # xdata = extract_subset_fea(bb, fea_list=[fea_prfx], fea_sep=fea_sep)
        # bb = pd.concat([bb[merger], xdata], axis=1)  # keep only the merger meta col from fea_df

        # xdata = extract_subset_fea(fea_df, fea_list=[fea_prfx], fea_sep=fea_sep)
        # fea_df = pd.concat([fea_df[merger], xdata], axis=1)  # keep only the merger meta col from fea_df
        # ml_df = pd.merge(dock, fea_df, how='inner', on=merger).reset_index(drop=True)
        # del fea_df, xdata

        # Re-org cols
        fea_cols = extract_subset_fea_col_names(ml_df,
                                                fea_list=[fea_prfx],
                                                fea_sep=fea_sep)
        meta_cols = ['Inchi-key', 'SMILES', 'TITLE', 'CAT', 'reg', 'cls']
        cols = meta_cols + fea_cols
        # ml_df = ml_df[cols]
        ml_df = ml_df[[c for c in cols if c in ml_df.columns]]
        print_fn('{}: {}'.format(fea_name, ml_df.shape))

        # Save
        outpath = trg_outdir / f'ml.{trg_name}.{fea_name}'
        ml_df.to_parquet(str(outpath) + '.parquet')

        # Compute baseline if specified
        if baseline:
            te_scr = trn_baseline(ml_df, fea_list=[fea_prfx], fea_sep=fea_sep)
            res[f'{fea_prfx}_r2'] = te_scr['r2']
            res[f'{fea_prfx}_mae'] = te_scr['median_absolute_error']
            del te_scr

        del ml_df

    if dd_fea is not None:
        merge_dock_and_fea(dock,
                           fea_df=dd_fea,
                           fea_prfx='dd',
                           fea_sep=fea_sep,
                           merger=ID,
                           fea_name='descriptors',
                           baseline=baseline)

    if fps_fea is not None:
        merge_dock_and_fea(dock,
                           fea_df=fps_fea,
                           fea_prfx='ecfp2',
                           fea_sep=fea_sep,
                           merger=ID,
                           fea_name='ecfp2',
                           baseline=baseline)

    if img_fea is not None:
        pass

    # if n_samples is not None:
    #     assert n_samples == ml_df.shape[0], 'Final ml_df size must match n_samples {}'.format(fpath)
    return res
Beispiel #23
0
from time import time
from optparse import OptionParser
from multiprocessing import Process
from mne import Epochs, find_events
from time import time, strftime, gmtime
import os
from stimulus_presentation import auditory_p300
from utils import utils
from collections import OrderedDict
import numpy as np
from pandas import DataFrame
from psychopy import visual, core, event, sound, monitors
from pylsl import StreamInfo, StreamOutlet, resolve_byprop, StreamInlet

raw = utils.load_data('auditory/P300',
                      sfreq=256.,
                      subject_nb=subject,
                      session_nb=session)

raw.plot_psd()

raw.filter(1, 30, method='iir')

events = find_events(raw)
event_id = {'Non-Target': 1, 'Target': 2}

epochs = Epochs(raw,
                events=events,
                event_id=event_id,
                tmin=-0.1,
                tmax=0.8,
                baseline=None,
Beispiel #24
0
def train(network='rnn'):
    word2id, id2word = load_data(TOKEN_DATA)
    tag2id, id2tag = load_data(TAG_DATA)
    x_train, y_train, seq_lens, _, _ = generate_data(TRAIN_DATA,
                                                     word2id,
                                                     tag2id,
                                                     max_len=hp.max_len)
    x_dev, y_dev, dev_seq_lens, _, source_tag = generate_data(
        DEV_DATA, word2id, tag2id, max_len=hp.max_len)
    vocab_size = len(word2id)
    num_tags = len(tag2id)
    if network == "transformer":
        model = TransformerCRFModel(vocab_size, num_tags, is_training=True)
    elif network == 'rnn':
        model = BiRnnCRF(vocab_size, num_tags)
    elif network == 'cnn':
        model = CnnCRF(vocab_size, num_tags)
    elif network == 'match-pyramid':
        model = CnnCRF(vocab_size, num_tags)
    else:
        return
    sv = tf.train.Supervisor(graph=model.graph,
                             logdir=logdir,
                             save_model_secs=0)
    with sv.managed_session() as sess:
        for epoch in range(1, hp.num_epochs + 1):
            if sv.should_stop():
                break
            train_loss = []
            for x_batch, y_batch, len_batch in batch_data(
                    x_train, y_train, seq_lens, hp.batch_size):
                feed_dict = {
                    model.x: x_batch,
                    model.y: y_batch,
                    model.seq_lens: len_batch
                }
                loss, _ = sess.run([model.loss, model.train_op],
                                   feed_dict=feed_dict)
                train_loss.append(loss)

            dev_loss = []
            predict_lists = []
            for x_batch, y_batch, len_batch in batch_data(
                    x_dev, y_dev, dev_seq_lens, hp.batch_size):
                feed_dict = {
                    model.x: x_batch,
                    model.y: y_batch,
                    model.seq_lens: len_batch
                }
                loss, logits = sess.run([model.loss, model.logits], feed_dict)
                dev_loss.append(loss)

                transition = model.transition.eval(session=sess)
                pre_seq = model.predict(logits, transition, len_batch)
                pre_label = recover_label(pre_seq, len_batch, id2tag)
                predict_lists.extend(pre_label)
            train_loss_v = np.round(float(np.mean(train_loss)), 4)
            dev_loss_v = np.round(float(np.mean(dev_loss)), 4)
            print('****************************************************')
            acc, p, r, f = get_ner_fmeasure(source_tag, predict_lists)
            print('epoch:\t{}\ttrain loss:\t{}\tdev loss:\t{}'.format(
                epoch, train_loss_v, dev_loss_v))
            print('acc:\t{}\tp:\t{}\tr:\t{}\tf:\t{}'.format(acc, p, r, f))
            print('****************************************************\n\n')
Beispiel #25
0
def main(
        data_dir: str = '/project/cq-training-1/project2/teams/team12/data/',
        model_name: str = 'seq2seqgru',
        epochs: int = 20,
        optimizer: str = 'adam',
        lr: float = 1e-3,
        batch_size: int = 32,
        vocab_size: int = None,  # If None all tokens of will be in vocab
        seq_len:
    int = None,  # If None the seq len is dynamic (might not work with all models)
        seed: bool = True,
        model_config: dict = None,
        embedding: str = None,
        embedding_dim: int = 128,
        back_translation_model: str = 'saved_model/<model_folder_name>',
        back_translation: bool = False,
        back_translation_ratio: float = 1.0,
        fr_to_en: bool = False):

    # Call to remove tensorflow warning about casting float64 to float32
    tf.keras.backend.set_floatx('float32')

    # Set random seed
    if seed:
        tf.random.set_seed(SEED)
        np.random.seed(SEED)

    # Data paths
    path_en = os.path.join(data_dir, 'train.lang1')
    path_fr = os.path.join(data_dir, 'train.lang2')
    path_unaligned_en = os.path.join(data_dir, 'unaligned-tok.en')
    path_unaligned_fr = os.path.join(data_dir, 'unaligned-tok.fr')
    if fr_to_en:  # Switch paths
        tmp = path_en
        path_en = path_fr
        path_fr = tmp

    # Create vocabs
    logger.info('Creating vocab...')
    word2idx_en, idx2word_en = utils.create_vocab(path_en, vocab_size)
    word2idx_fr, idx2word_fr = utils.create_vocab(path_fr, vocab_size)
    logger.info(
        f'Size of english vocab : {len(word2idx_en)}, size of french vocab : {len(word2idx_fr)}'
    )

    # Back translation
    prediction_file = None
    if back_translation:
        prediction_file = os.path.join(utils.SHARED_PATH,
                                       'translated_unaligned.en')
        if os.path.exists(prediction_file):
            logger.info(
                f'Using translation from {prediction_file} for back-translation.'
            )
        else:
            logger.info(
                f'Translating {path_unaligned_fr} for back-translation...')
            # Load data
            data = utils.load_data(path_unaligned_fr, word2idx_fr)
            dataset = tf.data.Dataset.from_generator(
                lambda: [ex for ex in data],
                tf.int64,
                output_shapes=tf.TensorShape([None])).padded_batch(
                    128, padded_shapes=[None])
            # Load model
            model_config = {
                'num_layers': 2,
                'd_model': 128,
                'dff': 512,
                'num_heads': 8
            }
            model = Transformer(model_config, len(word2idx_fr), word2idx_en)
            model.load_weights(os.path.join(back_translation_model, "model"))

            # Write prediction to file
            with open(prediction_file, 'w') as f:
                print('opening file and writing predictions...')
                for batch in tqdm(dataset,
                                  desc='Translating...',
                                  total=len(data) // 128 + 1):
                    preds = model({
                        'inputs': batch,
                        'labels': tf.zeros_like(batch)
                    })
                    for pred in preds:
                        sentence = utils.generate_sentence(
                            np.argmax(pred.numpy(), axis=1).astype('int'),
                            idx2word_en)
                        f.writelines([sentence, '\n'])

    # Load datasets
    logger.info('Loading datasets...')
    train_dataset, valid_dataset, nb_train_ex, nb_valid_ex = utils.load_training_data(
        path_en,
        path_fr,
        word2idx_en,
        word2idx_fr,
        seq_len,
        batch_size,
        en_back_translated_path=prediction_file,
        fr_unaligned_path=path_unaligned_fr,
        back_translation_ratio=back_translation_ratio)
    logger.info(
        f'Number of training examples : {nb_train_ex}, number of valid examples : {nb_valid_ex}'
    )

    # Load embeddings
    embedding_matrix = None
    if embedding:
        logger.info(f'Loading embedding {embedding} ...')
        if embedding == 'fasttext':
            embedding_matrix = utils.create_fasttext_embedding_matrix(
                path_unaligned_en, word2idx_en, embedding_dim)
        elif embedding == 'word2vec':
            raise Exception(f'Embedding "{embedding}" not implemented yet')
        elif embedding == 'glove':
            raise Exception(f'Embedding "{embedding}" not implemented yet')
        else:
            raise Exception(f'Embedding "{embedding}" not recognized.')

    # Create model
    if model_name == 'gru':
        model = baselines.GRU(len(word2idx_fr), batch_size)
    elif model_name == 'seq2seqgru':
        if model_config is None:
            model_config = {
                'embedding_dim': 256,
                'encoder_units': 512,
                'decoder_units': 512,
                'n_layers': 1
            }
        model = Seq2SeqGRU(len(word2idx_en),
                           word2idx_fr,
                           batch_size,
                           model_config,
                           embedding_matrix=embedding_matrix)
    elif model_name == 'transformer':
        if model_config is None:
            model_config = {
                'num_layers': 2,
                'd_model': 128,
                'dff': 512,
                'num_heads': 8
            }
        model = Transformer(model_config,
                            len(word2idx_en),
                            word2idx_fr,
                            embedding_matrix=embedding_matrix)
    else:
        raise Exception(f'Model "{model}" not recognized.')

    # Optimizer
    if optimizer == 'adam':
        if model_name == 'transformer':  # Use adam according to transformer paper
            optimizer = tf.keras.optimizers.Adam(utils.CustomSchedule(
                model_config['d_model']),
                                                 beta_1=0.9,
                                                 beta_2=0.98,
                                                 epsilon=1e-9)
            logger.info(
                'Using custom scheduler for learning rate, --lr argument ignored.'
            )
        else:
            optimizer = tf.keras.optimizers.Adam(lr)
    elif optimizer == 'sgd':
        optimizer = tf.keras.optimizers.SGD(lr)
    else:
        raise Exception(f'Optimizer "{optimizer}" not recognized.')

    # Training loop
    logger.info(f'Training with model {model.get_name()} ...')

    metrics = {
        'train_accuracy': [],
        'valid_accuracy': [],
        'train_loss': [],
        'valid_loss': [],
        'train_bleu': [],
        'valid_bleu': []
    }
    model_path = model.get_name() + f'_fr_to_en_{fr_to_en}_embedding_{embedding}_embedding_dim_{embedding_dim}'\
                                    f'_back_translation_{back_translation}_ratio_{back_translation_ratio}'
    best_valid_bleu = 0
    for epoch in range(epochs):
        train_epoch(model, train_dataset, optimizer,
                    np.ceil(nb_train_ex / batch_size), idx2word_fr)
        test_epoch(model, valid_dataset, np.ceil(nb_valid_ex / batch_size),
                   idx2word_fr, idx2word_en)
        train_accuracy = train_accuracy_metric.result().numpy()
        valid_accuracy = valid_accuracy_metric.result().numpy()
        train_loss = train_loss_metric.result().numpy()
        valid_loss = valid_loss_metric.result().numpy()
        train_bleu = train_bleu_metric.result()
        valid_bleu = valid_bleu_metric.result()

        if valid_bleu > best_valid_bleu:
            best_valid_bleu = valid_bleu
            utils.save_model(model, model_path)

        # Logs
        logger.info(f'Epoch {epoch}\n'\
                    f'    Train BLEU : {train_bleu:.4f} - Valid BLEU : {valid_bleu:.4f}\n'\
                    f'    Train Accuracy : {train_accuracy:.4f} - Valid Accuracy : {valid_accuracy:.4f}\n'\
                    f'    Train Loss : {train_loss:.4f} - Valid Loss : {valid_loss:.4f}')

        metrics['train_accuracy'].append(train_accuracy)
        metrics['valid_accuracy'].append(valid_accuracy)
        metrics['train_loss'].append(train_loss)
        metrics['valid_loss'].append(valid_loss)
        metrics['train_bleu'].append(train_bleu)
        metrics['valid_bleu'].append(valid_bleu)

        # If using back translation, sample new generated examples for next epoch
        if back_translation:
            train_dataset, _, _, _ = utils.load_training_data(
                path_en,
                path_fr,
                word2idx_en,
                word2idx_fr,
                seq_len,
                batch_size,
                en_back_translated_path=prediction_file,
                fr_unaligned_path=path_unaligned_fr,
                back_translation_ratio=back_translation_ratio)

        # If training with embeddings, unfreeze embedding layer at 50th epoch
        if epoch == 48 and embedding and model_name == 'transformer':
            model.unfreeze_embedding_layer()

    # save metrics
    utils.save_metrics(metrics, model_path)

    # Plot accuracy
    plots.plot_accuracy(metrics['train_accuracy'], metrics['valid_accuracy'])
Beispiel #26
0
def ensemble_methods_regressor_forest_dataset():

    data = utils.load_data('forestfires.csv')

    new_data = utils.convert_data_to_numeric(data, [2, 3])

    feature_vector = new_data[:, 0:-1]
    targets = new_data[:, -1]

    # Data normalization
    data_features_normalized = normalization.z_score_normalization(feature_vector)

    data_features_train, data_features_test, data_targets_train, data_targets_test = \
        train_test_split(data_features_normalized,
                         targets,
                         test_size=0.25)

    # Model declaration
    """
    Parameters to select:

    n_estimators: The number of base estimators in the ensemble.
            Values: Random Forest and Bagging. Default 10
                    AdaBoost. Default: 50

    ###Only for Bagging and Boosting:###
    base_estimator: Base algorithm of the ensemble. Default: DecisionTree

    ###Only for Random Forest:###
    criterion: "entropy" or "gini": default: gini
    max_depth: maximum depth of tree, default: None
    """

    names = ["Bagging Regressor", "AdaBoost Regressor", "Random Forest Regressor"]

    models = [
        BaggingRegressor(
            base_estimator=tree.DecisionTreeRegressor(
                criterion='mse',
                max_depth=10)
        ),
        AdaBoostRegressor(
            base_estimator=tree.DecisionTreeRegressor(
                criterion='mse',
                max_depth=10)
        ),
        RandomForestRegressor(
            criterion='mse',
            max_depth=10
        )
    ]

    for name, em_reg in zip(names, models):
        logger.info("###################---" + name + "---###################")

        em_reg.fit(data_features_train, data_targets_train)

        # Model evaluation
        test_data_predicted = em_reg.predict(data_features_test)

        error = metrics.mean_absolute_error(data_targets_test, test_data_predicted)

        logger.debug('Total Error: %s', error)
def incremental_load():
    kwargs = request.args.to_dict()
    return load_data("incremental", **kwargs)
Beispiel #28
0
from utils import utils
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB, GaussianNB
import numpy as np
from sklearn import svm
from sklearn.metrics import roc_curve, auc, accuracy_score, recall_score, precision_score
from sklearn.decomposition import TruncatedSVD

features, labels, valid_adj, valid_graph_labels, train_adj, train_graph_labels = utils.load_data(
    cuda=False)

svd = TruncatedSVD(300)
tr = [[v for row in graph for v in row] for graph in train_adj]
#tr = svd.fit_transform(tr)
vr = [[v for row in graph for v in row] for graph in valid_adj]
#vr = svd.fit_transform(vr)

tl = np.argmax(train_graph_labels, axis=1)
vl = np.argmax(valid_graph_labels, axis=1)

print('Decision Tree:')
cl = DecisionTreeClassifier()
cl = cl.fit(tr, tl)
pl = cl.predict(vr)
print(accuracy_score(vl, pl), precision_score(vl, pl), recall_score(vl, pl))
print('Random Forest:')
cl = RandomForestClassifier(oob_score=True, random_state=10)
cl = cl.fit(tr, tl)
pl = cl.predict(vr)
print(accuracy_score(vl, pl), precision_score(vl, pl), recall_score(vl, pl))
                    ])

device = torch.device(
    'cuda', args.cuda) if torch.cuda.is_available() else torch.device('cpu')

# Reset random state for reproducibility
np.random.seed(args.seed)
torch.manual_seed(args.seed)
torch.cuda.manual_seed(args.seed)

if args.multinet:
    args.sym = 0
    args.embedding = 0

# Load dataset
graph, adj, features, labels, idx_train, idx_val, idx_test = load_data(
    path=args.data_dir, percent=args.train_percent, sym=args.sym)
embedding = node2vec(graph, args.data_dir, args.sym)
if args.multinet:
    embedding1 = node2vec(graph, args.data_dir, 1)
del graph
gc.collect()
if args.pca > 0:
    aff_features = PCA(args.pca, whiten=True).fit_transform(features.numpy())
    features = torch.FloatTensor(aff_features)
if args.embedding == 0:
    del features
    features = embedding.to(device)
    msg = 'Uses only the embedding features (extracting from Node2Vec model).'
    if args.multinet:
        features1 = embedding1.to(device)
        msg = '`multinet` specified, load embeddings for both adj and adj.T.'