Ejemplo n.º 1
0
def ingest(record_type, **kwargs):
    """
    Run the ingestion flow for the given :record_type:.
    """
    datasource = acquire_data(record_type, **kwargs)

    validating = not kwargs.get("no_validate")
    if validating:
        ref = kwargs.get("ref")
        datasource = validate_data(datasource, record_type, ref)

        # clean up missing data
        for k in [k for k in datasource.keys()]:
            if not datasource[k] or len(datasource[k]) < 1:
                del datasource[k]
    else:
        print("Skipping data validation")

    # output to files if needed
    output = kwargs.get("output")
    if output and os.path.exists(output):
        print(f"Writing data files to {output}")
        start_time, end_time = kwargs.get("start_time"), kwargs.get("end_time")
        output_data(output, datasource, record_type, start_time, end_time)

    loading = not kwargs.get("no_load")
    if loading and len(datasource) > 0:
        load_data(datasource, record_type, **kwargs)

    print(f"{record_type} complete")
def main(begin_date, end_date):

    extract_dict = extract.get_extracts(begin_date, end_date)

    output_dict = transform.get_transforms(extract_dict)

    load.load_data(output_dict)
def main():
    ''' main function 
    This puts the pieces of the ETL all together:
    extract -- reads raw data from api endpoints, and passes through raw schemas for validation
    transform -- uses parsed schemas to deserialize data into desired format for DB
    load -- uses sqlalchemy models + session to safely merge the data into the database
    '''
    args = get_args()
    if not args.date:
        logger.error('Must pass date for parsing')
        return

    # extract -- start by extracting all game_ids; then extract each game one by one
    results = {"shot_attempt": [], "goal": []}
    game_ids = extract_game_ids_for_date(args.date)
    logger.info('Receieved %s game_ids to load', len(game_ids))
    for game_id in game_ids:
        logger.info('Extracting: %s', game_id)
        raw_game = extract_game(game_id)

        # transform -- for each game we get, turn it into parsed shot and goal rows
        parsed_data = transform_game(raw_game)
        results["shot_attempt"].extend(parsed_data["shot_attempt"])
        results["goal"].extend(parsed_data["goal"])

    # load -- once all games have been iterated through, create sqlalchemy models out of the rows and insert
    create_tables()
    load_data(results)
Ejemplo n.º 4
0
def main():
    """
    Parse command line arguments and options and run OceanDiv.
    
    """
    # Read options
    args = parse_args.get_args()
    config = namelist.get_namelist(args)
    tools.print_message(config, 'Running OceanDiv...')
    
    # Load data
    tools.print_message(config, 'Loading data...')
    ohcs = load.load_data(config, dtype='ohc')
    flxs = load.load_data(config, dtype='flx')
    basins = load.load_geodata(config, geotype='basins')
    areas = load.load_geodata(config, geotype='areas')

    # Process data 
    ohcs, flxs, basins, areas = process.unify_masks(ohcs, flxs, basins, areas)
    out_cubes = process.process_by_basin(config, ohcs, flxs, basins, areas)
    
    # Save output
    save.save_as_netcdf(config, out_cubes)
    
    # Finished
    tools.print_message(config, 'Finished!')
Ejemplo n.º 5
0
    def __init__(self,
                 paths,
                 patch_size,
                 batch_size,
                 transformations=[],
                 augment=False,
                 mean=True):
        self.orig_patch_size = patch_size
        if augment:
            patch_size = int(math.sqrt(2 * self.orig_patch_size**2)) + 1

        self.patch_size = patch_size
        self.batch_size = batch_size
        self.augment = augment

        self.data_details = []
        self.data = []

        for path in paths:
            self.data_details.append(load.load_data(path, details_only=True))
            self.data.append(load.load_data(path))

        #ASSERT ALL SAME SIZE???

        for i, trans in enumerate(transformations):
            if trans:
                for j in range(len(self.data[i])):
                    self.data[i][j] = trans(self.data[i][j])
Ejemplo n.º 6
0
def main(args):
    usaCovidDataUrl = args['usaCovidDataUrl']
    johnHopkinsDataUrl = args['johnHopkinsDataUrl']
    loggerLevel = logging.__dict__[args['loggerLevel']]

    dynamodb_resource = boto3.resource('dynamodb',
                                       region_name='ap-southeast-2')

    try:
        setupLogger(loggerLevel)
        logger.info('Starting...')
        usaCovidDataFilename = getRemoteFile(usaCovidDataUrl, 'usaCovidData')
        johnHopkinsRecoveryDataFilename = getRemoteFile(
            johnHopkinsDataUrl, 'johnHopkinsData')
        mergedData = transform.mergeCsvFiles(usaCovidDataFilename,
                                             johnHopkinsRecoveryDataFilename)
        scanResponse = getTableScanResponse('covidData', dynamodb_resource)
        latestDate = getLatestRecordDate(scanResponse)
        intialLoad = tableIsEmpty(scanResponse)
        load.load_data(mergedData, latestDate, dynamodb_resource, intialLoad)
        logger.info('Done!')
    except:
        logger.exception('Error in processing!')
    finally:
        if os.path.isdir('/tmp/download/'):
            cleanupFiles('/tmp/download/')
Ejemplo n.º 7
0
def pre_processing():
    test_data = load_data(True)
    train_data = load_data(False)
    a = test_data['a']
    a1 = train_data['a']
    x = train_data['x']
    columns_rm = [
        get_names().index('sex_Female'),
        get_names().index('sex_Male')
    ]
    indx = [j for j in range(len(x.T)) if j not in columns_rm]

    train_data['x'] = train_data['x'][:, indx]
    test_data['x'] = test_data['x'][:, indx]

    xtrain_fe = train_data['x'][a1 == 0]
    xtrain_ma = train_data['x'][a1 == 1]

    xtest_fe = test_data['x'][a == 0]
    xtest_ma = test_data['x'][a == 1]

    ytrain_fe = train_data['y'][a1 == 0]
    ytrain_ma = train_data['y'][a1 == 1]

    ytest_fe = test_data['y'][a == 0]
    ytest_ma = test_data['y'][a == 1]

    return ytrain_fe, ytrain_ma, ytest_fe, ytest_ma, xtrain_fe, xtrain_ma, xtest_fe, xtest_ma
Ejemplo n.º 8
0
def q3():
    data_train = load_data(False)
    xtrain = data_train['x']
    ytrain = data_train['y']
    atrain = data_train['a']

    data_test = load_data(True)
    xtest = data_test['x']
    ytest = data_test['y']
    atest = data_test['a']

    xtrain = torch.tensor(xtrain).float()
    ytrain = torch.tensor(ytrain[:, None]).float()
    atrain = torch.tensor(atrain[:, None]).float()

    xtest = torch.tensor(xtest).float()
    ytest = torch.tensor(ytest[:, None]).float()
    atest = torch.tensor(atest[:, None]).float()
    accuracy = []
    d_p = []
    alphas = [.01, .1, 1, 10, 100]
    for alpha in alphas:
        print('Alpha :', alpha)
        features_extractor, classifers = NN_mmd(xtrain, ytrain, atrain, alpha)
        acc = accuracy_(classifers, xtest, ytest)
        accuracy.append(acc)
        print('acc for MMD NN: ', acc)
        delta_dp = dp(atrain.int().numpy().ravel(),
                      (classifers(xtrain) > 0).numpy().ravel())
        d_p.append(delta_dp)
        print('dp for features:', delta_dp)
    print('accuracy ', accuracy)
    print('alpha', alphas)
    print('d_p', d_p)
Ejemplo n.º 9
0
def execute_compemploy():
    search_term='warehouse&20philadelphia'
    search_type='lab'
    scrape.read_rss_and_load(search_type,search_term,cf['data_dir'])
    df = parse.create_df(cf['data_dir'],search_term)
    df_valid = parse.get_valid_texts(df)
    Session = load.bind_to_database(cf['postgres_username']
            ,cf['postgres_password'],cf['postgres_db'])
    load.load_data(Session,df_valid)
    send.send_from_database(Session)
Ejemplo n.º 10
0
def main(main_config_fpath='../data/example/main_config.cfg'):
    '''Get user-specified information from main_config.cfg'''
    cfg_parser = ConfigParser.SafeConfigParser()
    cfg_parser.readfp(open(main_config_fpath, 'r'))

    # get directory paths
    data_dir = add_pathsep(cfg_parser.get('general', 'data_dir'))
    downsample_dir = data_dir[0:-1] + "_downsampled" + os.sep
    preprocess_dir = data_dir[0:-1] + "_preprocessed" + os.sep
    ttv_list = ['training' + os.sep, 'validation' + os.sep, 'test' + os.sep]

    # ensure directories exist
    if not os.path.isdir(data_dir):
        sys.exit("Specified data directory " + data_dir + " does not exist.")
    for ttv in ttv_list if is_labeled(data_dir) else ['']:
        if not os.path.isdir(downsample_dir + ttv):
            os.makedirs(downsample_dir + ttv)
        if not os.path.isdir(preprocess_dir + ttv):
            os.makedirs(preprocess_dir + ttv)

    # get remaining preprocessing parameters
    img_width = cfg_parser.getint('general', 'img_width')
    img_height = cfg_parser.getint('general', 'img_height')
    mean_proj_bins = cfg_parser.getint('preprocessing', 'mean_proj_bin')
    max_proj_bins = cfg_parser.getint('preprocessing', 'max_proj_bin')
    new_time_depth = cfg_parser.getint('preprocessing', 'time_equalize')
    upper_contrast = cfg_parser.getfloat('preprocessing', 'upper_contrast')
    lower_contrast = cfg_parser.getfloat('preprocessing', 'lower_contrast')
    centroid_radius = cfg_parser.getint('preprocessing', 'centroid_radius')

    # run preprocessing
    for ttv in ttv_list if is_labeled(data_dir) else ['']:
        if cfg_parser.getboolean('general', 'do_downsample'):
            downsample(data_dir + ttv, downsample_dir + ttv, img_width,
                       img_height, mean_proj_bins, max_proj_bins)
            time_equalize(downsample_dir + ttv, downsample_dir + ttv,
                          img_width, img_height, new_time_depth)
        else:
            time_equalize(data_dir + ttv, downsample_dir + ttv, img_width,
                          img_height, new_time_depth)

        if is_labeled(data_dir):
            stks, rois, file_names = load_data(downsample_dir + ttv, img_width,
                                               img_height)
            stks = improve_contrast(stks, upper_contrast, lower_contrast)
            rois = get_centroids(rois, centroid_radius, img_width, img_height)
            save_image_tifs(stks, file_names, preprocess_dir + ttv)
            save_roi_tifs(rois, file_names, preprocess_dir + ttv)
        else:
            stks, file_names = load_data(downsample_dir + ttv,
                                         img_width,
                                         img_height,
                                         no_rois=True)
            stks = improve_contrast(stks, upper_contrast, lower_contrast)
            save_image_tifs(stks, file_names, preprocess_dir + ttv)
Ejemplo n.º 11
0
def execute_compemploy():
    search = cf['searches']
    for search_type in search.keys():
        for search_term in search[search_type]:
            scrape.read_rss_and_load(search_type,search_term,cf['data_dir'])
            df = parse.create_df(cf['data_dir'],search_term)
            df_valid = parse.get_valid_texts(df)
            Session = load.bind_to_database(cf['postgres_username']
                    ,cf['postgres_password'],cf['postgres_db'])
            load.load_data(Session,df_valid)
    send.send_from_database(Session)
Ejemplo n.º 12
0
def get_data():
    test_data = load_data(True)
    train_data = load_data(False)
    x = train_data['x']
    columns_rm = [
        get_names().index('sex_Female'),
        get_names().index('sex_Male')
    ]
    indx = [j for j in range(len(x.T)) if j not in columns_rm]

    train_data['x'] = train_data['x'][:, indx]
    test_data['x'] = test_data['x'][:, indx]
    return train_data['x'], test_data['x'], train_data['y'], test_data[
        'y'], train_data['a'], test_data['a']
Ejemplo n.º 13
0
	def test_load(self):
		# create test log files instead so we can delete
		self.datelog = open('testimport.log', "w")
		self.datelog_fn = 'testimport.log'
		# run load_data
		load_data(self.db_cursor, self.db_connection, self.datelog_fn)

		# run some test queries to make sure we have what we expect
		self.db_cursor.execute("SELECT * FROM fileformat1")
		assert(len(self.db_cursor.fetchall()) == 3)
		self.db_cursor.execute("SELECT * FROM fileformat1 WHERE valid=True")
		assert(len(self.db_cursor.fetchall()) == 2 )
		self.db_cursor.execute("SELECT * FROM fileformat1 WHERE name='Corey'")
		assert(len(self.db_cursor.fetchall()) == 0)
Ejemplo n.º 14
0
def merge(files, fill_na):
    df = load.load_data(PREFIX + files[0])
    for file in files[1:]:
        file_df = load.load_data(PREFIX + file)
        # Dropping stars since this is how we will judge our accuracy
        if file_df.get('stars') is not None:
            file_df.drop('stars', axis=1, inplace=True)
        df = df.merge(file_df, on='business_id', how='outer')

    if fill_na is not None:
        df.fillna(fill_na, inplace=True)
    stars = df.stars.copy()
    df.drop('stars', axis=1, inplace=True)

    return df, stars
Ejemplo n.º 15
0
 def __init__(self, bot, config):
     super(Extension, self).__init__(bot, config)
     self.data = load.load_data('profile')
     self.register_commands('profile', 'verify')
     self.mention_regex = re.compile(r'<@!?(\d+)>')
     self.domain = config.get('domain', 'undertale.fandom.com')
     self.initialized = False
Ejemplo n.º 16
0
def run_simulation(args, sim_num=0, header=None):
    """Run ANN simulation"""
    # Always run verbosely (for now)
    args['verbose'] = True

    # Load training and test data
    training_ds, testing_ds = load_data(args)

    # Build and train feed-forward neural network
    trainer, ff_network = train(args, training_ds)

    # Initialize results output file with given or default header
    if header is None:
        header = ['hidden_neurons', 'learning_rate', 'max_epochs',
                  'activation', 'hits', 'mse']

    # Create results directory to hold simulation files if not existing
    if not os.path.exists('results'):
        os.makedirs('results')

    # Write table header to simulation file
    with open('results/simulation{}.txt'.format(sim_num), 'a') as sim_file:
        sim_file.write('{}\n'.format('|'.join(header)))

    # Use the trainer to evaluate the network on the training and test data
    evaluate(args, trainer, ff_network, training_ds, testing_ds, sim_num, header)
Ejemplo n.º 17
0
def load_datasets_from_file(filename, debug=False, read_size=100):
    """
    Function for loading dataset before initializing neural network and evaluating the model.
    If you get/build dataset in fasta format beforehand, provide filename in argument when calling build.py. We expect
    provided filename is located in media directory.
    If filename is empty/not provided, then specify all the needed params for expected data loading. Filename is build from
    md5 from sorted genome IDs, depth param, sample param, read_size param, onehot param and seed param. File is saved
    in fasta format and zipped with gzip.
    :param filename: filename, given from
    :param debug: if the flag for debug is present, run in debug mode (controlled seed, smaller taxonomy)
    :param read_size: input length
    :return: train and test datasets as well as number of classes
    """
    transmission_dict = {'A': [1, 0, 0, 0], 'T': [0, 1, 0, 0], 'C': [0, 0, 1, 0], 'G': [0, 0, 0, 1]}
    test = 0.2
    depth = 4
    sample = 0.2
    #read_size = 100
    onehot = True
    # taxonomy_el_count = 20 and seed = 0 for debug only
    if debug:
        seed = 0
        taxonomy_el_count = 20
    else:
        seed = random.randint(0, 4294967295)
        taxonomy_el_count = -1
    if not filename:
        filename = "%s_%d_%.3f_%d_%d_%d_%d%s" % (hashlib.md5(str(sorted(get_gids()))).hexdigest(), depth, sample, read_size, onehot,
                                                 seed, taxonomy_el_count, ".fasta.gz")
    trX, teX, trY, teY, trteX, trteY, \
    num_of_classes, train_class_sizes = load_data(filename=filename, test=test, depth=depth, read_size=read_size,
                                                  transmission_dict=transmission_dict, sample=sample, seed=seed,
                                                  taxonomy_el_count=taxonomy_el_count)
    return trX, teX, trY, teY, trteX, trteY, num_of_classes, train_class_sizes
Ejemplo n.º 18
0
def scrape_world_select():
    dt = datetime.now()
    response = get_osrs_world_select()
    status_code = response.status_code

    if (response.ok):
        world_data, total_player_data = extract_data(response)

        world_data, total_player_count = (transform_data(
            world_data, total_player_data, dt))

        load_data(world_data, total_player_count)
    else:
        print('Bad Response - HTTP', status_code)

    update_logs(dt, status_code)
Ejemplo n.º 19
0
def main():
    six.print_('loading data')
    train_x, train_y, val_x, val_y = load_data()
    train_x = train_x.reshape(-1, 64 * 64)
    val_x = val_x.reshape(-1, 64 * 64)
    six.print_('load data complete')

    six.print_('start PCA')
    try:
        pca = pickle.load(open('pca.pickle', 'rb'))
    except:
        pca = decomposition.PCA(n_components=8*8)
        pca.fit(train_x[:])
    train_x = pca.transform(train_x)
    six.print_('PCA complete')

    clf = SVC(C=0.0001, kernel='linear', verbose=True, max_iter=100)
    six.print_('start training')
    clf.fit(train_x, train_y)
    six.print_('training complete')

    val_x = pca.transform(val_x)
    acc = sum(val_y == clf.predict(val_x)) / float(len(val_y))
    print(acc)

    pickle.dump(pca, open('pca.pickle', 'wb'))
    pickle.dump(clf, open('svm.pickle', 'wb'))
Ejemplo n.º 20
0
def main():
    six.print_('loading data')
    train_x, train_y, val_x, val_y = load_data()
    train_x = train_x.reshape(-1, 64 * 64)
    val_x = val_x.reshape(-1, 64 * 64)
    six.print_('load data complete')

    six.print_('start PCA')
    try:
        pca = pickle.load(open('pca.pickle', 'rb'))
    except:
        pca = decomposition.PCA(n_components=8 * 8)
        pca.fit(train_x[:])
    train_x = pca.transform(train_x)
    six.print_('PCA complete')

    clf = SVC(C=0.0001, kernel='linear', verbose=True, max_iter=100)
    six.print_('start training')
    clf.fit(train_x, train_y)
    six.print_('training complete')

    val_x = pca.transform(val_x)
    acc = sum(val_y == clf.predict(val_x)) / float(len(val_y))
    print(acc)

    pickle.dump(pca, open('pca.pickle', 'wb'))
    pickle.dump(clf, open('svm.pickle', 'wb'))
Ejemplo n.º 21
0
    def __init__(self, args):
        self.train_data, self.test_data, self.valid_data, self.mapping = load_data(
            args.data_dir)
        self.learning_rate = args.learning_rate
        self.epochs = args.epochs
        use_gpu = args.gpu
        hidden_units = args.hidden_units
        self.architecture = args.arch
        use_dropout = args.dropout
        dropout_ratio = args.dropout_ratio

        base_model = load_base_model(self.architecture)

        features_in = get_classifier_inputs_number(base_model)

        classifier = Classifier(features_in=features_in,
                                hidden_units=hidden_units,
                                use_dropout=use_dropout,
                                dropout_ratio=dropout_ratio)

        self.save_dir = args.save_dir
        self.model = get_model(base_model, classifier)

        cuda_is_available = torch.cuda.is_available()
        if not cuda_is_available:
            print("Cuda is not available. Only the CPU will be used")
        self.device = torch.device(
            "cuda:0" if cuda_is_available and use_gpu else "cpu")
Ejemplo n.º 22
0
def main():
    parser = argparse.ArgumentParser(description='Train a neural network')

    parser.add_argument('--model', type=str)
    parser.add_argument('--lr', type=float, default=0.001)
    parser.add_argument('--decay', type=float, default=1e-4)
    parser.add_argument('--momentum', type=float, default=0.9)
    parser.add_argument('--batch', type=int, default=128)
    parser.add_argument('--epoch', type=int, default=100)
    parser.add_argument('--output', type=str, default='weight')
    args = parser.parse_args()

    model = importlib.import_module(args.model).build()

    six.print_('loading data')
    (train_x, train_y, val_x, val_y) = load_data()
    six.print_('load data complete')

    sgd = SGD(lr=args.lr,
              decay=args.decay,
              momentum=args.momentum,
              nesterov=True)
    model.compile(loss='binary_crossentropy', optimizer=sgd)
    six.print_('build model complete')

    six.print_('start training')
    model.fit(train_x,
              train_y,
              batch_size=args.batch,
              nb_epoch=args.epoch,
              verbose=2,
              show_accuracy=True,
              shuffle=True,
              validation_data=(val_x, val_y))
    model.save_weights(args.output + '.hdf5')
Ejemplo n.º 23
0
def main():
    parser = argparse.ArgumentParser(description='Train a neural network')

    parser.add_argument('--model', type=str)
    parser.add_argument('--lr', type=float, default=0.001)
    parser.add_argument('--decay', type=float, default=1e-4)
    parser.add_argument('--momentum', type=float, default=0.9)
    parser.add_argument('--batch', type=int, default=128)
    parser.add_argument('--epoch', type=int, default=100)
    parser.add_argument('--output', type=str, default='weight')
    args = parser.parse_args()

    model = importlib.import_module(args.model).build()

    six.print_('loading data')
    (train_x, train_y, val_x, val_y) = load_data()
    six.print_('load data complete')

    sgd = SGD(lr=args.lr,
              decay=args.decay,
              momentum=args.momentum,
              nesterov=True)
    model.compile(loss='binary_crossentropy', optimizer=sgd)
    six.print_('build model complete')

    six.print_('start training')
    model.fit(train_x, train_y, batch_size=args.batch, nb_epoch=args.epoch,
              verbose=2,
              show_accuracy=True,
              shuffle=True,
              validation_data=(val_x, val_y))
    model.save_weights(args.output + '.hdf5')
Ejemplo n.º 24
0
def score_labeled_data(postprocess_dir, data_dir, img_width, img_height):
    categories = ["training/", "validation/", "test/"]
    for c in categories:
        ground_truth_rois, filenames = load.load_data(data_dir + c,
                                                      img_width,
                                                      img_height,
                                                      rois_only=True)
        rois = defaultdict(lambda: [None, None])
        for i, r in enumerate(ground_truth_rois):
            rois[filenames[i]][0] = r
        for f in os.listdir(postprocess_dir + c):
            filename = os.path.splitext(os.path.basename(f))[0]
            if f.endswith('.npz'):
                rois[filename][1] = np.load(postprocess_dir + c + f)['rois']
        files_to_remove = []
        for f in rois:
            if rois[f][0] is None:
                print "Unable to score " + f + " : missing ground truth data"
                files_to_remove.append(f)
            elif rois[f][1] is None:
                print "Unable to score " + f + " : missing convnet data"
                files_to_remove.append(f)
        for f in files_to_remove:
            rois.pop(f)
        ground_truth_rois, convnet_rois = zip(*rois.values())
        score = Score(ground_truth_rois, convnet_rois)
        with open(postprocess_dir + c + "score.txt", 'w') as score_file:
            score_file.write(str(score))
Ejemplo n.º 25
0
 def __init__(self, bot, config):
     super(Extension, self).__init__(bot, config)
     self.data = load.load_data('cvn')
     self.register_commands('links', 'cancelprocess')
     self.confirm = False
     self.loop = asyncio.new_event_loop()
     self.namespaces = list(range(
         0, 16)) + [110, 111, 502, 503, 828, 1201, 2001]
Ejemplo n.º 26
0
def compute_histogram_database(vocabulary, max_im=None):
    res = []
    gen = load_data()
    for i, (im, mask) in enumerate(gen):
        if max_im and max_im < i:
            break
        res.append(compute_histogram(im, mask, vocabulary))
    return np.array(res)
Ejemplo n.º 27
0
def co_with_y():
    data = load_data(False)
    x, y = data['x'], data['y']
    corr = []
    for i in x.T:
        corr.append(abs(stats.pearsonr(i, y.reshape(-1))[0]))
    sorting = np.argsort(corr)[-10:]
    for j in sorting:
        print(get_names()[j])
Ejemplo n.º 28
0
def main():
    business_reviews = load_data()
    sample_business = business_reviews[0]

    # Let's look at the first business just to get an idea of what we're
    # working with...
    print sample_business['reviews'][0]
    print sample_business['categories']
    print sample_business['name']
Ejemplo n.º 29
0
def removed_columns():
    test_data = load_data(True)
    train_data = load_data(False)
    x, a = test_data['x'], test_data['a']

    columns_rm = [
        get_names().index('sex_Female'),
        get_names().index('sex_Male')
    ]
    indx = [j for j in range(len(x.T)) if j not in columns_rm]
    train_data['x'] = train_data['x'][:, indx]
    test_data['x'] = test_data['x'][:, indx]
    clf = LogisticRegression(C=1000).fit(train_data['x'],
                                         train_data['a'].reshape(-1))
    print(clf.score(test_data['x'], test_data['a'].reshape(-1)))
    print(
        re_accuracy(test_data['a'].reshape(-1), clf.predict(test_data['x']),
                    test_data['a'].reshape(-1)))
Ejemplo n.º 30
0
    def test_load_test(self):
        f = open("test.txt")
        wd, pd, ctd, etd = load.load_data(f)
        f.close()

        self.assertEqual(wd,
                         [["Peter", "Blackburn"], ["1966", "World", "Cup"]])
        self.assertEqual(pd, [["NNP", "NNP"], ["CD", "NNP", "NNP"]])
        self.assertEqual(ctd, [["I-NP", "I-NP"], ["I-NP", "I-NP", "I-NP"]])
Ejemplo n.º 31
0
def compute_histogram_database(vocabulary, max_im=None):
    res = []
    gen = load_data()
    for i, (im, mask) in enumerate(gen):
        if max_im and max_im < i:
            break
        res.append(compute_histogram(im,
                                     mask,
                                     vocabulary))
    return np.array(res)
Ejemplo n.º 32
0
def load_test_dataset():
    print("Loading unlabeled dataset")

    test_df = read_labels(DATA_DIR / 'test.txt', column_names=['name'])

    # Index by name to make sure pandas doesn't
    # add an extra column in the submission
    test_df = test_df.set_index('name')

    return test_df, load_data(DATA_DIR / 'test' / 'test', test_df.index)
Ejemplo n.º 33
0
def main():
    device = torch.device(args.device)

    # Loading the train and dev data and save them in a loader + the encoder of the classes
    train_loader, dev_loader, label_encoder = load_data(
        args.train_path, args.dev_path, args.batch_size, args.tokens_column,
        args.predict_column, args.lang_model_name, args.max_len,
        args.separator, args.pad_label, args.null_label, device)

    train(train_loader, dev_loader, label_encoder, device)
Ejemplo n.º 34
0
def main():
    business_reviews = load_data()
    sample_business = business_reviews[0]

    # Let's look at the first business just to get an idea of what we're
    # working with...
    print '\n'
    print 'First review: {0}'.format(sample_business['reviews'][0])
    print 'Categories: {0}'.format(sample_business['categories'])
    summary = interesting_words(sample_business['reviews'])
    print '\nReviews summary: {0}'.format(summary)
Ejemplo n.º 35
0
def predict_corr():
    test_data = load_data(True)
    train_data = load_data(False)
    x, a = test_data['x'], test_data['a']
    clf = LogisticRegression(C=1000).fit(train_data['x'],
                                         train_data['y'].reshape(-1))
    y_pred = clf.predict(test_data['x'])

    corr = []
    for i in x.T:
        replace_nan = abs(stats.pearsonr(i, y_pred)[0])
        if np.isnan(replace_nan):
            corr.append(0)
        else:
            corr.append(replace_nan)

    sorted_corr = np.argsort(corr)[-3:]
    print(sorted_corr)
    for j in sorted_corr:
        print(get_names()[j])
Ejemplo n.º 36
0
def testAll(args):
    word2idx, vectors = create_model(args)
    global idx2word
    idx2word = {b: a for a, b in word2idx.items()}
    print("> Loading trained model and Test")
    max_recall = load_model(args.model_dump)
    print(f"max_recall: {max_recall}")
    test_data = load_test_data(args, word2idx)
    with torch.no_grad():
        model.eval()
        dataset = load_data(args, word2idx, vectors)
        calculateRecall(dataset)
Ejemplo n.º 37
0
def trainInit(args):
    max_recall = 0
    word2idx, vectors = create_model(args)
    idx2word = {b:a for a,b in word2idx.items()}

    if args.model_load != None:
        print("> Loading trained model and Train")
        max_recall = load_model(args.model_load)

    dataset = load_data(args, word2idx, vectors)
    objective = nn.BCEWithLogitsLoss(pos_weight=torch.FloatTensor([RATE])).to(device)

    return dataset, objective, word2idx, max_recall
Ejemplo n.º 38
0
 def __init__(self, bot, config):
     super(Extension, self).__init__(bot, config)
     self.temp = load.load_data('report')
     self.register_commands('report', 'unreport', 'resolve', 'kocka')
     self.mw = mwclient.Site('vstf.wikia.com', path='/')
     self.mw.login(config['username'], config['password'])
     for t in ['w', 's', 'p', 'b']:
         if not t in self.temp:
             if t == 'w' or t == 'b':
                 self.temp[t] = []
             else:
                 self.temp[t] = {}
     self.message = False
Ejemplo n.º 39
0
def main():
    business_reviews = load_data()

    # Find the first Mexican business
    for biz in business_reviews:
        if 'Mexican' in biz['categories']:
            sample_business = biz
            break

    # Let's see how we classify the first business...
    result = naive_mexican_classifier(sample_business['reviews'])
    print '\n'
    print 'First review: {0}'.format(sample_business['reviews'][0])
    print 'Categories: {0}'.format(sample_business['categories'])
    print '\nClassified as Mexican?: {0}'.format(result)
Ejemplo n.º 40
0
Archivo: cnn.py Proyecto: raj347/solml
def compute_cnn_features(ident_list):
    data = load.load_data(ident_list, 96, 96, color=True)
    N = data.shape[0]
    X = np.zeros((N, 3, 96, 96))
    for i in range(N):
        im = data[i, :].copy().reshape((96, 96, 3)).copy()
        #plt.imshow(im.astype(np.uint8))
        im[:, :, 0] -= 103.939
        im[:, :, 1] -= 116.779
        im[:, :, 2] -= 123.68
        im = im.transpose((2, 0, 1))
        im = np.expand_dims(im, axis=0)
        X[i, :, :, :] = im
    cnn_features = model.predict(X)

    return cnn_features
def prepare_data():

    X = load_data()
    features = X[:, 5:43]

    city_encoder = LabelEncoder()
    city_group_encoder = LabelEncoder()
    type_encoder = LabelEncoder()

    raw_city = city_encoder.fit_transform(X[:, 2:3].flatten())
    raw_city_group = city_group_encoder.fit_transform(X[:, 3:4].flatten())
    raw_type = type_encoder.fit_transform(X[:, 4:5].flatten())

    features =np.concatenate((np.array([raw_type]).T, features), axis=1)
    features =np.concatenate((np.array([raw_city_group]).T, features ), axis=1)
    features =np.concatenate(( np.array([raw_city]).T, features), axis=1)

    return train_test_split(features, X[:, 42:43].flatten(), test_size=0.33, random_state=42)
Ejemplo n.º 42
0
def main(filename, station_filename, output_filename):
    times, station_ids, empty_slots, available_bikes = load_data(filename)
    sid2data = load_stations(station_filename)

    total_slots = np.add(empty_slots, available_bikes)
    # gen empty_ratio 
    empty_ratios = np.divide(empty_slots, total_slots)
    # gen available ratio
    available_bike_ratios = np.divide(available_bikes, total_slots)

    # gen hours 
    hours = get_hour(times)
    # gen minutes
    minutes = get_minute(times, delta=15)

    # for each 15 minutes generate a snap shot
    data = []
    for h in range(24):
        for m in range(0, 60, 15):
            slot_station_ids = station_ids[ hours == h & minutes == m]
            slot_available_bike_ratios = available_bike_ratios[ hours == h & minutes == m]

            slot_data = {}
            slot_data['hour'] = h 
            slot_data['minute'] = m
            slot_data['station'] = []
            for station_id in range(1, 119+1):
                mean_available_bike_ratio = np.mean(slot_available_bike_ratios[slot_station_ids == station_id])
                slot_data['station'].append({
                    'bike_ratio': mean_available_bike_ratio, 
                    'name': sid2data[station_id]['name'],
                    'e_name': sid2data[station_id]['e_name'],
                    'lat': sid2data[station_id]['lat'],
                    'lng': sid2data[station_id]['lng'],
                    'sid': station_id
                })

            data.append(slot_data)
    
    # print out 
    outf = open(output_filename, 'w')
    print >> outf, json.dumps(data)
    outf.close()
Ejemplo n.º 43
0
def main():
    filename = "../ubike_record.09_13_09_26.csv"
    # filename = "../../ubike_record.csv"

    times, station_ids, empty_slots, available_bike = load_data(filename)
    # fetch Taipei city goverment data
    start_t = mdates.strpdate2num("%Y-%m-%d %H:%M:%S")("2013-09-15 00:00:00")
    end_t = mdates.strpdate2num("%Y-%m-%d %H:%M:%S")("2013-09-16 00:00:00")

    station_ids = station_ids[times < end_t]
    empty_slots = empty_slots[times < end_t]
    times = times[times < end_t]

    station_ids = station_ids[start_t < times]
    empty_slots = empty_slots[start_t < times]
    times = times[start_t < times]

    station_id = 4 
    times = times[station_ids == station_id] 
    empty_slots = empty_slots[station_ids == station_id]
    available_bike = available_bike[station_ids == station_id]
    # plot 
    plot(times, available_bike)
Ejemplo n.º 44
0
import tensorflow as tf
from load import load_data
import pickle

# 调用load.py, 加载数据并进行预处理
# 加载数据
title_count, title_set, genres2int, features, targets_values, ratings, users, movies, data, movies_orig, users_orig = load_data()
# print(title_count)
# print(title_set)
# print(genres2int)
# print(features)
# print(targets_values)
# print(ratings)
# print(movies)
# print(movies_orig)

sentences_size = title_count                            # 电影名称的长度
embed_dim = 32                                          # 嵌入矩阵的维度
num_epochs = 10                                         # 迭代次数
batch_size = 256                                        # batch的大小
dropout_keep = 0.5                                      # dropout比例
learning_rate = 0.0001                                  # 学习率
save_dir = './model/save'                             # 生成模型的保存路径

# 用户信息
user_gender = {'M':'男性','F':'女性'}
user_age = {1:"Under 18",18:  "18-24",25:  "25-34",35:  "35-44",45:  "45-49", 50:  "50-55",56:  "56+"}
user_occupation = {0:  "other" , 1:  "academic/educator",2:  "artist",
                   3:  "clerical/admin",4:  "college/grad student",5:  "customer service",
                   6:  "doctor/health care",7:  "executive/managerial",8:  "farmer",
                   9:  "homemaker",10:  "K-12 student",11:  "lawyer",
Ejemplo n.º 45
0
import model
from imp import reload
reload(model)
import numpy as np
import matplotlib as mpl
mpl.use('Agg')
import matplotlib.pyplot as plt
import os
import load



data = load.load_data("cifar10")

X = data[0]
img_depth = data[4]
img_x = data[5]

import Plots
print(X.shape)
Plots.plot_testimg(X[:10, :])

batch_size = 32

num_f1 = 50
num_f2 = 50
num_f3 = 50
f1 = (num_f1, img_depth, 11, 11)
f2 = (num_f2, num_f1, 2, 2)
f3 = (num_f3, num_f2, 3, 3)
filters = [f1]
Ejemplo n.º 46
0
def parameter_tuning(methodology, nfold, is_pca, is_testing, n_jobs, conf, n_estimator):
    drop_fields = []

    parser = ModelConfParser(conf)

    objective = parser.get_objective()
    cost = parser.get_cost()

    filepath_training, filepath_testing, filepath_submission, filepath_tuning = parser.get_filepaths(methodology)
    filepath_feature_importance, top = parser.get_feature_importance()
    filepath_feature_interaction, binsize, top_feature = parser.get_feature_interaction()

    for filepath in [filepath_tuning, filepath_submission]:
        create_folder(filepath)

    filepath_cache_1 = "{}/input/train.pkl".format(BASEPATH)
    train_x, test_x, train_y, test_id, _ = load_data(filepath_cache_1, filepath_training, filepath_testing, drop_fields)

    pool = []
    for value, count in zip(values, counts):
        if count > 2:
            pool.append(value)

    idxs = train_y.isin(pool)

    train_x = train_x[idxs].values
    train_y = train_y[idxs].astype(str).values

    test_x = test_x.values
    test_id = df_testing["row_id"].values

    if filepath_feature_interaction:
        for layers, value in load_interaction_information(filepath_feature_interaction, str(top_feature)):
            for df in [train_x, test_x]:
                t = value
                breaking_layer = None
                for layer in layers:
                    if layer in train_x.columns:
                        t *= df[layer]
                    else:
                        breaking_layer = layer
                        break

                if breaking_layer == None:
                    df[";".join(layers)] = t
                else:
                    log("Skip {}".format(layers), WARN)
                    break

    if is_pca:
        train_x, test_x = pca(train_x, train_y.values, test_x)

    if is_testing:
        train_x = train_x.head(1000)
        train_y = train_y.head(1000)

    params = tuning(train_x, train_y, test_id, test_x, cost, objective,
                    filepath_feature_importance, filepath_tuning, filepath_submission, methodology, nfold, top_feature,
                    n_estimator=n_estimator, thread=n_jobs)

    log("The final parameters are {}".format(params))
Ejemplo n.º 47
0
#!/usr/bin/python

import argparse
import load as Loader 
import analyze as Analyzer


parser = argparse.ArgumentParser(description='Analyze scraped data.')
parser.add_argument('directory', metavar='dir',
                   help='directory to process')

args = parser.parse_args()
data = Loader.load_data(args.directory)

print("==== Loaded Data =====")
Analyzer.histogram("hits.png", 
   data["base"]["hits"], 25,
   "Histogram of Number of Hits","Number of Hits")

Analyzer.histogram("words.png", 
   data["base"]["words"], 25,
   "Histogram of Number of Words","Number of Words")

Analyzer.scatter("hits-vs-words.png",
   data["base"]["words"],data["base"]["hits"], 
   "Histogram of Hits vs Words","Number of Words", "Number of Hits")

Analyzer.scatter("bookmarks-vs-words.png",
   data["base"]["words"],data["base"]["bookmarks"], 
   "Histogram of Bookmarks vs Words","Number of Words", "Number of Bookmarks")
Ejemplo n.º 48
0
Archivo: cnn.py Proyecto: MaGold/cnn
        filter_params.append(w)
        outshp = (outshp - f[2] + 1)/2

    outshp = filters[-1][0] * outshp * outshp
    w = init_weights((outshp, fc[0]))
    fc_params.append(w)

    for i in range(len(fc)-1):
        w = init_weights((fc[i], fc[i+1]))
        fc_params.append(w)
    return filter_params, fc_params




trX, trY, teX, teY, channels, img_x = load.load_data("mnist")
#trX, trY, teX, teY, channels, img_x = load.load_data("cifar10")
img_y = img_x


X = T.ftensor4()
Y = T.fmatrix()

f1 = (10, channels, 7, 7)
f2 = (25, f1[0], 4, 4)
filters = [f1, f2]
fc = [500, trY.shape[1]]

filter_params, fc_params = get_params(img_x, filters, fc)
params = filter_params + fc_params
print(params)
Ejemplo n.º 49
0
def feature_engineer(conf, thread, feature_importance, interaction_information, merge_ii, split_idx, split_num, testing, combinations_size):
    drop_fields = []
    transform2 = True

    parser = ModelConfParser(conf)
    BASEPATH = parser.get_workspace()
    binsize, top = parser.get_interaction_information()
    top_feature = parser.get_top_feature()

    if feature_importance:
        filepath_training = "{}/input/train.csv".format(BASEPATH)
        filepath_testing = "{}/input/test.csv".format(BASEPATH)
        filepath_cache_1 = "{}/input/train.pkl".format(BASEPATH)
        folder_ii = "{}/input/interaction_information/transform2=True_testing=-1_binsize={}".format(BASEPATH, binsize)
        folder_feature = "{}/etc/feature_profile/transform2=True_binsize={}_top={}".format(BASEPATH, binsize, top)

        train_x, test_x, train_y, test_id, train_id = load_data(filepath_cache_1, filepath_training, filepath_testing, drop_fields)

        columns = train_x.columns
        for layers, value in load_interaction_information(folder_ii, threshold=top):
            for df in [train_x, test_x]:
                t = value
                breaking_layer = None
                for layer in layers:
                    if layer in columns:
                        t *= df[layer].values
                    else:
                        breaking_layer = layer
                        break
                if breaking_layer == None:
                    df[";".join(layers)] = t
                else:
                    log("Skip {} due to {} not in columns".format(layers, breaking_layer), WARN)
                    break

        names = train_x.columns
        print "Data Distribution is ({}, {}), and then the number of feature is {}".format(np.sum(train_y==0), np.sum(train_y==1), len(names))

        fp = FeatureProfile()
        fp.profile(train_x.values, train_y, names, folder_feature, int(min(512, len(names))))

    if interaction_information:
        log("Try to calculate the interaction information", INFO)

        filepath_training = "{}/input/train.csv".format(BASEPATH)
        filepath_testing = "{}/input/test.csv".format(BASEPATH)

        train_x, test_x, train_y, id_train, id_test = None, None, None, None, None
        if transform2:
            train_x, test_x, train_y, id_train, id_test = data_transform_2(filepath_training, filepath_testing, keep_nan=True)
        else:
            train_x, train_y, test_x, test_id = data_load(drop_fields=drop_fields)

        filepath_cache = "{}/input/transform2={}_binsize={}_cache.pkl".format(BASEPATH, transform2, binsize)
        folder_couple = "{}/input/interaction_information/transform2={}_testing={}_binsize={}".format(BASEPATH, transform2, testing, binsize)

        results_couple = feature_engineering.calculate_interaction_information(filepath_cache, train_x, train_y, folder_couple, \
            binsize=binsize, nthread=thread, combinations_size=combinations_size, n_split_idx=split_idx, n_split_num=split_num,
            is_testing=int(testing) if testing > 0 else None)

    if merge_ii:
        folder_couple = "{}/input/interaction_information/transform2={}_testing={}_binsize={}".format(BASEPATH, transform2, testing, binsize)

        count_filepath, count_couple, final_count_filepath, final_count_couple = feature_engineering.merge_interaction_information(folder_couple)
        log("Originally. we have {} records in {} files. After merging, we have {} records in {} files".format(count_couple, count_filepath, final_count_couple, final_count_filepath), INFO)
Ejemplo n.º 50
0
import numpy as np

from sklearn.externals.joblib import Memory

from load import load_data
from descriptors import compute_boundary_desc, get_interest_points

NUM_IMAGES = 300

mem = Memory(cachedir='.')

gen = load_data(test=True)
descriptors = []
print "Compute descriptors"
d = 0
for i, (im, mask) in enumerate(gen):
    if i % 10 == 0:
        print "Computed %d images" % i
    if NUM_IMAGES is not None and i == NUM_IMAGES:
        break
    interest_points = mem.cache(get_interest_points)(mask)
    descriptor, coords = mem.cache(compute_boundary_desc)(im,
                                                          mask,
                                                          interest_points)
    for element in descriptor:
        descriptors.append(element)

# Let's dump descriptors to not recompute them later
descriptors = np.array(descriptors)
descriptors.dump('./data/descriptors.npy')
Ejemplo n.º 51
0
import sys
import subprocess
import os
import random

from load import load_data, load_emb, load_veclist
from tree_blstm import model
from accuracy import conlleval
from tools import shuffle
  
if __name__ == '__main__':

    s = {'seed':345, 'epoch':20, 'lr':0.01, 'decay':0.95, 'hnum':100 , 'dnum':340, 'ynum':2, 'wnum':6206, 'L2': 0.000001,
         'me':50, 'md':50, 'mx':50, 'Wlnum':13, 'Wrnum':297,'kalpha':0.2}
    print 'load train data'
    train_e = load_data("data/train_e.txt")
    train_d = load_data("data/train_d.txt")
    train_l = load_data("data/train_l.txt")
    train_s = load_data("data/train_s.txt")
    train_tl = load_veclist("data/train_tl.txt")
    train_tr = load_veclist("data/train_tr.txt")
    train_ta = load_veclist("data/train_ta.txt")
    train_y = load_data("data/train_y.txt")
    
    print 'load test data'
    test_e = load_data("data/test_e.txt")
    test_d = load_data("data/test_d.txt")
    test_l = load_data("data/test_l.txt")
    test_s = load_data("data/test_s.txt")
    test_tl = load_veclist("data/test_tl.txt")
    test_tr = load_veclist("data/test_tr.txt")
Ejemplo n.º 52
0
import numpy as np

from sklearn.externals.joblib import Memory
from load import load_data

from descriptors import compute_boundary_desc, get_interest_points
from histograms import compute_visual_words

NUM_IMAGES = None

mem = Memory(cachedir='.')

vocabulary = np.load('./data/vocabulary.npy')
gen = load_data()
res = []

# FIXME needs to lookup the number of images
postings = np.zeros((len(vocabulary), 3170))

for i, (im, mask) in enumerate(gen):
    if i % 10 == 0:
        print "computed %d images" % i
    if NUM_IMAGES is not None and i == NUM_IMAGES:
        break

    interest_points = mem.cache(get_interest_points)(mask)
    descriptor, coords = mem.cache(compute_boundary_desc)(im,
                                                  mask,
                                                  interest_points)
    vw = compute_visual_words(descriptor, vocabulary)
    if vw is not None:
Ejemplo n.º 53
0
def train(args):
    """
    This function trains the dietnetwork using the histogram embedding. The idea is to
    use batch learning from numpy files using basic dict_feed (not much improvement in time)
    Args:
        - args.path: path to the data dir which contains train/val/test
        - args.learning_rate: learning rate for the optimizer
        - args.sum_dir: summry
        - args.num_epoch: ...
        - args.batchsize: ...
    """
 


    # load the data: (note:already preshuffled)
    trainX, trainY, validX, validY, testX, testY = load_data(args.path)
    trainX = np.array(trainX).astype(np.float32)
    trainY = np.array(trainY).astype(np.float32)
    validX = np.array(validX).astype(np.float32)
    validY = np.array(validY).astype(np.float32)
    testX = np.array(testX).astype(np.float32)
    testY = np.array(testY).astype(np.float32)
 
    val_len = np.shape(validX)[0]
    test_len = np.shape(testX)[0]

    # get dietnet input values:
    input_dim=np.shape(trainX)[1]
    output_dim=np.shape(trainY)[1]
    embed_size=input_dim
    
    ############### Get Hyperparameter grid and iterate  ##################3
    grid_list, grid_string = grid()
    
    for hparam, hparam_list in zip(grid_list, grid_string):
	tf.reset_default_graph()
	print("new combination")
        # Begin loop for each parameter combination
        # build the graph:
        loss, accuracy = dietnet(path=args.path,
                                 input_size=input_dim, 
                                 output_size=output_dim,
                                 dropout_rate=args.dropout_rate,
                                 embed_size=embed_size,
                                 hidden_size=100,
                                 gamma=args.gamma,
                                 w_init=hparam['w_init_dist'],
                                 activ_fun=hparam['act_funs'],
                                 )

        #final ops: accuracy, loss, optimizer:
        optimizer = hparam['optims'] 
        training_op = slim.learning.create_train_op(loss, optimizer,
                                                    #summarize_gradients=True,
                                                    clip_gradient_norm=10)
        
        # Summary stuff: get the train/valid/test loss and accuracy
        test_acc_summary = tf.summary.scalar('test_accuracy', accuracy, collections=['test'])
        valid_acc_summary = tf.summary.scalar('valid_accuracy', accuracy, collections=['valid'])
        train_acc_summary = tf.summary.scalar('train_accuracy', accuracy, collections=['train'])

        test_loss_summary = tf.summary.scalar('test_loss', loss, collections=['test'])
        valid_loss_summary = tf.summary.scalar('valid_loss', loss, collections=['valid'])
        train_loss_summary = tf.summary.scalar('train_loss', loss, collections=['train'])

        # separates the summaries according to the collection
        train_ops = tf.summary.merge_all('train')
        valid_ops = tf.summary.merge_all('valid')
        test_ops = tf.summary.merge_all('test')

        with tf.Session() as sess:
            # init variables
            sess.run(tf.global_variables_initializer())
            sess.run(tf.local_variables_initializer())

            # print out all trainable variables
            #print([i for i in  tf.trainable_variables()])

            # saver for summary
            swriter = tf.summary.FileWriter(args.sum_dir + hparam_list, sess.graph)

            step = 0

            try:
                for i in range(args.num_epoch):
                    for idx in range(int(np.shape(trainX)[0] / args.batchsize)):
                        # prep data for train:
                        a,b = idx*args.batchsize, (idx+1)*args.batchsize
                        batch_x = trainX[a:b,:]
                        batch_y = trainY[a:b,:]

                        #get time
                        start_time=time.time()

                        # run train op and get train loss
                        trainloss, accur, summaries = sess.run([training_op, accuracy, train_ops],
                                                        feed_dict={
                                                            'inputs:0': batch_x,
                                                            'outputs:0': batch_y,
                                                            'is_training:0': True})

                        # add sumamries every other step for memory
                        if not idx % 25: swriter.add_summary(summaries,step)
                        
                        duration=time.time() - start_time

                        # every 5 steps get train and test loss/accur
                        if not idx % 25: 
                            # sample random 25% from test/valid for error
                            val_ind = [i for i in random.sample(xrange(val_len), args.batchsize)]
                            test_ind = [i for i in random.sample(xrange(test_len),
                                        args.batchsize)]
                            val_x = validX[val_ind,:]
                            val_y = validY[val_ind,:]
                            
                            test_x = testX[test_ind,:]
                            test_y = testY[test_ind,:]
                            
                            # get val loss/accur:
                            val_loss, accur_valid, summaries = sess.run([loss, 
                                                                        accuracy, 
                                                                        valid_ops],
                                                                feed_dict={
                                                                    'inputs:0': val_x,
                                                                    'outputs:0': val_y,
                                                                    'is_training:0': False})
                            swriter.add_summary(summaries,step)

                            # get test loss/accur
                            test_loss,accur_test, summaries = sess.run([loss, accuracy,test_ops],
                                                            feed_dict={
                                                                'inputs:0': test_x,
                                                                'outputs:0': test_y,
                                                                'is_training:0': False})
                            swriter.add_summary(summaries, step)
                            
                            # print to console in order to watch:
                            print('step {:d}-train/v/test acc:={:.3f},{:.3f},{:.3f}'.format(step, 
                                                                                    accur,
                                                                                    accur_valid,
                                                                                    accur_test))

                        step += 1

                        # add checkpoint here:...
                
                # if num_epochs is complete close swriter
                swriter.close()

            finally:
                swriter.close()
Ejemplo n.º 54
0
def learning(conf, thread, is_testing):
    drop_fields = []

    parser = ModelConfParser(conf)

    BASEPATH = parser.get_workspace()
    objective = parser.get_objective()
    binsize, top = parser.get_interaction_information()
    cost = parser.get_cost()
    nfold = parser.get_nfold()
    top_feature = parser.get_top_feature()

    filepath_training = "{}/input/train.csv".format(BASEPATH)
    filepath_testing = "{}/input/test.csv".format(BASEPATH)
    filepath_cache_1 = "{}/input/train.pkl".format(BASEPATH)
    folder_ii = "{}/input/interaction_information/transform2=True_testing=-1_binsize={}".format(BASEPATH, binsize)
    filepath_feature_importance = "{}/etc/feature_profile/transform2=True_binsize={}_top={}.pkl".format(BASEPATH, binsize, top)

    train_x, test_x, train_y, test_id, train_id = load_data(filepath_cache_1, filepath_training, filepath_testing, drop_fields)
    if is_testing:
        train_x = train_x.head(1000)
        train_y = train_y.head(1000)
    basic_columns = train_x.columns

    for layers, value in load_interaction_information(folder_ii, threshold=str(top_feature)):
        for df in [train_x, test_x]:
            t = value
            breaking_layer = None
            for layer in layers:
                if layer in basic_columns:
                    t *= df[layer].values
                else:
                    breaking_layer = layer
                    break

            if breaking_layer == None:
                df[";".join(layers)] = t
            else:
                log("Skip {} due to {} not in columns".format(layers, breaking_layer), WARN)
                break

    ii_columns = train_x.columns
    importance_columns = load_feature_importance(filepath_feature_importance, top_feature)

    predictors = {"basic": basic_columns,
                  "interaction-information-3": [column for column in ii_columns if column.count(";") == 1],
                  "interaction-information-4": [column for column in ii_columns if column.count(";") == 2],
                  "feature-importance": importance_columns}

    train_y = train_y.values
    test_id = test_id.values
    train_Y = train_y.astype(float)

    layer1_models, layer2_models, last_model = [], [], []
    data_dimension = []

    # Init the parameters of deep learning
    checkpointer = KaggleCheckpoint(filepath="{epoch}.weights.hdf5",
                                    training_set=([train_x], train_Y),
                                    testing_set=([test_x], test_id),
                                    folder=None,
                                    cost_string=cost,
                                    verbose=0, save_best_only=True, save_training_dataset=False)

    # Init the parameters of cluster
    for idx, layer_models in enumerate([layer1_models, layer2_models, last_model]):
        data_dimension.append([])

        for model_section in parser.get_layer_models(idx+1):
            for method, setting in parser.get_model_setting(model_section):
                if method.find("deep") > -1:
                    setting["folder"] = None

                    if "data_dimension" in setting:
                        if setting["data_dimension"] == "basic":
                            setting["input_dims"] = len(basic_columns)
                        elif setting["data_dimension"] == "importance":
                            setting["input_dims"] = len(importance_columns)
                        elif setting["data_dimension"].find("interaction-information") != -1:
                            setting["input_dims"] = top_feature
                        else:
                            log("Wrong Setting for input_dims because the data_dimension is {}".format(setting["data_dimension"]), ERRPR)
                            sys.exit(100)

                        data_dimension[idx].append(setting["data_dimension"])
                    else:
                        log("Not found data_dimension in LAYER{}".format(idx+1), INFO)

                        data_dimension[idx].append("all")

                    setting["callbacks"] = [checkpointer]
                    setting["number_of_layer"] = setting.pop("layer_number")
                else:
                    if "data_dimension" in setting:
                        data_dimension[idx].append(setting["data_dimension"])
                    else:
                        data_dimension[idx].append("all")

                layer_models.append((method, setting))
                log("Get the configuration of {} from {}".format(method, conf), INFO)
                log("The setting is {}".format(setting), INFO)

    folder_model = "{}/prediction_model/ensemble_learning/conf={}_is_testing={}_nfold={}_layer1={}_layer2={}_binsize={}_top={}".format(\
                        BASEPATH, os.path.basename(conf), is_testing, nfold, len(layer1_models), len(layer2_models), binsize, top_feature)
    folder_middle = "{}/etc/middle_layer/is_testing={}_nfold={}_binsize={}_top={}".format(\
                        BASEPATH, is_testing, nfold, binsize, top_feature)

    if is_testing and os.path.isdir(folder_model):
        log("Due to the testing mode, remove the {} firstly".format(folder_model), INFO)
        shutil.rmtree(folder_model)

    folder_submission = "{}/submission".format(folder_model)
    create_folder(folder_submission + "/dummy.txt")

    filepath_training = "{}/training_proba_tracking.csv".format(folder_model)
    filepath_testing = "{}/testing_proba_tracking.csv".format(folder_model)

    previous_training_dataset, previous_testing_dataset = train_x, test_x
    prediction_testing_history, prediction_training_history, learning_loss_history = {"ID": test_id}, {"target": train_Y}, []

    # Model Training
    m = [layer1_models, layer2_models, last_model]
    for idx, models in enumerate(m):
        filepath_queue = "{}/layer{}_queue.pkl".format(folder_model, idx+1)
        filepath_nfold = "{}/layer{}_nfold.pkl".format(folder_model, idx+1)

        for idx_col, (method, setting) in enumerate(models):
            if method.find("deep") > -1:
                input_dims = -1

                models[idx_col][1]["input_dims"] = len(previous_training_dataset[0])

            if "auto_tuning" in setting and setting["auto_tuning"] == 1:
                filepath_tuning = "{}/etc/parameter_tuning/layer{}/method={}_testing={}_nfold={}_top={}_binsize={}_feature={}.pkl".format(folder_model, idx+1, method, is_testing, nfold, top, binsize, len(previous_training_dataset[0]))
                filepath_submission = "{}/etc/parameter_tuning/layer{}/method={}_binsize={}_top={}_feature={}.submission.csv".format(folder_model, idx+1, method, binsize, top, len(previous_training_dataset[0]))

                create_folder(filepath_tuning)

                log("Start the process of auto-tuning parameters for {}".format(method), INFO)
                params = tuning(previous_training_dataset, train_Y, test_id, previous_testing_dataset, cost,
                                None, filepath_tuning, filepath_submission, method, nfold, top_feature, binsize,
                                thread=parser.get_n_jobs())

                log("The final paramers of layer{}-{} are {}".format(idx+1, method, params), INFO)
                for k, v in zip(["max_iter", "n_estimators", "learning_rate"], [parser.get_n_estimators(), parser.get_n_estimators(), parser.get_learning_rate()]):
                    if k in params:
                        params[k] = v

                models[idx_col][1].update(params)

            if "auto_tuning" in setting:
                models[idx_col][1].pop("auto_tuning")

        layer_train_x, layer_test_x, learning_loss = layer_model(\
                                 objective, folder_model, folder_middle, predictors, previous_training_dataset, train_Y, previous_testing_dataset, models,
                                 filepath_queue, filepath_nfold, n_folds=nfold, cost_string=cost, number_of_thread=thread,
                                 saving_results=(True if (idx == 0 or method.find("deep") == -1 )else False))

        learning_loss_history.append(learning_loss)

        col = layer_test_x.shape[1]
        for idx_col in range(0, col):
            submission = layer_test_x[:,idx_col]
            filepath_submission = "{}/layer={}_dimension={}_model={}_params={}.csv".format(folder_submission, idx+1, data_dimension[idx][idx_col], models[idx_col][0], make_a_stamp(models[idx_col][1]))
            save_kaggle_submission({"ID": test_id, "Target": submission}, filepath_submission)

            prediction_training_history["layer={}_method={}_feature={}_params={}".format(idx+1, models[idx_col][0], data_dimension[idx][idx_col], make_a_stamp(models[idx_col][1]))] = layer_train_x[:, idx_col]
            prediction_testing_history["layer={}_method={}_feature={}_params={}".format(idx+1, models[idx_col][0], data_dimension[idx][idx_col], make_a_stamp(models[idx_col][1]))] = layer_test_x[:, idx_col]

        previous_training_dataset = layer_train_x
        previous_testing_dataset = layer_test_x

        log("Layer{} is done...".format(idx+1), INFO)

    filepath_history_training_prediction = "{}/history_training.csv".format(folder_model)
    save_kaggle_submission(prediction_training_history, filepath_history_training_prediction)

    filepath_history_testing_prediction = "{}/history_testing.csv".format(folder_model)
    save_kaggle_submission(prediction_testing_history, filepath_history_testing_prediction)

    filepath_history_learning_loss = "{}/learning_loss.pkl".format(folder_model)
Ejemplo n.º 55
0
def main(argv):
    
    parser = argparse.ArgumentParser(description='Run test')
    parser.add_argument('-o', '--output', type=str, 
                   help='output directory')
    
    parser.add_argument('--db', nargs='+',  choices=('vista', 'vista1500', 'fantom', 'fantom_long', 'promoters'), required=True)
    group = parser.add_mutually_exclusive_group()
    group.add_argument('--saveclass', action='store_true', help='train classifier on whole dataset and save to pickle')
    group.add_argument('--useclass', type=str, help='use classifier saved in file')
    
    parser.add_argument('-p', '--positives', choices=tissue_choices, required=True)
    parser.add_argument('-n', '--negatives',  choices=tissue_choices, required=True)
    parser.add_argument('--boruta', action='store_true')
    parser.add_argument('--distinct', action='store_true')
    parser.add_argument('--histmods', type=str, default='', help='histone modificiations list file')
    parser.add_argument('--kmers', type=str, nargs='+', default='', help='kmers extension')
    
    #parser.add_argument('--usegc', action='store_true')
    
    args = parser.parse_args()
    
    if args.kmers == None and args.histmods == None:
        parser.error('Specify kmers or histmods')
    
    if len(args.db) == 1:
        args.db.append(args.db[0])
    
    if 'fantom' in args.db and ('positives' in args.positives or 'positives' in args.negatives ):
        parser.error('Positives for FANTOM not defined')
    
    
    #prepare output directory
    outdir = RESULTSPATH+args.output

    try:
        
            maks = 0
            for i in xrange(100, 0, -1):
                if os.path.exists(outdir+'_'+str(i)):
                    maks = i
                    break
            #print 'moving to', outdir+str(maks+1)
            
            outdir += '_'+str(maks+1)
            os.mkdir(outdir)
    except:
        parser.error('Cannot create directory %s' % outdir)
    outdir += '/'
    
    #write report, redirect stdout to log file
    orig_stdout = sys.stdout
    outfile = open(outdir+'log.txt', 'w')
    sys.stdout = outfile
    
    print args
    #print "cv_folds=%d, N_trees=%d, N_repeats=%d" % ( shared.cv_folds, shared.N_trees, shared.N_repeats )
    print "cv_folds=%d, N_trees=%d, N_repeats=%d, usegc=%s" % ( cv_folds, N_trees, N_repeats, USEGC )

    
    #load pos and neg data without balance
    datan = load_data(args.db[1], args.histmods, args.kmers, args.negatives, args.distinct )
    datap = load_data(args.db[0], args.histmods, args.kmers, args.positives, args.distinct )
    print "data sizes: %s %d %d, %s %d %d" % (args.positives, datap.shape[0], len(datap.dtype.names), args.negatives, datan.shape[0], len(datan.dtype.names))
    sys.stdout.flush()
    if args.boruta:
        boruta.start_boruta()
        
    
    #load classifier from pickle
    if args.useclass:
        auc = predict(datap, datan, args.useclass, outdir )
    #train new classifier
    else:
        if args.saveclass:
            name = args.db[0] + '-' + args.db[1]+"_" + args.positives + "_vs_" + args.negatives + "_" + args.histmods + "_" + str(args.kmers)
            auc = train_save(datap, datan, name, outdir)
            
        #else:
        name = args.positives + " vs " + args.negatives
        auc = train_cv(datap, datan, name, outdir, args.boruta)
        
    
    #finish 
    #all options: DB, POS, NEG, kmers, hmods, ntrees, cv_folds (0 if no cv), used_class, auc
    
    if args.kmers == '': args.kmers = '-'
    if args.histmods == '': args.histmods = '-'
    summary = "%s\t%s\t%s\t%s\t%s\t%s\t%d\t%s" % (args.db, args.positives, 
                                              args.negatives, args.kmers, 
                                              args.histmods, args.distinct, N_trees, USEGC)
    
    if not args.useclass and not args.saveclass:
        summary += "\t%d\t" % (cv_folds )
    elif args.useclass:
        summary += "\t0\t%s" % (args.useclass)
    else:
        summary += "\t0\t%s" % (args.saveclass)
    summary += "\t%f" % auc[0]
    summary += "\t%s" % time.ctime()
    summary += "\t%f" % auc[1]
    summary += "\t%s" % outdir
    summary += "\t%d\t%d" % (datap.shape[0], datan.shape[0])
    
    summary += "\n"
    
    print summary
    
    sys.stdout = orig_stdout
    outfile.close()
    
    summaryf = open(SUMMARYFILE, 'a+')
    summaryf.write(summary)
    summaryf.close()
Ejemplo n.º 56
0
import test
import sys

print("Init..")
model_name = "model.ckpt"
epochs = 2
if "e" in sys.argv:
    epochs = int(sys.argv[sys.argv.index("e")+1])
files = []
files.append("mute")
files.append("volume")
files.append("channel")
print("Files: " + ", ".join(files))

print("Loading data..")
inputs, outputs, words = load.load_data(files)

if "t" in sys.argv:
    print("Setup train..")
    sess = tf.InteractiveSession()
    x, y, y_ = th.setup(len(words), len(files))
    train_step, writer, merged, accuracy = th.trainSetup(y, y_, sess)

    print("Train..")
    th.train(inputs, outputs, x, y_, train_step, sess, epochs, writer, merged, accuracy)

    print("Save..")
    th.save(sess, model_name)
else:
    print("Test..")
    test.test(model_name, words, files)