コード例 #1
0
def main():
    # Load json config
    config = json.load(open("config.json"))
    extracted_features_root = config["extracted_features"]
    print("[+] Load features ...")

    X_test_num = utils.load_features(extracted_features_root, "X_test_num")
    X_test_cat = utils.load_features(extracted_features_root, "X_test_cat")
    X_test_desc = utils.load_features(extracted_features_root,
                                      "X_test_desc").any()
    X_test_title = utils.load_features(extracted_features_root,
                                       "X_test_title").any()
    #X_test_param = utils.load_features(extracted_features_root, "X_test_param").any()

    token_len = utils.load_features(extracted_features_root, "token_len")

    #X_test_text = [X_test_desc, X_test_title, X_test_param]
    X_test_text = [X_test_desc, X_test_title]

    n_folds = config["n_fold"]
    if n_folds:
        predict_fold(config, n_folds, X_test_num, X_test_cat, X_test_text,
                     token_len)
    else:
        predict_one(config, X_test_num, X_test_cat, X_test_text, token_len)
コード例 #2
0
def main(args):

    _, q, x = utils.load_benchmark(args.dataset, args.features)

    q = utils.load_features(q, chunks=(2500, 2048))
    x = utils.load_features(x, chunks=(2500, 2048))
    dim = q.shape[1]

    if args.random_rot is not None:
        rot = args.random_rot
        rot = os.path.join('features', 'random_ortho', f'rand_ortho_{dim}_{rot}.npy')
        rot = np.load(rot).astype(np.float32)
        q = q.dot(rot.T)
        x = x.dot(rot.T)

    # centering
    x_mean = x.mean(axis=0)
    q -= x_mean
    x -= x_mean

    out_dir = os.path.join('features', args.output)
    os.makedirs(out_dir, exist_ok=True)
    _, q_out, x_out = utils.load_benchmark(args.dataset, args.output)

    if not os.path.exists(q_out) or args.force:
        utils.save_as_hdf5(q, q_out, progress=True)
    if not os.path.exists(x_out) or args.force:
        utils.save_as_hdf5(x, x_out, progress=True)
コード例 #3
0
def main():
    parameters = {}
    parameters['axes'] = parameter_indices_to_plot
    assert len(parameters['axes']) == 2, \
        'Plot_3D can only plot over 2 parameters.'
    assert all(0<=p<=config.P-1 for p in parameters['axes']), \
        'Provided parameters do not coincide with those in config.'
    parameters['sliders'] = list(set(range(config.P))-set(parameters['axes']))
        
    # Store references to plots, otherwise the widgets become unresponsive due
    # to garbage collector. https://stackoverflow.com/a/42884505
    plots = {}

    scaler = utils.load_scaler()
    for component in config.components:
        model_constructor = utils.models[model_key]
        model = model_constructor()
        model.load(utils.model_dir, component)
        
        # Initialize and load data structres
        features = {}; targets = {}; outputs = {}
        for dataset in ['train', 'test']:
            features[dataset] = utils.load_features(dataset)
            targets[dataset] = utils.load_targets(dataset, component)
            outputs[dataset] = None
        # Create the interactive 3D plot
        plots[component] = Plot_3D(component, outputs, targets, features, model, parameters, scaler)
    pyplot.show()
コード例 #4
0
    def initialize(self):

        print('Initialization in progress...!\n')        
        
        start = time.time()
        yolo = YOLO(**{"model_path": self.model_path, 
            "anchors_path": self.anchors,
            "classes_path": self.yolo_classes_path,
            "score" : self.confidence,
            "gpu_num" : self.gpu_num,
            "model_image_size" : (416, 416),
            })
        
        # load pre-processed features database
        features, _, _ = load_features(self.recog_model)
        with open(self.classes_path, 'rb') as f:
            #img_input, input_labels = pickle.load(f)
            input_feats, input_labels = pickle.load(f)

        # load pre-trained recognition model
        model, preprocessed, input_shape = load_extractor_model(self.recog_model)
        my_preprocess = lambda x: preprocessed(pad_image(x, input_shape))

        #input_feat = extract_features(img_input, model, my_preprocess)
        sim_cutoff, (bins, cdf_list) = similarity_cutoff(input_feats, features, 0.95)

        print("Done...! It tooks {:.3f} mins\n".format((time.time() - start)/60))
        
        self.model_preproc = (yolo, model, my_preprocess)
        self.params = (input_feats, sim_cutoff, bins, cdf_list, input_labels)        
        return True
コード例 #5
0
ファイル: 02_tune.py プロジェクト: srinathdama/sniROM
def main():

    for component in config.components:

        D, denom_sq = utils.load_POD_D_and_denom_sq(component)

        features = {}
        targets = {}
        for dataset in ['train', 'validate']:
            features[dataset] = utils.load_features(dataset)
            targets[dataset] = utils.load_targets(dataset, component)

        ## Wrapper for the training routine
        def train_wrapper(tune_config):
            model_constructor = utils.models[model_key]
            model = model_constructor()
            model.set_data(features, targets, D, denom_sq)
            model.train(tune_config)
            model.save(utils.model_dir, component)

        for model_key in models_to_be_trained:
            ## Train without a tuning config
            t0 = time.time()
            train_wrapper(None)
            dt = time.time() - t0
            print(F"Trained {model_key} for {component} in {dt:.4} s")

        for model_key in models_to_be_tuned:
            ## Tune using the defined tuning config
            analysis = tune.run(train_wrapper,
                                local_dir=join(utils.model_dir, model_key),
                                name=component,
                                config=tune_config[model_key],
                                stop={'time_total_s': 1800})
コード例 #6
0
def initialize(filename):

    print('Initialization in progress...!\n')
    start = time.time()
    yolo = YOLO(
        **{
            "model_path":
            './model/keras_yolo3/model_data/yolo_weights_logos.h5',
            "anchors_path": './model/keras_yolo3/model_data/yolo_anchors.txt',
            "classes_path": './data/preprocessed/classes.txt',
            "score": 0.05,
            "gpu_num": 1,
            "model_image_size": (416, 416),
        })
    # get Inception/VGG16 model and flavor from filename
    model_name, flavor = model_flavor_from_name(filename)
    ## load pre-processed features database
    features, brand_map, input_shape = load_features(filename)

    ## load inception model
    model, preprocess_input, input_shape = load_extractor_model(
        model_name, flavor)
    my_preprocess = lambda x: preprocess_input(utils.pad_image(x, input_shape))

    with open('./data/preprocessed/trained_brands.pkl', 'rb') as f:
        img_input, input_labels = pickle.load(f)

    (img_input, feat_input, sim_cutoff,
     (bins, cdf_list)) = load_brands_compute_cutoffs(img_input,
                                                     (model, my_preprocess),
                                                     features, sim_threshold)
    print('Done! It tooks {:.2f} mins.\n'.format((time.time() - start) / 60))

    return (yolo, model, my_preprocess), (feat_input, sim_cutoff, bins,
                                          cdf_list, input_labels)
コード例 #7
0
def main(fea_dir, pos_file, neg_file, out_dir, model_file):
  model = train("{0}/all.fea".format(fea_dir), pos_file, neg_file)
  if save_model != None:
    save_model(model, model_file)

  for f in listdir(fea_dir):
    #if f == "all.fea":
    #  continue
    
    file_path = "{0}/{1}".format(fea_dir, f)
    ph_fea = load_features(file_path)

    phrase_list = []
    fea_list = []

    for ph in ph_fea:
      phrase_list.append(ph)
      fea_list.append(ph_fea[ph])

    X = np.asarray(fea_list)
    scores = model.decision_function(X)

    items = [(scores[i], phrase_list[i]) for i in range(len(phrase_list))]
    items.sort(reverse=True)

    save_result(items, "{0}/{1}".format(out_dir, f))
コード例 #8
0
def main(args):
  feat, case_ids = load_features(args.src, zscore=True)
  lab = load_labels(args.labsrc)

  ((nepc_f, nepc_lab), (m0_f, m0_lab), (m0p_f, m0p_lab), (m1_f, m1_lab)) = split_sets(feat, lab)

  yvect = ['M0']*m0_f.shape[0] + ['NPEC']*nepc_f.shape[0]
  ttests = []
  fig = plt.figure()
  for f in feat.columns:
    m0_ = m0_f.loc[:, f]
    nepc_ = nepc_f.loc[:, f]
    tt = ttest_ind(m0_, nepc_)
    if tt.pvalue < 1e-10:
      feature_data = pd.DataFrame({'group': yvect, 
        'feature': np.concatenate([m0_, nepc_], axis=0)})
      print(f, tt)
      out = os.path.join(args.dst, 'f_{}.png'.format(f))
      plt.clf()
      # sns.boxplot(x='group', y='feature', data=feature_data)
      sns.distplot(m0_, label='M0')
      sns.distplot(nepc_, label='NEPC')
      plt.legend()
      plt.title('Feature {}'.format(f))
      plt.savefig(out, bbox_inches='tight')
コード例 #9
0
def main(args):
    feat, case_ids = load_features(args.src)
    lab = load_labels(args.labsrc)

    feat = drop_high_cor(feat, cor_thresh=0.8)
    print('Features after high cor drop')
    print(feat.head())

    run_tsne(feat, lab)
コード例 #10
0
 def __init__(self, args):
     """
     MUSAE and AE machine constructor.
     :param args: Arguments object with the model hyperparameters.
     """
     self.args = args
     self.log = dict()
     self.graph = load_graph(args.graph_input)
     self.features = load_features(args.features_input)
コード例 #11
0
def main():
    # register arguments
    args = register_arguments()

    input_sequences = SeqIO.parse(args.sequences, 'fasta')
    clade_designations = read_in_clade_definitions(
        f"config/clades_{args.lineage}_ha.tsv")

    refname = (f"config/reference_{args.lineage}_ha.gb")
    ref = SeqIO.read(refname, 'genbank')
    features = load_features(refname)
    refstr, refCDS, refAA, cds_start, cds_end = get_cds(ref)

    # get clade internal clade and likeness
    clades_relatives, internal_clades = load_relatives()

    # output files
    prov_out = args.batchName + "_provanence.txt"
    results_out = args.batchName + "_cladeResults.txt"
    errors_out = args.batchName + "_error.txt"
    bucket_out = args.batchName + "_bucket.txt"

    # results
    results_bucket = ResultsBucket()

    for seq in input_sequences:

        seq_container = tmpNode()
        seq_aln = codon_align(seq, refstr, refAA, cds_start, cds_end)

        # error checking
        if seq_aln is None:
            print(
                f"{seq.id}\tError translating, check lineage and correct", file=sys.stdout)
            with open(errors_out, 'a') as ef:
                print(
                    f"{seq.id}\tError translating, check lineage and correct", file=ef)
            continue

        clade_provanence = get_provanence(
            seq_aln, features, clade_designations, ref)
        # write out results
        with open(prov_out, 'a') as cf:
            print(f"{seq.description}\t{', '.join(clade_provanence)}", file=cf)

        #clade_final = get_likeness(seq, clade_provanence, clades_relatives, internal_clades)
        clade_desig, virus_like, desig = get_likeness(
            seq, clade_provanence, clades_relatives, internal_clades)
        with open(results_out, 'a') as rf:
            print(clade_desig, file=rf)
        print(clade_desig, file=sys.stdout)

        results_bucket.add_result(seqno=seq.description, ha_clade=desig, result=virus_like,
                                  prov=', '.join(clade_provanence))
    print(results_bucket)
    results_bucket.write_results(bucket_out)
コード例 #12
0
ファイル: features.py プロジェクト: pristinelife/ds-project
def get_features(limit=1000, features=[], stemmer_type="RegexpStemmer", db_name="yelp_train", standardized=False):
    """
    -----------------------------------------------
    It does a bit of optimization
    Loads features from pickle, if the features with
    the specified input conditions are already pickled
    
    If not fetches from the database (MongoDB)
    -----------------------------------------------
    """
    if os.path.exists(data_dir + "/X_%s_%s_%s_%s.pickle"%(limit, db_name, stemmer_type, "-".join(features))):
        X = load_features("X_%s_%s_%s_%s.pickle"%(limit, db_name, stemmer_type, "-".join(features)))
        y = load_features("Y_%s_%s_%s_%s.pickle"%(limit, db_name, stemmer_type, "-".join(features)))
        z = load_features("Z_%s_%s_%s_%s.pickle"%(limit, db_name, stemmer_type, "-".join(features)))
        
    else:
        #! fetch features from database
        X, y, z = extract_and_save_features(limit=limit, features=features, stemmer_type=stemmer_type, db_name=db_name, standardized=standardized)
        
    return X, y, z
コード例 #13
0
ファイル: train.py プロジェクト: ngxbac/Avito
def main():
    torch.backends.cudnn.benchmark = True

    # Load json config
    config = json.load(open("config.json"))
    extracted_features_root = config["extracted_features"]
    # Load data and token len of embedding layers
    print("[+] Load features ...")
    y = utils.load_features(extracted_features_root, "y_train")
    token_len = utils.load_features(extracted_features_root, "token_len")

    X_train_num = utils.load_features(extracted_features_root, "X_train_num")
    X_train_cat = utils.load_features(extracted_features_root, "X_train_cat")
    X_train_desc = utils.load_features(extracted_features_root,
                                       "X_train_desc").any()
    X_train_title = utils.load_features(extracted_features_root,
                                        "X_train_title").any()
    # X_train_word_desc = utils.load_features(extracted_features_root, "X_train_word_description")
    # X_train_word_title = utils.load_features(extracted_features_root, "X_train_word_title")
    embedding_weights = utils.load_bcolz(extracted_features_root,
                                         "embedding_weights")
    X_train_word = [utils.load_bcolz(extracted_features_root, "X_train_word")]

    X_train_text = [X_train_desc, X_train_title]
    # X_train_word = [X_train_word_desc, X_train_word_title]

    n_folds = config["n_fold"]
    if n_folds:
        train_fold(config, n_folds, X_train_num, X_train_cat, X_train_text,
                   X_train_word, embedding_weights, y, token_len)
    else:
        train_normal(config, X_train_num, X_train_cat, X_train_text,
                     X_train_word, embedding_weights, y, token_len)
コード例 #14
0
def train():

    path_to_data = '../../data/processed/'
    path_to_output = '../../data/submissions/'
    path_to_preds = '../../data/predictions/'

    version = '1.1'
    random_seed = 8675309
    sample_size = 50000
    n_folds = 5

    params = {
        'nthread': 8,
        'n_estimators': 10000,
        'learning_rate': 0.02,
        'num_leaves': 34,
        'colsample_bytree': 0.9497036,
        'subsample': 0.8715623,
        'max_depth': 8,
        'reg_alpha': 0.041545473,
        'reg_lambda': 0.0735294,
        'min_split_gain': 0.0222415,
        'min_child_weight': 39.3259775,
        'silent': -1,
        'verbose': -1
    }

    train, labels, test, train_ids, test_ids = utils.load_features(
        path_to_data, version, sample_size)
    oof_train, oof_test = utils.kfold(classifier_builder=LightGBMWrapper,
                                      base_classifier=lightgbm.LGBMClassifier,
                                      classifier_params=params,
                                      train=train,
                                      labels=labels,
                                      test=test,
                                      n_folds=n_folds,
                                      random_seed=random_seed,
                                      use_smote=True)

    df_oof_train = pd.DataFrame({
        'SK_ID_CURR': train_ids,
        'TARGET': labels,
        'lightgbm': oof_train
    })
    #    df_oof_train['SK_ID_CURR'] = df_oof_train['SK_ID_CURR'].astype('int32')

    df_oof_test = pd.DataFrame({'SK_ID_CURR': test_ids, 'TARGET': oof_test})
    #    df_oof_test['SK_ID_CURR'] = df_oof_test['SK_ID_CURR'].astype('int32')

    df_oof_train.to_csv(path_to_preds + version + '-lightgbm.csv', index=False)
    df_oof_test.to_csv(path_to_output + version + '-lightgbm.csv', index=False)
コード例 #15
0
def main(args):
  feat_importance = pd.read_csv(args.src, sep='\t', index_col=0, header=None)
  features , _ = load_features(args.featsrc, zscore=True)
  labels = load_labels(args.labelsrc)
  
  feat_importance.sort_values(1, ascending=False, inplace=True)
  sns.distplot(feat_importance)
  plt.savefig('tile_feature_importance_dist.png', bbox_inches='tight')

  sns.regplot(np.squeeze(feat_importance.index.values), np.squeeze(feat_importance.values))

  feat_importance = feat_importance.iloc[:args.n, :]
  print('highest feature importance:')
  for f in feat_importance.index.values:
    print(f, feat_importance.loc[f].values)
コード例 #16
0
def train():

    path_to_data   = '../../data/processed/'
    path_to_output = '../../data/submissions/'
    path_to_preds  = '../../data/predictions/'

    version = '1.3'
    random_seed = 8675309
    sample_size = None
    n_folds = 5

    xgb_params = {
        'learning_rate':0.1,
        'n_estimators':10000,
        'max_depth':4,
        'min_child_weight':5,
        'subsample':0.8,
        'colsample_bytree':0.8,
        'objective':'binary:logistic',
        'nthread':8,
        'seed':random_seed,
        'scale_pos_weight':2.5,
        'reg_alpha':1.2,
        'early_stopping_rounds':50,
        'verbose':20,
        'eval_metric':'auc'
    }

    train, labels, test, train_ids, test_ids = utils.load_features(path_to_data, version, sample_size)
    oof_train, oof_test = utils.kfold(classifier_builder=XgboostWrapper,
                                      base_classifier=XGBClassifier,
                                      classifier_params=xgb_params,
                                      train=train,
                                      labels=labels,
                                      test=test,
                                      n_folds=n_folds,
                                      random_seed=random_seed)

    df_oof_train = pd.DataFrame({'SK_ID_CURR':train_ids, 'TARGET':labels, 'xgboost':oof_train})
    df_oof_train.fillna(0, inplace=True)
    df_oof_train['SK_ID_CURR'] = df_oof_train['SK_ID_CURR'].astype('int32')

    df_oof_test = pd.DataFrame({'SK_ID_CURR':test_ids, 'TARGET':oof_test})
    df_oof_test['SK_ID_CURR'] = df_oof_test['SK_ID_CURR'].astype('int32')

    df_oof_train.to_csv(path_to_preds + version + '-xgboost.csv', index=False)
    df_oof_test.to_csv(path_to_output + version + '-xgboost.csv', index=False)
コード例 #17
0
def main():
    dataset = 'test'
    features = utils.load_features(dataset)

    error_table = []  #list of dictionaries

    for component in config.components:
        L = config.num_basis[component]
        D, denom_sq = utils.load_POD_D_and_denom_sq(component)
        eps_pod_sqs = utils.load_error_POD_sq(dataset, component)

        targets = utils.load_targets(dataset, component)
        outputs = {}
        for model_key, model_constructor in utils.models.items():
            model = model_constructor()
            model.load(utils.model_dir, component)
            outputs[model_key] = model.evaluate(features)

        ## for each sample in test set
        for i in range(len(targets)):

            for l in range(L + 1):
                eps_pod_sq = eps_pod_sqs[l, i]
                line = {
                    'component': component,
                    'l': l,
                    'sample': i,
                    'dataset': dataset,
                    'eps_pod_sq': eps_pod_sq
                }

                for model_key in utils.models:
                    q_rb_model = outputs[model_key][i][:l]
                    q_rb_truth = targets[i][:l]
                    D_l = D[:l]
                    eps_reg_sq = np.sum(
                        (D_l * (q_rb_model - q_rb_truth))**2) / denom_sq
                    eps_sq = eps_pod_sq + eps_reg_sq
                    eps_key = F'eps_pod{model_key.lower()}_sq'
                    line[eps_key] = eps_sq

                error_table.append(line)

    df = pd.DataFrame(error_table)
    utils.save_error_table(df, dataset)
コード例 #18
0
def initialize(yolo, model_name, DB_path):
    print("\n\nInitialization in progress...!\n")
    start = time.time()
    
    # load pre-processed features database
    features, _, _ = load_features(model_name)
    with open(args.classes_path, 'rb') as f:
        #img_input, input_labels = pickle.load(f)
        input_feats, input_labels = pickle.load(f)

    # load pre-trained recognition model
    model, preprocessed, input_shape = load_extractor_model(model_name)
    my_preprocess = lambda x: preprocessed(pad_image(x, input_shape))

    #input_feats = extract_features(img_input, model, my_preprocess)
    sim_cutoff, (bins, cdf_list) = similarity_cutoff(input_feats, features, 0.95)

    print("Done...! It tooks {:.3f} mins\n".format((time.time() - start)/60))

    return (yolo, model, my_preprocess), (input_feats, sim_cutoff, bins, cdf_list, input_labels)
コード例 #19
0
ファイル: main.py プロジェクト: preetam01/FEATHER
def main(args):
    """
    Characteristic function embedding wrapper.
    :param args: Arguments object parsed up.
    """
    if args.model_type == "FEATHER":
        print("\nFitting a node embedding.\n")
        graph = load_graph(args.graph_input)
        features = load_features(args.feature_input)
        model = FEATHER()
        model.fit(graph, features)
    elif args.model_type == "FEATHER-G":
        print("\nFitting a graph level embedding.\n")
        graphs = load_graphs(args.graphs_input)
        model = FEATHERG()
        model.fit(graphs)
    else:
        quit()
    X = model.get_embedding()
    save_embedding(X, args.output)
コード例 #20
0
ファイル: nearestNeighbors.py プロジェクト: aiporre/uBAM
        'Chosen features: "%s". Compute %i Nearest Neighbors of %i randomly chosen postures. The Results will be saved in "%s".'
        % (args.feature_type, args.nn_per_query, args.number_of_queries,
           results_fold))
elif 'lstm' in args.feature_type:
    print(
        'Chosen features: "%s". Compute %i Nearest Neighbors of %i randomly chosen sequences. The Results will be saved in "%s".'
        % (args.feature_type, args.nn_per_query, args.number_of_queries,
           results_fold))
else:
    raise ValueError(
        'Chosen Features (%s) are not available. Please choose "fc6", "fc7" or "fc6fc7" for posture features or "lstm" for sequence features.'
        % args.feature_type)

print('Load features...')
feat, frames, coords, vids = load_features(args.feature_type,
                                           cfg.features_path,
                                           uni_videos.tolist())

############################################
# 2. compute NN and plot it
############################################
k = args.nn_per_query  #number of nearest neighbor per query
nr = min(args.number_of_queries, len(feat))  #number of queries
idx = np.random.permutation(len(feat))[:nr]  #choose randomly queries

#plot queries and NN
if 'fc6' in args.feature_type or 'fc7' in args.feature_type:
    n_mean_nn = 100  #we also want to plot the mean over the 100 nearest neighbor of the queries
    print('Compute %i Nearest Neighbor for %i queries' % (k, nr))
    D, I = compute_NN(feat, min(100 * n_mean_nn, feat.shape[0]), idx)
    nr_rows, r, fig_nr = 10, 0, 1
コード例 #21
0
ファイル: main.py プロジェクト: awoo769/Level_8_Lab
	# Columns that we want to train on
	cols = [ 'ax_l', 'ay_l', 'az_l', 'ax_r', 'ay_r', 'az_r', 'a_res_l', 'a_res_r']
	cols = [ 'ax_l', 'ay_l', 'az_l', 'ax_r', 'ay_r', 'az_r']
	cols = ['ax_diff', 'ay_diff', 'az_diff']
	cols = ['ax_diff', 'ay_diff', 'az_diff','a_res_diff']
	cols = [ 'ax_l', 'ay_l', 'az_l', 'ax_r', 'ay_r', 'az_r', 'ax_diff', 'ay_diff', 'az_diff']

	for event in events:
			for event_type in event_types:
				x = []
				
				for col in cols:
					directory = get_directory(initial_directory=data_folder, columns=col, est_events=True, event=event, event_type=event_type)

					# Load features (after extract data has been run)
					X_dictionary, y_dictionary, groups = load_features(data_folder, directory, est_events=True)

					x.append(X_dictionary)

				X = {}

				for k in X_dictionary.keys():
					concat_list = []

					for idx in x:
						concat_list.append(idx[k])

					X[k] = pd.concat(concat_list, axis=1)
				
				y = y_dictionary
コード例 #22
0
############################################
# 1. Load sequences and features
############################################
detections = load_table(cfg.detection_file,asDict=False)
det_cohort= np.array(detections['cohort']) # Used for classifier and plots
det_time  = np.array(detections['time'])   # Used for classifier and plots
det_frames= np.array(detections['frames'])
det_videos= np.array(detections['videos'])
uni_videos= np.unique(detections['videos'].values)
#uni_videos= [v for v in uni_videos if '2kmh' in v]
#uni_videos= [v for v in uni_videos if 'H' in v]
uni_videos= np.array([v for v in uni_videos if os.path.isdir(cfg.crops_path+v)])

print('Load features...')
pos_features,pos_frames,pos_coords,pos_videos = load_features('fc6', cfg.features_path,uni_videos.tolist())

############################################
# 2. Posture healthy/impaired assignment
############################################
video_time  =np.array([det_time[det_videos==v][0]   for v in uni_videos]).astype(int)
video_cohort=np.array([det_cohort[det_videos==v][0] for v in uni_videos]).astype(int)
pos_time    =np.concatenate([video_time[uni_videos==v] for v in pos_videos])

healthy, impaired = pos_time==0, pos_time==1
h_pos_feat,  h_pos_videos= pos_features[healthy], pos_videos[healthy]
h_pos_frames,h_pos_coords= pos_frames[healthy], pos_coords[healthy]
i_pos_feat,  i_pos_videos= pos_features[impaired], pos_videos[impaired]
i_pos_frames,i_pos_coords= pos_frames[impaired], pos_coords[impaired]

############################################
コード例 #23
0
def data_processing(files, seg_per_sent=3, debug=False):
    data_all = []

    # stats
    full_sent_pos, seg_num_doc = [], []
    total_instance = 0
    for fi, file in tqdm(enumerate(files)):
        data = load_features(file)
        # [(name, [(time_elapsed, parsed_words, segments, full_sent_pos), ...])]

        doc_data = []
        for doc in data:
            # doc
            doc_name = doc[0]
            if debug:
                print(doc_name, fi)

            num_sent = len(doc[1])
            seg_nums = 0
            sent_data = []

            for i in range(num_sent):
                # sentence
                sent = doc[1][i]

                segments = []
                for ssent in sent:
                    parsed_words, segments_all = ssent[1], ssent[2]
                    num_seg = len(segments_all)

                    sub_segments = []
                    is_full_sent = False
                    sum_list = [0, 1]

                    if isinstance(segments_all, str):
                        full_sent_key = '000{:03}'.format(len(parsed_words))
                        sub_segments.append([
                            full_sent_key, 0, 0, True, True,
                            [0 for _ in sum_list]
                        ])
                        full_sent_pos.append(0)
                    else:
                        segments_all = list(segments_all.items())

                        probs = [[] for _ in sum_list]
                        max_ed = -1
                        for j in range(num_seg):
                            key, prob = segments_all[j][0], segments_all[j][1]

                            for p in range(len(probs)):
                                probs[p].append(
                                    prob[p][1] +
                                    prob[p][2])  # P(comma) + P(period)
                            _, ed = decode_index(key)
                            max_ed = max(max_ed, ed)

                        full_sent_key = '000{:03d}'.format(max_ed)

                        mode = 0  # if > 0, scaling
                        if mode > 0:
                            probs = [scaling(pr, mode) for pr in probs]

                        # ! filtering RULE !
                        quant_ratio = 0.75  # 0.5 for median
                        thres = [
                            np.quantile(prs, quant_ratio) for prs in probs
                        ]

                        num_sel_seg = 0
                        for j in range(num_seg):
                            key, prob = segments_all[j][0], segments_all[j][1]
                            st, ed = decode_index(key)
                            rouge_score = 0
                            sel_type = 0

                            # segment selection by threshold
                            if probs[0][j] < thres[0] or probs[1][j] < thres[1]:
                                sel_type = 1

                            # optional RULE
                            if parsed_words[st] == 'and' or parsed_words[
                                    ed - 1] == 'and':
                                sel_type = 2

                            if sel_type == 0:
                                num_sel_seg += 1

                            if key == full_sent_key:
                                full_sent_pos.append(j)
                                is_full_sent = True

                            # segment: [key, psum, rouge, is_full_sent, is_sel, probs.]
                            sub_segments.append([
                                key,
                                np.sum([probs[p][j] for p in sum_list]),
                                rouge_score, is_full_sent, sel_type,
                                [probs[p][j] for p in sum_list]
                            ])
                            is_full_sent = False

                        if debug:
                            # raw segments based on XLNet prob. dist.
                            segment_sorted = sorted(sub_segments,
                                                    key=lambda x: x[1],
                                                    reverse=True)
                            print('\n[Full sentence]: ',
                                  ' '.join(parsed_words))
                            print('\n[XLNet segments - sorted by prob. sum]')
                            topn_seg = 10
                            for si, seg in enumerate(segment_sorted):
                                if si < topn_seg or si >= len(
                                        segment_sorted) - topn_seg:
                                    seg_text = gen_segment_text(
                                        seg, parsed_words)
                                    app_txt = 'top-{}'.format(
                                        topn_seg
                                    ) if si < topn_seg else 'bot-{}'.format(
                                        topn_seg)
                                    fi = '-f' if seg[0] == full_sent_key else ''
                                    prob_txt = 'sel:[{}] Sum:{:.3e}, L-C + L-P:{:.3e}, R-C: + R-P:{:.3e}'.format(
                                        seg[4], seg[1], seg[5][0], seg[5][1])
                                    print(
                                        '{:03}/{:03} [{}{}]'.format(
                                            si + 1, len(segment_sorted),
                                            app_txt, fi), prob_txt, seg_text)

                        sub_segments = [
                            sseg for sseg in sub_segments
                            if (sseg[4] == 0) and (not sseg[3])
                        ]
                        sub_segments = sorted(sub_segments,
                                              key=lambda x: x[1],
                                              reverse=True)

                        if debug:
                            # selected segments
                            print(
                                '\n[candidate segments filtered by median - sorted]'
                            )
                            for si, seg in enumerate(sub_segments):
                                seg_text = gen_segment_text(seg, parsed_words)
                                fi = '-f' if seg[3] else ''
                                prob_txt = 'sel:[{}] Sum:{:.3e}, L-C + L-P:{:.3e}, R-C: + R-P:{:.3e}'.format(
                                    seg[4], seg[1], seg[5][0], seg[5][1])
                                print(
                                    '{:03}/{:03}{}'.format(
                                        si + 1, num_sel_seg, fi), prob_txt,
                                    seg_text)

                        # final sub-segments
                        if len(sub_segments) == 0:
                            sub_segments.append([
                                full_sent_key, 0, 0, True, True,
                                [0 for _ in sum_list]
                            ])
                        else:
                            sub_segments = sub_segments[:seg_per_sent]

                        if debug:
                            # selected segments
                            print('\n[final candidate segments]')
                            for si, seg in enumerate(sub_segments):
                                seg_text = gen_segment_text(seg, parsed_words)
                                fi = '-f' if seg[3] else ''
                                prob_txt = 'sel:[{}] Sum:{:.3e}, L-C + L-P:{:.3e}, R-C: + R-P:{:.3e}'.format(
                                    seg[4], seg[1], seg[5][0], seg[5][1])
                                print(
                                    '{:03}/{:03}{}'.format(
                                        si + 1, num_sel_seg, fi), prob_txt,
                                    seg_text)
                            pdb.set_trace()

                        seg_nums += len(sub_segments)

                    # exclude less than 5 words (not chunks)
                    final_sub_segments = []
                    for sseg in sub_segments:
                        seg_text = gen_segment_text(sseg, parsed_words)
                        if len(seg_text.split()) >= 5:
                            final_sub_segments.append(sseg)
                    segments.append((parsed_words, final_sub_segments))

                sent_data.append(segments)

            doc_data.append((doc_name, sent_data))
            seg_num_doc.append(seg_nums)

        data_all = data_all + doc_data
        total_instance += len(data)

    print('data num.: {} {}'.format(total_instance, len(data_all)))

    return data_all, full_sent_pos, seg_num_doc
コード例 #24
0
        print(input_path + '/{}*'.format(split))
        files = sorted(glob.glob(input_path + '/{}*'.format(split)))
        print('{} files are found'.format(len(files)))

        filename = os.path.join(output_path, split + '.pkl')
        filename_stats = os.path.join(output_path, split + '_stats.pkl')

        if not os.path.exists(filename):
            st_time = time.time()

            # merge data based on filtering rule
            data_all, full_sent_pos, seg_num_doc = data_processing(
                files, args.seg_per_sent, args.debug)

            save_features(filename, data_all)
            save_features(filename_stats, [full_sent_pos, seg_num_doc])

            print('total num. sentences', len(full_sent_pos))
            print('elapsed time: {:.3f}s'.format(time.time() - st_time))
        else:
            data_all = load_features(filename)
            full_sent_pos, seg_num_doc = load_features(filename_stats)
            print('data is loaded from {} and {}'.format(
                filename, filename_stats))

        full_sent_pos_list.append(full_sent_pos)
        print_stats(seg_num_doc, '{}-seg_num_doc'.format(split))

    # draw data stats
    draw_stats(full_sent_pos_list, splits, data_name)
コード例 #25
0
 ''' extract feature '''
 #print('===> extract features for every videos ...')
 #utils.extract_feature_p1(feature_extractor, train_loader, val_loader, args)
 ''' define loss '''
 criterion = nn.CrossEntropyLoss()
 ''' setup optimizer '''
 FC.cuda()
 optimizer = torch.optim.Adam(FC.parameters(),
                              lr=args.lr,
                              weight_decay=args.weight_decay)
 sched = lr_scheduler.StepLR(optimizer, step_size=50)
 ''' setup tensorboard '''
 writer = SummaryWriter(os.path.join(args.save_dir, 'train_info'))
 ''' load train and val features '''
 print('===> load train and val features and labels ...')
 train_features, train_label, valid_features, valid_labels = utils.load_features(
     args)
 ''' train model '''
 print('===> start training ...')
 iters = 0
 best_acc = 0
 for epoch in range(1, args.epoch + 1):
     FC.train()
     utils.set_requires_grad(FC, True)
     total_length = train_features.shape[0]
     perm_index = torch.randperm(total_length)
     train_X_sfl = train_features[perm_index]
     train_y_sfl = train_label[perm_index]
     # construct training batch
     for index in range(0, total_length, args.train_batch):
         train_info = 'Epoch: [{0}][{1}/{2}]'.format(
             epoch, index + 1, len(train_loader))
コード例 #26
0
ファイル: test.py プロジェクト: DBone32/Logohunter-API
def test(filename):
    """
    Test function: runs pipeline for a small set of input images and input
    brands.
    """
    yolo = YOLO(**{"model_path": 'keras_yolo3/yolo_weights_logos.h5',
                "anchors_path": 'keras_yolo3/model_data/yolo_anchors.txt',
                "classes_path": 'data_classes.txt',
                "score" : 0.05,
                "gpu_num" : 1,
                "model_image_size" : (416, 416),
                }
               )
    save_img_logo, save_img_match = True, True

    test_dir = os.path.join(os.path.dirname(__file__), os.path.pardir, 'data/test')

    # get Inception/VGG16 model and flavor from filename
    model_name, flavor = model_flavor_from_name(filename)
    ## load pre-processed features database
    features, brand_map, input_shape = load_features(filename)

    ## load inception model
    model, preprocess_input, input_shape = load_extractor_model(model_name, flavor)
    my_preprocess = lambda x: preprocess_input(utils.pad_image(x, input_shape).astype(np.float32))

    ## load sample images of logos to test against
    input_paths = ['test_batman.jpg', 'test_robin.png', 'test_lexus.png', 'test_champions.jpg',
                   'test_duff.jpg', 'test_underarmour.jpg', 'test_golden_state.jpg']
    input_labels = [ s.split('test_')[-1].split('.')[0] for s in input_paths]
    input_paths = [os.path.join(test_dir, 'test_brands/', p) for p in input_paths]

    # compute cosine similarity between input brand images and all LogosInTheWild logos
    ( img_input, feat_input, sim_cutoff, (bins, cdf_list)
    ) = load_brands_compute_cutoffs(input_paths, (model, my_preprocess), features, sim_threshold, timing=True)

    images = [ p for p in os.listdir(os.path.join(test_dir, 'sample_in/')) if p.endswith('.jpg')]
    images_path = [ os.path.join(test_dir, 'sample_in/',p) for p in images]

    start = timer()
    times_list = []
    img_size_list = []
    candidate_len_list = []
    for i, img_path in enumerate(images_path):
        outtxt = img_path

        ## find candidate logos in image
        prediction, image = detect_logo(yolo, img_path, save_img = True,
                                          save_img_path = test_dir, postfix='_logo')

        ## match candidate logos to input
        outtxt, times = match_logo(image, prediction, (model, my_preprocess),
                outtxt, (feat_input, sim_cutoff, bins, cdf_list, input_labels),
                save_img = save_img_match, save_img_path=test_dir, timing=True)

        img_size_list.append(np.sqrt(np.prod(image.size)))
        candidate_len_list.append(len(prediction))
        times_list.append(times)

    end = timer()
    print('Processed {} images in {:.1f}sec - {:.1f}FPS'.format(
            len(images_path), end-start, len(images_path)/(end-start)
           ))

    fig, axes = plt.subplots(1,2, figsize=(9,4))
    for iax in range(2):
        for i in range(len(times_list[0])):
            axes[iax].scatter([candidate_len_list, img_size_list][iax], np.array(times_list)[:,i])

        axes[iax].legend(['read img','get box','get features','match','draw','save'])
        axes[iax].set(xlabel=['number of candidates', 'image size'][iax], ylabel='Time [sec]')
    plt.savefig(os.path.join(test_dir, 'timing_test.png'))
コード例 #27
0
                                         sum_path=test_sum_path,
                                         is_duc=False,
                                         topn_sent=args.topn_sent)

        dest_dir = os.path.join(args.base_path,
                                os.path.dirname(args.TAC_data_path[0]),
                                args.data_type)
        data_name = 'TAC'

    train_ids = None
    test_ids = None

    if args.data_type == 'xlnet':

        train_file = os.path.join(dest_dir, 'train.pkl')
        train_data = load_features(train_file)
        test_file = os.path.join(dest_dir, 'test.pkl')
        test_data = load_features(test_file)

        data_ext = []
        summary, Y = text_train.ref[:train_ids], text_train.Y[:train_ids]
        name, pos = text_train.name[:train_ids], text_train.pos[:train_ids]
        data_ext.append([summary, Y, name, pos])
        summary, Y = text_test.ref[:test_ids], text_test.Y[:test_ids]
        name, pos = text_test.name[:test_ids], text_test.pos[:test_ids]
        data_ext.append([summary, Y, name, pos])
        text_dir = [
            os.path.join(dest_dir, 'train'),
            os.path.join(dest_dir, 'test')
        ]
コード例 #28
0
features_file = sys.argv[1]
input_file = sys.argv[2]
output_file = sys.argv[3]

target_col = 'SalaryNormalized'
cols2tokenize = [ 'Title', 'FullDescription' ]
cols2binarize = [ 'Loc1', 'Loc2', 'Loc3', 'Loc4', 'Loc5', 'ContractType', 'ContractTime', 'Company', 'Category', 'SourceName' ]
cols2drop = [  'SalaryRaw' ]

# only some features from these columns
cols2filter = [ 'Title', 'FullDescription', 'FullDescription' ]

###

print "loading features..."
features_by_col = load_features( features_file )

print "%s ---> %s" % ( input_file, output_file )

i_f = open( input_file )
o_f = open( output_file, 'wb' )

reader = csv.reader( i_f )
headers = reader.next()

target_index = headers.index( target_col )
indexes2tokenize = map( lambda x: headers.index( x ), cols2tokenize )
indexes2binarize = map( lambda x: headers.index( x ), cols2binarize )
indexes2drop = map( lambda x: headers.index( x ), cols2drop )
indexes2filter = map( lambda x: headers.index( x ), cols2filter )
コード例 #29
0
ファイル: train_model.py プロジェクト: tkino15/kaggle_malware
def train_model(config, _debug, logger, start_dt, train_and_predict):
    """
    train model with features. model and features are designated in config
    """
    features = config['features']
    label_name = config['label_name']
    id_name = config['id_name']

    # load only train features and label
    x_train_all = load_features(features, _debug, target='train')
    y_train_all = load_target(label_name, _debug)

    gc.collect()

    logger.debug('x_train_all:{0}'.format(x_train_all.shape))
    logger.debug('y_train_all:{0}'.format(y_train_all.shape))

    # save feature names and index
    feature_names = x_train_all.columns.tolist()
    x_train_idx = x_train_all.index

    # convert from df to matrix
    x_train_all = df_to_matrix(x_train_all)

    # load model params
    params = config['params']
    seed = config['seed']
    model_name = config['model_name']

    # generate stratified k-fold instance
    n_splits = config['n_splits']
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed)

    # to store results
    y_te_prs = np.zeros(len(y_train_all))
    scores_tr, scores_te = defaultdict(list), defaultdict(list)
    importances_df = pd.DataFrame()
    trained_models = []

    # cross validation
    for _fold, (tr_idx,
                te_idx) in enumerate(skf.split(x_train_idx, y_train_all)):
        _fold += 1
        logger.debug('------ {0} / {1} fold ------'.format(_fold, n_splits))

        # extract dataset
        x_tr, x_te = x_train_all[tr_idx, :], x_train_all[te_idx, :]
        y_tr, y_te = y_train_all[tr_idx], y_train_all[te_idx]

        logger.debug('x_tr:{0} x_te:{1}'.format(x_tr.shape, x_te.shape))
        logger.debug('y_tr:{0} y_te:{1}'.format(y_tr.shape, x_te.shape))

        # train model
        y_tr_pr, y_te_pr, model = train_and_predict(x_tr, y_tr, x_te, params)

        # save prediction
        y_te_prs[te_idx] += y_te_pr / (n_splits - 1)

        # compute metric
        scores_tr = calc_metrics(scores_tr, y_tr_pr, y_tr)
        scores_te = calc_metrics(scores_te, y_te_pr, y_te)

        logger.debug('[{0}f] train_acc:{1} test_acc:{2}'.format(
            _fold, scores_tr['acc'][-1], scores_te['acc'][-1]))
        logger.debug('[{0}f] train_auc:{1} test_auc:{2}'.format(
            _fold, scores_tr['auc'][-1], scores_te['auc'][-1]))

        # save model
        trained_models.append(model)

        # feature importance
        if hasattr(model, 'feature_importances_'):
            importances_df['{}_fold'.format(
                _fold)] = model.feature_importances_
        elif hasattr(model, 'coef_'):
            importances_df['{}_fold'.format(_fold)] = model.coef_.flatten()

        del x_tr, x_te, y_tr, y_te, y_tr_pr, y_te_pr, model
        gc.collect()

    # mean metrics
    scores_cv_tr = np.mean(pd.DataFrame(scores_tr), axis=0)
    scores_cv_te = np.mean(pd.DataFrame(scores_te), axis=0)

    logger.debug('------ cross validation ------')
    logger.debug('[cv] train_acc:{0}, test_acc:{1}'.format(
        scores_cv_tr['acc'], scores_cv_te['acc']))
    logger.debug('[cv] train_auc:{0}, test_auc:{1}'.format(
        scores_cv_tr['auc'], scores_cv_te['auc']))

    if importances_df.any(axis=None):
        # mean feature importance
        importances_df = pd.DataFrame({
            'feature':
            feature_names,
            'importance':
            np.mean(importances_df, axis=1)
        })

        # save
        file_name = 'importances_{0:%m%d_%H%M%S}_{1:.5f}_{2}'.format(
            start_dt, scores_cv_te['auc'], model_name)
        importances_df.to_csv('../../data/output/{0}.csv'.format(file_name),
                              index=False)

        # plot
        fig = plot_importances(importances_df, file_name)
        fig.savefig(
            '../../figures/feature_importance/{0}.png'.format(file_name))

    # save prediction on te dataset
    train_df = pd.read_pickle('../../data/input/train.pkl')
    if _debug:
        train_df = train_df.iloc[:int(train_df.shape[0] / 100)]

    y_te_prs_df = pd.DataFrame({
        'id': train_df[id_name],
        'pred': y_te_prs,
        'truth': y_train_all
    })
    logger.debug('y_tr_prs_df:{0}'.format(y_te_prs_df.shape))

    del train_df
    gc.collect()

    # save prediction on cross-validation test
    y_te_prs_df.to_pickle(
        '../../data/output/val_{0:%m%d_%H%M%S}_{1:.5f}_{2}.pkl'.format(
            start_dt, scores_cv_te['auc'], model_name))

    del y_te_prs_df
    gc.collect()

    # save models
    model_path = '../../models/models_{0:%m%d_%H%M%S}_{1:.5f}_{2}.pkl'.format(
        start_dt, scores_cv_te['auc'], model_name)
    with open(model_path, 'wb') as f:
        pickle.dump(trained_models, f)
コード例 #30
0
import random
import copy

# ML
import torch
from torch.utils.tensorboard import SummaryWriter

from models import RecursiveNN_Linear
from rosetta import train_model, test_model
from utils import create_loader, load_features

logdir = "./logs/"

folds = 3

dataset, _ = load_features(split=False, nt=False)


def population_generator(pop, pop_size):
    """Generate a random population of size pop_size."""
    for _ in range(pop_size + 1):
        epochs = np.random.randint(low=1, high=100)
        pop.append({
            "N1": np.random.randint(low=4, high=64),
            "N2": np.random.randint(low=4, high=64),
            "lr": np.random.randint(low=1, high=10) * 1e-4,
            "gamma": np.random.random_sample(),
            "batch_size_train": np.random.randint(low=32, high=512),
            "epochs": epochs,
            "out_features": np.random.randint(low=1, high=15),
            "leaky_relu": bool(random.getrandbits(1)),
コード例 #31
0
import argparse
import faiss
import time
import utils

if __name__ == '__main__':
    parser = argparse.ArgumentParser(
        description='Train and save an empty FAISS index')
    parser.add_argument('index_type',
                        type=str,
                        help='String for index_factory()')
    parser.add_argument('train_data', type=str, help='Path to train data')
    parser.add_argument('index_file', type=str, help='Output Index file')
    args = parser.parse_args()

    x = utils.load_features(args.train_data, 'rmac')[...]
    n, d = x.shape

    index = faiss.index_factory(d, args.index_type)
    train_time = time.time()
    index.train(x)
    train_time = time.time() - train_time
    print('Training Time:', train_time)
    faiss.write_index(index, args.index_file)
コード例 #32
0
    sim_file = '{}_sim*'.format(args.split)
    file_pattern = [y_name_pos_file, imp_file, sim_file, imp_vector_file]
    file_names = ['y_name_pos.pkl', 'imp.pkl', 'sim.pkl', 'imp_vector.pkl']

    for i, pf in enumerate(zip(file_pattern, file_names)):
        pattern, fn = pf
        pattern_ = os.path.join(BERT_base_dir, pattern)
        file_n = 'imp_vector.h5' if args.dataset == 2 and i == 3 else fn
        file_name = os.path.join(BERT_output_dir, file_n)

        files = sorted(glob.glob(pattern_))
        print('found {} files for {}'.format(len(files), pattern_))
        if i == 0:
            Y_data, name_data, pos_data = [], [], []
            for file in files:
                data = load_features(file)
                # 'Y': Y, 'name': name, 'pos': pos
                Y_data = Y_data + data['Y']
                name_data = name_data + data['name']
                pos_data = pos_data + data['pos']
            save_features(file_name, {
                'Y': Y_data,
                'name': name_data,
                'pos': pos_data
            })
        else:
            data_all = []
            for file in files:
                data = load_features(file)
                data_all = data_all + data
            if args.dataset == 2 and i == 3:
コード例 #33
0
                     default='index_img')
    par.add_argument('--input_image',
                     type=str,
                     dest='input_image',
                     help='input image path to search query',
                     required=True)
    return par


def build_search(images_features, file_index, image_feature):
    image_index = utils.index_features(images_features, dims=4096)
    results = utils.search_index_by_value(image_feature, image_index,
                                          file_index)
    print(results)


if __name__ == "__main__":
    parser = build_parser()
    options = parser.parse_args()
    features_path = options.features_path
    file_mapping = options.file_mapping
    input_image = options.input_image

    model = utils.load_headless_pretrained_model()
    image = utils.load_img(input_image)
    image_feature = model.predict(image).reshape((4096, ))
    print(image_feature.shape)
    images_features, file_index = utils.load_features(features_path,
                                                      file_mapping)
    build_search(images_features, file_index, image_feature)