Ejemplo n.º 1
0
def check_train(trainfile, classifier_file):
    """Create an instance of 'Classifier', train it with the dataset in 'trainfile'
    and save the trained model to 'classifier_file" on disk."""
    # Load training data
    texts, labels = dataloader.load(trainfile)
    # Create a Classifier instance and train it on data
    classifier = Classifier()
    classifier.train(texts, labels)
    # Save classifier
    save_classifier(classifier, classifier_file)
    print("Done.\n--------------------------------------\n")
Ejemplo n.º 2
0
def main():
    samples = []
    load(samples)

    space_groups = {}
    for i in samples:
        grp = i.space_group
        if (grp in space_groups): space_groups[grp] += 1
        else: space_groups[grp] = 1
    print(len(space_groups))

    X = []
    y = []
    for i in samples:
        X_temp = i.image_stack
        X.append(X_temp)
        y_temp = i.space_group
        y.append(y_temp)
    X = np.array(X)
    y = np.array(y)

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)

    model = keras.Sequential([
        keras.layers.Flatten(input_shape=(3, 512, 512)),
        keras.layers.Dense(1000, activation=tf.nn.relu),
        keras.layers.Dense(231, activation=tf.nn.softmax)
    ])

    model.compile(optimizer='adam', 
                    loss='sparse_categorical_crossentropy',
                    metrics=['accuracy'])

    model.fit(X_train, y_train, epochs=3)

    test_loss, test_acc = model.evaluate(X_test, y_test)
    print('Test accuracy:', test_acc)
Ejemplo n.º 3
0
    def __init__(self, params, mode, device):
        assert mode in ['train', 'test', 'valid']
        np.random.seed(params['seed'])
        #self._const = 0  # constrain counter
        self._device = device
        self._maxlen = params['maxlen']
        self._benchmark = params['benchmark']
        self._batchsize = params['batch_size']

        # builds ad hoc dataset, the number of violated_ constraints can be tuned
        (X, y) = dataloader.load(self._benchmark, maxlen=self._maxlen)
        self._len, self._nfeatures = X.shape
        indices = self._get_indexes(params, self._len, mode, params['seed'])
        X, y = X[indices], np.reshape(y[indices], (len(indices), 1))
        self._dataset = (Ten(X), Ten(y))
Ejemplo n.º 4
0
def eval(classifier_file, testfile):
    """Loads a classifier model from disk, and evaluate it with test data from
    'testfile' file"""
    try:
        # load classifier
        classifier = load_classifier(classifier_file)
        # load test data
        texts, labels = dataloader.load(testfile)
        # classify input texts with the classifier (let the classifier predict classes for the input texts)
        predictions = classifier.predict(texts)
        # compute accuracy score by comparing the model predictions with the true labels
        acc = accuracy_score(labels, predictions)
        print("\nAccuracy: %.4f\n" % acc)
    except FileNotFoundError:
        print(" Error: cannot find model file '%s'" % classifier_file)
    print("Done.\n--------------------------------------\n")
Ejemplo n.º 5
0
def plot_incidents_per_station(san_diego, years, with_fire_station=False):
    '''
    Plot number of incidents per station over years

    Parameters
    ----------
    san_diego : TYPE
        DESCRIPTION.
    years : TYPE
        DESCRIPTION.

    Returns
    -------
    None.

    '''
    assert isinstance(san_diego, gpd.GeoDataFrame)
    assert hasattr(years, '__iter__')
    assert all(2007<=i<=2020 and isinstance(i, int) for i in years)  
    assert isinstance(with_fire_station, bool)
    
    # information of incidents over years
    incidents_df = dataloader.load(years)
    
    # get dataframe with incidents per stations
    incidents_per_station_df = calculate_incidents_per_station(incidents_df)
    
    # merge incidents_per_station with san diego map
    years_san_diego = san_diego.merge(incidents_per_station_df, on = 'address_zip', how = 'inner')
    
    # plot the heatmap
    if len(years) == 1:
        title = 'Number of incidents per station in %d'%(years[0]);
    else:
        title = 'Number of incidents per station over %d-%d'%(years[0], years[-1])
    ax = years_san_diego.plot(column = 'incidents_per_station', scheme = 'quantiles', legend=True, cmap = 'OrRd', figsize=(12,20))
    
    # combine with fire station
    if with_fire_station:
        fire_df = pd.read_csv('./data/fire_station_position.csv')
        gdf = gpd.GeoDataFrame(fire_df, geometry=gpd.points_from_xy(fire_df.longitude, fire_df.latitude))
        gdf.plot(ax = ax, color = 'blue')
        
    plt.title(title)
    #plt.savefig('incidents_per_station_%d-%d'%(years[0], years[-1])+'.png')
    plt.show()
Ejemplo n.º 6
0
def plot_std_incidents(years):
    '''
    

    Parameters
    ----------
    years : TYPE
        DESCRIPTION.

    Returns
    -------
    None.

    '''
    assert hasattr(years, '__iter__')
    assert all(2007<=i<=2020 and isinstance(i, int) for i in years)  
    
    std_list = []
    
    # information of incidents over years
    incidents_df = dataloader.load(years)
    
    # get dataframe with incidents per stations
    incidents_per_station_df = calculate_incidents_per_station(incidents_df)
    
    # remove samples with no stations
    incidents_per_station_df = incidents_per_station_df[incidents_per_station_df['incidents_per_station']!=0]
    std_list.append(incidents_per_station_df['incidents_per_station'].std())
    
    for i in range(len(incidents_per_station_df)):
        incidents_per_station_df.loc[i, 'num_stations'] += 1
        incidents_per_station_df['incidents_per_station'] = incidents_per_station_df['num_incidents']/incidents_per_station_df['num_stations']
        incidents_per_station_df.replace(np.inf, 0, inplace=True)
        std_list.append(incidents_per_station_df['incidents_per_station'].std())
        
    title = 'std of number of incidents per station %d-%d'%(years[0],years[-1])
    plt.figure()
    plt.plot(std_list)
    plt.xlabel('number of added fire stations')
    plt.ylabel('std of number of incidents per station')
    plt.title(title)
    #plt.savefig('std_incidents_per_station_%d-%d'%(years[0],years[-1]) + '.png')
    plt.show()
Ejemplo n.º 7
0
def plot_fire_heatmap(san_diego, years, with_fire_station=False):
    '''
    Plot fire stations with number of incidents over years

    Parameters
    ----------
    san_diego : gpd.GeoDataFrame
        map of san diego
    years : iterable of int
        recorded duration

    '''
    
    assert isinstance(san_diego, gpd.GeoDataFrame)
    assert hasattr(years, '__iter__')
    assert all(2007<=i<=2020 and isinstance(i, int) for i in years)  
    assert isinstance(with_fire_station, bool)
    
    # incidents over years
    years_df = dataloader.load(years)
    years_df['address_zip'] = years_df['address_zip'].fillna(0.0).astype(int).astype(str)
    years_incidents = pd.pivot_table(years_df, values='incident_number', index=['address_zip'], columns=[], aggfunc=np.ma.count, fill_value=0)
    years_san_diego = san_diego.merge(years_incidents, on = 'address_zip', how = 'inner')
    years_san_diego['incident_number'] = years_san_diego['incident_number'].fillna(0.0).astype(int)
    
    # plot san diego map with incidents
    if len(years) == 1:
        title = 'Number of incidents in %d'%(years[0])
    else:
        title = 'Incidents over %d-%d'%(years[0],years[-1])
    ax = years_san_diego.plot(column = 'incident_number', scheme = 'quantiles', legend=True, cmap = 'OrRd', figsize=(12,20))
    
    # combine with fire station
    if with_fire_station:
        fire_df = pd.read_csv('./data/fire_station_position.csv')
        gdf = gpd.GeoDataFrame(fire_df, geometry=gpd.points_from_xy(fire_df.longitude, fire_df.latitude))
        gdf.plot(ax = ax, color = 'blue')
    
    plt.title(title)
    plt.axis('off')
    #plt.savefig('Incidents_over_%d-%d'%(years[0],years[-1])+'.png')
    plt.show()    
Ejemplo n.º 8
0
def main():
    sc = SparkContext('local[15]', 'haha')
    d = load(sc)
    data_train_lp, data_dev_p, label_dev_gt, test_p = d['train_tfidf_lp'], d[
        'dev_tfidf'], d['dev_gt'], d['test_tfidf']
    data_train_p, label_train_gt = d['train_tfidf'], d['train_gt']
    # print("count =", data_train_lp.take(10))
    sample_train = data_train_lp
    print("sample in total: ", sample_train.count())
    print("___________train______________")
    sys.stdout.flush()
    lg = LogisticRegressionWithSGD.train(sample_train, step=0.005)
    print("___________trained____________")
    sys.stdout.flush()
    # lg.save(sc, 'logistic.model')
    result_dev = lg.predict(data_dev_p).map(int)
    result_train = lg.predict(data_train_p).map(int)

    print("train info:")
    valid(result_train, label_train_gt)
    print("dev info:")
    valid(result_dev, label_dev_gt)
    dump(lg.predict(test_p).map(int).collect())
    except:
        pass
    os.mkdir(dirname)

def visualize(discretizer, dirname):
    reset_dir(dirname)

    discretizer.visualize("%s/full.png" % dirname)
    for i in range( len(norm_sensor_states[0]) ):
        discretizer.visualize("%s/%d.png" % (dirname, i), (i, i))

if __name__ == '__main__':
    output_dir = 'discretizer_visualizations'
    reset_dir(output_dir)

    sensor_states, behaviors = dataloader.load('test.out')
    norm_sensor_states = map(normalization.normalize, sensor_states)

    for data_length in [300, 1000, 10000]:
        data = norm_sensor_states[:data_length]

        for i in range(2, 26):
            dirname = '%s/kmeans_%d_%d' % (output_dir, i, data_length)

            discretizer = discretization.KMeansDiscretizer(i)
            discretizer.train(data)

            visualize(discretizer, dirname)

        for i in range(2, 6):
            dirname = '%s/som_%dx%d_%d' % (output_dir, i, i, data_length)
Ejemplo n.º 10
0
from model import dcnn
from time import time

batch_size = 128
num_classes = 299
epochs = 1000
size = (81, 78)

try:
    x_train.shape
    y_train.shape
    x_test.shape
    y_test.shape
except NameError:
    now = time()
    (x_train, y_train), (x_test, y_test) = load(size=size,
                                                _writer_num=num_classes)
    print('data loaded')

print('x_train shape:', x_train.shape)
print(x_train.shape[0], 'train samples')
print(x_test.shape[0], 'test samples')

model = dcnn(input_shape=(size[0], size[1], 1), num_classes=num_classes)

save_path = 'model/model04052059.h5'
log_path = './logs/train_logs'

earlystopping = EarlyStopping(monitor='val_loss', patience=5, verbose=0)
modelcheckpoint = ModelCheckpoint(save_path,
                                  monitor='val_loss',
                                  save_best_only=True,
Ejemplo n.º 11
0
import modshogun as sg
import dataloader as loader

# load data
feature_matrix = loader.load('mnist.mat')
# create features instance
features = sg.RealFeatures(feature_matrix)

# create Linear Local Tangent Space Alignment converter instance
converter = sg.LinearLocalTangentSpaceAlignment()

# set target dimensionality
converter.set_target_dim(2)
# set number of neighbors 
converter.set_k(10)
# set number of threads
converter.parallel.set_num_threads(2)
# set nullspace shift (optional)
converter.set_nullspace_shift(-1e-6)

# compute embedding with Linear Local Tangent Space Alignment method
embedding = converter.embed(features)
Ejemplo n.º 12
0
parser.add_argument("--lamb",
                    type=float,
                    default=0.0,
                    help="Regularization constant.")
parser.add_argument("--cross",
                    action="store_true",
                    help="Do cross validation or not.")
parser.add_argument("--patience",
                    type=int,
                    default=5,
                    help="Patience to stop.")
args = parser.parse_args()

train = {}
test = {}
train["feature"], train["label"] = load(os.path.join(args.data, "a9a"))
test["feature"], test["label"] = load(os.path.join(args.data, "a9a.t"))
# add for w_0
ones = np.ones([train["feature"].shape[0], 1])
train["feature"] = np.append(train["feature"], ones, axis=1)
ones = np.ones([test["feature"].shape[0], 1])
test["feature"] = np.append(test["feature"], ones, axis=1)

if not args.cross:
    w, acc_list, ll_list = irls(train, test, args.lamb, args.patience)
    print("Final step: %d" % len(acc_list))
    print("Final training acc: %f" %
          evaluate(train["feature"], train["label"], w))
    print("Final testing acc: %f" % acc_list[-1])
    print("Final L2 norm of w: %f" % np.linalg.norm(w))
else:
Ejemplo n.º 13
0
def main():
    parser = argparse.ArgumentParser(
        description='Alkane property fitting demo')
    parser.add_argument('-i', '--input', type=str, help='Data')
    parser.add_argument('-f', '--fp', type=str, help='Fingerprints')
    parser.add_argument('-o',
                        '--output',
                        default='out',
                        type=str,
                        help='Output directory')
    parser.add_argument('-t',
                        '--target',
                        default='raw_density',
                        type=str,
                        help='Fitting target')
    parser.add_argument('-p',
                        '--part',
                        default='',
                        type=str,
                        help='Partition cache file')
    parser.add_argument('-l',
                        '--layer',
                        default='16,16',
                        type=str,
                        help='Size of hidden layers')
    parser.add_argument('--visual',
                        default=1,
                        type=int,
                        help='Visualzation data')
    parser.add_argument('--gpu', default=1, type=int, help='Using gpu')
    parser.add_argument('--epoch',
                        default="200",
                        type=str,
                        help='Number of epochs')
    parser.add_argument('--step',
                        default=500,
                        type=int,
                        help='Number of steps trained for each batch')
    parser.add_argument('--batch',
                        default=int(1e9),
                        type=int,
                        help='Batch size')
    parser.add_argument('--lr',
                        default="0.005",
                        type=str,
                        help='Initial learning rate')
    parser.add_argument('--l2', default=0.000, type=float, help='L2 Penalty')
    parser.add_argument(
        '--check',
        default=10,
        type=int,
        help='Number of epoch that do convergence check. Set 0 to disable.')
    parser.add_argument('--minstop',
                        default=0.2,
                        type=float,
                        help='Minimum fraction of step to stop')
    parser.add_argument('--maxconv',
                        default=2,
                        type=int,
                        help='Times of true convergence that makes a stop')
    parser.add_argument('--featrm',
                        default='',
                        type=str,
                        help='Remove features')
    parser.add_argument('--optim', default='rms', type=str, help='optimizer')
    parser.add_argument('--continuation',
                        default=False,
                        type=bool,
                        help='continue training')
    parser.add_argument('--plotsize',
                        default=500,
                        type=int,
                        help='plotting size')
    parser.add_argument('--seed',
                        default=233,
                        type=int,
                        help='random select samples for plotting')

    opt = parser.parse_args()

    if opt.layer != "":
        layers = list(map(int, opt.layer.split(',')))
    else:
        layers = []

    opt_lr = list(map(float, opt.lr.split(',')))
    opt_epochs = list(map(int, opt.epoch.split(',')))

    if not os.path.exists(opt.output):
        os.mkdir(opt.output)

    logger = logging.getLogger('train')
    logger.setLevel(logging.INFO)
    flog = logging.FileHandler(opt.output + '/log.txt', mode='w')
    flog.setLevel(logging.INFO)
    formatter = logging.Formatter(
        fmt='[%(asctime)s] (%(levelname)s) %(message)s',
        datefmt='%Y-%d-%m %H:%M:%S')
    flog.setFormatter(formatter)
    clog = logging.StreamHandler()
    clog.setFormatter(formatter)
    logger.addHandler(flog)
    logger.addHandler(clog)

    if opt.featrm == 'auto':
        logger.info('Automatically remove features')
        featrm = [14, 15, 17, 18, 19, 20, 21, 22]
    elif opt.featrm == '':
        featrm = []
    else:
        featrm = list(map(int, opt.featrm.split(',')))
    logger.info('Remove Feature: %s' % featrm)

    logger.info('Reading data...')
    datax, datay, data_names = dataloader.load(filename=opt.input,
                                               target=opt.target,
                                               fps=opt.fp.split(','),
                                               featrm=featrm)

    logger.info('Selecting data...')
    selector = preprocessing.Selector(datax, datay, data_names)
    if opt.part:
        logger.info('Loading partition file %s' % opt.part)
        selector.load(opt.part)
    else:
        logger.warning(
            "Partition file not found. Using auto-partition instead.")
        selector.partition(0.8, 0.1)
        selector.save(opt.output + '/part.txt')
    trainx, trainy, trainname = selector.training_set()
    validx, validy, validname = selector.validation_set()
    scaler = preprocessing.Scaler()
    scaler.fit(trainx)
    # scaler.save(opt.output + '/scale.txt')
    normed_trainx = scaler.transform(trainx)
    normed_validx = scaler.transform(validx)

    model = fitting.TorchMLPRegressor(len(trainx[0]),
                                      len(trainy[0]),
                                      layers,
                                      batch_size=opt.batch,
                                      batch_step=opt.step,
                                      is_gpu=False,
                                      args_opt={
                                          'optimizer': torch.optim.Adam,
                                          'lr': opt.lr,
                                          'weight_decay': opt.l2
                                      })
    model.load(opt.output + '/model.pt')

    size = 500

    data_smiles = [i.split('\t')[0] for i in validname]

    data_T = [i.split('\t')[1] for i in validname]

    data_P = [i.split('\t')[2] for i in validname]

    predy = model.predict_batch(torch.Tensor(normed_validx))

    error_y = abs(predy - validy).reshape(-1)

    draw(validy[:size], predy[:size], data_smiles[:size], data_T[:size],
         data_P[:size])
Ejemplo n.º 14
0
def main_worker(gpu, args):
    """ 模型训练、测试、转JIT、蒸馏文件制作
    :param gpu: 运行的gpu id
    :param args: 运行超参
    """
    args.gpu = gpu
    utils.generate_logger(
        f"{datetime.datetime.now().strftime('%Y%m%d%H%M%S')}-{gpu}.log")
    logging.info(f'args: {args}')

    # 可复现性
    if args.seed is not None:
        random.seed(args.seed)
        torch.manual_seed(args.seed)
        cudnn.deterministic = True
        logging.warning('You have chosen to seed training. '
                        'This will turn on the CUDNN deterministic setting, '
                        'which can slow down your training considerably! '
                        'You may see unexpected behavior when restarting '
                        'from checkpoints.')

    if args.cuda:
        logging.info(f"Use GPU: {args.gpu} ~")
        if args.distributed:
            args.rank = args.rank * args.gpus + gpu
            dist.init_process_group(backend='nccl',
                                    init_method=args.init_method,
                                    world_size=args.world_size,
                                    rank=args.rank)
    else:
        logging.info(f"Use CPU ~")

    # 创建/加载模型,使用预训练模型时,需要自己先下载好放到 pretrained 文件夹下,以网络名词命名
    logging.info(f"=> creating model '{args.arch}'")
    model = my_models.get_model(args.arch,
                                args.pretrained,
                                num_classes=args.num_classes)

    # 重加载之前训练好的模型
    if args.resume:
        if os.path.isfile(args.resume):
            logging.info(f"=> loading checkpoint '{args.resume}'")
            checkpoint = torch.load(args.resume,
                                    map_location=torch.device('cpu'))
            acc = model.load_state_dict(checkpoint['state_dict'])
            logging.info(f'missing keys of models: {acc.missing_keys}')
            del checkpoint
        else:
            raise Exception(
                f"No checkpoint found at '{args.resume}' to be resumed")

    # 模型信息
    image_height, image_width = args.image_size
    logging.info(
        f'Model {args.arch} input size: ({image_height}, {image_width})')
    utils.summary(size=(image_height, image_width), channel=3, model=model)

    # 模型转换:转为 torch.jit.script
    if args.jit:
        if not args.resume:
            raise Exception('Option --resume must specified!')
        applications.convert_to_jit(model, args=args)
        return

    if args.criterion == 'rank':
        criterion = criterions.RankingLoss(args=args)  # 对比排序损失
    elif args.criterion == 'emd':
        criterion = criterions.EMDLoss()  # 推土机距离损失
    elif args.criterion == 'regress':
        criterion = criterions.RegressionLoss()  # MSE回归损失
    else:
        raise NotImplementedError(
            f'Not loss function {args.criterion},only (rank, emd, regress)!')

    if args.cuda:
        if args.distributed and args.sync_bn:
            model = apex.parallel.convert_syncbn_model(model)
        torch.cuda.set_device(args.gpu)
        model.cuda(args.gpu)
        criterion = criterion.cuda(args.gpu)

    # 优化器:Adam > SGD > SWA(SGD > Adam)
    optimizer = torch.optim.Adam(model.parameters(),
                                 lr=args.lr,
                                 weight_decay=args.weight_decay)
    # 可尝试优化器
    # optimizer = torch.optim.SGD(model.parameters(),
    #                             args.lr, momentum=args.momentum,
    #                             weight_decay=args.weight_decay)
    # from optim.torchtools.optim import RangerLars, Ralamb, Novograd, LookaheadAdam, Ranger, RAdam, AdamW
    # optimizer = RangerLars(model.parameters(), lr=args.lr, weight_decay=args.weight_decay)
    # optimizer = Ralamb(model.parameters(), lr=args.lr, weight_decay=args.weight_decay)
    # optimizer = Novograd(model.parameters(), lr=args.lr, weight_decay=args.weight_decay)
    # optimizer = LookaheadAdam(model.parameters(), lr=args.lr, weight_decay=args.weight_decay)
    # optimizer = Ranger(model_params, lr=args.lr, weight_decay=args.weight_decay)
    # optimizer = RAdam(model.parameters(), lr=args.lr, weight_decay=args.weight_decay)
    # optimizer = AdamW(model.parameters(), lr=args.lr, weight_decay=args.weight_decay)
    # 随机均值平均优化器
    # from optim.swa import SWA
    # optimizer = SWA(optimizer, swa_start=10, swa_freq=5, swa_lr=0.05)

    # 混合精度训练
    if args.cuda:
        model, optimizer = amp.initialize(model, optimizer, opt_level="O1")
        model = DDP(model)
    else:
        model = torch.nn.DataParallel(model)

    if args.train:
        train_loader = dataloader.load(args, 'train')
        val_loader = dataloader.load(args, 'val')
        scheduler = LambdaLR(
            optimizer, lambda epoch: adjust_learning_rate(epoch, args=args))
        applications.train(train_loader, val_loader, model, criterion,
                           optimizer, scheduler, args)
        args.evaluate = True

    if args.evaluate:
        torch.set_flush_denormal(True)
        test_loader = dataloader.load(args, name='test')
        acc, loss, test_results = applications.test(test_loader, model,
                                                    criterion, args)
        logging.info(f'Evaluation: * Acc@1 {acc:.3f} and loss {loss:.3f}.')
        logging.info(f'Evaluation results:')
        for result in test_results:
            logging.info(' '.join([str(r) for r in result]))
        logging.info('Evaluation Over~')
import pytorch_lightning as pl

import dataloader
from config import *
from net.charrnn import CharRNN
from net.lightning import Lightning

(trl, tel, val), vocab = dataloader.load(FILE_PATH,
                                         DEVICE,
                                         SPLITS,
                                         BATCH_SIZE,
                                         SEQ_LEN,
                                         unique=True)

for MODEL_NAME in ["rnn", "gru", "lstm"]:  #["lstm"]:
    for N_LAYERS in [1, 2, 3]:  #[1, 2, 3, 4]:
        for HIDDEN_SIZE in [32, 64, 128, 256, 512]:
            net = CharRNN(len(vocab), HIDDEN_SIZE, EMBEDDING_DIM, MODEL_NAME,
                          DROPOUT, N_LAYERS, DEVICE)
            lightning = Lightning(net, LR)
            esc = pl.callbacks.EarlyStopping(monitor='val_acc',
                                             min_delta=0.00,
                                             patience=3,
                                             mode="max")
            trainer = pl.Trainer(gpus=int(DEVICE == "cuda"),
                                 precision=PRECISION,
                                 gradient_clip_val=CLIP,
                                 max_epochs=MAX_EPOCHS,
                                 progress_bar_refresh_rate=10,
                                 callbacks=[esc],
                                 benchmark=True,
timestamp = datautils.timeStamped()
datautils.createDirectory(outputDirectory)

##############################################
# This creates a logger. You can use this to create an output file
# which logs the steps of your analysis
##############################################
logger = datautils.createLogger(outputDirectory)

######################################################
# Now lets try with tfidf
######################################################

logger.info("\load data\n\n")

data_tuple = dataloader.load(dataDirectory, dataFile)

#The loaded column map
columnMap = dataloader.loadColumnMap(dataDirectory, dataFile)
#the loaded target map
targetMap = dataloader.loadTargetMap(dataDirectory, dataFile)

kmeansplots.elbow(data_tuple[0], outputDirectory, "elbow", 2, 8)
###############Run the kmeans and get back a model "kmeanstfidf"
X = conversions.convertToTfIdf(data_tuple[0])
kmeanstfidf = kmeansTools.doKmeans(X,
                                   clusterCount=10,
                                   maxIterations=10,
                                   init="k-means++",
                                   n_init=2,
                                   precompute_distances='auto',
Ejemplo n.º 17
0
def main():
    sc = SparkContext('local[15]', 'haha')
    # sc._conf.set("spark.python.profile", "true")

    print(sc.getConf().getAll())

    d = load(sc)
    data_train_lp, data_dev_p, label_dev_gt, test_p = d['train_tfidf_lp'], d['dev_tfidf'], d['dev_gt'], d['test_tfidf']
    data_train_p, label_train_gt = d['train_tfidf'], d['train_gt']
    data_train, data_dev, data_test = d['train_raw'], d['dev_raw'], d['test_raw']

    data_train_lp = data_train_lp.sample(False, 0.01)
    
    # print(sum(data_train_lp.first()[0]))
    # print(data_train_lp.zipWithIndex().collect())
    print(data_train_lp.take(2))
    print("___________train_bayes_____________")
    sys.stdout.flush()
    nb = NaiveBayes.train(data_train_lp)
    print("___________trained_bayes___________")
    sys.stdout.flush()
    # nb.save(sc, 'bayes.model')
    bayes_result_dev = nb.predict(data_dev_p).map(int)
    bayes_result_dev.count()
    bayes_result_train = nb.predict(data_train_p).map(int)
    bayes_result_train.count()
    bayes_result_test = nb.predict(test_p).map(int)
    bayes_result_test.count()
    
    print("train info:")
    valid(bayes_result_train, label_train_gt)
    print("dev info:")
    valid(bayes_result_dev, label_dev_gt)

    print("___________train_logistic_____________")
    sys.stdout.flush()
    lg = LogisticRegressionWithSGD.train(data_train_lp, step=0.005)
    print("___________trained_logisitc___________")
    sys.stdout.flush()
    # lg.save(sc, 'logistic.model')
    logistic_result_dev = lg.predict(data_dev_p).map(int)
    logistic_result_train = lg.predict(data_train_p).map(int)
    logistic_result_test = lg.predict(test_p).map(int)

    print("train info:")
    valid(logistic_result_train, label_train_gt)
    print("dev info:")
    valid(logistic_result_dev, label_dev_gt)

    fused_train_p = stack_label([bayes_result_train, logistic_result_train])
    fused_dev_p = stack_label([bayes_result_dev, logistic_result_dev])
    fused_test_p = stack_label([bayes_result_test, logistic_result_test])

    fused_train_lp = label(data_train, fused_train_p)

    print("___________train_GBDT___________")
    sys.stdout.flush()
    gbdt = GradientBoostedTrees.trainClassifier(fused_train_lp, {})
    print('___________trained_GBDT_________')
    sys.stdout.flush()

    fused_result_train = gbdt.predict(fused_train_p)
    fused_result_dev = gbdt.predict(fused_dev_p)
    fused_result_test = gbdt.predict(fused_test_p)

    print("train info:")
    valid(fused_result_train, label_train_gt)
    print("dev info:")
    valid(fused_result_dev, label_dev_gt)

    dump(fused_result_test.map(int).collect())

    sc.show_profiles()
Ejemplo n.º 18
0
    None.

    '''

    assert isinstance(fireDataFrame, pd.DataFrame)

    zip_count = fireDataFrame['address_zip'].value_counts()
    zip_count = zip_count[0:30]
    plt.figure(figsize=(12, 14))
    ax = sns.barplot(y=zip_count.index,
                     x=zip_count.values,
                     alpha=0.8,
                     color='orangered',
                     orient='h',
                     order=zip_count.index)
    ax.xaxis.set_major_formatter(
        matplotlib.ticker.FuncFormatter(lambda x, p: format(int(x), ',')))
    plt.title('Number of fires by zipcode', fontsize=20)
    plt.ylabel('ZIP Code(top 30)', fontsize=18)
    plt.xlabel('Number of fire incidents', fontsize=18)
    ax.text(123600, 1.3, 'Max at 92101(Downtown)', fontsize=20)
    #plt.savefig('pic_zip-num.png')

    plt.show()


if __name__ == '__main__':

    fireDataFrame = dataloader.load(range(2007, 2020))
    fireIncidentPerZipcode(fireDataFrame)
Ejemplo n.º 19
0
from cntk import Trainer, StreamConfiguration, text_format_minibatch_source, learning_rate_schedule, UnitType
from cntk.initializer import glorot_uniform
from cntk.learner import sgd
from cntk.ops import *
from cntk.utils import get_train_eval_criterion, get_train_loss

import dataloader

image_size = 28
input_dim = image_size * image_size
num_output_classes = 10
num_hidden_layers = 2
hidden_layers_dim = 400

dataloader.load()

train_file = "data/MNIST/Train-28x28_cntk_text.txt"

if os.path.isfile(train_file):
    path = train_file
else:
    print("Cannot find data file")

feature_stream_name = 'features'
labels_stream_name = 'labels'

mb_source = text_format_minibatch_source(path, [
    StreamConfiguration(feature_stream_name, input_dim),
    StreamConfiguration(labels_stream_name, num_output_classes)
])
from normalization import normalize
from graph import MarkovChainGraph
import matplotlib.pyplot as plt
stateTranslate = {
        '0.0, 0.0, 0.0': 'nothing to see',
        '1.0, 0.0, 0.0': 'object in front',
        '1.0, 1.0, 0.0': 'facing object',
        '0.0, 0.0, 1.0': 'object grabbed',
        '1.0, 0.0, 1.0': 'object grabbed and object in front',
        '1.0, 1.0, 1.0': 'object grabbed and facing object',
        }

if __name__ == '__main__':
    plt.rcParams.update({'figure.autolayout': True})
    filename = sys.argv[1]
    sensorStates, behaviors = dataloader.load(filename)
    discretizer = SimplifyingDiscretizer()
    graph = MarkovChainGraph()
    result = {}
    ss = []
    for sensorState in sensorStates:
        sensorState = normalize(sensorState)
        s = discretizer.discretize(sensorState)
        s = graph.state_to_key(s)
        ss.append(stateTranslate[s])
        if stateTranslate[s] in result:
            result[stateTranslate[s]] += 1
        else:
            result[stateTranslate[s]] = 1

    graph.construct(ss, behaviors)
        {
            'Emergency Medical Response': 'EMR',
            'Urgent Medical Response': 'UMR',
            'Non-Emergency Medical Response': 'NEMR'
        },
        inplace=True)
    category = category.sort_values(ascending=True)

    if len(years) == 1:
        title = 'Call Category in %d' % (years[0])
    else:
        title = 'Call Category over %d-%d' % (years[0], years[-1])

    fig, ax = plt.subplots(figsize=(30, 30))
    ax.barh(category.index, category.values, color='orangered')
    ax.xaxis.set_major_formatter(
        matplotlib.ticker.FuncFormatter(lambda x, p: format(int(x), ',')))
    matplotlib.rc('xtick', labelsize=30)
    matplotlib.rc('ytick', labelsize=30)
    plt.title(title)
    #plt.savefig('call_category_%d-%d'%(years[0],years[-1]))
    plt.show()


if __name__ == '__main__':

    years = range(2010, 2011)
    fire_df = dataloader.load(years)

    plot_category_over_years(fire_df, years)
Ejemplo n.º 22
0
from matplotlib import pyplot as plt
from rx import operators
from rx import scheduler
from rx.scheduler.eventloop import AsyncIOThreadSafeScheduler
if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument("-c", "--config", help="config file", required=True)
    args = parser.parse_args()

    args_config = os.path.abspath(args.config)
    cfg.merge_from_file(args_config)
    cfg_dir = os.path.dirname(args_config)

    print("Loading training data...")
    loader = cfg.train_data.loader
    train_dataset = dataloader.load(loader, cfg_dir)
    train_dataset.generate_original = False

    if cfg.valid_data.enabled:
        print("Loading validation data...")
        loader = cfg.valid_data.loader
        valid_dataset = dataloader.load(loader, cfg_dir)
        valid_dataset.normalize_with(train_dataset.input_mean,
                                     train_dataset.input_std,
                                     train_dataset.target_mean,
                                     train_dataset.target_std)
        valid_dataset.generate_original = False
    else:
        valid_dataset = None
    if cfg.test_data.enabled:
        print("Loading testing data...")
Ejemplo n.º 23
0
    'numWsClus': 10,  # number of clusters of services
    'saveTimeInfo': False,  # whether to keep track of the running time
    'saveLog': True,  # whether to save log into file
    'debugMode': False,  # whether to record the debug info
    'parallelMode': True  # whether to leverage multiprocessing for speedup
}

initConfig(para)
#########################################################

startTime = time.clock()  # start timing
logger.info('==============================================')
logger.info('CLUS: [Silic et al., FSE\'2013].')

# load the dataset
dataTensor = dataloader.load(para)
logger.info('Loading data done.')

# run for each density
if para['parallelMode']:  # run on multiple processes
    pool = multiprocessing.Pool()
    for density in para['density']:
        pool.apply_async(evaluator.execute, (dataTensor, density, para))
    pool.close()
    pool.join()
else:  # run on single processes
    for density in para['density']:
        evaluator.execute(dataTensor, density, para)

logger.info(
    time.strftime(
Ejemplo n.º 24
0
def main():
	optparser = OptionParser()

	# options for input and output
	optparser.add_option('-m', '--model', action='store', dest='model_name')

	# options for recomia
	optparser.add_option('-e', '--embed', action='store', dest='key_embed')
	optparser.add_option('-s', '--dataset', action='store', dest='key_dataset')
	optparser.add_option('-p', '--prefix', action='store', dest='prefix')

	# options for training
	optparser.add_option('-r', '--resume', action='store_true', dest='resume', default = False)
	optparser.add_option('-d', '--dim_hidden', action='store', dest='dim_hidden', type='int', default = None)
	optparser.add_option('-b', '--batch_size', action='store', type='int', dest='batch_size', default = 16)
	optparser.add_option('-l', '--learning_rate', action='store', type='float', dest='lrate', default = 0.05)
	optparser.add_option('-c', '--decay_c', action='store', type='float', dest='decay_c', default = 1e-4)

	optparser.add_option('-v', '--valid_freq', action='store', type='int', dest='valid_freq', default=5000)

	opts, args = optparser.parse_args()

	dir_exp = '../'
	fname_dataset = dir_exp + 'dataset/%s.pkl'%(opts.key_dataset)
	
	fname_embed = dir_exp + 'wemb/%s.txt'%(opts.key_embed)
	fname_model = dir_exp + 'model/%s'%(opts.prefix)
	fname_test = dir_exp + 'test/%s_test.pkl'%(opts.prefix)
	fname_prec = dir_exp + 'test/%s_prec.pkl'%(opts.prefix)
	dir_tokid = dir_exp + 'tokid/%s/'%(opts.key_embed)

	dataset = dataloader.load(fname_dataset, dir_tokid)
	wembedder = WordEmbedder.load(fname_embed)

	if not opts.resume:
		Wemb = wembedder.get_Wemb()
	else:
		Wemb = None

	print >> sys.stderr, 'main: [info] start training'
	clf = Classifier()

	res = clf.train(
			dataset = dataset,
			Wemb = Wemb,

			fname_model = fname_model,
			resume = opts.resume,

			model_name = opts.model_name,
			dim_hidden = opts.dim_hidden,
			batch_size = opts.batch_size,
			decay_c = opts.decay_c,
			lrate = opts.lrate,

			validFreq = opts.valid_freq,
			saveFreq = opts.valid_freq,
		)

	test_x, test_y = dataset[2]

	proba = clf.predict_proba(test_x)
	cPickle.dump((test_y, proba), open(fname_test, 'w'))	

	prec = precision_at_n(test_y, proba)
	cPickle.dump(prec, open(fname_prec, 'w'))
Ejemplo n.º 25
0
def anomaly(experiment_name,
            network_model,
            dataset,
            inside_labels,
            unknown_labels,
            with_unknown,
            batch_size=100,
            nb_epochs=200,
            save_weights=True):

    print('#' * 50)
    print('Experiment:', experiment_name)
    print('model:', network_model)
    print('dataset:', dataset)
    print('inside_labels:', str(inside_labels))
    print('unknown_labels:', str(unknown_labels))
    print('batch_size:', batch_size)
    print('nb_epochs:', nb_epochs)
    print('-' * 50)

    inside_labels.sort()
    unknown_labels.sort()

    (X_train_all,
     y_train_all), (X_train, y_train), (X_test, y_test) = dataloader.load(
         dataset, inside_labels, unknown_labels, with_unknown)

    if 'mlp' in network_model:
        X_train_all = X_train_all.reshape(X_train_all.shape[0], -1)
        X_train = X_train.reshape(X_train.shape[0], -1)
        X_test = X_test.reshape(X_test.shape[0], -1)

    input_shape = X_train.shape[1:]
    nb_classes = y_train.shape[1]
    nb_batchs = X_train.shape[0] // batch_size

    model = create_model(network_model, batch_size, input_shape, nb_classes,
                         nb_batchs)

    if network_model.endswith('bayesian') and 'poor' not in network_model:
        mod = X_train.shape[0] % batch_size
        if mod:
            X_train = X_train[:-mod]
            y_train = y_train[:-mod]

    print('Training')
    start_time = time.time()
    for i in tqdm.tqdm(list(range(nb_epochs))):
        model.fit(X_train,
                  y_train,
                  nb_epoch=1,
                  batch_size=batch_size,
                  verbose=0)
    end_time = time.time()

    if save_weights:
        path = '{0}_results/weights/{1}_{2}/'
        wu = 'with' if with_unknown else 'without'
        path = path.format(dataset, network_model, wu)
        os.makedirs(path, exist_ok=True)
        model.save_weights(os.path.join(path, experiment_name + '.h5'),
                           overwrite=True)

    print('Collecting measures of train')
    measures_train, train_acc = get_measures(X_train_all, y_train_all, model,
                                             batch_size, inside_labels)
    print('Collecting measures of test')
    measures_test, test_acc = get_measures(X_test, y_test, model, batch_size,
                                           inside_labels)

    print('Classification')
    clf = uncertainty_classifier(measures_train, inside_labels, unknown_labels)
    measures_test['classifier'] = {l: [] for l in range(10)}
    for l in range(10):
        n = len(measures_test['entropy_std_samples'][l])
        for i in range(n):
            f = [
                # measures_test['variation_ratio'][l][i],
                measures_test['mean_entropy'][l][i],
                measures_test['pred_std_mean'][l][i],
                measures_test['entropy_std_samples'][l][i],
                measures_test['entropy_mean_samples'][l][i],
            ]
            p = clf.predict_proba([f])[0, 1]
            measures_test['classifier'][l].append(p)

    # Anomaly detection
    # by classical prediction entropy
    def anomaly_detection(anomaly_score_dict, metric_name, df):
        threshold = np.logspace(-10.0, 1.0, 1000)
        acc = {}
        for t in threshold:
            tp = 0.0
            tn = 0.0
            for l in anomaly_score_dict:
                if l in unknown_labels:
                    continue

                if l in inside_labels:
                    tp += (np.array(anomaly_score_dict[l]) < t).mean()
                else:
                    tn += (np.array(anomaly_score_dict[l]) >= t).mean()

            tp /= len(inside_labels)
            tn /= (10.0 - len(unknown_labels)) - len(inside_labels)
            bal_acc = (tp + tn) / 2.0
            f1_score = 2.0 * tp / (2.0 + tp - tn)
            acc[t] = [bal_acc, f1_score, tp, tn]

        trues = []
        scores = []
        for l in anomaly_score_dict:
            if l in unknown_labels: continue

            scores += anomaly_score_dict[l]
            if l in inside_labels:
                trues += [0] * len(anomaly_score_dict[l])
            else:
                trues += [1] * len(anomaly_score_dict[l])
        assert len(trues) == len(scores)

        auc = metrics.roc_auc_score(trues, scores)

        sorted_acc = sorted(acc.items(), key=lambda x: x[1][0], reverse=True)
        df.set_value(experiment_name, metric_name + '_bal_acc',
                     sorted_acc[0][1][0])
        bal_acc = sorted_acc[0][1][0]

        sorted_acc = sorted(acc.items(), key=lambda x: x[1][1], reverse=True)
        df.set_value(experiment_name, metric_name + '_f1_score',
                     sorted_acc[0][1][1])
        f1_score = sorted_acc[0][1][1]
        df.set_value(experiment_name, metric_name + '_auc', auc)

        msg = '{0}: (auc, {1:.2f}), (bal_acc, {2:.2f}), (f1_score, {3:.2f})'
        print(msg.format(metric_name, auc, bal_acc, f1_score))

        return df

    print('-' * 50)
    df = pd.DataFrame()
    df.set_value(experiment_name, 'experiment_name', experiment_name)
    df.set_value(experiment_name, 'train_time', end_time - start_time)
    df.set_value(experiment_name, 'dataset', dataset)
    df.set_value(experiment_name, 'test_acc', test_acc)
    df.set_value(experiment_name, 'inside_labels', str(inside_labels))
    df.set_value(experiment_name, 'unknown_labels', str(unknown_labels))
    df.set_value(experiment_name, 'epochs', nb_epochs)
    df = anomaly_detection(measures_test['pred_std_mean'], 'pred_std_', df)
    df = anomaly_detection(measures_test['mean_entropy'], 'entropy_', df)
    df = anomaly_detection(measures_test['entropy_mean_samples'],
                           'entropy_expectation_', df)
    df = anomaly_detection(measures_test['variation_ratio'],
                           'variation_ratio_', df)
    df = anomaly_detection(measures_test['classifier'], 'classifier_', df)

    return df
Ejemplo n.º 26
0
def main_worker(gpu, args):
    """
    模型训练、测试、转JIT、蒸馏文件制作
    :param gpu: 运行的gpu id
    :param args: 运行超参
    """
    args.gpu = gpu
    utils.generate_logger(f"{datetime.datetime.now().strftime('%Y%m%d%H%M%S')}-{gpu}.log")
    logging.info(f'args: {args}')

    # 可复现性
    if args.seed is not None:
        random.seed(args.seed)
        torch.manual_seed(args.seed)
        cudnn.deterministic = True
        logging.warning('You have chosen to seed training. '
                        'This will turn on the CUDNN deterministic setting, '
                        'which can slow down your training considerably! '
                        'You may see unexpected behavior when restarting '
                        'from checkpoints.')

    if args.cuda:
        logging.info(f"Use GPU: {args.gpu} ~")
        if args.distributed:
            args.rank = args.rank * args.gpus + gpu
            dist.init_process_group(backend='nccl', init_method=args.init_method,
                                    world_size=args.world_size, rank=args.rank)
    else:
        logging.info(f"Use CPU ~")

    # 创建/加载模型,使用预训练模型时,需要自己先下载好放到 pretrained 文件夹下,以网络名词命名
    logging.info(f"=> creating model '{args.arch}'")
    model = my_models.get_model(args.arch, args.pretrained, num_classes=args.num_classes)

    # 重加载之前训练好的模型
    if args.resume:
        if os.path.isfile(args.resume):
            logging.info(f"=> loading checkpoint '{args.resume}'")
            checkpoint = torch.load(args.resume, map_location=torch.device('cpu'))
            acc = model.load_state_dict(checkpoint['state_dict'], strict=True)
            logging.info(f'missing keys of models: {acc.missing_keys}')
            del checkpoint
        else:
            raise Exception(f"No checkpoint found at '{args.resume}' to be resumed")

    # 模型信息
    image_height, image_width = args.image_size
    logging.info(f'Model {args.arch} input size: ({image_height}, {image_width})')
    utils.summary(size=(image_height, image_width), channel=3, model=model)

    # 模型转换:转为 torch.jit.script
    if args.jit:
        if not args.resume:
            raise Exception('Option --resume must specified!')
        applications.convert_to_jit(model, args=args)
        return

    if args.criterion == 'softmax':
        criterion = criterions.HybridCELoss(args=args)  # 混合策略多分类
    elif args.criterion == 'bce':
        criterion = criterions.HybridBCELoss(args=args)  # 混合策略多标签二分类
    else:
        raise NotImplementedError(f'Not loss function {args.criterion}')

    if args.cuda:
        if args.distributed and args.sync_bn:
            model = apex.parallel.convert_syncbn_model(model)
        torch.cuda.set_device(args.gpu)
        model.cuda(args.gpu)
        criterion = criterion.cuda(args.gpu)

    if args.knowledge in ('train', 'test', 'val'):
        torch.set_flush_denormal(True)
        distill_loader = dataloader.load(args, name=args.knowledge)
        applications.distill(distill_loader, model, criterion, args, is_confuse_matrix=True)
        return

    if args.make_curriculum in ('train', 'test', 'val'):
        torch.set_flush_denormal(True)
        curriculum_loader = dataloader.load(args, name=args.make_curriculum)
        applications.make_curriculum(curriculum_loader, model, criterion, args, is_confuse_matrix=True)
        return

    if args.visual_data in ('train', 'test', 'val'):
        torch.set_flush_denormal(True)
        test_loader = dataloader.load(args, name=args.visual_data)
        applications.Visualize.visualize(test_loader, model, args)
        return

    # 优化器
    opt_set = {
        'sgd': partial(torch.optim.SGD, momentum=args.momentum),
        'adam': torch.optim.Adam, 'adamw': AdamW,
        'radam': RAdam, 'ranger': Ranger, 'lookaheadadam': LookaheadAdam,
        'ralamb': Ralamb, 'rangerlars': RangerLars,
        'novograd': Novograd,
    }
    optimizer = opt_set[args.opt](model.parameters(), lr=args.lr)  # weight decay转移到train那里了
    # 随机均值平均优化器
    # from optim.swa import SWA
    # optimizer = SWA(optimizer, swa_start=10, swa_freq=5, swa_lr=0.05)

    # 混合精度训练
    if args.cuda:
        model, optimizer = amp.initialize(model, optimizer, opt_level="O1")

    if args.distributed:
        model = apex.parallel.DistributedDataParallel(model)
    else:
        model = torch.nn.DataParallel(model)

    if args.train:
        train_loader = dataloader.load(args, 'train')
        val_loader = dataloader.load(args, 'val')
        scheduler = LambdaLR(optimizer,
                             lambda epoch: adjust_learning_rate(epoch, args=args))
        applications.train(train_loader, val_loader, model, criterion, optimizer, scheduler, args)
        args.evaluate = True

    if args.evaluate:
        torch.set_flush_denormal(True)
        test_loader = dataloader.load(args, name='test')
        acc, loss, paths_targets_preds_probs = applications.test(test_loader, model,
                                                                 criterion, args, is_confuse_matrix=True)
        logging.info(f'Evaluation: * Acc@1 {acc:.3f} and loss {loss:.3f}.')
        logging.info(f'Evaluation Result:\n')
        for path, target, pred, prob in paths_targets_preds_probs:
            logging.info(path + ' ' + str(target) + ' ' + str(pred) + ' ' + ','.join([f'{num:.2f}' for num in prob]))
        logging.info('Evaluation Over~')
Ejemplo n.º 27
0
		'saveTimeInfo': False, # whether to keep track of the running time
		'saveLog': True, # whether to save log into file
		'debugMode': False, # whether to record the debug info
		'parallelMode': True # whether to leverage multiprocessing for speedup
		}

initConfig(para)
#########################################################


startTime = time.clock() # start timing
logger.info('==============================================')
logger.info('ADF: [Wu et al., TSMC\'2013].')

# load the dataset
dataMatrix = dataloader.load(para)
logger.info('Loading data done.')

# run for each density
if para['parallelMode']: # run on multiple processes
    pool = multiprocessing.Pool()
    for density in para['density']:
		pool.apply_async(evaluator.execute, (dataMatrix, density, para))
    pool.close()
    pool.join()
else: # run on single processes
	for density in para['density']:
		evaluator.execute(dataMatrix, density, para)

logger.info(time.strftime('All done. Total running time: %d-th day - %Hhour - %Mmin - %Ssec.',
         time.gmtime(time.clock() - startTime)))
Ejemplo n.º 28
0
opt = Config().parse()

if not os.path.exists(opt.RESULT):
    os.makedirs(opt.RESULT)

os.environ['CUDA_VISIBLE_DEVICES'] = str(opt.GPU)
print("using gpu {}".format(opt.GPU))
"""
Data preparation.
"""
opt.use_PCA = True
opt.use_SuperPCA = False
print(opt.use_PCA)
print(opt.use_SuperPCA)
assert (opt.use_PCA and opt.use_SuperPCA) == False
data, label = load(opt.DATASET)
if opt.use_PCA:
    data, pca = apply_pca(data)
else:
    pass

init_tr_labeled_idx, init_tr_unlabeled_idx, te_idx = get_init_indices(
    data, label)

init_trl_data, init_trl_label = get_data(data, label, init_tr_labeled_idx)
init_trunl_data, init_trunl_label = get_data(data, label,
                                             init_tr_unlabeled_idx)
te_data, te_label = get_data(data, label, te_idx)

init_trl_data = np.expand_dims(init_trl_data, axis=4)
init_trl_label = keras.utils.to_categorical(init_trl_label)
Ejemplo n.º 29
0
def anomaly(experiment_name, network_model, dataset,
            inside_labels, unknown_labels, with_unknown,
            batch_size=100, nb_epochs=200, save_weights=True):

    print('#'*50)
    print('Experiment:', experiment_name)
    print('model:', network_model)
    print('dataset:', dataset)
    print('inside_labels:', str(inside_labels))
    print('unknown_labels:', str(unknown_labels))
    print('batch_size:', batch_size)
    print('nb_epochs:', nb_epochs)
    print('-'*50)

    inside_labels.sort()
    unknown_labels.sort()

    (X_train_all, y_train_all), (X_train, y_train), (X_test, y_test) = dataloader.load(dataset,
                                                                                       inside_labels,
                                                                                       unknown_labels,
                                                                                       with_unknown)

    if 'mlp' in network_model:
        X_train_all = X_train_all.reshape(X_train_all.shape[0], -1)
        X_train = X_train.reshape(X_train.shape[0], -1)
        X_test = X_test.reshape(X_test.shape[0], -1)

    input_shape = X_train.shape[1:]
    nb_classes = y_train.shape[1]
    nb_batchs = X_train.shape[0]//batch_size

    model = create_model(network_model, batch_size, input_shape, nb_classes, nb_batchs)

    if network_model.endswith('bayesian') and 'poor' not in network_model:
        mod = X_train.shape[0]%batch_size
        if mod:
            X_train = X_train[:-mod]
            y_train = y_train[:-mod]

    print('Training')
    start_time = time.time()
    for i in tqdm.tqdm(list(range(nb_epochs))):
        model.fit(X_train, y_train, nb_epoch=1, batch_size=batch_size, verbose=0)
    end_time = time.time()

    if save_weights:
        path = '{0}_results/weights/{1}_{2}/'
        wu = 'with' if with_unknown else 'without'
        path = path.format(dataset, network_model, wu)
        os.makedirs(path, exist_ok=True)
        model.save_weights(os.path.join(path, experiment_name+'.h5'), overwrite=True)

    print('Collecting measures of train')
    measures_train, train_acc = get_measures(X_train_all, y_train_all, model, batch_size, inside_labels)
    print('Collecting measures of test')
    measures_test, test_acc = get_measures(X_test, y_test, model, batch_size, inside_labels)

    print('Classification')
    clf = uncertainty_classifier(measures_train, inside_labels, unknown_labels)
    measures_test['classifier'] = {l:[] for l in range(10)}
    for l in range(10):
        n = len(measures_test['entropy_std_samples'][l])
        for i in range(n):
            f = [
                # measures_test['variation_ratio'][l][i],
                measures_test['mean_entropy'][l][i],
                measures_test['pred_std_mean'][l][i],
                measures_test['entropy_std_samples'][l][i],
                measures_test['entropy_mean_samples'][l][i],
            ]
            p = clf.predict_proba([f])[0, 1]
            measures_test['classifier'][l].append(p)

    # Anomaly detection
    # by classical prediction entropy
    def anomaly_detection(anomaly_score_dict, metric_name, df):
        threshold = np.logspace(-10.0, 1.0, 1000)
        acc = {}
        for t in threshold:
            tp = 0.0
            tn = 0.0
            for l in anomaly_score_dict:
                if l in unknown_labels:
                    continue

                if l in inside_labels:
                    tp += (np.array(anomaly_score_dict[l]) < t).mean()
                else:
                    tn += (np.array(anomaly_score_dict[l]) >= t).mean()

            tp /= len(inside_labels)
            tn /= (10.0 - len(unknown_labels)) - len(inside_labels)
            bal_acc = (tp + tn)/2.0
            f1_score = 2.0*tp/(2.0 + tp - tn)
            acc[t] = [bal_acc, f1_score, tp, tn]

        trues = []
        scores = []
        for l in anomaly_score_dict:
            if l in unknown_labels: continue

            scores += anomaly_score_dict[l]
            if l in inside_labels:
                trues += [0]*len(anomaly_score_dict[l])
            else:
                trues += [1]*len(anomaly_score_dict[l])
        assert len(trues) == len(scores)

        auc = metrics.roc_auc_score(trues, scores)

        sorted_acc = sorted(acc.items(), key=lambda x: x[1][0], reverse=True)
        df.set_value(experiment_name, metric_name + '_bal_acc', sorted_acc[0][1][0])
        bal_acc = sorted_acc[0][1][0]

        sorted_acc = sorted(acc.items(), key=lambda x: x[1][1], reverse=True)
        df.set_value(experiment_name, metric_name + '_f1_score', sorted_acc[0][1][1])
        f1_score = sorted_acc[0][1][1]
        df.set_value(experiment_name, metric_name + '_auc', auc)

        msg = '{0}: (auc, {1:.2f}), (bal_acc, {2:.2f}), (f1_score, {3:.2f})'
        print(msg.format(metric_name, auc, bal_acc, f1_score))

        return df

    print('-'*50)
    df = pd.DataFrame()
    df.set_value(experiment_name, 'experiment_name', experiment_name)
    df.set_value(experiment_name, 'train_time', end_time - start_time)
    df.set_value(experiment_name, 'dataset', dataset)
    df.set_value(experiment_name, 'test_acc', test_acc)
    df.set_value(experiment_name, 'inside_labels', str(inside_labels))
    df.set_value(experiment_name, 'unknown_labels', str(unknown_labels))
    df.set_value(experiment_name, 'epochs', nb_epochs)
    df = anomaly_detection(measures_test['pred_std_mean'], 'pred_std_', df)
    df = anomaly_detection(measures_test['mean_entropy'], 'entropy_', df)
    df = anomaly_detection(measures_test['entropy_mean_samples'], 'entropy_expectation_', df)
    df = anomaly_detection(measures_test['variation_ratio'], 'variation_ratio_', df)
    df = anomaly_detection(measures_test['classifier'], 'classifier_', df)

    return df
Ejemplo n.º 30
0
		'saveTimeInfo': False, # whether to keep track of the running time
		'saveLog': True, # whether to save log into file
		'debugMode': False, # whether to record the debug info
		'parallelMode': True # whether to leverage multiprocessing for speedup
		}

initConfig(para)
#########################################################


startTime = time.clock() # start timing
logger.info('==============================================')
logger.info('Approach: [UMEAN, IMEAN, UPCC, IPCC, UIPCC].')

# load the dataset
dataTensor = dataloader.load(para)
logger.info('Loading data done.')

# run for each density
numTimeSlice = dataTensor.shape[2]
if para['parallelMode']: # run on multiple processes
    pool = multiprocessing.Pool()
    for cxtId in xrange(numTimeSlice):
    	dataMatrix = dataTensor[:, :, cxtId]
    	for density in para['density']:
			pool.apply_async(evaluator.execute, (dataMatrix, density, para, cxtId))
    pool.close()
    pool.join()
else: # run on single processes
    for cxtId in xrange(numTimeSlice):
    	dataMatrix = dataTensor[:, :, cxtId]
def main(unused_argv):
    # Load datasets
    # Get signals and labels
    if FLAGS.data_type == 'all':
        trn_data_list = [
            dataloader.load(which_data='s{}'.format(i),
                            gesture=FLAGS.gesture,
                            for_merge=True,
                            train=True) for i in [1, 2, 3]
        ]
        tst_data_list = [
            dataloader.load(which_data='s{}'.format(i),
                            gesture=FLAGS.gesture,
                            for_merge=True,
                            train=False) for i in [1, 2, 3]
        ]

        signals_trn = np.concatenate([data[0] for data in trn_data_list],
                                     axis=0)
        labels_trn = np.concatenate([data[1] for data in trn_data_list],
                                    axis=0)
        signals_tst = np.concatenate([data[0] for data in tst_data_list],
                                     axis=0)
        labels_tst = np.concatenate([data[1] for data in tst_data_list],
                                    axis=0)

    else:
        signals_trn, labels_trn = dataloader.load(which_data=FLAGS.data_type,
                                                  gesture=FLAGS.gesture,
                                                  train=True)
        signals_tst, labels_tst = dataloader.load(which_data=FLAGS.data_type,
                                                  gesture=FLAGS.gesture,
                                                  train=False)

    # Set model params
    model_params = dict(
        learning_rate=FLAGS.learning_rate,
        # regularization type and strength
        wd=FLAGS.wd,
        # convolutional layer
        window_size=5,
        n_conv=1,
        n_filter=100,
        # fully connected layer
        n_fully_connected=1,
        # pooling
        pooling_size=2,
        pooling_stride=1,
        # n_labels
        n_labels=labels_trn.shape[1],
    )  # model0.+n_conv1.+n_filter.+n_fc1.*/eval

    model_id = 'GEST' if FLAGS.gesture is True else 'LOCO'
    model_id += '_%s' % FLAGS.data_type.upper()
    model_id += '_wd{}'.format(FLAGS.wd)
    # model_id += '_n_conv%s' % model_params['n_conv']
    # model_id += '_n_filter%s' % model_params['n_filter']
    # model_id += '_n_fc%s' % model_params['n_fully_connected']

    dt_now = datetime.now().strftime('%Y%m%d_%H%M%S')
    fname = '{}__{}'.format(model_id, dt_now)
    print('-' * 5, model_id, '-' * 5)
    print('-' * 5, dt_now, '-' * 5)

    # Model dir
    model_dir = './tf_models/{}/{}'.format(model_id, dt_now)
    # if FLAGS.restart is True:
    if tf.gfile.Exists(model_dir):
        tf.gfile.DeleteRecursively(model_dir)
    tf.gfile.MakeDirs(model_dir)

    # Instantiate Estimator
    estimator = tf.estimator.Estimator(model_fn=func.model_fn,
                                       params=model_params,
                                       model_dir=model_dir)

    # Input functions
    train_input_fn = tf.estimator.inputs.numpy_input_fn(
        x={'signals': signals_trn},
        y=labels_trn,
        batch_size=FLAGS.batch_size,
        num_epochs=None,
        shuffle=True)
    test_input_fn = tf.estimator.inputs.numpy_input_fn(
        x={'signals': signals_tst}, y=labels_tst, num_epochs=1, shuffle=False)

    # Train and test ==> record test summary
    # We iterate train and evaluation to save summaries
    if FLAGS.train is True:
        for i in range(FLAGS.steps // FLAGS.log_freq):
            iter_up_to_now = i * FLAGS.log_freq
            print('-' * 10, 'Begin training - iteration', iter_up_to_now,
                  '-' * 10)
            estimator.train(input_fn=train_input_fn, steps=FLAGS.log_freq)

            # Evaluate and save the result
            iter_up_to_now = (i + 1) * FLAGS.log_freq
            ev = estimator.evaluate(
                input_fn=test_input_fn,
                hooks=[_SaveWeightHook(fname, iter_up_to_now)])
            save_the_evaluation(ev, fname, iter_up_to_now)
Ejemplo n.º 32
0
		'saveTimeInfo': False, # whether to keep track of the running time
		'saveLog': False, # whether to save log into file
		'debugMode': False, # whether to record the debug info
        'parallelMode': True # whether to leverage multiprocessing for speedup
		}

initConfig(para)
#########################################################


startTime = time.clock() # start timing
logger.info('==============================================')
logger.info('Approach: HMF [He et al., ICWS\'2014].')

# load the dataset
dataMatrix = dataloader.load(para)
logger.info('Loading data done.')

# get the location groups for users as well as for services
locGroup = dataloader.getLocGroup(para)
logger.info('Clustering done.') 

# run for each density
if para['parallelMode']: # run on multiple processes
    pool = multiprocessing.Pool()
    for density in para['density']:
		pool.apply_async(evaluator.execute, (dataMatrix, locGroup, density, para))
    pool.close()
    pool.join()
else: # run on single processes
	for density in para['density']:
Ejemplo n.º 33
0
def main(args=None):

    parser = argparse.ArgumentParser(
        description='Simple training script for training a RetinaNet network.')

    parser.add_argument('--dataset',
                        help='Dataset type, must be one of csv or coco.')
    parser.add_argument('--coco_path', help='Path to COCO directory')
    parser.add_argument(
        '--csv_train',
        help='Path to file containing training annotations (see readme)')
    parser.add_argument('--csv_classes',
                        help='Path to file containing class list (see readme)')
    parser.add_argument(
        '--csv_val',
        help=
        'Path to file containing validation annotations (optional, see readme)'
    )

    parser.add_argument(
        '--depth',
        help='Resnet depth, must be one of 18, 34, 50, 101, 152',
        type=int,
        default=50)
    parser.add_argument('--epochs',
                        help='Number of epochs',
                        type=int,
                        default=100)

    parser = parser.parse_args(args)

    # Create the data loaders
    if parser.dataset == 'coco':

        if parser.coco_path is None:
            raise ValueError('Must provide --coco_path when training on COCO,')

        #dataset_train = CocoDataset(parser.coco_path, set_name='train2017', transform=transforms.Compose([Normalizer(), Augmenter(), Resizer()]))
        #dataset_val = CocoDataset(parser.coco_path, set_name='val2017', transform=transforms.Compose([Normalizer(), Resizer()]))

        dataloader_train = load()

    else:
        raise ValueError(
            'Dataset type not understood (must be csv or coco), exiting.')

    #sampler = AspectRatioBasedSampler(dataset_train, batch_size=2, drop_last=False)
    #dataloader_train = DataLoader(dataset_train, num_workers=3, collate_fn=collater, batch_sampler=sampler)

    #if dataset_val is not None:
    #		sampler_val = AspectRatioBasedSampler(dataset_val, batch_size=1, drop_last=False)
    #	dataloader_val = DataLoader(dataset_val, num_workers=3, collate_fn=collater, batch_sampler=sampler_val)

    # Create the model
    # returns [nms_scores, nms_class, transformed_anchors[0, anchors_nms_idx, :]]
    if parser.depth == 18:
        retinanet = model.resnet18(num_classes=dataset_train.num_classes(),
                                   pretrained=True)
    elif parser.depth == 34:
        retinanet = model.resnet34(num_classes=dataset_train.num_classes(),
                                   pretrained=True)
    elif parser.depth == 50:
        retinanet = model.resnet50(num_classes=dataset_train.num_classes(),
                                   pretrained=True)
    elif parser.depth == 101:
        retinanet = model.resnet101(num_classes=dataset_train.num_classes(),
                                    pretrained=True)
    elif parser.depth == 152:
        retinanet = model.resnet152(num_classes=dataset_train.num_classes(),
                                    pretrained=True)
    else:
        raise ValueError(
            'Unsupported model depth, must be one of 18, 34, 50, 101, 152')

    use_gpu = True

    if use_gpu:
        retinanet = retinanet.cuda()

    retinanet = torch.nn.DataParallel(retinanet).cuda()

    retinanet.training = True

    optimizer = optim.Adam(retinanet.parameters(), lr=1e-5)

    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer,
                                                     patience=3,
                                                     verbose=True)

    loss_hist = collections.deque(maxlen=500)

    retinanet.train()
    # Freeze batch norm layers
    retinanet.module.freeze_bn()

    print('Num training images: {}'.format(len(dataset_train)))

    for epoch_num in range(parser.epochs):

        retinanet.train()
        retinanet.module.freeze_bn()

        epoch_loss = []

        # ~ engine in prototypical network
        for iter_num, data_temp in enumerate(
                dataloader_train):  #iterates through the episodes
            try:
                optimizer.zero_grad()

                #print('size of data')
                #print(data.items())
                # data is a dictionary with keys: img, annot and scale
                #print(data['img'])
                #print(data['img'].size())
                #(batch size (2), channels (3), width and height of image) (padded by 0 so every image in the batch has the same dimension)
                #print(data['annot'])
                #print(data['annot'].size())
                #(batch size (2), maximum number of annotations per image in the batch, coordinates + class id (5))
                # annotations are padded by -1 so every image in the batch has the same number of annotations
                #print(data['scale'])
                # vector of size 2 (size of batch) with the scale of the image

                # same for when using anchors: take the mean excluding values of -1

                classes_ids = dataset_train.classes
                relevant_ids = [classes_ids[x] for x in data_temp['class']]

                sample = []
                normalizer = Normalizer()
                resizer = Resizer()

                for i in range(len(data_temp['x'])):
                    for j in range(len(data_temp['x'][0])):
                        idx = data_temp['x'][i][j].item()
                        img = load_image(idx)
                        annots = load_annotations(idx)
                        # only keep annotations for the conisidered classes
                        annots = annots[np.isin(annots[:, 4], relevant_ids)]
                        temp = {'img': img, 'annot': annots}
                        sample.append(resizer(normalizer(temp)))

                data = collater(sample)
                #print(data)
                # now the data is still a dictionary with keys: img, annot and scale
                #print(data['img'].size())
                #print('test initial image')
                #print(data['img'].sum())
                #(number of images ~ batch size (=(n_support + n_query)*n_way), channels (=3), width, height)
                #print(data['annot'].size())
                #(number of images, max number of annotations in those images, coordinates & class (=5))
                #print(len(data['scale']))
                # list of length number of images, containing the scale for each
                #sys.exit()

                # need to change classification loss, and format of regression loss to accept the new form of the batch

                classification_loss, regression_loss = retinanet(
                    [data['img'].cuda().float(), data['annot']])

                classification_loss = classification_loss.mean()
                regression_loss = regression_loss.mean()

                loss = classification_loss + regression_loss

                if bool(loss == 0):
                    continue

                loss.backward()

                torch.nn.utils.clip_grad_norm_(retinanet.parameters(), 0.1)

                optimizer.step()

                loss_hist.append(float(loss))

                epoch_loss.append(float(loss))

                print(
                    'Epoch: {} | Iteration: {} | Classification loss: {:1.5f} | Regression loss: {:1.5f} | Running loss: {:1.5f}'
                    .format(epoch_num, iter_num, float(classification_loss),
                            float(regression_loss), np.mean(loss_hist)))

                del classification_loss
                del regression_loss
            except Exception as e:
                print(e)
                continue

        # if parser.dataset == 'coco':

        # 	print('Evaluating dataset')

        # 	coco_eval.evaluate_coco(dataset_val, retinanet)

        scheduler.step(np.mean(epoch_loss))

        #torch.save(retinanet.module, '{}_retinanet_{}.pt'.format(parser.dataset, epoch_num))

    retinanet.eval()

    torch.save(retinanet, 'model_final.pt'.format(epoch_num))