Beispiel #1
0
def main(opts, args):
    random.seed(0)
    rng = np.random.RandomState(1234567)
    lasagne.random.set_rng(rng)
    from cwi import xvalidate
    dset = utils.get_dset()
    objfunc = partial(xvalidate, dset, args['kfold'])

    hpspace = Space(args['layers_max'],opts)
    logging.critical('""" space """')
    logging.critical(hpspace)
    logging.critical('""" space end """\n')

    def objwrapper(spsample):
        conf = Conf(spsample,args)
        logging.critical(conf)
        f1 = objfunc(conf.params)
        loss = 1-f1
        logging.critical('f1:{:.2f}\n'.format(f1))

        return {'loss': loss, 'status': STATUS_OK}


    best = fmin(objwrapper,
            space=hpspace.space,
            algo=tpe.suggest,
            max_evals=args['evals']
            )
    """
    {'opt': 0, 'dr2m3': 0.4161315610584588, 'activation': 2, 'n_batch': 1, 'h2m3': 0, 'dr0m3': 0.5461767196698459, 'dr1m3': 0.6720178139274229, 'h3m3': 0, 'dr3m3': 0.419643470550215, 'h1m3': 1, 'dpart': 2, 'lr': 0.0008630799496044787, 'norm': 8.679706724218939}
    """
    logging.critical(best)
def plot_vars(f_step, projection, load_all=False):
    # The one employed for the figure name when exported
    variable_name = 'gph_t_850'
    # Build the name of the output image
    run_string, _ = get_run()

    if load_all:
        f_steps = list(range(0, 79)) + list(range(81, 121, 3))
    else:
        f_steps = [f_step]

    filenames = ['/tmp/' + projection + '_' + variable_name +
                 '_%s_%03d.png' % (run_string, f_step) for f_step in f_steps]
    test_filenames = [os.path.exists(f) for f in filenames]

    if all(test_filenames):  # means the files already exist
        return filenames

    # otherwise do the plots
    dset = get_dset(vars_3d=['t@850', 'fi@500'], f_times=f_steps).squeeze()
    # Add a fictictious 1-D time dimension just to avoid problems
    if 'step' not in dset.dims.keys():
        dset = dset.expand_dims('step')
    #
    dset = subset_arrays(dset, projection)
    time = pd.to_datetime(dset.valid_time.values)
    cum_hour = dset.step.values.astype(int)

    temp_850 = dset['t'] - 273.15
    z_500 = dset['z']
    gph_500 = mpcalc.geopotential_to_height(z_500)
    gph_500 = xr.DataArray(gph_500.magnitude, coords=z_500.coords,
                           attrs={'standard_name': 'geopotential height',
                                  'units': gph_500.units})

    levels_temp = np.arange(-30., 30., 1.)
    levels_gph = np.arange(4700., 6000., 70.)

    lon, lat = get_coordinates(temp_850)
    lon2d, lat2d = np.meshgrid(lon, lat)

    cmap = get_colormap('temp')

    args = dict(filenames=filenames, projection=projection, levels_temp=levels_temp,
                cmap=cmap, lon2d=lon2d, lat2d=lat2d, lon=lon, lat=lat, temp_850=temp_850.values,
                gph_500=gph_500.values, levels_gph=levels_gph, time=time, run_string=run_string)

    if load_all:
        single_plot_param = partial(single_plot, **args)
        iterator = range(0, len(f_steps))
        pool = Pool(cpu_count())
        results = pool.map(single_plot_param, iterator)
        pool.close()
        pool.join()
    else:
        results = single_plot(0, **args)

    return results
Beispiel #3
0
def main():
    random.seed(0)
    rng = np.random.RandomState(1234567)
    lasagne.random.set_rng(rng)
    parser = get_arg_parser()
    args = vars(parser.parse_args())

    setup_logger(args)
    dset = utils.get_dset()
    sent_lens = [len(sent['ws']) for sent in dset]
    logging.debug('# of words per sent, min:{} max:{} mean:{:.2f} std:{:.2f}'.format(min(sent_lens), max(sent_lens), np.mean(sent_lens), np.std(sent_lens)))

    logging.critical(tabulate([args], headers='keys'))

    xvalidate(dset, 1, args)
Beispiel #4
0
def main(args):
    logging.critical(tabulate([args], headers='keys', floatfmt='.2f')+'\n')
    random.seed(0)
    defaults = {'embs': args['embs'], 'n_fold' : 5, 'e_context' : args['e_context'],
            'feats' : args['feats'], 'percentile' : 20, 'unkt' : 2,
            'clf' : 'svm', 'kerntype' : 'rbf', 'kerngamma' : 1, 'kerncoef0' : 1, 'kerndegree' : 2, 'cweights' : 1}

    
    assert hasattr(hp,'loguniform')
    space = {
            # 'kerngamma' : hp.loguniform('kerngamma', -20, 8),
            'kerngamma' : getattr(hp, args['kerngamma'][0])('kerngamma', *map(int, args['kerngamma'][1:])),
            'C' : getattr(hp,args['C'][0])('C', *map(int, args['C'][1:])),
            'percentile' : getattr(hp,args['percentile'][0])('percentile', *map(int, args['percentile'][1:])),
            # 'percentile' : hp.quniform('percentile', 5, 30,1),
            # 'percentile' : getattr(hp,args['percentile'][0])('percentile',args['percentile'][1], args['percentile'][2]),
            # 'C' : hp.loguniform('C', -20, 8),
            # 'C' : hp.lognormal('C', -4, 1),
            # 'percentile' : hp.normal('percentile', 20, 5),
            # 'kerngamma' : hp.uniform('kerngamma', .00001, .1),
    }

    from cwi import xvalidate, Emb
    import utils
    dset = utils.get_dset(args['data'])
    emb = Emb(dset)


    def objwrapper(spsample):
        conf = defaults.copy()
        conf.update(spsample)
        logging.critical(tabulate([conf], headers='keys', floatfmt='.2e'))
        f1, f1std = xvalidate(dset, conf, emb)
        loss = 1-f1
        logging.critical('f1:{:.2f} f1std:{:.2f}\n'.format(f1,f1std))

        return {'loss': loss, 'status': STATUS_OK}


    best = fmin(objwrapper,
            space=space,
            algo=tpe.suggest,
            max_evals=args['evals']
            )
    logging.critical(best)
    defaults.update(best)
    f1, f1std = xvalidate(dset, defaults, emb)
    logging.critical('f1:{:.2f} f1std:{:.2f}\n'.format(f1,f1std))
Beispiel #5
0
def c_opt(dset, emb, dargs):
    logging.critical(tabulate([dargs],headers='keys'))
    f1s = []
    for C in C_values:
        targs = dargs.copy()
        targs['C'] = C
        f1, f1std = cwi.xvalidate(dset, targs, emb)
        f1s.append((f1,f1std,C))
        logging.critical('{}\t{}\t{}'.format(C, f1,f1std))
    logging.critical('\n')
    return max(f1s)

if __name__ == '__main__':
    setup_logger({'log':'road'})

    dset = utils.get_dset()

    emb = cwi.Emb(dset)

    infolist = []
    infolist.append(opt(dset,emb,['r50'],0))

    """
    for ec in range(4):
        infolist.extend(opt(dset,emb,['{}{}'.format(e,dim)],ec) for e,dim in product(['s','g'],[50,100,200]))
        # infolist.extend(opt(dset,emb,ename,ec) for ename in [['s50']])

    for ec in range(4):
        infolist.extend(opt(dset,emb,[e1,e2],ec) for e1,e2 in product(['s50','s100','s200'],['g50','g100','g200']))
    """
Beispiel #6
0
def get_vocab(dset):
    return set(w for sent in dset for w in sent['ws'])

def get_contexts(sent, c):
    ws = (['<s>']*c) + sent['ws'] + (['</s>']*c)

    contexts = []
    for i, w in enumerate(sent['ws']):
        wi = i + c
        if sent['ii'][i]:
            contexts.append(' '.join([w for w in ws[wi-c:wi] + ['___'] + ws[wi+1:wi+c+1]]))
    return contexts

if __name__ == '__main__':
    trn = get_dset()
    tst = get_test()
    print map(len, map(get_tagged_vocab, [trn,tst]))
    print 'tagged vocab size trn {} tst {}'.format(*map(len, map(get_tagged_vocab, [trn,tst])))
    print 'all vocab size trn {} tst {}'.format(*map(len, map(get_vocab, [trn,tst])))

    vtrn, vtst = map(get_tagged_vocab, [trn,tst])
    print 'tagged vtst diff: {:.2f}'.format( len(vtst.difference(vtrn)) / len(vtst) )

    vtrn, vtst = map(get_vocab, [trn,tst])
    print 'all vtst diff: {:.2f}'.format( len(vtst.difference(vtrn)) / len(vtst) )

    precnt = Counter(w[:j] for sent in trn for w, lbl in zip(sent['ws'],sent['ls']) for j in range(3,5) if lbl==1 and len(w)>j)
    sufcnt = Counter(w[-j:] for sent in trn for w, lbl in zip(sent['ws'],sent['ls']) for j in range(3,5) if lbl==1 and len(w)>j)
    print 'most common prefixes:', precnt.most_common(100)
    print 'most common suffixes:', sufcnt.most_common(100)
Beispiel #7
0
    pred_labels = []
    for sent, pred in zip(dset,preds):
        gold_labels.extend([sent['ls'][ii] for ii, interested in enumerate(sent['ii']) if interested])
        # pred_labels.extend([pred[ii] for ii, interested in enumerate(sent['ii']) if interested])
        pred_labels.extend(pred)
    logging.debug(tabulate(confusion_matrix(np.array(gold_labels), np.array(pred_labels)), headers=[0,1]))
    p, r, f = evaluate_system.evaluateIdentifier(gold_labels, pred_labels)
    return p,r,f

if __name__ == '__main__':
    parser = get_arg_parser()
    args = vars(parser.parse_args())

    setup_logger(args)

    logging.debug(tabulate([OrderedDict((k,v) for k,v in sorted(args.iteritems()))], headers='keys'))

    if args['testf']:
        trn = utils.get_dset(args['data'])
        tst = utils.get_test()
        ytrn, ytst = fit_predict(trn, tst, args, Emb(trn+tst))
        with open(args['testf'], 'w') as out:
            out.write('\n'.join([str(y) for y in ytst]))
    else:
        dset = utils.get_dset(args['data'])
        if args['sample'] > 0:
            random.seed(0)
            dset = random.sample(dset, args['sample'])
        xvalidate(dset, args, Emb(dset))

Beispiel #8
0
        f1s.append((f1,f1std,C))
        logging.critical('{}\t{}\t{}'.format(C, f1,f1std))
    logging.critical('\n')
    return max(f1s)

if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--log', default='dont')
    parser.add_argument('--feats', default='')
    parser.add_argument('--percentile', default=20, type=int)
    parser.add_argument('--e_context', default=0, type=int)
    parser.add_argument('--fast', default=False, action='store_true')
    args = parser.parse_args()
    setup_logger({'log':args.log})

    dset0 = utils.get_dset('training')
    dset = utils.get_dset('testing_annotated')
    emb = cwi.Emb(dset0+dset)


    dargs = {'embs':['s50','g50'], 'e_context' : args.e_context, 'feats' : args.feats, 'percentile' : args.percentile, 'n_fold' : 5, 
            'cweights' : 1, 'clf' : 'svm', 'kerntype' : 'lin', 'C' : 1, 'kerngamma' : 1, 'kerncoef0' : 1, 'kerndegree' : 1}
    # logging.critical(tabulate([dargs],headers='keys'))

    if args.fast:
        add_data_sizes = [0,200,400]
        C_powers = [-2,+1]
        seeds = [7,5,9]
    else:
        C_powers = [-20,+8]
        add_data_sizes = [0] + map(lambda x:2**x*100,range(1,7))
def plot_var(f_step, projection):
    # NOTE!
    # If we are inside this function it means that the picture does not exist
    # The one employed for the figure name when exported
    variable_name = 'gph_t_850'
    # Build the name of the output image
    run_string, _ = get_run()
    filename = '/tmp/' + projection + '_' + \
        variable_name + '_%s_%03d.png' % (run_string, f_step)

    """In the main function we basically read the files and prepare the variables to be plotted.
  This is not included in utils.py as it can change from case to case."""
    dset = get_dset(vars_3d=['t@850', 'fi@500'], f_times=f_step).squeeze()
    dset = subset_arrays(dset, projection)
    time = pd.to_datetime(dset.valid_time.values)
    cum_hour = dset.step.values.astype(int)

    temp_850 = dset['t'] - 273.15
    z_500 = dset['z']
    gph_500 = mpcalc.geopotential_to_height(z_500)
    gph_500 = xr.DataArray(gph_500.magnitude, coords=z_500.coords,
                           attrs={'standard_name': 'geopotential height',
                                  'units': gph_500.units})

    levels_temp = np.arange(-30., 30., 1.)
    levels_gph = np.arange(4700., 6000., 70.)

    cmap = get_colormap('temp')

    fig = plt.figure(figsize=(figsize_x, figsize_y))

    ax = plt.gca()

    lon, lat = get_coordinates(temp_850)
    lon2d, lat2d = np.meshgrid(lon, lat)

    ax = get_projection_cartopy(plt, projection, compute_projection=True)

    if projection == 'euratl':
        norm = BoundaryNorm(levels_temp, ncolors=cmap.N)
        cs = ax.pcolormesh(lon2d, lat2d, temp_850, cmap=cmap, norm=norm)
    else:
        cs = ax.contourf(lon2d, lat2d, temp_850, extend='both',
                         cmap=cmap, levels=levels_temp)

    c = ax.contour(lon2d, lat2d, gph_500, levels=levels_gph,
                   colors='white', linewidths=1.)

    labels = ax.clabel(c, c.levels, inline=True, fmt='%4.0f', fontsize=6)

    maxlabels = plot_maxmin_points(ax, lon, lat, gph_500,
                                   'max', 80, symbol='H', color='royalblue', random=True)
    minlabels = plot_maxmin_points(ax, lon, lat, gph_500,
                                   'min', 80, symbol='L', color='coral', random=True)

    an_fc = annotation_forecast(ax, time)
    an_var = annotation(
        ax, 'Geopotential height @500hPa [m] and temperature @850hPa [C]', loc='lower left', fontsize=6)
    an_run = annotation_run(ax, time)

    plt.colorbar(cs, orientation='horizontal',
                 label='Temperature', pad=0.03, fraction=0.04)

    plt.savefig(filename, **options_savefig)
    plt.clf()

    return filename