Example #1
0
def xgboost_train(sym_pred=False):
    train_explicit, train_implicit, train_tag = load('train')
    dev_explicit, dev_implicit, dev_tag = load('dev')
    if sym_pred:
        train_explicit = train_explicit + train_implicit * symptom_predict(
            train_explicit)
        dev_explicit = dev_explicit + dev_implicit * symptom_predict(
            dev_explicit)
    xg_train = xgb.DMatrix(train_explicit, label=train_tag)
    xg_test = xgb.DMatrix(dev_explicit, label=dev_tag)
    # 1.训练模型
    # setup parameters for xgboost
    param = {}
    # use softmax multi-class classification
    param['objective'] = 'multi:softmax'
    # scale weight of positive examples
    param['eta'] = 0.1
    param['max_depth'] = 10
    param['silent'] = 1
    param['nthread'] = 4
    param['num_class'] = 12

    watchlist = [(xg_train, 'train'), (xg_test, 'test')]
    num_round = 100
    bst = xgb.train(param, xg_train, num_round, watchlist)

    pred = bst.predict(xg_test)
    train_pred = bst.predict(xg_train)
    print('dev classification error=%f' % accuracy_score(dev_tag, pred))
    print('train classification error=%f' %
          accuracy_score(train_tag, train_pred))
    pickle.dump(bst, open("disease.pickle.dat", "wb"))
    def main(self):
    """ Tests data processing methods
    """

    try:
        preprocess.setup()
    except:
        print 'SETUP failed'
    else:
        print 'SETUP succeeded'

    try:
        d = preprocess.load(prefix=PAR.OBSERVATIONS)
        s = preprocess.load(prefix=PAR.SYNTHETICS)
    except:
        print 'LOAD failed'
    else:
        print 'LOAD succeeded'

    try:
        d = preprocess.process_traces(d)
        s = preprocess.process_traces(s)
    except:
        print 'PROCESS_TRACES failed'
    else:
        print 'PROCESS_TRACES succeeded'
def predict(input_path, output_path, resources_path):
    """
    This is the skeleton of the prediction function.
    The predict function will build your model, load the weights from the checkpoint and write a new file (output_path)
    with your predictions in the BIES format.
    
    The resources folder should contain everything you need to make the predictions. It is the "resources" folder in your submission.
    
    N.B. DO NOT HARD CODE PATHS IN HERE. Use resource_path instead, otherwise we will not be able to run the code.

    :param input_path: the path of the input file to predict.
    :param output_path: the path of the output file (where you save your predictions)
    :param resources_path: the path of the resources folder containing your model and stuff you might need.
    :return: None
    """

    model = load_model(resources_path + '/model.h5')

    model.summary()

    dictionary = load(resources_path + 'dictionary')
    word2id = dictionary['word2id']
    id2label = dictionary['id2label']

    X_chinese, y, characters, sizes = file2BIES(input_path)

    # Process X
    X_processed = processX(X_chinese, word2id, sentence_size=626)

    y_pred = model.predict(X_processed)

    prediction = []

    arg = np.argmax(y_pred, axis=2)

    for i in range(len(arg)):
        sentence = arg[i]
        labels = []
        num_char = np.count_nonzero(X_processed[i])
        for char in sentence[0:num_char]:
            labels.append(id2label[char])

        prediction.append(labels)

    score(prediction, y, verbose=True)

    # Write prediction file
    filename, extension = os.path.splitext(output_path)
    with open(filename + '_prediction' + extension, "w+") as f:
        for line in prediction:
            f.write(''.join(str(e) for e in line))
            f.write('\n')

    # Write gold file
    with open(output_path, "w+") as f:
        for line in y:
            f.write(''.join(str(e) for e in line))
            f.write('\n')

    pass
Example #4
0
 def write_receivers(self):
     unix.cd(self.getpath)
     key = 'use_existing_STATIONS'
     val = '.true.'
     solvertools.setpar(key, val)
     _, h = preprocess.load('traces/obs')
     solvertools.write_receivers(h.nr, h.rx, h.rz)
Example #5
0
def train_and_test():
    train_explicit, train_implicit, train_tag = load('train')
    dev_explicit, dev_implicit, dev_tag = load('dev')
    train_symptom = np.array(train_implicit != 0, dtype=int)
    dev_symptom = np.array(dev_implicit != 0, dtype=int)

    clf_multilabel = MultiOutputClassifier(
        XGBClassifier(tree_method='gpu_hist',
                      gpu_id=0,
                      eval_metric='logloss',
                      use_label_encoder=False))

    clf_multilabel.fit(train_explicit, train_symptom)
    val_pred = clf_multilabel.predict(dev_explicit)
    print("f1 score", f1_score(dev_symptom, val_pred, average='macro'))
    pickle.dump(clf_multilabel, open("symptom.pickle.dat", "wb"))
Example #6
0
def preprocess():
    # Data Preparation
    # ==================================================

    # Load data
    print("Loading data...")
    x_text, y = load(FLAGS.positive_data_file, FLAGS.negative_data_file)

    # Build vocabulary
    max_document_length = max([len(x.split(" ")) for x in x_text])
    vocab_processor = learn.preprocessing.VocabularyProcessor(
        max_document_length)
    x = np.array(list(vocab_processor.fit_transform(x_text)))

    # Randomly shuffle data
    np.random.seed(10)
    shuffle_indices = np.random.permutation(np.arange(len(y)))
    x_shuffled = x[shuffle_indices]
    y_shuffled = y[shuffle_indices]

    # Split train/test set
    # TODO: This is very crude, should use cross-validation
    dev_sample_index = -1 * int(FLAGS.dev_sample_percentage * float(len(y)))
    x_train, x_dev = x_shuffled[:dev_sample_index], x_shuffled[
        dev_sample_index:]
    y_train, y_dev = y_shuffled[:dev_sample_index], y_shuffled[
        dev_sample_index:]

    del x, y, x_shuffled, y_shuffled

    print("Vocabulary Size: {:d}".format(len(vocab_processor.vocabulary_)))
    print("Train/Dev split: {:d}/{:d}".format(len(y_train), len(y_dev)))
    return x_train, y_train, vocab_processor, x_dev, y_dev
Example #7
0
 def write_receivers(self):
     unix.cd(self.getpath)
     key = 'use_existing_STATIONS'
     val = '.true.'
     setpar(key, val)
     _, h = preprocess.load('traces/obs')
     solvertools.write_receivers(h.nr, h.rx, h.rz)
Example #8
0
    def main(self):
        unix.rm(PATH.SCRATCH)
        unix.mkdir(PATH.SCRATCH)
        preprocess.setup()


        print 'SIMULATION 1 OF 3'
        system.run('solver', 'setup',
                   hosts='all')

        print 'SIMULATION 2 OF 3'
        self.prepare_model()
        system.run('solver', 'eval_func',
                   hosts='all',
                   path=PATH.SCRATCH)

        print 'SIMULATION 3 OF 3'
        system.run('solver', 'eval_grad',
                   hosts='all',
                   path=PATH.SCRATCH)

        # collect traces
        obs = join(PATH.SOLVER, self.event, 'traces/obs')
        syn = join(PATH.SOLVER, self.event, 'traces/syn')
        adj = join(PATH.SOLVER, self.event, 'traces/adj')

        obs,_ = preprocess.load(obs)
        syn,_ = preprocess.load(syn)
        adj,_ = preprocess.load(adj, suffix='.su.adj')

        # collect model and kernels
        model = solver.load(PATH.MODEL_INIT)
        kernels = solver.load(PATH.SCRATCH+'/'+'kernels'+'/'+self.event, suffix='_kernel')

        # dot prodcut in data space
        keys = obs.keys()
        LHS = DotProductLHS(keys, syn, adj)

        # dot product in model space
        keys = ['rho', 'vp', 'vs'] # model.keys()
        RHS = DotProductRHS(keys, model, kernels)

        print 
        print 'LHS:', LHS
        print 'RHS:', RHS
        print 'RELATIVE DIFFERENCE:', (LHS-RHS)/RHS
        print
Example #9
0
def grid_search_train(train, test, subm):
    '''
    Arguments:
    sentences=None, size=100, alpha=0.025, window=5, min_count=5, max_vocab_size=None,
    sample=0.001, seed=1, workers=3, min_alpha=0.0001, sg=0, hs=0, negative=5,
    cbow_mean=1, hashfxn=<built-in function hash>, iter=5, null_word=0, trim_rule=None,
    sorted_vocab=1, batch_words=10000, compute_loss=False, callbacks=()
    '''
    data = preprocess.load()
    for sg, size, window, min_count, hs, neg, iter, sample in product(
        [0], [300], [5], [1], [0], [5], [25], [0.001]):
        # for sg,size,window,min_count,hs,neg,iter,sample in product( [1,0],
        #                                                             [100,300],
        #                                                             [5,10],
        #                                                             [1],
        #                                                             [0,1],
        #                                                             [5,10],
        #                                                             [5,25],
        #                                                             [0.1,0.01,0.001]):
        print(time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime()), sg, size,
              window, min_count, hs, neg, iter, sample)

        model = gensim.models.word2vec.Word2Vec(data['train_tokens'],
                                                sg=sg,
                                                size=size,
                                                window=window,
                                                min_count=min_count,
                                                hs=hs,
                                                negative=neg,
                                                iter=iter,
                                                sample=sample)
        model_name = 'sg{0}-sz{1}-win{2}-minc{3}-hs{4}-neg{5}-iter{6}-samp{7}'.format(
            sg, size, window, min_count, hs, neg, iter, sample)
        model.save('data/w2v-' + model_name + '.model')

        embedding_vectorizer = TfidfEmbeddingVectorizer(
            model.wv, data['vocabulary'], data['idf'])
        train_embedded = embedding_vectorizer.fit(data['train_tokens'], None)
        train_embedded = embedding_vectorizer.transform(data['train_tokens'])
        test_embedded = embedding_vectorizer.transform(data['test_tokens'])

        label_cols = [
            'toxic', 'severe_toxic', 'obscene', 'threat', 'insult',
            'identity_hate'
        ]
        preds = np.zeros((len(test), len(label_cols)))
        for i, j in enumerate(label_cols):
            m = LogisticRegression()
            mf = m.fit(train_embedded, train[j])
            preds[:, i] = mf.predict_proba(test_embedded)[:, 1]

        submid = pd.DataFrame({'id': subm["id"]})
        submission = pd.concat(
            [submid, pd.DataFrame(preds, columns=label_cols)], axis=1)
        submission.to_csv(
            'submission/submission-toxicw2v-doctfidf-lr-{}.csv'.format(
                model_name),
            index=False)
Example #10
0
 def initialize_adjoint_traces(self, path='traces/obs'):
     """ Adjoint traces are initialized by writing zeros for all components.
         Components actually in use during an inversion or migration will be
         overwritten with nonzero values later on.
     """
     _, h = preprocess.load(path)
     zeros = np.zeros((h.nt, h.nr))
     for channel in ['x', 'y', 'z']:
         preprocess.writer(zeros, h, channel=channel, prefix='traces/adj/')
Example #11
0
 def initialize_adjoint_traces(self, path='traces/obs'):
     """ Adjoint traces are initialized by writing zeros for all components.
         Components actually in use during an inversion or migration will be
         overwritten with nonzero values later on.
     """
     _, h = preprocess.load(path)
     zeros = np.zeros((h.nt, h.nr))
     for channel in ['x', 'y', 'z']:
         preprocess.writer(zeros, h, channel=channel, prefix='traces/adj/')
Example #12
0
    def main(self):
        unix.rm(PATH.SCRATCH)
        unix.mkdir(PATH.SCRATCH)
        preprocess.setup()

        print 'SIMULATION 1 OF 3'
        system.run('solver', 'setup', hosts='all')

        print 'SIMULATION 2 OF 3'
        self.prepare_model()
        system.run('solver', 'eval_func', hosts='all', path=PATH.SCRATCH)

        print 'SIMULATION 3 OF 3'
        system.run('solver', 'eval_grad', hosts='all', path=PATH.SCRATCH)

        # collect traces
        obs = join(PATH.SOLVER, self.event, 'traces/obs')
        syn = join(PATH.SOLVER, self.event, 'traces/syn')
        adj = join(PATH.SOLVER, self.event, 'traces/adj')

        obs, _ = preprocess.load(obs)
        syn, _ = preprocess.load(syn)
        adj, _ = preprocess.load(adj, suffix='.su.adj')

        # collect model and kernels
        model = solver.load(PATH.MODEL_INIT)
        kernels = solver.load(PATH.SCRATCH + '/' + 'kernels' + '/' +
                              self.event,
                              suffix='_kernel')

        # dot prodcut in data space
        keys = obs.keys()
        LHS = DotProductLHS(keys, syn, adj)

        # dot product in model space
        keys = ['rho', 'vp', 'vs']  # model.keys()
        RHS = DotProductRHS(keys, model, kernels)

        print
        print 'LHS:', LHS
        print 'RHS:', RHS
        print 'RELATIVE DIFFERENCE:', (LHS - RHS) / RHS
        print
Example #13
0
 def initialize_adjoint_traces(self):
     """ Adjoint traces must be initialized by writing zeros for all 
       components. This is because when reading traces at the start of an
       adjoint simulation, SPECFEM3D_GLOBE expects that all components exist.
       Components actually in use during an inversion or migration will
       be overwritten with nonzero values later on.
     """
     _, h = preprocess.load('traces/obs')
     zeros = np.zeros((h.nt, h.nr))
     for channel in ['x', 'y', 'z']:
         preprocess.writer(zeros, h, channel=channel, prefix='traces/adj')
Example #14
0
def get_data_loader(name, train=True):
    print("use dataset: {}".format(name))
    if name == "MNIST":
        return get_mnist(train)
    elif name == "USPS":
        return get_usps(train)
    elif name == "SVHN":
        return get_svhn(train)
    elif name == "A":
        return load_images('data/office/',
                           'amazon',
                           batch_size=config.batch_size,
                           is_train=train)
    elif name == "W":
        return load_images('data/office/',
                           'webcam',
                           batch_size=config.batch_size,
                           is_train=train)
    elif name == "D":
        return load_images('data/office/',
                           'dslr',
                           batch_size=config.batch_size,
                           is_train=train)
    elif name == "B":
        return load('data/image-clef/b_list.txt',
                    batch_size=config.batch_size,
                    is_train=train)
    elif name == "C":
        return load('data/image-clef/c_list.txt',
                    batch_size=config.batch_size,
                    is_train=train)
    elif name == "I":
        return load('data/image-clef/i_list.txt',
                    batch_size=config.batch_size,
                    is_train=train)
    elif name == "P":
        return load('data/image-clef/p_list.txt',
                    batch_size=config.batch_size,
                    is_train=train)
def main_load_and_plot():
    """ Loads the dataset and project it on the region """
    from mpl_toolkits.basemap import Basemap
    from matplotlib import pyplot as plt
    import matplotlib.cm as cm
    import time, datetime

    column_indexes, data = load()
    long_index, lat_index = column_indexes.index(
        'location-long'), column_indexes.index('location-lat')
    tmp_index = column_indexes.index('timestamp')

    lats, longs, i = [], [], 0
    tmpstamps = []
    for w in data:
        lon, lat = w[long_index], w[lat_index]
        tmp = w[tmp_index]
        if lon == '' or lat == '':
            i += 1
            continue

        tmp = time.mktime(
            datetime.datetime.strptime(tmp,
                                       "%Y-%m-%d %H:%M:%S.000").timetuple())

        lats.append(float(lat))
        longs.append(float(lon))
        tmpstamps.append(int(tmp))
    print("Skipped " + str(i) + " data point.")

    # projection='ortho', projection='mill'
    m = Basemap(projection='mill',
                llcrnrlon=-10,
                llcrnrlat=2,
                urcrnrlon=70,
                urcrnrlat=70,
                lon_0=30,
                lat_0=35,
                resolution='l')
    x1, y1 = m(longs, lats)
    m.scatter(x1, y1, s=30, c=tmpstamps, marker="o", cmap=cm.cool, alpha=0.7)
    m.drawmapboundary(fill_color='black')  # fill to edge
    m.drawcountries()
    m.fillcontinents(color='white', lake_color='black', zorder=0)
    plt.colorbar()
    plt.show()
Example #16
0
    def fix_near_field(self, path=''):
        """
        """
        import preprocess
        preprocess.setup()

        name = solver.check_source_names()[solver.getnode]
        fullpath = path + '/' + name
        g = solver.load(fullpath, suffix='_kernel')
        if not PAR.FIXRADIUS:
            return

        mesh = self.getmesh()
        x, z = self.getxz()

        lx = x.max() - x.min()
        lz = z.max() - z.min()
        nn = x.size
        nx = np.around(np.sqrt(nn * lx / lz))
        nz = np.around(np.sqrt(nn * lz / lx))
        dx = lx / nx
        dz = lz / nz

        sigma = 0.5 * PAR.FIXRADIUS * (dx + dz)
        _, h = preprocess.load(solver.getpath + '/' + 'traces/obs')

        # mask sources
        mask = np.exp(-0.5 * ((x - h.sx[0])**2. + (z - h.sy[0])**2.) /
                      sigma**2.)
        for key in solver.parameters:
            weight = np.sum(mask * g[key][0]) / np.sum(mask)
            g[key][0] *= 1. - mask
            g[key][0] += mask * weight

        # mask receivers
        for ir in range(h.nr):
            mask = np.exp(-0.5 * ((x - h.rx[ir])**2. + (z - h.ry[ir])**2.) /
                          sigma**2.)
            for key in solver.parameters:
                weight = np.sum(mask * g[key][0]) / np.sum(mask)
                g[key][0] *= 1. - mask
                g[key][0] += mask * weight

        solver.save(fullpath, g, suffix='_kernel')
Example #17
0
    def fix_near_field(self, path=''):
        """
        """
        import preprocess
        preprocess.setup()

        name = solver.check_source_names()[solver.getnode]
        fullpath = path +'/'+ name
        g = solver.load(fullpath, suffix='_kernel')
        if not PAR.FIXRADIUS:
            return

        mesh = self.getmesh()
        x,z = self.getxz()

        lx = x.max() - x.min()
        lz = z.max() - z.min()
        nn = x.size
        nx = np.around(np.sqrt(nn*lx/lz))
        nz = np.around(np.sqrt(nn*lz/lx))
        dx = lx/nx
        dz = lz/nz

        sigma = 0.5*PAR.FIXRADIUS*(dx+dz)
        _, h = preprocess.load(solver.getpath +'/'+ 'traces/obs')

        # mask sources
        mask = np.exp(-0.5*((x-h.sx[0])**2.+(z-h.sy[0])**2.)/sigma**2.)
        for key in solver.parameters:
            weight = np.sum(mask*g[key][0])/np.sum(mask)
            g[key][0] *= 1.-mask
            g[key][0] += mask*weight

        # mask receivers
        for ir in range(h.nr):
            mask = np.exp(-0.5*((x-h.rx[ir])**2.+(z-h.ry[ir])**2.)/sigma**2.)
            for key in solver.parameters:
                weight = np.sum(mask*g[key][0])/np.sum(mask)
                g[key][0] *= 1.-mask
                g[key][0] += mask*weight

        solver.save(fullpath, g, suffix='_kernel')
def get_paths():
    from preprocess import load
    column_indexes, data = load()
    import time, datetime
    import itertools
    from operator import itemgetter

    long_index, lat_index = column_indexes.index(
        'location-long'), column_indexes.index('location-lat')
    tmp_index = column_indexes.index('timestamp')
    local_tag_index = column_indexes.index('tag-local-identifier')

    groups = itertools.groupby(data, key=lambda x: x[local_tag_index])

    i = 0

    path_by_tag_id = {}

    for k, g in groups:
        path_by_tag_id[k] = []
        #    lats, longs = [], []
        #    tmpstamps = []
        for w in g:
            lon, lat = w[long_index], w[lat_index]
            tmp = w[tmp_index]
            tmp = time.mktime(
                datetime.datetime.strptime(
                    tmp, "%Y-%m-%d %H:%M:%S.000").timetuple())

            if lon == '' or lat == '':
                i += 1
                continue

            path_by_tag_id[k].append((float(lon), float(lat), int(tmp)))

        # Sorting each path time wise
        for k in path_by_tag_id.keys():
            path_by_tag_id[k] = sorted(path_by_tag_id[k], key=itemgetter(2))

    return path_by_tag_id
Example #19
0
def run_training():
    train = preprocess.load(path=config.PATH,
                            filename=config.FILENAME,
                            col_list=[0, 2, 3, 4, 6, 7, 8, 9],
                            dtypes=config.DTYPES)
    targets = train[config.POSSIBLE_LABELS]
    class_weights = custom_loss.calculating_class_weights(targets.values, 6)
    with open(os.path.join(config.PATH, "class_weights.pkl"), "wb") as f:
        pickle.dump(class_weights, f)

    # extract and load pretrained wordembeddings
    word2vec = word_embeddings.load_pretr_wv()
    # tokenize and pad sequences
    tokenizer_object = Tokenize_Object(config.PATH)
    data = tokenizer_object.tokenize(train)
    # save tokenizer
    tokenizer_object.save_tokenizer()
    # apply embeddings to layer weights
    embedding_layer = word_embeddings.apply_embeddings(word2vec)
    # build model
    m = model.build_model(embedding_layer)
    # apply class weights via custom loss
    m.compile(loss=custom_loss.get_weighted_loss(class_weights),
              optimizer='adam',
              metrics=['accuracy'])

    X_train, X_valid, y_train, y_valid = preprocess.split_train_valid(
        data, train[config.POSSIBLE_LABELS])

    # fitting the model
    print('Training model...')
    r = m.fit(
        X_train,
        y_train,
        epochs=config.EPOCHS,
        batch_size=config.BATCH_SIZE,
        validation_data=(X_valid, y_valid),
        verbose=2,
        callbacks=[config.callback_checkpoint, config.callback_earlystop])
def conv_net(x, keep_prob):
    """
    Create a convolutional neural network model
    : x: Placeholder tensor that holds image data.
    : keep_prob: Placeholder tensor that hold dropout keep probability.
    : return: Tensor that represents logits
    """
    # TODO: Apply 1, 2, or 3 Convolution and Max Pool layers
    #    Play around with different number of outputs, kernel size and stride
    # Function Definition from Above:
    #    conv2d_maxpool(x_tensor, conv_num_outputs, conv_ksize, conv_strides, pool_ksize, pool_strides)

    layer = nl.conv2d_maxpool(x, 16, (4, 4), (1, 1), (2, 2), (2, 2))
    tf.nn.dropout(layer, keep_prob=keep_prob)

    # TODO: Apply a Flatten Layer
    # Function Definition from Above:
    #   flatten(x_tensor)
    layer = nl.flatten(layer)

    # TODO: Apply 1, 2, or 3 Fully Connected Layers
    #    Play around with different number of outputs
    # Function Definition from Above:
    #   fully_conn(x_tensor, num_outputs)

    layer = nl.fully_conn(layer, 400)
    layer = tf.nn.dropout(layer, keep_prob)

    # TODO: Apply an Output Layer
    #    Set this to the number of classes
    # Function Definition from Above:
    #   output(x_tensor, num_outputs)

    categories = preprocess.load('data/categories.p')
    res = nl.output(layer, len(categories))

    # TODO: return output
    return res
Example #21
0
def main():

    ############## PRE PROCESS DATA (only once) #############################
    data = preprocess.load(filename)
    clean_data = preprocess.clean(data)

    ############## READ CLEANED DATA ########################################
    data = pd.read_csv(filename_clean)
    data = data.drop(columns=["Unnamed: 0"])
    print(data.head())

    ############## EXTRACT FEATURES #########################################

    unobtrusive = data

    ## Create dataset including obtrusive features ##

    # removing all redundant columns / keeping those that we want features for
    cols_to_keep = ["id", "time", "mood", "sun", \
        "rain", "max_temp", "total_appuse", "activity", "circumplex.arousal", \
        "circumplex.valence", "weekdaydummy0", "weekdaydummy1", "weekdaydummy2", \
        "weekdaydummy3", "weekdaydummy4", "weekdaydummy5", "weekdaydummy6"]

    data = data[cols_to_keep]

    # creating lagged variables for the following columns (with defined durations)
    columns_to_lag = [
        "mood", "circumplex.arousal", "circumplex.valence", "total_appuse",
        "max_temp"
    ]
    lags = [4, 3, 3, 3, 3]

    for i, col in enumerate(columns_to_lag):
        data = pivot.create_lagged_vars(data, col, lags=lags[i])

    # many rows are unusable so we drop them
    data = data.dropna()

    data.to_csv("with_features.csv")

    ## Creating unobtrusive-only dataset ##

    # removing all redundant columns / keeping those that we want features for
    un_cols_to_keep = ["id", "time", "mood", "sun", \
        "rain", "max_temp", "total_appuse", "activity", "weekdaydummy0", "weekdaydummy1", \
        "weekdaydummy2", "weekdaydummy3", "weekdaydummy4", "weekdaydummy5", "weekdaydummy6"]

    unobtrusive = unobtrusive[un_cols_to_keep]

    # creating lagged variables for the following columns (with defined durations)
    un_columns_to_lag = ["total_appuse", "max_temp"]
    lags = [4, 3]

    for i, col in enumerate(un_columns_to_lag):
        unobtrusive = pivot.create_lagged_vars(unobtrusive, col, lags=lags[i])

    # many rows are unusable so we drop them
    unobtrusive = unobtrusive.dropna()

    unobtrusive.to_csv("unobtrusive_with_features.csv")

    ## Correlations

    features = pd.read_csv('with_features.csv', index_col=0)
    correlations = calculate_pvalues(features)
    correlations.to_csv('correlations.csv')
    correlations = correlations.astype(float)
    correlations = correlations.drop(['time'], axis=1)
    correlations = correlations.drop(['time', 'mood', 'total_appuse_lag2', 'total_appuse_lag3', \
        'max_temp_lag2', 'max_temp_lag3', 'circumplex.arousal_lag2', 'circumplex.arousal_lag3', \
        'circumplex.valence_lag2', 'circumplex.valence_lag3'], axis=0)
    plt.figure()
    sns.heatmap(
        correlations[['mood', 'circumplex.arousal', 'circumplex.valence']],
        vmin=0,
        vmax=1,
        center=0.5,
        linewidth=3)
    plt.show()
Example #22
0
from sklearn.preprocessing import label_binarize
from sklearn.metrics import roc_auc_score
import numpy as np
import configparser

from preprocess import load

if __name__ == '__main__':
    p = [
        'mozilla4.arff', 'waveform-5000.arff', 'diabetes.arff', 'pc5.arff',
        'pc1.arff'
    ]
    cl = ['naive bayes', 'decision tree', 'KNN', 'MLP', 'LinearSVM']
    for fpath in p:
        print(fpath)
        X, y = load(fpath)
        n_classes = np.arange(np.unique(y).size)
        for j in range(2, 3):
            start_time = time.time()
            if j == 0:
                clf = BaggingClassifier(base_estimator=GaussianNB(),
                                        n_estimators=10,
                                        max_samples=0.5,
                                        max_features=0.5)
            elif j == 1:
                clf = BaggingClassifier(base_estimator=DecisionTreeClassifier(
                    random_state=0, criterion='gini'),
                                        n_estimators=10,
                                        max_samples=0.5,
                                        max_features=0.5)
            elif j == 2:
Example #23
0
 def write_sources(self):
     unix.cd(self.getpath)
     _, h = preprocess.load(dir='traces/obs')
     solvertools.write_sources(vars(PAR), h)
Example #24
0
        print("Bot" + sentence_out)
        except:
            print("Error")
        

def chatBegin(){
    #存放用于测试聊天用的代码
    SOS_token = 2
    EOS_token = 3
    train_save_path = 'D:/1000.pth'
    # corpus_paris_path = 'D:/corpus_paris.csv'
    corpus_paris_path = "D:/clean_chat_corpus/xiaohuangji_processed.tsv"
    dict_path = "D:/clean_chat_corpus/xiaohuangji_dict.tsv"
    word2index = []
    index2word = []
    word2index, index2word = preprocess.load(dict_path)
    num_words = len(index2word)

    num_layers = 2
    dropout = 0.1
    hidden_size = 256
    output_size = num_words
    embedding = torch.nn.Embedding(num_words, hidden_size)
    learning_rate = 0.0001
    decoder_lr_ratio = 5
    encoder_lr = learning_rate
    decoder_lr = learning_rate * decoder_lr_ratio
    total_gen = 5000
    batch_size = 1024

Example #25
0
    parser.error('norm must be 1 or 2')
    sys.exit(1)

print(__doc__)
parser.print_help()
print()

################################################################################
# core logic
################################################################################

if __name__ == '__main__':

    # load the data
    # TODO make the data to load a parameter
    data_obj = load('preprocessed_data/prd.pkl')
    # TODO choose whether to load data by year or by animal
    # if by year, select year, else select animal - print out options
    data_by_year = data_obj.get_data_by_year()
    year, indivs = data_by_year[0]

    print('year')
    print(year)
    print('num individuals for the year')
    print(len(indivs))
    print()

    # get the time series of interest from the loaded data
    tss = []
    for indiv_id, pts in indivs:
        tss.append(pts)
Example #26
0
from sklearn.neural_network import MLPClassifier
from sklearn.svm import LinearSVC

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import label_binarize

from preprocess import load
from sklearn.pipeline import Pipeline
from sklearn.neighbors import NeighborhoodComponentsAnalysis



p = ['mozilla4.arff', 'waveform-5000.arff', 'diabetes.arff', 'pc5.arff', 'pc1.arff']
path = p[0]
X, y = load(path)

n_classes = np.arange(np.unique(y).size)

# clf = DecisionTreeClassifier(random_state=0, criterion='gini')
# clf = GaussianNB()
# clf = KNeighborsClassifier(n_neighbors=3)
# clf = MLPClassifier(hidden_layer_sizes=(100),
# 		        activation='relu',
# 		        solver='adam',
# 		        batch_size=128,
# 		        alpha=1e-4,
# 		        learning_rate_init=1e-3,
# 		        learning_rate='adaptive',
# 		        tol=1e-4,
# 		        max_iter=200)
Example #27
0
        print('Number of clusters: %d' % n_clusts)
        print('_' * 80)
        # create a clusterer
        clusterer = TsClusterer(n_clusts, dist_norm, max_iterations, stopping_threshold)
        avg_err = cv.cross_validate(clusterer, distance_metric=dist_metric, window=window)
        errs.append((n_clusts, avg_err, clusterer))
        print('Average error of %f achieved using %d clusters' % (avg_err, n_clusts))
        print()
    return errs

if __name__ == '__main__':
    print('file path: %s' % args.source_path)
    print()

    # Load the clusterers
    clusterers = load(args.source_path)

    print_errors(clusterers)

    plot_errors(clusterers)

    if args.best_n_clusts:
        print('Getting the clusterer for %d clusters' % args.best_n_clusts)
        clusterer = get_clusterer_for_best_n_clusts(clusterers, args.best_n_clusts)
        if not clusterer:
            print('No clusterer found for %d clusters' % args.best_n_clusts)
            sys.exit(1)
        print('Done')
        print()

        assignments, lowest_err = clusterer.get_best_assignment()
Example #28
0
                        help='How big is each word vector')
    parser.add_argument('--preprocess',
                        action='store_true',
                        help='Redo preprocessing.')
    parser.add_argument('--embed',
                        action='store_true',
                        help='Redo embedding preprocessing.')

    # Setup
    params = parser.parse_args()

    if params.debug:
        print('Running in debug mode.')

    seed_everything()

    if params.preprocess:
        x_train, x_test, y_train, features, test_features, word_index = preprocess.preprocess(
            params)
    else:
        x_train, x_test, y_train, features, test_features, word_index = preprocess.load(
            params)

    if params.embed:
        embedding_matrix = embeddings.process(params)
    else:
        embedding_matrix = embeddings.load(params)

    preds = run(x_train, y_train, features, test_features, x_test,
                embedding_matrix, params)
Example #29
0
    else:
        # try appending preprocessed_data/
        args.source_path = 'preprocessed_data/' + args.source_path
        if not os.path.exists(args.source_path):
            parser.error('The file %s does not exist' % args.source_path)
            sys.exit(1)

print(__doc__)
parser.print_help()
print()

if __name__ == '__main__':
    print('file path: %s' % args.source_path)
    print()

    data = load(args.source_path)

    data_by_year = data.get_data_by_year()
    print('num years')
    print(len(data_by_year))
    year_0, indivs = data_by_year[0]

    print('year')
    print(year_0)
    print('number of indiviuals')
    print(len(indivs))
    indiv_0, pts = indivs[0]
    print('individual')
    print(indiv_0)
    print('shape of path')
    print(pts.shape)
Example #30
0
from preprocess import load
from sklearn.pipeline import Pipeline

if __name__ == '__main__':

    p = [
        'mozilla4.arff', 'waveform-5000.arff', 'diabetes.arff', 'pc5.arff',
        'pc1.arff'
    ]
    cl = [
        'naive bayes', 'decision tree', 'KNN', 'MLP', 'LinearSVM',
        'improve of bagging knn'
    ]
    for fpath in p:

        X, Y = load(fpath)
        for i in range(5, 6):
            start_time = time.time()
            if i == 0:
                clf = BaggingClassifier(base_estimator=GaussianNB(),
                                        n_estimators=10,
                                        max_samples=0.5,
                                        max_features=0.5)
            elif i == 1:
                clf = BaggingClassifier(base_estimator=DecisionTreeClassifier(
                    random_state=0, criterion='gini'),
                                        n_estimators=10,
                                        max_samples=0.5,
                                        max_features=0.5)
            elif i == 2:
                clf = BaggingClassifier(
    def fix_near_field(self, path=''):
        """
        """
        import preprocess
        preprocess.setup()

        name = solver.check_source_names()[solver.getnode]
        fullpath = path + '/' + name
        #print 'DB: name=', name
        #print 'DB: fullpath=', fullpath

        g = solver.load(fullpath, suffix='_kernel')
        g_vec = solver.merge(g)
        nproc = solver.mesh.nproc

        #print 'DB: len(g_vec)=', len(g_vec)

        if not PAR.FIXRADIUS:
            return

        x, y, z = self.getcoords()
        #print 'DB: len(g)=', len(g)
        #print 'DB: len(g[vp][0])=', len(g['vp'][0])
        #print 'DB: x.shape=', x.shape
        #print 'DB: len(x)=', len(x)

        ##sys.exit("DB: stop from postporcess-regularize")

        lx = x.max() - x.min()
        ly = y.max() - y.min()
        lz = z.max() - z.min()
        nn = x.size
        nx = np.around(np.sqrt(nn * lx / (lz * ly)))
        ny = np.around(np.sqrt(nn * ly / (lx * lz)))
        nz = np.around(np.sqrt(nn * lz / (lx * ly)))
        dx = lx / nx * 1.25
        dy = ly / ny * 1.25
        dz = lz / nz * 1.25

        #print 'DB: lx=', lx
        #print 'DB: ly=', ly
        #print 'DB: lz=', lz
        #print 'DB: nn=', nn
        #print 'DB: nx=', nx
        #print 'DB: ny=', ny
        #print 'DB: nz=', nz
        #print 'DB: dx=', dx
        #print 'DB: dy=', dy
        #print 'DB: dz=', dz

        sigma = PAR.FIXRADIUS * (dx + dz + dy) / 3.0
        _, h = preprocess.load(solver.getpath + '/' + 'traces/obs')

        # mask sources
        mask = np.exp(-0.5 * ((x - h.sx[0])**2. + (y - h.sy[0])**2. +
                              (z - h.sz[0])**2.) / sigma**2.)

        # mask top
        # for matlab
        # z_sqrt=(abs(z).^(0.25)); depth_scale=1-z_sqrt/max(z_sqrt); figure; plot(depth_scale,z);
        z_factor = np.power(abs(z), 0.5)
        #max_z_factor = np.amax(z_factor)
        #scale_depth = 1.0 - z_factor/max_z_factor
        #print 'DB: max(z_factor)=',max_z_factor
        #print 'DB: max(scale_depth)=',np.amax(scale_depth)
        #print 'DB: min(scale_depth)=',np.amin(scale_depth)
        #mask *= scale_depth

        #mask_depth = solver.split(z)
        mask_depth = solver.split(z_factor)
        mask_d = solver.split(mask)

        ##print 'DB: sigma=',sigma
        ##print 'DB: mask=',mask
        #print 'DB: len(mask)=', len(mask)
        #print 'DB: len(mask_d)=', len(mask_d)
        ##print 'DB: len(g)=', len(g)
        ##print 'DB: len(g)[vp][0]=', len(g['vp'][0])

        for key in solver.parameters:
            for iproc in range(nproc):
                #print 'DB: key, iproc=', key, iproc
                #print 'DB: len(g[key][iproc])=', len(g[key][iproc])
                #print 'DB: len(mask_d[key][iproc])=', len(mask_d[key][iproc])
                weight = np.sum(mask_d['vp'][iproc] * g[key][iproc]) / np.sum(
                    mask_d['vp'][iproc])
                #print 'DB: key, iproc, weigth= ', key, iproc, weight
                g[key][iproc] *= 1. - mask_d['vp'][iproc]
                g[key][iproc] *= mask_depth['vp'][iproc]
                #g[key][iproc] += mask_d['vp'][iproc]*weight

                #weight = np.sum(mask_d['vp'][iproc]*g[key][iproc])/np.sum(mask_d['vp'][iproc])
                ##print 'DB: key, iproc, weigth= ', key, iproc, weight
                #g[key][iproc] *= 1.-mask_d['vp'][iproc]
                #g[key][iproc] += mask_d['vp'][iproc]*weight

        # mask receivers
        #for ir in range(h.nr):
        #    mask = np.exp(-0.5*((x-h.rx[ir])**2.+(z-h.ry[ir])**2.)/sigma**2.)
        #    for key in solver.parameters:
        #        weight = np.sum(mask*g[key][0])/np.sum(mask)
        #        g[key][0] *= 1.-mask
        #        g[key][0] += mask*weight

        solver.save(fullpath, g, suffix='_kernel')
Example #32
0
    parser.add_argument('--lang', type=str, default='pt', help='pt or es')
    parser.add_argument('--loss_fn',
                        default=torch.nn.BCEWithLogitsLoss(reduction='sum'),
                        help='How big is each word vector')
    parser.add_argument('--preprocess',
                        action='store_true',
                        help='Redo preprocessing.')
    parser.add_argument('--embed',
                        action='store_true',
                        help='Redo embedding preprocessing.')

    # Setup
    params = parser.parse_args()

    if params.debug:
        print('Running in debug mode.')

    if params.preprocess:
        x_train, y_train, x_test = preprocess.preprocess(params)
    else:
        x_train, y_train, x_test = preprocess.load(params)

    print('Loaded:')
    print(x_train.describe())
    """if params.embed:
        embedding_matrix = embeddings.process(params)
    else:
        embedding_matrix = embeddings.load(params)"""

    preds = run(x_train, y_train, x_test, params)
Example #33
0
class test_preprocess(object):
    """ Preprocess integration test

        Not yet implemented. The following is just a sketch. None of the 
        methods work yet.
    """
    def check(self):
        """ Checks parameters and paths
        """
        #raise NotImplementedError

        # mute settings
        if 'MUTE' not in PAR:
            setattr(PAR, 'MUTE', False)

        if 'MUTESLOPE' not in PAR:
            setattr(PAR, 'MUTESLOPE', 0.)

        if 'MUTECONST' not in PAR:
            setattr(PAR, 'MUTECONST', 0.)

        # filter settings
        if 'BANDPASS' not in PAR:
            setattr(PAR, 'BANDPASS', False)

        if 'FREQLO' not in PAR:
            setattr(PAR, 'FREQLO', 0.)

        if 'FREQHI' not in PAR:
            setattr(PAR, 'FREQHI', 0.)

        # check paths
        if 'OBSERVATIONS' not in PATH:
            raise Exception

        if 'SYNTHETICS' not in PATH:
            raise Exception

        if 'OUTPUT' not in PATH:
            raise Exception

    def main(self):
        """ Tests data processing methods
        """

    try:
        preprocess.setup()
    except:
        print 'SETUP failed'
    else:
        print 'SETUP succeeded'

    try:
        d, h = preprocess.load(prefix=PATH.OBSERVATIONS)
        s, h = preprocess.load(prefix=PATH.SYNTHETICS)
    except:
        print 'LOAD failed'
    else:
        print 'LOAD succeeded'

    try:
        d = preprocess.multichannel(preprocess.process_traces, [d], [h])
        s = preprocess.multichannel(preprocess.process_traces, [s], [h])
    except:
        print 'PROCESS_TRACES failed'
    else:
        print 'PROCESS_TRACES succeeded'

    try:
        preprocess.save(d, h, prefix=PATH.OBSERVATIONS_PRE)
        preprocess.save(s, h, prefix=PATH.SYNTHETICS_PRE)
    except:
        print 'OUTPUT_TRACES failed'
    else:
        print 'OUTPUT_TRACES succeeded'
Example #34
0
 def write_sources(self):
     unix.cd(self.getpath)
     _, h = preprocess.load(dir='traces/obs')
     solvertools.write_sources(vars(PAR), h)
Example #35
0
# Misc Parameters
tf.flags.DEFINE_boolean("allow_soft_placement", True, "Allow device soft device placement")
tf.flags.DEFINE_boolean("log_device_placement", False, "Log placement of ops on devices")


FLAGS = tf.flags.FLAGS
FLAGS(sys.argv)
print("\nParameters:")
for attr, value in sorted(FLAGS.__flags.items()):
    print("{}={}".format(attr.upper(), value))
print("")

# CHANGE THIS: Load data. Load your own data here
if FLAGS.eval_train:
    x_raw, y_test = load(FLAGS.positive_data_file, FLAGS.negative_data_file)
    y_test = np.argmax(y_test, axis=1)
else:
    x_raw = ["a masterpiece four years in the making", "everything is off."]
    y_test = [1, 0]

# Map data into vocabulary
vocab_path = os.path.join(FLAGS.checkpoint_dir, "..", "vocab")
vocab_processor = learn.preprocessing.VocabularyProcessor.restore(vocab_path)
x_test = np.array(list(vocab_processor.transform(x_raw)))

print("\nEvaluating...\n")

# Evaluation
# ==================================================
checkpoint_file = tf.train.latest_checkpoint(FLAGS.checkpoint_dir)