Beispiel #1
0
def remove_features_sklearn(criterion, splitter, removal_order, train_file,
                            test_file, attr_file, max_features):
    train_accs = []
    test_accs = []
    remove_columns = []
    for col in removal_order:
        print(col)
        remove_columns.append(col)
        if len(remove_columns) == max_features: break
        print(remove_columns)
        train_data, train_attr = read_data(train,
                                           attr,
                                           remove_columns=remove_columns)
        test_data, test_attr = read_data(test,
                                         attr,
                                         remove_columns=remove_columns)
        train_acc, test_acc = other_trees.sklearn_decision_tree(
            criterion,
            splitter,
            train_data,
            test_data,
            train_attr,
            class_column,
            ds_name="custom")

        train_accs.append(train_acc)
        test_accs.append(test_acc)
    return train_accs, test_accs
Beispiel #2
0
def load(histmods, kmers, tissue, short, dist):
    if 'both' in tissue:
        if 'randoms' in tissue:
            randoms = 'randoms_'
        else: randoms = ''
        filename = get_filename(randoms +'heart', short)
        filename2 = get_filename(randoms +'brain', short)
        target_data = load_vista.load_enhancers_with_seq(filename)
        target_data2 = load_vista.load_enhancers_with_seq(filename2)
        files_remove = [filename+".remove"]
        files_remove2 = [filename2+".remove"]
        if dist:
            files_remove.append( "FANTOM_heart_brain.remove" )
            files_remove2.append("FANTOM_brain_heart.remove")
        
        data = read_data(histmods, kmers, target_data, filename)
        data = remove_samples(data, target_data, files_remove)
        data2 = read_data(histmods, kmers, target_data2, filename2)
        data2 = remove_samples(data2, target_data2, files_remove2)
        return join_and_balance(data, data2, False)[0]
    
    else:
        filename = get_filename(tissue, short)
        target_data = load_vista.load_enhancers_with_seq(filename)
        files_remove = [filename+".remove"]
        if dist:
            if tissue == "heart":
                files_remove.append( "FANTOM_heart_brain.remove" )
            elif tissue == "brain":
                files_remove.append("FANTOM_brain_heart.remove")
        print target_data
        data = read_data(histmods, kmers, target_data, filename)
        data = remove_samples(data, target_data, files_remove)
        return data
def drawEpipolarLine(inputPath, F, outputPath):
    img_a = cv2.imread(os.path.join(inputPath, "pic_a.jpg"))
    img_b = cv2.imread(os.path.join(inputPath, "pic_b.jpg"))
    pts_a = read_data(inputPath, "pts2d-pic_a.txt")
    pts_b = read_data(inputPath, "pts2d-pic_b.txt")
    pts_a = np.column_stack((pts_a, np.ones(pts_a.shape[0])))
    pts_b = np.column_stack((pts_b, np.ones(pts_a.shape[0])))
    eplines_a = np.dot(pts_b, F)
    eplines_b = np.dot(pts_a, F.T)
    height, width, _ = img_a.shape
    boundary_l = np.cross([0, 0, 1], [height - 1, 0, 1])
    boundary_r = np.cross([0, width - 1, 1], [height - 1, width - 1, 1])
    for line_a, line_b in zip(eplines_a, eplines_b):
        pts1 = np.cross(line_a, boundary_l)
        pts2 = np.cross(line_a, boundary_r)
        pts1 /= pts1[2]
        pts2 /= pts2[2]
        cv2.line(img_a,
                 tuple(pts1[:2].astype(int)),
                 tuple(pts2[:2].astype(int)), (0, 255, 0),
                 thickness=2)
        pts1 = np.cross(line_b, boundary_l)
        pts2 = np.cross(line_b, boundary_r)
        pts1 /= pts1[2]
        pts2 /= pts2[2]
        cv2.line(img_b,
                 tuple(pts1[:2].astype(int)),
                 tuple(pts2[:2].astype(int)), (0, 255, 0),
                 thickness=2)
    cv2.imwrite(os.path.join(outputPath, "ps3-2-c-1.png"), img_a)
    cv2.imwrite(os.path.join(outputPath, "ps3-2-c-2.png"), img_b)
    return img_a, img_b
Beispiel #4
0
def remove_features(removal_order, train_file, test_file, attr_file,
                    max_features):
    train_accs = []
    test_accs = []
    remove_columns = []
    for col in removal_order:
        print(col)
        remove_columns.append(col)
        if len(remove_columns) == max_features: break
        print(remove_columns)
        train_data, train_attr = read_data(train,
                                           attr,
                                           remove_columns=remove_columns)
        test_data, test_attr = read_data(test,
                                         attr,
                                         remove_columns=remove_columns)
        tree = decision_tree.DecisionTreeLearning(train_data, train_attr,
                                                  "normal", "class")

        decision_tree.print_tree(tree)
        y_pred, y_true = decision_tree.predict(train_data, tree)
        train_acc = decision_tree.accuracy_score(y_pred, y_true)
        print('Accuracy on Training Data: {0}'.format(train_acc * 100))
        y_pred, y_true = decision_tree.predict(test_data, tree)
        test_acc = decision_tree.accuracy_score(y_pred, y_true)
        print('Accuracy on Training Data: {0}'.format(test_acc * 100))

        train_accs.append(train_acc)
        test_accs.append(test_acc)
    return train_accs, test_accs
Beispiel #5
0
def load(histmods, kmers, tissue, short, dist):
    if 'both' in tissue:
        if 'randoms' in tissue:
            randoms = 'randoms_'
        else:
            randoms = ''
        filename = get_filename(randoms + 'heart', short)
        filename2 = get_filename(randoms + 'brain', short)
        target_data = load_vista.load_enhancers_with_seq(filename)
        target_data2 = load_vista.load_enhancers_with_seq(filename2)
        files_remove = [filename + ".remove"]
        files_remove2 = [filename2 + ".remove"]
        if dist:
            files_remove.append("FANTOM_heart_brain.remove")
            files_remove2.append("FANTOM_brain_heart.remove")

        data = read_data(histmods, kmers, target_data, filename)
        data = remove_samples(data, target_data, files_remove)
        data2 = read_data(histmods, kmers, target_data2, filename2)
        data2 = remove_samples(data2, target_data2, files_remove2)
        return join_and_balance(data, data2, False)[0]

    else:
        filename = get_filename(tissue, short)
        target_data = load_vista.load_enhancers_with_seq(filename)
        files_remove = [filename + ".remove"]
        if dist:
            if tissue == "heart":
                files_remove.append("FANTOM_heart_brain.remove")
            elif tissue == "brain":
                files_remove.append("FANTOM_brain_heart.remove")
        print target_data
        data = read_data(histmods, kmers, target_data, filename)
        data = remove_samples(data, target_data, files_remove)
        return data
Beispiel #6
0
def main():
    # create logger
    logger = logging.getLogger('main')
    logger.setLevel(logging.DEBUG)

    # create console handler
    ch = logging.StreamHandler()
    ch.setLevel(logging.DEBUG)

    # create formatter
    formatter = logging.Formatter(
        '%(asctime)s - %(name)s - %(levelname)s - %(message)s')

    # add formatter to ch
    ch.setFormatter(formatter)

    # add ch to logger
    logger.addHandler(ch)

    logger.debug('Loading training data ' + TR_PATH)
    training_data = read_data.read_data(TR_PATH, 100)
    logger.debug('Loading testing data ' + TS_PATH)
    testing_data = read_data.read_data(TS_PATH)
    logger.debug('Loading root tweets ' + RT_PATH)
    root_data = read_data.read_data(RT_PATH)
    logger.debug('Loading social network ' + SN_PATH)
    social_network = read_data.read_data(SN_PATH)

    # extract_labels
    labels = read_data.extract_labels(training_data, K)

    # run basic features
    basic_vectors = basic.get_vectors(root_data, training_data, social_network)
Beispiel #7
0
def main(_):

  filename = "Data11-17.txt"
  vectors_data1,labels_data1 = read_data.read_data(filename)
  filename = "valid18-20.txt"
  vectors_data2,labels_data2 = read_data.read_data(filename)
  filename = "Data21-25.txt"
  vectors_data3,labels_data3 = read_data.read_data(filename)

  vectors_data = np.vstack((vectors_data1,vectors_data2,vectors_data3))
  print(vectors_data.shape)
  labels_data = np.vstack((np.reshape(labels_data1,(len(labels_data1),1)),
    np.reshape(labels_data2,(len(labels_data2),1)),
      np.reshape(labels_data3,(len(labels_data3),1))))
  labels_data = np.reshape(labels_data,-1)
  print(labels_data.shape)

  filename = "Data4-10.txt"
  validation_data,vlabels_data = read_data.read_data(filename)
  filename = "Data26-29.txt"
  test_data,tlabels_data = read_data.read_data(filename)
  test_data = test_data[0:8000,]
  tlabels_data = tlabels_data[0:8000,]

  config = get_config()
  eval_config = get_config()
  eval_config.batch_size = 1
  eval_config.num_steps = 1

  with tf.Graph().as_default(), tf.Session() as session:

    initializer = tf.random_uniform_initializer(-config.init_scale,
                                                config.init_scale)
    with tf.variable_scope("model", reuse=None, initializer=initializer):
      m = PTBModel(is_training=True, config=config)
    with tf.variable_scope("model", reuse=True, initializer=initializer):
      mvalid = PTBModel(is_training=False, config=config)
      mtest = PTBModel(is_training=False, config=eval_config)
    
    
    tf.initialize_all_variables().run()

    summary_writer = tf.train.SummaryWriter("train/lstm3s",session.graph)

    for i in range(config.max_max_epoch):
      lr_decay = config.lr_decay ** max(i - config.max_epoch, 0.0)
      m.assign_lr(session, config.learning_rate * lr_decay)

      print("Epoch: %d Learning rate: %.3f" % (i + 1, session.run(m.lr)))

      train_perplexity = run_epoch(session, m, vectors_data, labels_data, m.train_op,summary_writer, 
                                   verbose=True)
      print("Epoch: %d Train Perplexity: %.3f" % (i + 1, train_perplexity))

      valid_perplexity = run_epoch(session, mvalid, validation_data, vlabels_data, tf.no_op(),summary_writer)
      print("Epoch: %d Valid Perplexity: %.3f" % (i + 1, valid_perplexity))

    test_perplexity = run_epoch(session, mtest, test_data, tlabels_data, tf.no_op(),summary_writer)
    print("Test Perplexity: %.3f" % test_perplexity)
Beispiel #8
0
def main(_):
    if FLAGS.config == "None":
        config = get_config(FLAGS.__flags, {})
    else:
        # TODO : create configs file (.json)
        config_path = os.path.join("configs", "%s%s" % (FLAGS.model_name, FLAGS.config_ext))
        config = get_config_from_file(FLAGS.__flags, config_path, FLAGS.config)

    load_meta_data(config)
    mkdirs(config)

    # load other files
    init_emb_mat_path = os.path.join(config.data_dir, 'init_emb_mat.h5')
    config.init_emb_mat = h5py.File(init_emb_mat_path, 'r')['data'][:]

    if config.train:
        train_ds = read_data(config, 'train')
        dev_ds = read_data(config, 'dev')
    else:
        test_ds = read_data(config, 'test')

    # For quick draft initialize (deubgging).
    if config.draft:
        config.train_num_batches = 1
        config.val_num_batches = 1
        config.test_num_batches = 1
        config.num_epochs = 1
        config.val_period = 1
        config.save_period = 1
        # TODO : Add any other parameter that induces a lot of computations

    pprint(config.__dict__)

    # TODO : specify eval tensor names to save in evals folder
    eval_tensor_names = []

    graph = tf.Graph()
    # TODO : initialize BaseTower-subclassed objects
    towers = [BaseTower(config) for _ in range(config.num_devices)]
    sess = tf.Session(graph=graph, config=tf.ConfigProto(allow_soft_placement=True))
    # TODO : initialize BaseRunner-subclassed object
    runner = BaseRunner(config, sess, towers)
    with graph.as_default(), tf.device("/cpu:0"):
        runner.initialize()
        if config.train:
            if config.load:
                runner.load()
            runner.train(train_ds, dev_ds, eval_tensor_names=eval_tensor_names)
        else:
            runner.load()
            runner.eval(test_ds, eval_tensor_names=eval_tensor_names)
def fundamentalMatrix(inputPath, rank=3, rank3Matrix=None):
    if rank == 3:
        pts_a, pts_b = read_data(inputPath, "pts2d-pic_a.txt"), read_data(
            inputPath, "pts2d-pic_b.txt")
        F_lstsq = solveEquations(pts_a, pts_b, method="least_square")
        F_SVD = solveEquations(pts_a, pts_b, method="SVD")
        return F_lstsq, F_SVD
    elif rank == 2:
        U, S, V = np.linalg.svd(rank3Matrix)
        S[-1] = 0
        S = np.diag(S)
        F = np.dot(np.dot(U, S), V)
        return F
    return None
Beispiel #10
0
def predict_classes_one():
    classifier = get_classifier()
    vectorizer = get_vectorizer()
    classes = get_classes(1)

    read_data.read_data(classifier, vectorizer, classes)

    cross_validation.score(classifier, vectorizer, 1)

    ids = create_id_set(0, io_files.test_path)
    label, y = predict.predict(classifier, vectorizer, ids)

    output = io_files.categorie1_path
    write_output(output, label, y)
Beispiel #11
0
def train(config):
    train_data = read_data('train')
    dev_data = read_data('dev')

    update_config(config, [train_data, dev_data])
    _config_debug(config)

    word2vec_dict = train_data.shared['lower_word2vec'] 
    word2idx_dict = train_data.shared['word2idx']
    idx2vec_dict = {word2idx_dict[word]: vec for word, vec in word2vec_dict.items() if word in word2idx_dict}
    emb_mat = np.array([idx2vec_dict[idx] if idx in idx2vec_dict
                        else np.random.multivariate_normal(np.zeros(config.word_emb_size), np.eye(config.word_emb_size))
                        for idx in range(config.word_vocab_size)])
    config.emb_mat = emb_mat
    bidaf_model = train_bidaf()
Beispiel #12
0
def load_other(database, histmods, kmers, tissue, dist):
    if '_notss' in tissue:
        filter_promoters = True
        tissue = tissue[:-6]
    else:
        filter_promoters = False
    if 'randoms' in tissue:
            target_data = load_vista.load_enhancers_with_seq(tissue+'1500')
            data = read_data(histmods, kmers, target_data, tissue+'1500')
            data = choose_tissue(data, target_data, 'all', dist, filter_promoters)
    else:
        target_data = load_vista.load_enhancers_with_seq(database)
        data = read_data(histmods, kmers, target_data, database)
        data = choose_tissue(data, target_data, tissue, dist, filter_promoters)
    return data
Beispiel #13
0
def load_other(database, histmods, kmers, tissue, dist):
    if '_notss' in tissue:
        filter_promoters = True
        tissue = tissue[:-6]
    else:
        filter_promoters = False
    if 'randoms' in tissue:
        target_data = load_vista.load_enhancers_with_seq(tissue + '1500')
        data = read_data(histmods, kmers, target_data, tissue + '1500')
        data = choose_tissue(data, target_data, 'all', dist, filter_promoters)
    else:
        target_data = load_vista.load_enhancers_with_seq(database)
        data = read_data(histmods, kmers, target_data, database)
        data = choose_tissue(data, target_data, tissue, dist, filter_promoters)
    return data
Beispiel #14
0
def loop_trac(band='alpha',con=1):
    for i in range(20):
        print('Person {}:'.format(i+1))
        tf.reset_default_graph()
        read_data.read_data(band,con,i+1)
        completeTrajectories()
        computeFeas()
        generate_behavior_sequences()
        generate_normal_behavior_sequence()
        trajectory2Vec(band,con,i+1)
        vecClusterAnalysis(band,con,i+1)
    fout = file('./band_data/deep_svm_con{}_{}'.format(con,band), 'w')
    
	#保存准确率
	cPickle.dump(accs, fout)
Beispiel #15
0
def reprocess_data2(n_samples=-1):
    X, labels = read_data()
    if (n_samples != -1):
        indexes = random.sample(range(1, len(X)), n_samples)
        X = [x for idx, x in enumerate(X) if idx in indexes]
        X = np.array(X)
        labels = [label for idx, label in enumerate(labels) if idx in indexes]

    label_dict = {
        'daisy': 0,
        'dandelion': 1,
        'rose': 2,
        'sunflower': 3,
        'tulip': 4
    }

    y = [label_dict[labels[i]] for i in range(len(labels))]

    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.15,
                                                        random_state=42)

    X_train = X_train / 255.
    X_test = X_test / 255.

    X_train = np.array(X_train)
    X_test = np.array(X_test)
    y_train = np.array(y_train)
    y_test = np.array(y_test)

    X_train = X_train.reshape(X_train.shape[0], 128 * 128 * 3)
    X_test = X_test.reshape(X_test.shape[0], 128 * 128 * 3)

    return X_train, X_test, y_train, y_test
def brabo_starter():
    house_path = '../../Data/wijk1_huizen.csv'
    # battery_path = '../../Data/wijk1_batterijen.txt'
    # battery_path = '../../Results/Battery_configurations/SCORE:4486_SIGMA:10.csv'
    # battery_path = '../../Results/Battery_configurations/leuknaampjes.csv'
    battery_path = '../../Results/Battery_configurations/1137_nice_sigma10.csv'


    houses, batteries = read_data(house_path, battery_path, True)

    max_x = max([dic['position'][0] for dic in houses] +
                [dic['position'][0] for dic in batteries]) + 1
    max_y = max([dic['position'][1] for dic in houses] +
                [dic['position'][1] for dic in batteries]) + 1

    wijk1 = SmartGrid(max_x,max_y)
    wijk1.add_house_dictionaries(houses)
    wijk1.add_battery_dictionaries(batteries)
    houses = wijk1.house_dict_with_manhattan_distances()


    print(houses)
    root = node(batteries, houses, 5000000)
    start = time.time()

    try:
        root.solve()

    except KeyboardInterrupt:
        run_time = time.time() - start
        print(run_time)
    print("klaar")
Beispiel #17
0
 def prepare(self, filename):
     self.time, self.data = read_data(filename)
     increment = self.time[-1] - self.time[-2]
     self.time = np.append(self.time, np.arange(1, 1 + self.forecast_size) * increment + self.time[-1])
     self.N_all_iter = len(self.time)
     self.operator_view.show()
     self.operator_view.status_bar.showMessage('Loaded successfully.', 1000)
Beispiel #18
0
def main():
    """
        plots a distribution using random_solve. The distribution gives a good
        indication to the statespace and its density. Increasing the amount of
        batteries will significantly increase runtime.

    """

    house_path = '../../Data/wijk1_huizen.csv'
    battery_path = '../../Data/wijk1_batterijen.txt'
    battery_path = '../../Results/Battery_configurations/lucas_1137_nice_sigma10.csv'
    battery_path = '../../Results/Battery_configurations/BESTSCORE_SIGMA_relative.csv'
    houses, batteries = read_data(house_path, battery_path)

    wijk1 = SmartGrid(51, 51)
    wijk1.add_house_dictionaries(houses)
    wijk1.add_battery_dictionaries(batteries)

    for element in houses:
        wijk1.create_house(element['position'], element['output'])
    for element in batteries:
        wijk1.create_battery(element['position'], element['capacity'])

    random_solve(wijk1)
    print("klaar")
Beispiel #19
0
def test_train_split(fraction):

    # user-item data for model
    df_train = pd.read_csv("data/model_input/df.csv", sep='\t') 

    # dataframe with user detail
    df_user_detail = read_data("user_detail_medium")

    # get list of unique user
    unique_users = list(df_user_detail.drop_duplicates(subset="user_id")["user_id"])

    # number of test users
    n_test_users = int(len(unique_users) * fraction)

    # shuffle and select users to drop
    random.shuffle(unique_users)
    df_test_data = pd.DataFrame(unique_users[:n_test_users])
    df_train_users = pd.DataFrame(unique_users[n_test_users:])

    # set rating to 0 for test users
    for index, user in df_test_data.iterrows():
        #df_train.loc[df_train["user_id"]==user[0], "comment"] = 0
        df_train.drop(df_train[df_train["user_id"]==user[0]].index, inplace=True)

    # check before store
    #print(df_train.loc[df_train["user_id"]==df_test_data.iloc[0][0]])
        
    # save training set
    df_train.to_csv("data/model_input/df_train.csv", sep="\t", index=False)
    df_test_data.to_csv("data/model_input/df_test.csv", index=False)
    df_train_users.to_csv("data/model_input/df_train_users.csv", index=False)

    return 
def get_new_edges(data_type, construction):
    tree_prop_file = 'd6.treeproperties'
    t2props_dict = get_t2props_dict(tree_prop_file)
    t2topsub_dict = get_t2topsub_dict(tree_prop_file)
    ## get predicted_dependencies and apply transformations
    predicted_dependencies = read_data(construction, data_type)
    unbounded_dependencies = read_unbounded(construction, data_type)
    sents = read_stags(construction, data_type, 'sents')
    predicted_stags = read_stags(construction, data_type)
    predicted_pos = read_stags(construction, data_type, 'predicted_pos')
    new_edges = []
    for sent_idx in range(len(unbounded_dependencies)):
        #for sent_idx in [0]:
        sent = sents[sent_idx]
        ## TAG analysis
        predicted_dependencies_sent = predicted_dependencies[sent_idx]
        predicted_stags_sent = predicted_stags[sent_idx]
        predicted_pos_sent = predicted_pos[sent_idx]
        transformed_sent = transform(t2props_dict, t2topsub_dict, sent,
                                     predicted_dependencies_sent,
                                     predicted_stags_sent, predicted_pos_sent)
        new_edges_sent = list(
            set(transformed_sent) - set(predicted_dependencies_sent))
        new_edges_sent = [x for x in new_edges_sent if x[0] != x[1]]
        #print(new_edges_sent)
        new_edges.append(new_edges_sent)
    return new_edges
Beispiel #21
0
def load_paths(paths, fresh, frames_per_gesture, separate_frames, feature_set_type):
    """Load data from given paths

    Parameters
    ----------
    paths : list
        The paths to the data: every path leads to the Leap subfolder of a participant folder
    fresh : boolean
        Recalculate aggregate of frames
    frames_per_gesture : int
        The number of frames considered to define a gesture
    separate_frames : boolean
        Treat every frame as a separate data point [not recommended]
    feature_set_type : string
        'hands_only', 'fingers_only', 'all'
    """
    all_data = []
    all_target = []
    for path in paths:
        if fresh:
            data, target = read_data(path, frames_per_gesture, separate_frames, feature_set_type)
            try:
                with open(path[:-4] + "Participant_fpg_{}.data".format(frames_per_gesture), 'wb') as fp:
                    pickle.dump((data, target), fp)
            except IOError:
                continue
        else:
            try:
                with open(path[:-4] + "Participant_fpg_{}.data".format(frames_per_gesture), 'rb') as fp:
                    data, target = pickle.load(fp)
            except IOError:
                continue
        all_data.extend(data)
        all_target.extend(target)
    return all_data, all_target
Beispiel #22
0
def _test(config):
    test_data = read_data(config, 'test', True)
    update_config(config, [test_data])

    _config_debug(config)

    if config.use_glove_for_unk:
        word2vec_dict = test_data.shared[
            'lower_word2vec'] if config.lower_word else test_data.shared[
                'word2vec']
        new_word2idx_dict = test_data.shared['new_word2idx']
        idx2vec_dict = {
            idx: word2vec_dict[word]
            for word, idx in new_word2idx_dict.items()
        }
        new_emb_mat = np.array(
            [idx2vec_dict[idx] for idx in range(len(idx2vec_dict))],
            dtype='float32')
        config.new_emb_mat = new_emb_mat

    pprint(config.__flags, indent=2)
    models = get_multi_gpu_models(config)
    model = models[0]
    evaluator = MultiGPUEvaluator(
        config,
        models,
        tensor_dict=models[0].tensor_dict if config.vis else None)
    graph_handler = GraphHandler(config, model)

    sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True))
    graph_handler.initialize(sess)
    num_steps = math.ceil(test_data.num_examples /
                          (config.batch_size * config.num_gpus))
    if 0 < config.test_num_batches < num_steps:
        num_steps = config.test_num_batches
    e = None
    for multi_batch in tqdm(test_data.get_multi_batches(
            config.batch_size,
            config.num_gpus,
            num_steps=num_steps,
            cluster=config.cluster),
                            total=num_steps):
        ei = evaluator.get_evaluation(sess, multi_batch)
        e = ei if e is None else e + ei
        if config.vis:
            eval_subdir = os.path.join(
                config.eval_dir, "{}-{}".format(ei.data_type,
                                                str(ei.global_step).zfill(6)))
            if not os.path.exists(eval_subdir):
                os.mkdir(eval_subdir)
            path = os.path.join(eval_subdir, str(ei.idxs[0]).zfill(8))
            graph_handler.dump_eval(ei, path=path)

    print("test acc: %f, loss: %f" % (e.acc, e.loss))
    if config.dump_answer:
        print("dumping answer ...")
        graph_handler.dump_answer(e)
    if config.dump_eval:
        print("dumping eval ...")
        graph_handler.dump_eval(e)
def plot_beach(columns, df=None, beaches=None, separate_beaches=False, **kwds):
    '''
    TODO: docstring
    '''
    if df is None:
        df = read_data.read_data()
    if beaches is None:
        beaches = df['Client.ID'].dropna().unique().tolist()
    if type(beaches) is str:
        # be flexible with scalar vs. vector input
        beaches = [beaches]

    if separate_beaches:
        fig, ax = plt.subplots(len(beaches), 1, sharex=True, sharey=True)
        for i, beach in enumerate(beaches):
            filt = df['Client.ID'] == beach
            df[filt].plot(y=columns, ax=ax[i], **kwds)
            ax[i].set_title(beach)
    else:
        fig, ax = plt.subplots(1,1)
        l = len(ax.legend().get_texts())
        for beach in beaches:
            filt = df['Client.ID'] == beach
            df[filt].plot(y=columns, ax=ax, **kwds)
            for txt in ax.legend().get_texts()[l:]:
                txt.set_text(beach + ': ' + txt.get_text())
    return fig, ax
def main():
    # Get the basic information.
    infile, number_of_components, graphtitle = get_information()

    # Here the data is finally read from the file.
    data, rawdata, variables, observations = rd.read_data(infile)

    # Look at and pre-process the data.
    look_at_and_pre_process_data(data, rawdata, variables)

    # Get the colour-coding (if applicable).
    colours = get_colours(observations)

    # Here the actual NIPALS algorithm is executed.
    print "\nBe patient. The NIPALS-algorithm may need some time ...\n"
    Z_merged, P_merged, r_merged, R_2, R_k_2, SPE, T_2 = na.nipals_pca(data, \
                 rawdata, number_of_components)

    # Plotting basic information.
    pl.plotting_the_basics(number_of_components, R_2, graphtitle, observations, \
        variables, Z_merged, P_merged, colours = None)

    # Plotting the "versus-graphs".
    pl.plotting_more_complicated_graphs(number_of_components, R_2, T_2, graphtitle, \
            SPE, observations, variables, Z_merged, \
            P_merged, r_merged, colours)

    print "\nThank you for using this program\n"
Beispiel #25
0
def create_decision_tree():
    #gets the customer list w preprocessed data -> {1 : c, 2: c,...}
    customer_list = read_data()
    train_data = get_decision_tree_data(customer_list)
    check_bal = train_data[0]
    sav_bal = train_data[1]
    decision = train_data[2]

    model = tree.DecisionTreeClassifier()
    model.fit(check_bal, decision)

    for masked_id in customer_list:
        if customer_list[masked_id].get_checking_status() == '':
            predict = model.predict([customer_list[masked_id].get_check_balances()])
            customer_list[masked_id].checking_status = predict[0]

    model.fit(sav_bal, decision)

    #print the decision tree to file
    dot_data = tree.export_graphviz(model, out_file = None)
    graph = gp.Source(dot_data)
    graph.render("decision_tree_output")

    for masked_id in customer_list:
        if customer_list[masked_id].get_checking_status() == 'close an account':
            predict = model.predict([customer_list[masked_id].get_sav_balances()])
            customer_list[masked_id].checking_status = predict[0]

    for masked_id in customer_list:
        customer_list[masked_id].pref_contact_medium = max(customer_list[masked_id].contact_mediums, key = customer_list[masked_id].contact_mediums.get)


    print_data_report(c_list = customer_list, file_name = output_file, train_data = training_data_for_print)
Beispiel #26
0
def load(histmods, kmers, tissue, dist):
    if '_notss' in tissue:
        filter_promoters = True
        tissue = tissue[:-6]
    else:
        filter_promoters = False
    if 'random' in tissue:
            target_data = load_vista.load_enhancers_with_seq(tissue)
            data = read_data(histmods, kmers, target_data, tissue)
            data = choose_tissue(data, target_data, 'all', dist, filter_promoters)
    else:
        target_data = load_vista.load_enhancers_with_seq(VISTA_FILE)
        data = read_data(histmods, kmers, target_data, VISTA_FILE)
        data = choose_tissue(data, target_data, tissue, dist, filter_promoters)
    
    return data
Beispiel #27
0
def main():
    X = read_data()
    p, n = X.shape
    mu_hat = np.sum(X, axis=1) / n
    X = (X.T - mu_hat).T
    Z = X / np.sqrt(n)
    U, S, Vt = np.linalg.svd(Z, full_matrices=False)
    Lam = S**2
    display_eigen_images(U, Lam)

    #projection of images
    Y = np.dot(U.T, X[:, :4])
    plot_projections(Y)

    #reconstruction
    recons = np.zeros((p, len(m)))
    for i, im in enumerate(m):
        recons[:, i] = np.dot(U[:, :im], Y[:im, 0])
    recons = (recons.T + mu_hat).T
    fig, axs = plt.subplots(3, 2)
    for k in range(6):
        img = np.reshape(recons[:, k], (Wd, Ht))
        axs[k // 2, k % 2].imshow(img, cmap=plt.cm.gray, interpolation='none')
        axs[k // 2, k % 2].set_title('m=' + str(m[k]))
        axs[k // 2, k % 2].axis('off')
    plt.savefig('reconstruction.pdf')
    plt.close()
Beispiel #28
0
def train(model,data,settings):
    print("-- RUNNING TRAINING --", flush=True)

    # We are caching the partition in the container home dir so that
    # the same training subset is used for each iteration for a client. 
    try:
        with open('/app/mnist_train/x.pyb','rb') as fh:
            x_train=pickle.loads(fh.read())
        with open('/app/mnist_train/y.pyb','rb') as fh:
            y_train=pickle.loads(fh.read())
        with open('/app/mnist_train/classes.pyb','rb') as fh:
            classes=pickle.loads(fh.read())
    except:
        (x_train, y_train, classes) = read_data(data,nr_examples=settings['training_samples'])

        try:
            os.mkdir('/app/mnist_train')
            with open('/app/mnist_train/x.pyb','wb') as fh:
                fh.write(pickle.dumps(x_train))
            with open('/app/mnist_train/y.pyb','wb') as fh:
                fh.write(pickle.dumps(y_train))
            with open('/app/mnist_train/classes.pyb','wb') as fh:
                fh.write(pickle.dumps(classes))
        except:
            pass

    model.fit(x_train, y_train, batch_size=settings['batch_size'], epochs=settings['epochs'], verbose=1)

    print("-- TRAINING COMPLETED --", flush=True)
    return model
Beispiel #29
0
def get_user_information():

    # read dataframe with raw user information
    df = read_data("response")

    # merge rows of same user
    df = df.groupby(["user_id"]).agg({"post_id": lambda x: ', '.join(x)})

    # reset index
    df.reset_index(level=0, inplace=True)

    # chunk size
    n = 600

    # split dataframe into subset
    list_df = [df[i:i + n] for i in range(0, df.shape[0], n)]

    print("Getting user information from dataframe %s / %s" %
          (chunk_index_userinfo + 1, len(list_df)))

    # sub dataframe of list of articles
    df_sub = list_df[chunk_index_userinfo]

    # retrieve user information
    df_sub["userinfo"] = df_sub["user_id"].apply(get_userinfo)

    # save to csv
    df_sub.to_csv("data/user_detail/users_%s.csv" % chunk_index_userinfo,
                  encoding='utf-8',
                  index=False)

    print("Successfully retrieved user information from dataframe %s / %s" %
          (chunk_index_userinfo + 1, len(list_df) + 1))
Beispiel #30
0
def main():
    output_path = Path('../output/try2_exactly_7_times')
    output_path.mkdir(exist_ok=True)
    save_path = output_path / 'vader.ckpt'

    # w_train, x_train, names = read_premade(DAYS_ORDERED)
    w_train, x_train, names = read_data()
    x_train = (x_train - np.mean(x_train)) / np.std(x_train)

    vader = VADER(x_train=x_train,
                  w_train=w_train,
                  save_path=save_path,
                  n_hidden=[128, 32],
                  k=5,
                  learning_rate=1e-3,
                  output_activation=None,
                  recurrent=True,
                  batch_size=8,
                  alpha=0.1)
    # pre-train without latent loss
    vader.pre_fit(n_epoch=20, verbose=True)
    # train with latent loss
    vader.fit(n_epoch=100, verbose=True)
    # get the clusters
    c = vader.cluster(x_train, w_train)
    # get the re-constructions
    p = vader.predict(x_train)

    print(vader.get_clusters_on_x())
def validate(model, data, sample_fraction=1):

    try:
        x_test, y_test, classes = read_data(data,
                                            sample_fraction=sample_fraction)
        model_score = model.evaluate(x_test, y_test, verbose=0)
        result = open("../result/result.txt", "a")
        result.write("===========================\n")
        result.write("Validation accuracy: %s\n" % model_score[1])
        result.close()
        print("======================================================")
        print('Training loss:', model_score[0])
        print('Training accuracy:', model_score[1])
        y_pred = model.predict_classes(x_test)
        clf_report = metrics.classification_report(y_test.argmax(axis=-1),
                                                   y_pred)
    except Exception as e:
        print("failed to validate the model {}".format(e), flush=True)
        raise

    report = {
        "classification_report": clf_report,
        "loss": model_score[0],
        "accuracy": model_score[1]
    }

    return report
def main():
    data = []
    inp = []
    num_iter = int(input("Enter the number of iterations"))
    for i in range(0, num_iter):
        inp = make_granules.make_granules(i, data[:2])
        data = svm.svm(inp)

    print("clf is ", data[-1])

    print(
        "***********************************************************************"
    )
    print("ON THE FINAL DATA !!!!")
    df = read_data.read_data()
    number_of_cols = len(df.columns)

    X = df.values
    Y = X[:, -1]
    X = X[:, :-1]

    indices = np.argwhere(Y == 1)
    clf = data[-1]
    indices = indices.ravel()
    X = X[indices]

    predictions = clf.predict(X)
    correctly_done = np.sum(predictions)
    print("Correctly  classified minority points ",
          correctly_done / len(indices))
    print(
        "***********************************************************************"
    )
Beispiel #33
0
def main(stud_ans):

    op = rd.read_data("train.tsv")
    X_train_raw = op[0]
    y_train = op[1]
    X_test_raw = []
    X_test_raw.extend(stud_ans)

    cv = CountVectorizer(max_df=1.0,
                         min_df=2,
                         ngram_range=(1, 2),
                         max_features=10000,
                         stop_words='english')

    X_vec = cv.fit_transform(X_train_raw)
    selector = SelectKBest(mutual_info_classif, k=300)
    X_vec = selector.fit_transform(X_vec, y_train)
    Y_vec = cv.transform(X_test_raw)
    Y_vec = selector.transform(Y_vec)
    svd = TruncatedSVD(100)
    lsa = make_pipeline(svd, Normalizer(copy=False))

    print "shape", X_vec.shape, Y_vec.shape
    X_train_lsa = lsa.fit_transform(X_vec)
    X_test_lsa = lsa.transform(Y_vec)
    p = []
    knn_lsa = KNeighborsClassifier(n_neighbors=1,
                                   algorithm='brute',
                                   metric='cosine')
    knn_lsa.fit(X_train_lsa, y_train)
    p.extend(knn_lsa.predict(X_test_lsa))
    float(p[0])
    print "answers modified", p[0]

    return p
Beispiel #34
0
def main():
    import argparse
    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('--use_gpu', help='Use GPU to train NN', action='store_true', default=False)
    parser.add_argument('--gpu_device', help='GPU device ID', type=int, default=0)
    parser.add_argument('--model_type', help='Architecture of Model(STL/SNN/HPS/TF/PROG/Deconv/DeconvTM/DeconvTM2)', type=str, default='STL')
    parser.add_argument('--test_type', help='Type of test (including regularization scale or etc)', type=int, default=0)
    parser.add_argument('--all_output', help='Train on all outputs, not final stability score', action='store_true', default=False)
    parser.add_argument('--save_mat_name', help='Name of file to save training results', type=str, default='delete_this.mat')
    parser.add_argument('--lifelong', help='Train in lifelong learning setting', action='store_true', default=False)
    args = parser.parse_args()

    do_lifelong = args.lifelong
    mat_file_name = args.save_mat_name

    train_hyperpara = {}
    train_hyperpara['improvement_threshold'] = 1.002  # for accuracy (maximizing it)
    train_hyperpara['patience_multiplier'] = 1.5
    train_hyperpara['lr'] = 0.0001
    train_hyperpara['lr_decay'] = 1.0 / 100.0
    train_hyperpara['num_run_per_model'] = 5
    train_hyperpara['learning_step_max'] = 5000
    train_hyperpara['patience'] = 100

    data_hyperpara = {}
    data_hyperpara['folder_name'] = 'Data'
    data_hyperpara['train_file_name'] = _train_file_name
    data_hyperpara['test_file_name'] = _test_file_name
    data_hyperpara['train_valid_ratio'] = [0.8, 0.2]
    data_hyperpara['all_output'] = args.all_output

    _, datainfo = read_data(data_hyperpara['folder_name'], data_hyperpara['train_file_name'], data_hyperpara['test_file_name'], data_hyperpara['train_valid_ratio'], data_hyperpara['all_output'])
    model_architecture, model_hyperpara = model_setup(args.model_type, datainfo[2], args.test_type)
    train_result = train_run_for_each_model(model_architecture, model_hyperpara, train_hyperpara, data_hyperpara, mat_file_name, useGPU=args.use_gpu, GPU_device=args.gpu_device, doLifelong=do_lifelong)
def cal():
    parser = read_data()
    #parser.read_mq2008('./MQ2008')
    parser.read_mq2007('./MQ2007')
    scores = []

    for k in range(5):
        scores.append([])

    for i in range(5):
        print("===========fold{}=================".format(i + 1))
        train, vali, test = parser.get_fold(i)
        X, y, qid = train

        X_test, y_test, qid_test = test
        X_vali, y_vali, qid_vali = vali

        model = AdaRank(scorer=NDCGScorer_qid(K=5))
        model.fit(X, y, qid, X_vali, y_vali, qid_vali)

        pred = model.predict(X_test)
        for k in range(5):
            score = round(
                NDCGScorer_qid(K=k + 1)(y_test, pred, qid_test).mean(), 4)
            scores[k].append(score)
            print('nDCG@{}\t{}\n'.format(k + 1, score))
    print("==============Mean NDCG==================")
    for f in range(5):
        print("mean NDCG@{}\t{}\n".format(f + 1, round(np.mean(scores[f]), 4)))
def get_data(data_file,
             table=[2, 3, 4],
             udgs_only=True,
             environment=['all'],
             sort_param='Re',
             verbose=True):
    """
    """

    if verbose:
        print('\n{0}\n'.format('-' * 150))
        print("File:       ", data_file)
        print("Table:      ", table)
        print("Objects:    ", "UDGs" if udgs_only else "Candidates")
        print("Environment:", environment)
        print("Sort By:    ", sort_param, '\n')

    # Load Data from Appropriate Tables
    df_results = read_data(data_file, udg_only=udgs_only)
    df_subset = df_results.loc[df_results["TABLE"].isin(table)]

    # Filter for Environment
    for env in environment:
        if env.lower() in ['sparse', 'dense']:
            df_subset = df_subset.loc[df_subset["LocalEnv"] == env.title()]
        elif env.lower() in ['cluster', 'non-cluster']:
            df_subset = df_subset.loc[df_subset["GlobalEnv"] == env.title()]
        elif env.lower() in ['high', 'low']:
            df_subset = df_subset.loc[df_subset["Density"] == env.title()]

    # Sort Data
    df_subset = df_subset.sort_values(by=sort_param)
    df_subset = df_subset.reset_index(drop=True)

    return df_subset
def train(model,data,sample_fraction):
    print("-- RUNNING TRAINING --")

    batch_size = 32
    epochs = 1

    # The data, split between train and test sets
    (x_train, y_train, classes) = read_data(data,sample_fraction=sample_fraction)
    """
    num = 3 # Number of Clients
    ran_order = sample(range(0, x_train.shape[0]), x_train.shape[0])
    local_size=int(x_train.shape[0]/num)
    partitionedX=[]
    partitionedY=[]
    for i in range(0,num):
        partitionedX.append(x_train[ran_order[i*local_size:(i+1)*local_size]])
        partitionedY.append(y_train[ran_order[i*local_size:(i+1)*local_size]])
    X = numpy.array(partitionedX)
    Y = numpy.array(partitionedY)
    """

    model.fit(x_train, y_train, batch_size=batch_size, epochs=epochs, verbose=1)

    print("-- TRAINING COMPLETED --")
    return model
Beispiel #38
0
def main():
	path = os.getcwd()
	data_path = os.path.join(path, 'train_data/')
	lstm_path = os.path.join(path, 'torcs-server/torcs-client/lstm.h5')
	dense_path = os.path.join(path, 'torcs-server/torcs-client/dense.h5')

	category_index_input, category_index_output, input_data, output_data = rd.read_data(data_path)

	# LSTM_network(lstm_path, input_data, output_data)
	Dense_network(dense_path, input_data, output_data)
Beispiel #39
0
 def setUp(self):
     """ alustetaan testeihin liittyvät muuttujat """
     [mylambda, n_iter, tol, temperature,tiheydenmuutos, d_rho, askel,
      ny, nx, nz, elektroni_lkm, elektroni_tiheys,
      V_hartree, ydin_tiheys, ytimien_lkm, ydin_tiheys] = \
      read_data.read_data(filename = '../test/alkuarvot.txt_5x5x0')
     self.elektroni_tiheys = elektroni_tiheys
     self.V_hartree1 = V_hartree
     self.V_hartree2 = V_hartree
     self.V_hartree3 = V_hartree
     self.ydin_tiheys = ydin_tiheys
Beispiel #40
0
def plot_data(extension):
# Plot data file with a given extension
	rc('text', usetex=False)

	fig = plt.figure()
	ax = fig.add_subplot(111)

	label = []
	for file in os.listdir('.'):
		if file.endswith(extension):
			t, h = rd.read_data(file)
			ax.plot(t, h, 'o-', label=file)

	ax.legend()
	plt.show(block=True)
Beispiel #41
0
 def setUp(self):
     """ alustetaan testeihin liittyvät muuttujat """
     [mylambda, n_iter, tol, temperature,tiheydenmuutos, d_rho, askel,
      ny, nx, nz, elektroni_lkm, elektroni_tiheys,
      V_hartree, ydin_tiheys, ytimien_lkm, ydin_tiheys] = \
      read_data.read_data(filename = '../test/alkuarvot.txt_5x5x0')
     self.nx = nx
     self.ny = ny
     self.nz = nz
     self.h = 0.1
     self.askel = askel         
     self.mylambda = None
     self.n_iter = None
     self.tol = None
     self.temperature = None
     self.tiheydenmuutos = None
     self.d_rho = None
     self.askel = None
     init_value = 0.0
     
     self.summa = None
     
     #luodaan testimuuttuja
     self.testgrid = gridi.Gridi(nx=5,ny=5,nz=5,h=0.1,init_value=0.0)
Beispiel #42
0
# -*- coding: utf-8 -*-

# anomalias

import matplotlib as mpl  # Descomente para não mostrar a janela em cada plot
mpl.use('Agg')            # Descomente para não mostrar a janela em cada plot
import numpy as np
import os
from PyFuncemeClimateTools import ClimateStats as cs
from PyFuncemeClimateTools import PlotMaps as pm
from read_data import read_data

pcp, pcpe, obs = read_data()

nla = np.linspace(-90., 90., 181)  # lat

nlo = np.linspace(-180., 179., 360)  # lon

x, y = np.meshgrid(nlo, nla)

y1, y2, x1, x2 = -60., 15., -90., -30.  # região

# # diff das climatologias
# clim_diff = np.nanmean(pcpe[0:30, :, :], axis=0) - \
#             np.nanmean(pcp[0:30, :, :], axis=0)
#
# figtitle = u'CLIMATOLOGIA (MM) (EXP - CONTROL)'
#
# directory = 'figs_expsolar/clim'
#
# if not os.path.exists(directory):
def plot_beach(columns, df=None, beaches=None, separate_beaches=False, **kwds):
    '''
    Plots the specified column of data for the specified beaches.

    Inputs
    ------
    columns          : One or more column names/indexes of data to plot.
    df               : The dataframe of data. If None, then the dataframe
                       will be read in using read_data.
    beaches          : Name or list of names of beaches to plot. If None, all
                       beaches will be used.
    separate_beaches : If False, each beach will be plotted on the same axis.
                       Otherwise, each beach will be plotted on its own axis.

    keyword arguments
    -----------------
    Other keyword arguments will be past to the plot routine.

    Returns
    fig : The figure object.
    ax  : If separate_beaches is false, then this is the axis object.
          Otherwise, it is the array of axis objects.

    Example
    -------
    >>> import read_data as rd
    >>> import visualizations as viz
    >>> df = rd.read_data()
    >>> beaches = ['Juneway', 'Rogers', 'Howard']
    >>> col = 'Escherichia.coli'
    >>> viz.plot_beach(col, df=df, beaches=beaches, separate_beaches=True)
    '''
    if df is None:
        df = read_data.read_data()
    if beaches is None:
        beaches = df['Client.ID'].dropna().unique().tolist()
    if type(beaches) is str:
        # be flexible with scalar vs. vector input
        beaches = [beaches]

    if separate_beaches:
        fig, ax = plt.subplots(len(beaches), 1, sharex=True, sharey=True)
        for i, beach in enumerate(beaches):
            filt = df['Client.ID'] == beach
            df[filt].plot(y=columns, ax=ax[i], **kwds)
            ax[i].set_title(beach)
    else:
        fig, ax = plt.subplots(1,1)
        for i, beach in enumerate(beaches):
            if type(columns) is str:
                l = i
            else:
                l = i * len(columns)
            filt = df['Client.ID'] == beach
            df[filt].plot(y=columns, ax=ax, **kwds)
            # TODO: cannot get this legend stuff to work...
            for txt in ax.legend().get_texts()[l:]:
                txt.set_text(beach + ': ' + txt.get_text())

    plt.show(block=TO_BLOCK)

    return fig, ax
def movie(compare_column=None, df=None):
    '''
    Creates an animation of the beaches E. coli levels represented as circles.
    The circle's radius is proportional to the log of the E. coli levels.
    Additionally, when the E. coli level is above the threshold of 235 PPM,
    the circle color changes from blue to purple. You can optionally choose
    to vary the background color of the animation with another column of data,
    however, this does not seem like a great way to visualize the relationship
    between E. coli levels and another data-stream.

    Inputs
    ------
    compare_column : The name or index of the column that will be used to vary
                     the background color. If compare_column is None, then the
                     background color will remain static.
    df             : The dataframe to use. If None, then the dataframe will be
                     read in using read_data.

    Returns
    -------
    anim : The animation object.

    Example
    -------
    >>> import read_data as rd
    >>> import visualizations as viz
    >>> df = rd.read_data()
    >>> viz.movie(df=df)
    '''

    if df is None:
        df = read_data.read_data()
    if compare_column is None:
        to_compare = False
    else:
        to_compare = True

    if to_compare:
        compare_min = df[compare_column].dropna().min()
        compare_max = df[compare_column].dropna().max()
        bg_min_color = np.array([.75, .5, .2])
        bg_max_color = np.array([.999, .999, 0.9])

    file_name = '../data/ExternalData/Beach_Locations.csv'
    beach_locs = read_data.read_locations(file_name)

    # compute Mercator projection of lat/longs
    phi = 0.730191653

    beach_locs['Latitude'] = beach_locs['Latitude'] * 110574.0
    beach_locs['Longitude'] = beach_locs['Longitude'] * 111320.0 * np.cos(phi)

    lat_min = beach_locs['Latitude'].min()
    lat_max = beach_locs['Latitude'].max()
    lat_rng = lat_max - lat_min
    lon_min = beach_locs['Longitude'].min()
    lon_max = beach_locs['Longitude'].max()
    lon_rng = lon_max - lon_min

    def generate_index():
        for timestamp in df.index.unique():
            readings = df.ix[timestamp, 'Escherichia.coli']
            if to_compare:
                compare = df.ix[timestamp, compare_column]
                if type(compare) is pd.Series:
                    compare = compare.dropna().mean()
                if np.isnan(compare):
                    continue
            if ((type(readings) is np.float64 and not np.isnan(readings)) or
                    (type(readings) is not np.float64 and readings.count())):
                if not to_compare:
                    compare = None
                yield timestamp, compare

    def animate(timestamp_and_compare):
        timestamp = timestamp_and_compare[0]
        compare = timestamp_and_compare[1]

        if to_compare:
            compare = (compare - compare_min) / compare_max
            bg_color = bg_min_color * compare + bg_max_color * (1. - compare)
            ax.set_axis_bgcolor(bg_color)

        for i, b in enumerate(beach_locs['Beach']):
            beach_filt = df.ix[timestamp, 'Client.ID'] == b
            beach_skipped = False
            try:
                if not beach_filt.sum() == 1:
                    beach_skipped = True
            except AttributeError:  # is a boolean
                if not beach_filt:
                    beach_skipped = True

            if beach_skipped:
                ecoli = 0
            else:
                ecoli = float(df.ix[timestamp, 'Escherichia.coli'][beach_filt])

            r = 200 * np.log(ecoli)

            if b in circle_indexes:
                ax.artists[circle_indexes[b]].set_radius(r)
                if ecoli >= 235:
                    ax.artists[circle_indexes[b]].set_facecolor(
                        (0.301, 0, 1, 0.75))
                else:
                    ax.artists[circle_indexes[b]].set_facecolor(
                        (0, 0.682, 1, 0.75))
            else:
                circ = plt.Circle((beach_locs.ix[i,'Longitude'],
                                   beach_locs.ix[i,'Latitude']),
                                  radius=r, edgecolor='none')
                ax.add_artist(circ)
                circle_indexes[b] = len(ax.artists) - 1
                if ecoli >= 235:
                    ax.artists[circle_indexes[b]].set_facecolor(
                        (0.301, 0, 1, 0.75))
                else:
                    ax.artists[circle_indexes[b]].set_facecolor(
                        (0, 0.682, 1, 0.75))
        ax.title.set_text(timestamp.strftime('%d %B %Y'))
        return ax

    fig = plt.figure(figsize=(18,10))
    ax = plt.gcf().gca()
    ax.set_xlim([lon_min - lon_rng * 0.4, lon_max + lon_rng * 0.15])
    ax.set_ylim([lat_min - lat_rng * 0.2, lat_max + lat_rng * 0.2])
    ax.set_aspect('equal')
    circle_indexes = {}

    anim = animation.FuncAnimation(fig, animate, generate_index, repeat=False)
    plt.show(block=TO_BLOCK)

    return anim
def beach_hist(col='Escherichia.coli', beaches=None,
               subplots=False, transform=lambda x: x, df=None):
    '''
    Plots histograms of a specified column for the specified beaches.

    Inputs
    ------
    col       : Column name or index of the column to be histogrammed
    beaches   : List of beach names to generate histograms for, None indicates
                that all beaches should be used.
    subplots  : False to have each beach's histogram be plotted on the same
                axis. Otherwise, subplots is a list with two elements specifying
                the dimensions of the subplot array. For example, [8, 4] will
                create an 8x4 grid of subplots. There must be at least as many
                subplot axes as beaches.
    transform : A function to trasform the data, can be useful to log scale
                the E. coli readings to make the histogram more spread out.
    df        : The dataframe containing the data. If None, the data will be
                read in using read_data.

    Example
    -------
    >>> import read_data as rd
    >>> import visualizations as viz
    >>> import numpy as np
    >>> df = rd.read_data()
    >>> # Will be very messy, you should only plot on the same axis when there
    >>> # are only a few beaches to plot
    >>> viz.beach_hist(transform=lambda x: np.log(x+1), df=df)
    >>> viz.beach_hist(transform=lambda x: np.log(x+1), df=df, subplots=[7, 4])
    '''

    if df is None:
        df = read_data.read_data()

    if beaches is None:
        beaches = df['Client.ID'].dropna().unique().tolist()

    if subplots:
        try:
            if len(subplots) != 2:
                raise ValueError('subplots must have exactly 2 elements')
        except TypeError:
            raise TypeError('subplots must be an iterable with 2 elements')

        if subplots[0] * subplots[1] < len(beaches):
            raise ValueError('not enough subplots for each beach')

        min_x = np.inf
        max_x = -np.inf
        for b in beaches:
            data = df[df['Client.ID'] == b][col].map(transform)
            if data.min() < min_x and not np.isinf(data.min()):
                min_x = data.min()
            if data.max() > max_x and not np.isinf(data.min()):
                max_x = data.max()

        fig, ax = plt.subplots(subplots[0], subplots[1],
                               sharex=True, sharey=True)
        ax = ax.flatten()

        for i, b in enumerate(beaches):
            df[df['Client.ID'] == b][col].map(transform).hist(
                normed=1, ax=ax[i], bins=np.linspace(min_x, max_x, 15)
            )
            ax[i].set_ylabel(b)
            ax[i].set_yticklabels([])

        for i in range(len(beaches) + 1, len(ax)):
            ax[i].set_yticklabels([])

    else:
        fig, ax = plt.subplots(1)
        for b in beaches:
            df[df['Client.ID'] == b][col].map(transform).hist(
                normed=True, alpha=.5, ax=ax
            )
        ax.legend(beaches)

    plt.show(block=TO_BLOCK)
Beispiel #46
0
do_sd = False
filenames = ['alkuarvot.txt_5x5x0', 'alkuarvot.txt_10x10x0',
             'alkuarvot.txt_16x16x0']

#filenames = ['alkuarvot.txt_10x10x0',
#             'alkuarvot.txt_16x16x0']
#filenames = ['alkuarvot.txt_5x5x0']
#filenames = ['alkuarvot.txt_10x10x0']
#filenames = ['alkuarvot.txt_16x16x0']
if do_mc:
    for filename in filenames:
        #Elektronien määrä on vakio
        [mylambda, n_iter, tol, temperature,tiheydenmuutos, d_rho, askel,
         ny, nx, nz, elektroni_lkm, elektroni_tiheys,
         V_hartree, ydin_tiheys, ytimien_lkm, ydin_tiheys] = \
         read_data.read_data(filename = filename)

        elektroni_tiheys.set_summa()
        elektroni_tiheys.set_mylambda(mylambda)
        elektroni_tiheys.set_n_iter(n_iter)
        elektroni_tiheys.set_tol(tol)
        elektroni_tiheys.set_temperature(temperature)
        elektroni_tiheys.set_tiheydenmuutos(tiheydenmuutos)
        elektroni_tiheys.set_d_rho(d_rho)
        elektroni_tiheys.set_askel(askel)

        outfile = open('mc_'+filename+'energiat.txt', 'w')
        start = time.time()
        konvergoinut = \
            laskentaa.minimoi_monte_carlolla(
            outfile, 
def run_training():
  """Train MNIST for a number of steps."""
  # Get the sets of images and labels for training, validation, and
  # test on MNIST.
  filename = "Data4-10.txt"
  vectors_data1,labels_data1 = read_data.read_data(filename)
  filename = "Data21-25.txt"
  vectors_data2,labels_data2 = read_data.read_data(filename)
  filename = "Data26-29.txt"
  vectors_data3,labels_data3 = read_data.read_data(filename)

  vectors_data = np.vstack((vectors_data1,vectors_data2,vectors_data3))
  print(vectors_data.shape)
  labels_data = np.vstack((np.reshape(labels_data1,(len(labels_data1),1)),
    np.reshape(labels_data2,(len(labels_data2),1)),np.reshape(labels_data3,(len(labels_data3),1))))
  labels_data = np.reshape(labels_data,-1)
  print(labels_data.shape)

  filename = "test30-31.txt"
  validation_data,vlabels_data = read_data.read_data(filename)
  filename = "valid18-20.txt"
  test_data,tlabels_data = read_data.read_data(filename)


  # Tell TensorFlow that the model will be built into the default Graph.
  with tf.Graph().as_default():
    # Generate placeholders for the images and labels.
    vectors_placeholder, labels_placeholder = placeholder_inputs(
        FLAGS.batch_size)

    # Build a Graph that computes predictions from the inference model.
    logits = mnist.inference(vectors_placeholder,
                             FLAGS.hidden1,
                             FLAGS.hidden2)

    # Add to the Graph the Ops for loss calculation.
    loss = mnist.loss(logits, labels_placeholder)

    # Add to the Graph the Ops that calculate and apply gradients.
    train_op = mnist.training(loss, FLAGS.learning_rate)

    # Add the Op to compare the logits to the labels during evaluation.
    eval_correct = mnist.evaluation(logits, labels_placeholder)

    # Build the summary operation based on the TF collection of Summaries.
    summary_op = tf.merge_all_summaries()

    # Create a saver for writing training checkpoints.
    saver = tf.train.Saver()

    # Create a session for running Ops on the Graph.
    sess = tf.Session()

    # Run the Op to initialize the variables.
    init = tf.initialize_all_variables()
    sess.run(init)

    # Instantiate a SummaryWriter to output summaries and the Graph.
    summary_writer = tf.train.SummaryWriter(FLAGS.train_dir,
                                            graph_def=sess.graph_def)

    # And then after everything is built, start the training loop.
    for step in xrange(FLAGS.max_steps):
      start_time = time.time()

      # Fill a feed dictionary with the actual set of images and labels
      # for this particular training step.
      feed_dict = fill_feed_dict(step,vectors_data,labels_data,
                                 vectors_placeholder,
                                 labels_placeholder)

      # Run one step of the model.  The return values are the activations
      # from the `train_op` (which is discarded) and the `loss` Op.  To
      # inspect the values of your Ops or variables, you may include them
      # in the list passed to sess.run() and the value tensors will be
      # returned in the tuple from the call.
      _, loss_value = sess.run([train_op, loss],feed_dict=feed_dict)

      duration = time.time() - start_time

      # Write the summaries and print an overview fairly often.
      if step % 100 == 0:
        # Print status to stdout.
        print('Step %d: loss = %.2f (%.3f sec)' % (step, loss_value, duration))
        # Update the events file.
        summary_str = sess.run(summary_op, feed_dict=feed_dict)
        summary_writer.add_summary(summary_str, step)

      # Save a checkpoint and evaluate the model periodically.
      if (step + 1) % 1000 == 0 or (step + 1) == FLAGS.max_steps:
        saver.save(sess, FLAGS.train_dir, global_step=step)
        # Evaluate against the training set.
        print('Training Data Eval:')
        do_eval(sess,
                eval_correct,
                vectors_placeholder,
                labels_placeholder,
                vectors_data,
                labels_data)
        # Evaluate against the validation set.
        print('Validation Data Eval:')
        do_eval(sess,
                eval_correct,
                vectors_placeholder,
                labels_placeholder,
                validation_data,
                vlabels_data)
        # Evaluate against the test set.
        print('Test Data Eval:')
        do_eval(sess,
                eval_correct,
                vectors_placeholder,
                labels_placeholder,
                test_data,
                tlabels_data)
Beispiel #48
0
from sklearn.ensemble import AdaBoostClassifier
from read_data import read_data

def results_from_examples(ps,ls):
    return [1 if p == l else 0 for p,l in zip(ps,ls)]

def error_rate(rs):
    return 1.0-((1.0*sum(rs))/len(rs))

print "Sklearn"
examples,labels = read_data('Data/clean1_clean.data')
clf = AdaBoostClassifier(n_estimators=50)
a = AdaBoostClassifier.fit(clf,examples,labels)
score = a.score(examples, labels)
i = 0
print "Estimator, Ensemble error, Classifier error"
for value in AdaBoostClassifier.staged_predict(clf, examples):
    rs = results_from_examples(value, labels)
    #print "Estimator: " + str(i) + " Ensemble error: " + str(error_rate(rs)) + " Classifier error: " + str(clf.estimator_errors_[i])
    print str(i) + "," + str(error_rate(rs)) + "," + str(clf.estimator_errors_[i])
    i = i + 1

print score
Beispiel #49
0
  ylabel('magnitude')
  legend(loc=1)
 #plot(ct,cx,'k.', ms=3)
  ax=subplot(313)
  ct, cx, ce = merge(t, lc1, e1, lc2, e2, mc_delay[0], mc_delta_mag[0])
  errorbar(ct,cx, ce,fmt='b.', ms=3, label = 'merged light curve')
  xlabel('time, day')
  ylabel('magnitude')
  legend(loc=1)
  fig.savefig("mc_light_curves_%s.png"%(output_tag))

  print 'Process Started on', t0
  print 'It is currently   ', datetime.datetime.now()
  #show()

#################################
#END def emcee_delay_estimator(...)

t, mag1, e1, mag2, e2, mag3, e3, mag4, e4 = read_data('../data/cosmograil/RXJ1131_Tewes2013.rdb') 
e1=array([max(0.1,x) for x in e1])
e2=array([max(0.1,x) for x in e2])
e3=array([max(0.1,x) for x in e3])
e4=array([max(0.1,x) for x in e4])
t    =t[0:70]
lc1  =mag1[0:70]
err1 =e1[0:70]
lc2  =mag4[0:70]
err2 =e4[0:70]
emcee_delay_estimator(t, lc1, err1, lc2, err2, 'RXJ1131_curves_AD')
#emcee_delay_estimator(t, mag2, e2, mag3, e3, 'RXJ1131_curves_BC')
Beispiel #50
0
import read_data as rd
import basic_analyze as az
import regsvd_sgd as mf
import datetime as dt

if __name__ == '__main__':

    path = '/home/bliuab/tencent/data/all'
    data = rd.read_data(path)
    #az.analyze_count(data)

    #the data before split_time is for training, after that for testing
    k = 100
    split_time = dt.datetime(2014,9,24,1)
    mf.mf(data, split_time, k)
def check_sample_times(df=None, to_plot=False):
    '''
    Investigates whether there is a relationship between the time a
    sample was taken and the E. coli reading. A possible hypothesis
    being that samples taken later in the day might tend to read be
    higher.

    The conclusions from this function seem to indicate that there
    is not a subtantial relationship between sample time and E.
    coli reading.

    Inputs
    ------
    df      : Dataframe object, should contain at least the columns
              'Client.ID', 'Escherichia.coli', 'Sample.Collection.Time',
              if df is None, then it will be read in from read_data.
    to_plot : Boolean, if true, the results will be printed and
              plotted. Otherwise, just the cleansed dataframe will
              be returned.

    Returns
    -------
    ct : Dataframe of collection times and E. coli readings.
         The column 'Sample.Collection.Time' is the fraction of the day,
         for example, a value of 0.50 indicates the collection happened
         at noon, a value of 0.25 would indicate 6:00 AM, etc.
    '''
    if df is None:
        df = rd.read_data()

    ct = df[['Client.ID', 'Escherichia.coli', 'Sample.Collection.Time']].dropna()

    def clean_times(s):
        '''
        Takes in a string from the sample collection column and
        makes it machine readable if possible, and a NaN otherwise
        '''
        if type(s) is not str:
            if type(s) is dt.datetime or type(s) is dt.time:
                return dt.datetime(2016, 1, 1, hour=s.hour, minute=s.minute)

        try:
            if ':' not in s:
                return float('nan')
            i = s.index(':')
            hr = int(s[max(i - 2, 0):i])
            mn = int(s[i+1:i+3])

            return dt.datetime(2016, 1, 1, hour=hr, minute=mn)
        except:
            return float('nan')

    ct['Sample.Collection.Time'] = ct['Sample.Collection.Time'].map(clean_times)
    ct = ct.dropna()
    ct['Sample.Collection.Time'] = ct['Sample.Collection.Time'].map(
        lambda x: x.hour / 24. + x.minute / (24. * 60.)
    )
    # Filter out those samples which came before 4:00 AM or after 8:00 PM
    # It seems like most of the ones that come from before 4:00 AM might
    # actually be occuring in the afternoon. I've tried taking these and manually
    # changing them to the afternoon and there was no significant change in results.
    ct = ct[(ct['Sample.Collection.Time'] > .125) & (ct['Sample.Collection.Time'] < .83)]

    if to_plot:
        # t-test
        ct_low = ct[ct['Escherichia.coli'] < 235]
        ct_high = ct[ct['Escherichia.coli'] >= 235]
        ttest = scipy.stats.ttest_ind(ct_low['Sample.Collection.Time'],
                                      ct_high['Sample.Collection.Time'])
        print('tests comparing below threshold to above threshold:')
        print('\tOVERALL:')
        print('\tt-statistic: {0}\n\tp-value    : {1}'.format(ttest[0], ttest[1]))

        low_mean = ct_low['Sample.Collection.Time'].mean()
        low_mean_hr = int(low_mean * 24)
        low_mean_min = str(int((low_mean * 24 - low_mean_hr) * 60))
        if len(low_mean_min) < 2:
            low_mean_min = '0' + low_mean_min
        print('\tbelow thresh mean: {0} ({1})'.format(
            low_mean, str(low_mean_hr) + ':' + low_mean_min
        ))
        high_mean = ct_high['Sample.Collection.Time'].mean()
        high_mean_hr = int(high_mean * 24)
        high_mean_min = str(int((high_mean * 24 - high_mean_hr) * 60))
        if len(high_mean_min) < 2:
            high_mean_min = '0' + high_mean_min
        print('\tbelow thresh mean: {0} ({1})'.format(
            high_mean, str(high_mean_hr) + ':' + high_mean_min
        ))

        ttests = []
        for b in ct['Client.ID'].dropna().unique().tolist():
            xl = ct_low[ct_low['Client.ID'] == b]
            xh = ct_high[ct_high['Client.ID'] == b]
            ttests.append(scipy.stats.ttest_ind(xl['Sample.Collection.Time'],
                                                xh['Sample.Collection.Time']))
            ttest = ttests[-1]
            print('\t' + b)
            print('\t\tt-statistic: {0}\n\t\tp-value    : {1}'.format(ttest[0], ttest[1]))
        plt.hist(map(lambda x: x[1], ttests))

        # qq-plot
        x = []
        y = []
        for p in np.linspace(0,1,1000):
            x.append(ct_low['Sample.Collection.Time'].quantile(p))
            y.append(ct_high['Sample.Collection.Time'].quantile(p))
        ax = plt.subplots(1)[1]
        ax.plot([0, 1], [0, 1], 'r--')
        ax.hold(True)
        ax.plot(x, y)
        ax.set_xlabel('Below Threshold Quantiles')
        ax.set_ylabel('Above Threshold Quantiles')
        ax.set_aspect('equal')

        # set e coli to log scale
        ct['Escherichia.coli'] = ct['Escherichia.coli'].map(lambda x: np.log(x + 1.))

        # correlations
        print('Correlations between log(E. coli) and Sample collection time:')
        print('\tPearson correlation : ' + str(ct.corr(method='pearson').ix[0,1]))
        print('\tSpearman correlation: ' + str(ct.corr(method='spearman').ix[0,1]))

        # scatter plot
        ct.plot(y='Escherichia.coli', x='Sample.Collection.Time', style='.')
        ax = plt.gca()
        ax.set_xlim([ct['Sample.Collection.Time'].min(), ct['Sample.Collection.Time'].max()])

        # histograms
        tb = viz.TO_BLOCK
        viz.TO_BLOCK = False
        fig, ax = viz.plot_beach(columns='Sample.Collection.Time', df=ct)
        viz.TO_BLOCK = tb
        ax.legend_.remove()
        plt.show(tb)
        ct['Escherichia.coli'] = ct['Escherichia.coli'].map(lambda x: np.exp(x) - 1.)

    return ct
from evaluation import average_relative_error
from save_result import save_to_file
import sys
# Poulis set k=25, m=2 as default!

if __name__ == '__main__':
    if len(sys.argv) <= 1:
        flag = True
    elif sys.argv[1] == 'DA':
        flag = False
    else:
        flag = True
    #read gentree tax
    att_tree = read_tree()
    #read record
    trans = read_data()
    # remove duplicate items
    for i in range(len(trans)):
        trans[i] = list(set(trans[i]))
    if flag:
        print "Begin AA"
        cut = AA(att_tree, trans)
    else:
        print "Begin DA"
        cut = DA(att_tree, trans)
    # cut = AA(att_tree[-1], trans)
    print "Final Cut"
    print cut
    result = trans_gen(trans, cut)
    save_to_file(result)
    print "Finish T-Anonymization!!"
Beispiel #53
0
task_dir = config().task.dir
kb_index = index_ent_rel(os.path.join(task_dir, 'train.txt'),
                         os.path.join(task_dir, 'valid.txt'),
                         os.path.join(task_dir, 'test.txt'))
n_ent, n_rel = graph_size(kb_index)

models = {'TransE': TransE, 'TransD': TransD, 'DistMult': DistMult, 'ComplEx': ComplEx}
gen_config = config()[config().g_config]
dis_config = config()[config().d_config]
gen = models[config().g_config](n_ent, n_rel, gen_config)
dis = models[config().d_config](n_ent, n_rel, dis_config)
gen.load(os.path.join(task_dir, gen_config.model_file))
dis.load(os.path.join(task_dir, dis_config.model_file))

train_data = read_data(os.path.join(task_dir, 'train.txt'), kb_index)
inplace_shuffle(*train_data)
valid_data = read_data(os.path.join(task_dir, 'valid.txt'), kb_index)
test_data = read_data(os.path.join(task_dir, 'test.txt'), kb_index)
filt_heads, filt_tails = heads_tails(n_ent, train_data, valid_data, test_data)
valid_data = [torch.LongTensor(vec) for vec in valid_data]
test_data = [torch.LongTensor(vec) for vec in test_data]
tester = lambda: dis.test_link(valid_data, n_ent, filt_heads, filt_tails)
train_data = [torch.LongTensor(vec) for vec in train_data]

dis.test_link(test_data, n_ent, filt_heads, filt_tails)

corrupter = BernCorrupterMulti(train_data, n_ent, n_rel, config().adv.n_sample)
src, rel, dst = train_data
n_train = len(src)
n_epoch = config().adv.n_epoch
Beispiel #54
0
"""

import numpy as np
from scipy.optimize import fmin_powell, fmin, anneal
from gridi import *
from energiat import E_tot
import laskentaa
import piirtoa
import string
import read_data


#try:

[ny, nx, nz, elektroni_lkm, elektroni_tiheys,
 V_hartree, ydin_tiheys, ytimien_lkm, ydin_tiheys] = read_data.read_data(filename = 'alkuarvot.txt')

print elektroni_tiheys.get_volume()
print elektroni_tiheys.gridi
print elektroni_tiheys.to_1d_list()

print "ydintiheys",ydin_tiheys.gridi
#Elektronien määrä on vakio
elektroni_tiheys.set_summa()
ntot = np.sum(elektroni_tiheys.gridi)*V_hartree.get_volume_of_a_box()
print "elektronien kokonaisvaraus", ntot, elektroni_tiheys.get_summa_mennyt()

# otetaan tavoitteeksi että siirto hyväksytään joka viides kerta
# silloin on sopivasti riskiä yrityksessä
tiheydenmuutos = 0.1
outfile = open('energiat.txt', 'w')
Beispiel #55
0
def main():
    filename = 'mnist_100.csv'
    train_perc = 0.7
    label_index = 0

    acc = []
    for i in range(100):
        print(i)
        (train_x, train_y), (test_x, test_y), possibleLabels = read_data.read_data(filename, train_perc, label_index, NUM_LABELS)

        numAttributes = len(train_x[0])
        numLabels = NUM_LABELS

        x = tf.placeholder(tf.float32, shape=[None, numAttributes])
        y = tf.placeholder(tf.float32, shape=[None, numLabels])

        W_hidden = tf.Variable(tf.truncated_normal([numAttributes, NUM_NEURONS], stddev=0.1))
        b_hidden = tf.Variable(tf.constant(0.1, shape=[NUM_NEURONS]))

        hidden_net = tf.matmul(x, W_hidden) + b_hidden
        hidden_out = tf.sigmoid(hidden_net)

        W_outlayer = tf.Variable(tf.truncated_normal([NUM_NEURONS, numLabels], stddev=0.1))
        b_outlayer = tf.Variable(tf.constant(0.1, shape=[numLabels]))

        output_net = tf.matmul(hidden_out, W_outlayer) + b_outlayer

        if numLabels == 1:
            predict = tf.sigmoid(output_net)
        else:
            predict = tf.nn.softmax(output_net)

        if numLabels == 1:
            cost = tf.reduce_sum(0.5 * (y - predict) * (y - predict))
        else:
            cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(labels=y, logits=output_net))

        trainStep = tf.train.AdamOptimizer(LEARNING_RATE).minimize(cost)

        with tf.Session() as sess:

            step = 0
            printEvery = 100
            maxIterations = 1000
            totalTime = 0

            sess.run(tf.global_variables_initializer())

            while step < maxIterations:
                step += 1

                # train the network
                startTime = time.process_time()
                sess.run(trainStep, feed_dict={x: train_x, y: train_y})
                totalTime += time.process_time() - startTime

                if step % printEvery == 0:
                    #p = sess.run(predict, feed_dict={x: train_x})

                    print("\nStep:", step, "\tTime:", totalTime / step)
                    #cm = confusion_matrix.buildConfusionMatrix(p, train_y, numLabels)
                    #confusion_matrix.printConfusionMatrix(cm, possibleLabels)
                    #print("Training:")
                    #confusion_matrix.printAccuracy(cm)

                    #print("Testing:")
                    #p = sess.run(predict, feed_dict={x: test_x})
                    #cm = confusion_matrix.buildConfusionMatrix(p, test_y, numLabels)
                    #confusion_matrix.printAccuracy(cm)

            p = sess.run(predict, feed_dict={x: test_x})

            #print("Confusion Matrix on Test Set:")
            cm = confusion_matrix.buildConfusionMatrix(p, test_y, numLabels)
            #confusion_matrix.printConfusionMatrix(cm, possibleLabels)

            #print("Average time:", totalTime / step)
            accuracy = confusion_matrix.printAccuracy(cm)
            acc.append(float(accuracy))

    print(sum(acc) / float(len(acc)))
    print (acc)
def prepare_data(df=None):
    '''
    Preps the data to be used in the model. Right now, the code itself must
    be modified to tweak which columns are included in what way.

    Parameters
    ----------
    df : Dataframe to use. If not specified, the dataframe is loaded automatically.

    Returns
    -------
    predictors : NxM DataFrame of the predictors for the classification problem.
    meta_info  : Nx2 DataFrame containing the columns 'Escherichia.coli' and
                 'Full_date', to be used, e.g., for leave-one-year-out cross
                 validation and creating the true class labels (elevated vs.
                 not elevated E. coli levels).
    '''
    if df is None:
        df = rd.read_data()

    # Leaving 2015 as the final validation set
    df = df[df['Full_date'] < '1-1-2015']


    ######################################################
    #### Add derived columns here
    ######################################################

    df['DayOfYear'] = df['Full_date'].map(lambda x: x.dayofyear)


    ######################################################
    #### List all columns you will use
    ######################################################

    # Meta columns are not used as predictors
    meta_columns = ['Full_date', 'Escherichia.coli']

    # Deterministic columns are known ahead of time, their actual values are used
    # with no previous days being used.
    deterministic_columns = [
        'Client.ID', 'Weekday', 'sunriseTime', 'DayOfYear'
    ]
    deterministic_hourly_columns = [
        'precipIntensity', 'temperature', 'windSpeed',
        'windBearing', 'pressure', 'cloudCover'
    ]
    for var in deterministic_hourly_columns:
        for hr in [-12, -8, -4, 0, 4]:
            deterministic_columns.append(var + '_hour_' + str(hr))

    # Historical columns have their previous days' values added to the predictors,
    # but not the current day's value(s). The value NUM_LOOKBACK_DAYS set below
    # controls the number of previous days added. Nothing is currently done to
    # fill NA values here, so if you wish to use columns with a high rate of data
    # loss, then you should add logic to fill the NA values.
    historical_columns = [
        'precipIntensity', 'precipIntensityMax',
        'temperatureMin', 'temperatureMax',
        'humidity', 'windSpeed', 'cloudCover'
    ]

    # Each historical column will have the data from 1 day back, 2 days back,
    # ..., NUM_LOOKBACK_DAYS days back added to the predictors.
    NUM_LOOKBACK_DAYS = 3


    ######################################################
    #### Get relevant columns, add historical data
    ######################################################

    all_columns = list(set(meta_columns + deterministic_columns + historical_columns))

    df = df[all_columns]

    df = rd.add_column_prior_data(
        df, historical_columns, range(1, NUM_LOOKBACK_DAYS + 1),
        beach_col_name='Client.ID', timestamp_col_name='Full_date'
    )

    df.drop(set(historical_columns) - set(deterministic_columns), axis=1, inplace=True)


    ######################################################
    #### Process non-numeric columns
    ######################################################

    # process all of the nonnumeric columns
    # This method just assigns a numeric value to each possible value
    # of the non-numeric column. Note that this will not work well
    # for regression-style models, where instead dummy columns should
    # be created.
    def nonnumericCols(data, verbose=True):
        for f in data.columns:
            if data[f].dtype=='object':
                if (verbose):
                    print('Column ' + str(f) + ' being treated as non-numeric')
                lbl = sklearn.preprocessing.LabelEncoder()
                lbl.fit(list(data[f].values))
                data[f] = lbl.transform(list(data[f].values))
        return data

    df = nonnumericCols(df)


    ######################################################
    #### Drop any rows that still have NA, set up outputs
    ######################################################

    total_rows_predictors = df.dropna(subset=['Escherichia.coli'], axis=0).shape[0]
    nonnan_rows_predictors = df.dropna(axis=0).shape[0]
    print('Dropping {0:.4f}% of rows because predictors contain NANs'.format(
        100.0 - 100.0 * nonnan_rows_predictors / total_rows_predictors
    ))

    df.dropna(axis=0, inplace=True)

    predictors = df.drop(['Escherichia.coli', 'Full_date'], axis=1)
    meta_info = df[['Escherichia.coli', 'Full_date']]

    return predictors, meta_info
    total_rows_predictors = df.dropna(subset=['Escherichia.coli'], axis=0).shape[0]
    nonnan_rows_predictors = df.dropna(axis=0).shape[0]
    print('Dropping {0:.4f}% of rows because predictors contain NANs'.format(
        100.0 - 100.0 * nonnan_rows_predictors / total_rows_predictors
    ))

    df.dropna(axis=0, inplace=True)

    predictors = df.drop(['Escherichia.coli', 'Full_date'], axis=1)
    meta_info = df[['Escherichia.coli', 'Full_date']]

    return predictors, meta_info


if __name__ == '__main__':
    df = rd.read_data(read_weather_station=False, read_water_sensor=False)
    epa_model_df = df[['Drek_Prediction', 'Escherichia.coli']].dropna()
    predictors, meta_info = prepare_data(df)
    timestamps = meta_info['Full_date']
    classes = meta_info['Escherichia.coli'] > 235

    print('Using the following columns as predictors:')
    for c in predictors.columns:
        print('\t' + str(c))
    hyperparams = {
        # Parameters that effect computation
        'n_estimators':250, 'max_depth':5,
        # Misc parameters
        'n_jobs':-1, 'verbose':False
    }
    clfs, roc_ax, pr_ax = model(timestamps, predictors, classes,
    # Adapted from sklearn.metrics._binary_clf_curve:
    # scores typically has many tied values. Here we extract
    # the indices associated with the distinct values. We also
    # concatenate a value for the end of the curve.
    # We need to use isclose to avoid spurious repeated thresholds
    # stemming from floating point roundoff errors.
    distinct_value_indices = np.where(np.logical_not(np.abs(
        np.diff(scores)) < 0.00001))[0]
    threshold_idxs = np.r_[distinct_value_indices, labels.size - 1]

    return scores, labels, threshold_idxs

if __name__ == '__main__':

    TO_BLOCK = False

    df = read_data.read_data()

    scores = df[['Reading.1', 'Escherichia.coli']].dropna()['Reading.1']
    labels = df[['Reading.1', 'Escherichia.coli']].dropna()['Escherichia.coli']
    labels = labels >= 235.0
    roc(scores, labels)
    precision_recall(scores, labels)

    beach_hist(transform=lambda x: np.log(x + 1), df=df, subplots=[7, 4])

    movie(df=df)

    plt.show()
        model_suffix = time.strftime("%d_%m_%Y")
    
    directory = 'model_'+model_suffix    
    if not os.path.exists(directory):
        os.makedirs(directory)

    ##########################    
    ###   Load the data   
    ##########################
    if args.input_data:
        print('Loading data from {0}'.format(args.input_data))
        df = pd.read_csv(args.input_data, parse_dates='Full_date', low_memory=False)
        df['Full_date'] = rd.date_lookup(df['Full_date'])
    else:
        print('Reading and loading data. Saving to {}'.format(directory+'/all_data.csv'))        
        df = rd.read_data(read_weather_station=False, read_water_sensor=False, add_each_beach_data=True)
        df.to_csv(directory+'/all_data.csv', index=False)
               
    ###############################   
    ###   Prepare Predictors  
    ###############################
    if args.input_processed:
        print('Using Preprocessed data from {0} and {1}'.format(args.input_processed, args.input_meta ))
        datafilename = args.input_processed
        metadatafilename = args.input_meta
        data_processed = pd.read_csv(datafilename)
        meta_info = pd.read_csv(metadatafilename, parse_dates='Full_date')      
        meta_info['Full_date'] =  rd.date_lookup(meta_info['Full_date'])
    else:
        print('Preparing data for modeling. Saving to {0} and {1}'.format(directory+'/processed.csv', directory+'/meta_processed.csv'))
        data_processed, meta_info = prepare_data(df)
Beispiel #60
0
# Get data (observations)

DataFilesNames = ['Basket_Ball_1.dat', 'Basket_Ball_2.dat', 'Bowling_Ball_1.dat', 'Bowling_Ball_2.dat']
#DataFilesNames = ['Basket_Ball_1.dat'] 

DataFolder = 'data/'
DataFiles = [DataFolder + name for name in DataFilesNames]

# Read all data points (+ clean-up suspicious ones)
Dx = 10.0
gridpts = 1000

allt, allh, allYP = [], [], []
for datafile in DataFiles:
	YP = []
	t, h = read_data(datafile)
	t = np.array(t[:-1])/600.
	h = h[:-1]
	# Assemble YP:
	for hh in h:
		YP.append(np.linspace(hh-Dx, hh+Dx, gridpts))
	# remove last data points:
	allt.append(t)
	allh.append(h)
	allYP.append(YP)

###############################################################################
# Compute posterior of observations at all points

allINTEGRAL = []