Python Dataset Examples

Programming Language: Python

Namespace/Package Name: preprocessing.dataset

Class/Type: Dataset

Examples at hotexamples.com: 11

Python Dataset - 11 examples found. These are the top rated real world Python examples of preprocessing.dataset.Dataset extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

Dataset(21)

branches(8)

filesAdded(8)

setOutputBranches(8)

addFiles(5)

selection(4)

process(4)

sampleSelection(3)

outputIndex(3)

load_questions(3)

addFlatSFtoDataframe(2)

ignoreBranches(2)

_resolveWildcardBranch(2)

rescale_labels(1)

outputBranchesSet(1)

outputBranches(1)

get_validation(1)

index_to_text(1)

cleanBranchList(1)

get_training(1)

get_counters(1)

getSelectedDataframe(1)

getBranchesFromFile(1)

generate_dataset(1)

files(1)

encode_single_question(1)

get_test(1)

Example #1

Show file

# This file trains the neural network using the encoder and decoder.
import __init__
import sys
import numpy as np
import tensorflow as tf
import pickle
from functools import reduce
import os
# custom imports
from network.config import CONFIG
from network.classifier import build_cnn_classifier, get_batch, get_feed_dict
from evaluation_metrics import get_f1_from_tokens, get_exact_match_from_tokens
from preprocessing.dataset import Dataset
from score import Score

D = Dataset(CONFIG.EMBEDDING_FILE)
index2embedding = D.index2embedding
padded_data, (max_length_question,
              max_length_context) = D.load_questions(CONFIG.QUESTION_FILE_V2)
print("Loaded data")

tf.reset_default_graph()
embedding = tf.placeholder(
    shape=[len(index2embedding), CONFIG.EMBEDDING_DIMENSION],
    dtype=tf.float32,
    name='embedding_ph')
train_op, loss, classifier_out = build_cnn_classifier(embedding)

root_path = __init__.root_path
results_path = root_path + '/resultsclassifier'
model_path = root_path + '/modelclassifier'

Example #2

Show file

import numpy as np

from feature_extraction.sliding_window import SlidingWindow
from feature_extraction.static_features import StaticFeatures
from lvq.cross_validation import CrossValidateLvq
from preprocessing.dataset import Dataset

splits = 5

sw_file = '../feature_extraction/data/sliding_window/50_0_10_10.json'
sw = SlidingWindow(file_path=sw_file)
#  sw = SlidingWindow()
sf_file = '../feature_extraction/data/static_features.csv'
sf = StaticFeatures(file_path=sf_file)

ds = Dataset()
df, mid_points = ds.generate_dataset(sw, sf, n_label_bins=20)
df = ds.rescale_labels(df, 5, 11)

n_bins = len(np.unique(df.binned_points))
largest_label = np.unique(df.binned_points)[-1]
n_bins = n_bins if largest_label == n_bins - 1 else largest_label + 1

model = glvq.LgmlvqModel(prototypes_per_class=1)
cv_lvq = CrossValidateLvq(n_bins, splits, model)
cv_lvq.cross_validate(df, gradient=False)

print
cv_lvq.print_conv_matrix()
tp = np.sum(cv_lvq.conf_matrix.diagonal())
tot = np.sum(np.sum(cv_lvq.conf_matrix))

Example #3

Show file

def test_Dataset_init():
    newDataset = Dataset("someName")

    assert isinstance(newDataset, Dataset)
    assert newDataset.outputName == "someName"

Example #4

Show file

import tensorflow as tf
from functools import reduce
import os
import pickle
from preprocessing.preprocess import answer_span_to_indices

# custom imports
from preprocessing.dataset import Dataset
from network.config import CONFIG
from network.build_model import get_batch
from evaluation_metrics import get_f1_from_tokens, get_exact_match_from_tokens
# Suppress tensorflow verboseness
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

print("Starting testing on dev file...")
D = Dataset(CONFIG.EMBEDDING_FILE)
index2embedding = D.index2embedding
padded_data, (max_length_question,
              max_length_context) = D.load_questions('data/dev.json')
print("Loaded data")

# split padded data per the start index of the question
split_data_pre = dict()
for qas in padded_data:
    first_word = D.index2word[qas["question"][0]].lower()
    if first_word not in split_data_pre:
        split_data_pre[first_word] = []
    split_data_pre[first_word].append(qas)

# Extract data bigger than batch size
split_data = dict()

Example #5

Show file

File: knowledgebase.py Project: jamjar919/dynamic-coattention-networks

#nltk.download('averaged_perceptron_tagger')

question_asked = input(
    "Enter a \'wh\' question, for example: Who is Sachin Ramesh Tendulkar?\n")
text = nltk.word_tokenize(question_asked)
processed_pos = nltk.pos_tag(text)
text_to_search = ''
for index in range(len(processed_pos)):
    if ("VB" in processed_pos[index][1]):
        text_to_search = ' '.join(question_asked.split(' ')[index + 1:])
        break

summary = wikipedia.summary(text_to_search)

context = ' '.join(summary.split()[:CONFIG.MAX_CONTEXT_LENGTH - 2])
D = Dataset(CONFIG.EMBEDDING_FILE)
index2embedding = D.index2embedding
question_encoding, context_encoding = D.encode_single_question(
    question_asked, context, CONFIG.MAX_QUESTION_LENGTH,
    CONFIG.MAX_CONTEXT_LENGTH)

embedding_dimension = 300
init = tf.global_variables_initializer()

latest_checkpoint_path = './model/saved-7'
print("restoring from " + latest_checkpoint_path)
saver = tf.train.import_meta_graph(latest_checkpoint_path + '.meta')

config = tf.ConfigProto()
if '--noGPU' in sys.argv[1:]:
    print("Not using the GPU...")

Example #6

Show file

def test_Dataset_resolveWildcardBranch(selector, inbranches, expectBranches):
    newDataset = Dataset("someName") 
    newDataset.filesAdded = True
    newDataset.branches = inbranches
    
    assert expectBranches == newDataset._resolveWildcardBranch(selector)

Example #7

Show file

def convertTreeMulti(config, treeName, category):
    logging.info("Starting conversion using multi method")
    checkNcreateFolder(config.outputFolder)

    #For multi mode, we generate a dataset per sample. In the loop the output is disabled and in the end the
    #dataframs of the 1:: samples will be added to the first and saved

    eventsLeft = config.maxEvents
    dfs = []
    baseDataset = None
    for iSample, sample in enumerate(config.samples):
        logging.info("Processing sample %s", sample)
        if iSample == 0:
            datasetName = config.outputPrefix + "_" + config.sampleName + "_" + config.categories[
                category].name
        else:
            datasetName = config.outputPrefix + "_" + config.sampleInfo[
                sample].name + "_" + config.categories[category].name
        dataset = Dataset(datasetName, config.outputFolder, treeName)
        logging.info("Setting sample selection: %s",
                     config.sampleInfo[sample].selection)
        dataset.sampleSelection = config.sampleInfo[sample].selection
        logging.info("Setting category selection: %s",
                     config.categories[category].selection)
        dataset.selection = config.categories[category].selection

        if config.excludeBranches is not None:
            dataset.ignoreBranches = config.excludeBranches
        logging.info("Setting files")
        dataset.addFiles(config.sampleInfo[sample].files)

        logging.info("Setting output branches")

        dataset.setOutputBranches(config.outputVariables)

        logging.debug("Setting indexing branches: %s", config.indexVariables)
        dataset.outputIndex = config.indexVariables

        if config.addRatio:
            dataset.setSF(config.sampleInfo[sample].addSF, "sampleRatio")

        logging.info("Starting processing dataset")
        thisSampleDF = dataset.process(eventsLeft, skipOutput=True)
        eventsLeft -= len(thisSampleDF)
        dfs.append(thisSampleDF)
        if iSample == 0:
            baseDataset = copy(dataset)

    baseDataset.makeOutput(pd.concat(dfs))
    logging.info("Finished processing")

Example #8

Show file

def convertTree(config, treeName, category):
    """ Wrapper for the functionality of preprocessing.dataset  """
    logging.info("Starting conversion")

    checkNcreateFolder(config.outputFolder)

    datasetName = config.outputPrefix + "_" + config.sampleName + "_" + config.categories[
        category].name
    dataset = Dataset(datasetName, config.outputFolder, treeName)

    logging.info("Setting sample selection: %s", config.sampleSelection)
    dataset.sampleSelection = config.sampleSelection
    logging.info("Setting category selection: %s",
                 config.categories[category].selection)
    dataset.selection = config.categories[category].selection

    if config.excludeBranches is not None:
        dataset.ignoreBranches = config.excludeBranches

    logging.info("Setting files")
    dataset.addFiles(config.files)

    logging.info("Setting output branches")
    dataset.setOutputBranches(config.outputVariables)

    logging.debug("Setting indexing branches: %s", config.indexVariables)
    dataset.outputIndex = config.indexVariables

    if config.addRatio:
        dataset.setSF(config.sampleSF, "sampleRatio")

    logging.info("Starting processing dataset")
    dataset.process(config.maxEvents)

    logging.info("Finished processing")

Example #9

Show file

File: do_lvq_run.py Project: jellevanwezel/FinalThesisProject

from lvq.predictor import Predictor
from preprocessing.dataset import Dataset

sw_file = '../feature_extraction/data/sliding_window/50_0_10_10.json'
sw = SlidingWindow(file_path=sw_file)
# Gets the sliding window data from the measurements
# Can be generated from the database or extracted from a file
#
#
sf_file = '../feature_extraction/data/static_features.csv'
sf = StaticFeatures(file_path=sf_file)
# Gets the static features by the measure points
# Can be generated from the database or extracted from a file


ds = Dataset()
df, mid_points = ds.generate_dataset(sw, sf, n_label_bins=20)
# Concatenates the static data and the sliding window and bins the labels


# df = ds.rescale_labels(df, 5, 11)
# mid_points = np.array(mid_points)[4:]
# optional, removes barely used labels
# Still needs to be optimized

mpdata = df.loc[df.measure_point_id == 767]
df = df[df.measure_point_id != 767]
#  take 1 measure point for testing and remove it from the dataset

x = df.iloc[:, 2:-4]
# the data to train lvq on,

Example #10

Show file

def main():
    # TODO hyperopt for optimal parameter

    # --- OPEN CONFIGURATION FILE AND GET PARAMETERS---
    json_file = open('config/config.json', 'r')
    json_file = json.load(json_file)

    training_path = json_file['Training']
    validation_path = json_file['Validation']
    test_path = json_file['Test']
    parameters = json_file['Parameters']

    # --- CREATE DATASET AND COMPUTE A PRE-PROCESSING---
    dataset = Dataset(training_path, validation_path, test_path, parameters)
    x_train, y_train = dataset.get_training()
    x_val, y_val = dataset.get_validation()
    x_test, y_test = dataset.get_test()

    # Decomment this lines to performs a pre-processing study
    # After normalization
    plot_pixel_intensity(x_train[0],
                         './pixel_intensity_after_normalization.png')

    # Verify if dataset is balanced
    counters = dataset.get_counters()
    balance(counters)

    datagen = keras.preprocessing.image.ImageDataGenerator()

    train_batches = len(x_train) // parameters[
        'BatchSize']  # // operator indicates a floor division

    # --- DEFINE MODEL ---
    model = custom_cnn_2(num_classes=parameters['NumClass'],
                         heigth=parameters['Height'],
                         width=parameters['Width'],
                         channels=parameters['Channels'])

    lr = parameters['LearningRate']
    decay = lr / parameters['NumEpoch']

    # --- TRAIN MODEL ---
    model.compile(loss=categorical_crossentropy,
                  optimizer=Adadelta(),
                  metrics=['accuracy'])

    # --- CHECKPOINTING TO SAVE BEST NETWORK ---
    filepath = 'models_saved/custom_cnn_2.hdf5'
    checkpoint = ModelCheckpoint(filepath=filepath,
                                 monitor='val_acc',
                                 verbose=1,
                                 save_best_only=True,
                                 mode='max')

    history = model.fit_generator(generator=datagen.flow(
        x_train, y_train, batch_size=parameters['BatchSize']),
                                  steps_per_epoch=train_batches,
                                  validation_data=[x_val, y_val],
                                  epochs=parameters['NumEpoch'],
                                  callbacks=[Metrics('logs'), checkpoint])

    model.load_weights('models_saved/custom_cnn_2.hdf5')
    model.compile(loss=categorical_crossentropy,
                  optimizer=Adadelta(),
                  metrics=['accuracy'])

    # --- EVALUATE MODEL ---
    score = model.evaluate(x=x_val, y=y_val, verbose=0)

    # --- PREDICT NEW VALUES ---
    y_pred = model.predict(
        x_val
    )  # Use validation because in this way we can evaluate some metrics with sklearn (use test with true prediction)

    write_to_file(score, y_val, y_pred, dict_elem, parameters['BatchSize'],
                  parameters['NumEpoch'], "Custom CNN 2")

    classes_predicted = np.argmax(y_pred, axis=1)
    classes_true = np.argmax(y_val, axis=1)

    # --- PLOT RESULTS ---
    plot_learning_curve(history)
    plt.show()

    confusion_mtx = metrics.confusion_matrix(classes_true, classes_predicted)
    plot_confusion_matrix(confusion_mtx, classes=list(dict_elem.values()))
    plt.show()

    # --- SAVE MODEL AND WEIGHTS ---
    model_json = model.to_json()
    with open('models_saved/custom_cnn_2.json', 'w') as mod:
        mod.write(model_json)

    print("Model was saved successfully!")

Example #11

Show file

import numpy as np
import tensorflow as tf
import pickle
from functools import reduce
import os
from preprocessing.preprocess import answer_span_to_indices
# custom imports
from preprocessing.dataset import Dataset
from network.config import CONFIG
from network.build_model import get_batch
from evaluation_metrics import get_f1_from_tokens, get_exact_match_from_tokens

# Suppress tensorflow verboseness
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 

D = Dataset(CONFIG.EMBEDDING_FILE)
index2embedding = D.index2embedding
#padded_data_squad1, (max_length_question, max_length_context) = D.load_questions('data/train.json')
#padded_data_validation = padded_data_squad1[(int) (CONFIG.TRAIN_PERCENTAGE*len(padded_data_squad1)):]
#untrained_contexts = [x["context"] for x in padded_data_validation]
#print("Loaded data from squad one")

padded_data_squad2, (max_length_question_squad2, max_length_context_squad2) = D.load_questions('data/train-v2.0.json')
print("padded_data_squad2.len = ",len(padded_data_squad2))
print("Max length from squad 2 q and c: ", max_length_question_squad2, max_length_context_squad2)
print("Loaded data from squad two")
'''
padded_data_untrained = [x for x in padded_data_squad2 if x["context"] in untrained_contexts]
unanswerable_data = [x for x in padded_data_untrained if x["answer_start"]==-1]
answerable_data = [x for x in padded_data_untrained if x["answer_start"]>=0]
print("Number of unanswerable questions: ",len(unanswerable_data))