Beispiel #1
0
def makeSplit(corpusElements, fraction=0.5):
    documentIds = corpusElements.documentsById.keys()
    sample = Split.getSample(len(documentIds), fraction)
    division = {}
    for i in range(len(documentIds)):
        division[documentIds[i]] = sample[i]
    return division
Beispiel #2
0
def makeSplit(corpusElements, fraction=0.5):
    documentIds = corpusElements.documentsById.keys()
    sample = Split.getSample(len(documentIds),fraction)
    division = {}
    for i in range(len(documentIds)): 
        division[documentIds[i]] = sample[i]
    return division
Beispiel #3
0
def generate_category_choice(possible):
    """Generates all distinct category splits.
    possible: Possible values for the category.

    If there are n categories, there are 2^(n-1)-1 distinct possible splits.
    All the splits are generated and returned in a list.

    Uses the binary form of number from 1 to 2^(n-1)-1 to generate
    the splits."""
    n = len(possible)
    splits = []
    for i in range(1, pow(2, n-1)):
        split = Split(is_numerical=False)
        for j in xrange(n):
            if (i >> j) % 2 == 1:
                split.add_category_range( possible[j] )
        splits.append( split )
    return splits
Beispiel #4
0
def create_library(img, path_t, lib_dir, w, Ovr, f, aug):

    fn = get_name(img)

    ### Build subfolders
    if os.path.isdir(lib_dir) is False:
        print('\nCreating training library folders at: ' + lib_dir)
        os.makedirs(lib_dir)
        os.makedirs(lib_dir + '/pics')
        os.makedirs(lib_dir + '/masks')
    pics_dir = lib_dir + '/pics'
    masks_dir = lib_dir + '/masks'

    ### Split mosic into tiles
    print('Splitting image: %s...' % fn)
    with suppress_stdout():  ### suppress the long output
        Split.split_image(input=img,
                          output_dir=pics_dir,
                          patch_w=w,
                          patch_h=w,
                          adj_overlay_x=Ovr,
                          adj_overlay_y=Ovr,
                          out_format=f)
    os.remove('split_image_info.txt')

    truths = gpd.read_file(path_t)
    crs = truths.crs
    # print('\nCascading truths for analysis...')
    truths = gpd.GeoSeries(cascaded_union(truths['geometry']))
    truths = gpd.GeoDataFrame(geometry=truths, crs=crs)

    ### Remove bad tiles from library (usually edge tiles) & Re-number
    Filter.remove(pics_dir, truths)

    ### Create ground truth masks for tiles. Remove non-overlapping tiles
    Masks.create_masks(path_t, lib_dir, pics_dir, masks_dir, img)

    ### Augment tile-mask pairs to grow library
    Augment.augment_images(lib_dir, aug)

    print('\nLibrary build successful\n\n')

    return
Beispiel #5
0
def generate_numerical_splits( records, index ):
    """Generates and fills all the possible numerical splits with the
    records, with respect to their feature in the given index.

    Returns the list of possible splits
    """
    possible = {}
    for r in records:
        possible[ r.features[index] ] = True
    possible = possible.keys()
    splits = []

    for i in xrange(0, len(possible)-1):
        s = Split(is_numerical=True)
        s.set_numerical_range( possible[i] )
        s.place( records, index )
        splits.append( s )

    return splits
Beispiel #6
0
    def AddPoint(self, x):
        print self.name, "adding point", x
        if self.latestPoint == None:
            self.latestPoint = x
        else:
            # create a new split between the last click and this one
            # (getting the order right)
            if (x > self.latestPoint):
                self.splits.append(
                    Split.Split(self.motion, self.latestPoint, x))
            else:
                self.splits.append(
                    Split.Split(self.motion, x, self.latestPoint))

            # add the split to the total motion represented by the track
            # If this is the first split, we need to replace the masked motion
            # with the current split, otherwise add the new split to the
            # previous ones
            #if self.motion == self.maskedMotion :
            #	self.motion = self.splits[-1].GetMotion()
            #else:
            #	self.motion = Piavca.MotionAdder(self.splits[-1].GetMotion(), self.motion)

            self.latestPoint = None
Beispiel #7
0
 def LoadSplits(self):
     filename = "%s.txt" % self.GetName()
     print filename
     file = open(filename, "r")
     lines = file.readlines()
     for line in lines:
         print line
         splits = string.split(line, ",")
         print splits
         self.splits = []
         for split in splits:
             if (split == ""):
                 continue
             split = string.split(split)
             if split == []:
                 continue
             print split
             split = Split.Split(self.motion, float(split[0]),
                                 float(split[1]))
             self.splits.append(split)
     for s in self.splits:
         print s
Beispiel #8
0
def main(path):
    A = splitString('data/' + path + '/PersonA.txt', 's')
    A1 = splitString('data/' + path + '/PersonA.txt', 'd')
    B = splitString('data/' + path + '/PersonB.txt', 's')
    B1 = splitString('data/' + path + '/PersonB.txt', 's')
    final = A[0] + B[0] + A1[0] + B1[0]
    final = [int(x) for x in final]
    # Now our array is order
    final.sort()
    file = open('data/' + path + '/Time.txt', 'r')
    for line in file.readlines():
        time = line.rstrip().split('¦')  # using rstrip to r
        time = [0 if x == '' else x for x in time]
        time = [int(x) for x in time]
    ### doing something ############################
    time = splitTime(final, time)  #this retusn time in conversation
    splittedA = splitText(A[1], final)  # returns splittes conversation
    splittedB = splitText(B[1], final)  # returns splittes conversation
    split_index = Split.splitter_function('data/' + path)
    index = splitInterface(
        splittedA, split_index[0])  # return index where iterface is changing
    return (time[0:index], split_index[1], time[index:len(time)],
            splittedA[0:index], splittedA[index:len(splittedA)],
            splittedB[0:index], splittedB[index:len(splittedA)])
import Split
import ColorNeural
import cPickle as pickle
from sklearn.preprocessing import MinMaxScaler
np.random.seed(488)

#def normalize(imgs, std_reg=1e-5):
#    return (imgs - imgs.mean(axis=0, keepdims=True)) / (imgs.std(axis=0, keepdims=True) + std_reg)

scaler=MinMaxScaler()
mypath='/home/it-lab412/Desktop/combined'

onlyfiles = colorFilesArray
random_seq = np.random.permutation(72390)
onlyfiles = onlyfiles[random_seq]
files_splits = Split.split_seq(onlyfiles,30)

y1 = colorYArray.astype(np.int32)
y1 = y1[random_seq]
y_splits = Split.split_seq(y1,30)

net = ColorNeural.NN()
        
for i in range(30):
        
    X = np.ndarray(shape=(2413,3,256,256),dtype='float32') 
    print "Batch Number = ",i
    files = files_splits[i]
    Y = y_splits[i]    
    
    for n in range(0,2413):
h_length = 256
sr = 22050
h_duration = h_length / sr
start_time = 0
steps = 0
thresholds = []
threshold = 0.0
max_threshold = 1.51
while (threshold<max_threshold):
    thresholds.append(threshold)
    threshold += 0.01
units = [1, 2, 3, 5, 7, 10]
epsilon = np.finfo(float).eps

for split_id in range(len(units)):
    test_set, training_set = sp.split_units(split_id)
    #training_set.pop(training_set.index(10))
    print("Testing on unit " + str(test_set[0]).zfill(2))

    threshold_Fs = Parallel(n_jobs=-1)(delayed(gf.get_F)(training_set, th) for th in thresholds)

    # find best threshold (argmax) and print it to slurm
    best_threshold_id = np.argmax(threshold_Fs)
    best_threshold = thresholds[best_threshold_id]
    best_threshold_str = "{0:.2f}".format(best_threshold)
    best_F = np.max(threshold_Fs)
    best_F_str = "{0:.2f}".format(100*best_F)
    print("Best F measure for training: " + best_F_str + "%")
    print("Best threshold for training: " + best_threshold_str)

Beispiel #11
0
sys.path.insert(0, "../")

import aiml
import Split

import logging

logging.basicConfig()

# The Kernel object is the public interface to
# the AIML interpreter.
k = aiml.Kernel()

# Use the 'learn' method to load the contents
# of an AIML file into the Kernel.
k.learn("cn-startup.xml")

# Use the 'respond' method to compute the response
# to a user's input string.  respond() returns
# the interpreter's response, which in this case
# we ignore.
k.respond("load aiml cn")

# Loop forever, reading user input from the command
# line and printing responses.
while True:
    text = raw_input("> ")
    text = Split.splitChinese(text)
    logging.info(text)
    print k.respond(text)
Beispiel #12
0
def main():
    # --------------------- import data ---------------------
    glass = dp.GlassImport()
    num_inputs = glass.shape[1] - 2
    # split into training, validation & testing data
    train, validate, test = sp.split(glass, num_inputs, t=0.7, v=0.15)
    # --------------------- specify network architecture ---------------------
    num_neurons1 = 10  # layer 1
    num_neurons2 = 2  # layer 2
    alpha = 0.01  # learning rate
    epoch = 100  # number of iterations
    nlayer1 = nl.NeuronLayer(num_neurons1, num_inputs, nl.tansig,
                             nl.j_tansig)  # instantiate layer 1
    nlayer2 = nl.NeuronLayer(num_neurons2, num_neurons1, nl.softmax,
                             nl.j_softmax)  # instantiate layer 2
    np.random.seed(0)
    # randomly initialize weight and bias on the interval [-0.5, 0.5]
    W1 = np.matrix(np.random.rand(num_neurons1, num_inputs) - 0.5)
    b1 = np.matrix(np.random.rand(num_neurons1, 1) - 0.5)
    W2 = np.matrix(np.random.rand(num_neurons2, num_neurons1) - 0.5)
    b2 = np.matrix(np.random.rand(num_neurons2, 1) - 0.5)
    # initialize cross entropy loss (training & validation)
    ce_t = []
    ce_v = []
    global s1_all, s2_all
    # pass on randomly initialized weights and biases to the network
    nlayer1.setWeightBias(w=W1, b=b1)
    nlayer2.setWeightBias(w=W2, b=b2)
    # ----------------------------------------------------------------------------------------------------------------------
    # training the network
    # ----------------------------------------------------------------------------------------------------------------------
    for j in range(epoch):
        for i in range(len(train)):
            # create input matrix from training dataset
            input = np.matrix(train.iloc[i, slice(0, num_inputs)]).transpose()
            # --------------------- propagate the inputs forward ---------------------
            nlayer1.FP(input)
            nlayer2.FP(nlayer1.a)
            # --------------------- calculate errors ---------------------
            target = np.matrix(train.iloc[i, -2:])
            e = nl.cross_entropy(nlayer2.a, target)
            if i == 0:
                e_all = e
            else:
                e_all = np.concatenate((e_all, e), axis=1)
            # --------------------- backpropagate sensitivities ---------------------
            s2 = nl.senseo(t=target,
                           a=nlayer2.f(nlayer2.n))  # layer 2 sensitivity
            if i == 0:
                s2_all = s2
            else:
                s2_all = np.concatenate((s2_all, s2), axis=1)
            s1 = nl.senseh(F_prime=nlayer1.j(nlayer1.a), W=W2,
                           s=s2)  # layer 1 sensitivity
            if i == 0:
                s1_all = s1
            else:
                s1_all = np.concatenate((s1_all, s1), axis=1)
        # --------------------- cross-entropy loss ---------------------
        ce_t.append(e_all.mean())
        # --------------------- update weights and biases ---------------------
        nlayer2.update(sensitivity=s2_all.mean(axis=1),
                       learning_rate=alpha)  # layer 2 update
        nlayer1.update(sensitivity=s1_all.mean(axis=1),
                       learning_rate=alpha)  # layer 1 update
        # ----------------------------------------------------------------------------------------------------------------------
        # Validating the network
        # ----------------------------------------------------------------------------------------------------------------------
        input = np.matrix(validate.iloc[:, slice(0, num_inputs)]).transpose()
        # --------------------- propagate inputs forward ---------------------
        nlayer1.FP(input)
        nlayer2.FP(nlayer1.a)
        target = np.matrix(validate.iloc[:, -2:])
        # --------------------- compute errors ---------------------
        for i in range(len(target)):
            e = nl.cross_entropy(nlayer2.a[:, i], target[i])
            if i == 0:
                e_all = e
            else:
                e_all = np.concatenate((e_all, e), axis=1)
        ce_v.append(e_all.mean())
        # --------------------- Early Stopping Condition ---------------------
        if j == 0:
            val_fail = []
        elif ce_v[j] > ce_v[j - 1]:
            val_fail.append(1)
            if len(val_fail) == 5:
                print 'Validation error has increased for 5 consecutive epochs. Early stopping at epoch {}'.format(
                    j)
                break
        else:
            val_fail = []


# ----------------------------------------------------------------------------------------------------------------------
# Test and evaluate the network
# ----------------------------------------------------------------------------------------------------------------------
# --------------------- confusion matrix ---------------------
    actual = pd.Series(test.iloc[:, -2],
                       name='Actual')  # actual values (targets)
    input = np.matrix(
        test.iloc[:, slice(0, num_inputs)]).transpose()  # network inputs, p
    nlayer1.FP(input)  # layer 1 net-input
    nlayer2.FP(nlayer1.a)  # layer 2 net-input
    predict = nl.classify(nlayer2.a)  # predicted values from network
    predict = np.array(predict).flatten()
    predict = pd.Series(predict, name='Predicted')
    confusion = pd.crosstab(actual, predict,
                            margins=False)  # create confusion matrix
    confusion = confusion.astype(float)  # convert values to floats
    print confusion  # output confusion matrix to the console
    # --------------------- accuracy metrics ---------------------
    ERR = (confusion.loc[0, 1] + confusion.loc[1, 0]) / len(predict)
    ACC = 1 - ERR
    FPR = confusion.iloc[0, 1] / (confusion.iloc[0, 1] + confusion.iloc[0, 0])
    TPR = confusion.iloc[1, 1] / (confusion.iloc[1, 0] + confusion.iloc[1, 1])
    print 'Accuracy: %.2f' % ACC
    print 'Error: %.2f' % ERR
    print 'False Positive Rate: %.2f' % FPR
    print 'True Positive Rate: %.2f' % TPR
    # --------------------- plot confusion matrix ---------------------
    fig, ax = plt.subplots(figsize=(5, 5))
    ax.matshow(confusion, cmap=plt.cm.Blues, alpha=0.3)
    for i in range(confusion.shape[0]):
        for j in range(confusion.shape[1]):
            ax.text(x=j,
                    y=i,
                    s=confusion.iloc[i, j].astype(int),
                    va='center',
                    ha='center')
    plt.xlabel('Predicted Class')
    plt.ylabel('True Class')
    plt.title('Confusion Matrix of Test Set Predictions')
    # --------------------- plot log(cross-entropy loss) ---------------------
    fig, ax = plt.subplots(ncols=1, nrows=1, figsize=[8, 8])
    ax.plot((np.arange(0, len(ce_t))),
            np.log(ce_t),
            label='Training',
            linewidth=2.0,
            color='blue')
    ax.plot((np.arange(0, len(ce_v))),
            np.log(ce_v),
            label='Validation',
            linewidth=2.0,
            color='green')
    ax.set_ylabel('Log Cross Entropy Loss')
    ax.set_xlabel('Epochs')
    ax.legend()
    plt.grid()
    plt.show()
Beispiel #13
0
def makeFolds(ids, folds=10):
    sample = Split.getFolds(len(ids), folds)
    division = {}
    for i in range(len(ids)):
        division[ids[i]] = sample[i]
    return division
Beispiel #14
0
        for text in re.split(_nsre, s)
    ]


mypath = '/home/it-lab412/Desktop/All512'
onlyfiles = [f for f in listdir(mypath) if isfile(join(mypath, f))]
onlyfiles.sort(key=natural_sort_key)

onlyfiles = np.array(onlyfiles)
onlyfiles = onlyfiles[2:]
#X = np.ndarray(shape=(5018,1,512,512),dtype='float32')  ### Shifted inside the loop

#random_seq = np.random.permutation(175630)
random_seq = seqArray
random_files = onlyfiles[random_seq]
random_splits = Split.split_seq(random_files, 65)

y = pd.read_csv('trainLabels.csv')
y = y['level']

y1 = y
y2 = y

for i in range(4):
    y1 = y1.append(y2)

y1 = y1.values
y1 = y1.astype(np.int32)

y1 = y1[random_seq]
Beispiel #15
0
def getDocumentFolds(documentIds, folds):
    sample = Split.getFolds(len(documentIds),folds)
    division = {}
    for i in range(len(documentIds)): 
        division[documentIds[i]] = sample[i]
    return division
Beispiel #16
0
def split(path):
  head, tail = Split(path)
  if len(head) > 1 and head[-1] == '/':
    head = head[:-1]
  return (head, tail)
Beispiel #17
0
def makeDivision(ids, fraction=0.5, seed=0):
    sample = Split.getSample(len(ids),fraction, seed)
    division = {}
    for i in range(len(ids)): 
        division[ids[i]] = sample[i]
    return division
Beispiel #18
0
def makeFolds(ids, folds=10):
    sample = Split.getFolds(len(ids),folds)
    division = {}
    for i in range(len(ids)): 
        division[ids[i]] = sample[i]
    return division
def natural_sort_key(s):
    return [int(text) if text.isdigit() else text.lower()
            for text in re.split(_nsre, s)]

mypath='/home/it-lab412/Desktop/All512'
onlyfiles = [ f for f in listdir(mypath) if isfile(join(mypath,f))]
onlyfiles.sort(key=natural_sort_key)

onlyfiles = np.array(onlyfiles)
onlyfiles = onlyfiles[2:]
#X = np.ndarray(shape=(5018,1,512,512),dtype='float32')  ### Shifted inside the loop 

#random_seq = np.random.permutation(175630)
random_seq = seqArray
random_files = onlyfiles[random_seq]
random_splits = Split.split_seq(random_files,65)


y = pd.read_csv('trainLabels.csv')
y = y['level']


y1 = y
y2 = y

for i in range(4):
    y1 = y1.append(y2)

y1 = y1.values
y1 = y1.astype(np.int32)
   
Beispiel #20
0
    outputTrees = []
    for i in range(options.folds):
        newRoot = ET.Element("corpus")
        for key in corpusElements.rootElement.attrib.keys():
            newRoot.attrib[key] = corpusElements.rootElement.attrib[key]
        outputTrees.append(newRoot)
    
    print >> sys.stderr, "Reading document ids"
    documentIds = []
    for document in corpusElements.documents:
        docId = document.attrib["id"]
        assert( not docId in documentIds )
        documentIds.append(docId)

    print >> sys.stderr, "Calculating document division"
    sample = Split.getFolds(len(documentIds),options.folds)
    division = {}
    for i in range(len(documentIds)): 
        division[documentIds[i]] = sample[i]

    print >> sys.stderr, "Dividing documents"
    for document in corpusElements.documents:
        docId = document.attrib["id"]
        outputTrees[division[docId]].append(document)
    
    for i in range(options.folds):
        if options.output == None:
            filename = options.input + ".fold" + str(i)
        else:
            filename = os.path.join(options.output, os.path.basename(options.input) + ".fold" + str(i))
        print >> sys.stderr, "Writing file", filename
Beispiel #21
0
import sys
sys.path.insert(0, "../")

import aiml
import Split

import logging
logging.basicConfig()

# The Kernel object is the public interface to
# the AIML interpreter.
k = aiml.Kernel()

# Use the 'learn' method to load the contents
# of an AIML file into the Kernel.
k.learn("cn-startup.xml")

# Use the 'respond' method to compute the response
# to a user's input string.  respond() returns
# the interpreter's response, which in this case
# we ignore.
k.respond("load aiml cn")

# Loop forever, reading user input from the command
# line and printing responses.
while True:
    text = raw_input("> ")
    text = Split.splitChinese(text)
    logging.info(text)
    print k.respond(text)
Beispiel #22
0
def do_your_thang(img_dir, out_path, path_t, saved_model, w, Ovr, f, timeline):

    truths = gpd.read_file(path_t)
    crs = truths.crs
    # print('\nCascading truths for analysis...')
    truths = gpd.GeoSeries(cascaded_union(truths['geometry']))
    truths = gpd.GeoDataFrame(geometry=truths, crs=crs)

    total = get_number(img_dir, '*tif')

    count = 1
    for pic in glob.glob(img_dir + '/*.tif'):

        if timeline:
            print('########## \
                  Timeline Image: %s / %s \
                      ##########' % (count, total))

        fn = get_name(pic)
        out_dir = out_path + '/' + fn

        ### Build subfolders
        if os.path.isdir(out_dir) is False:
            os.makedirs(out_dir)
            os.makedirs(out_dir + '/tiles')
            os.makedirs(out_dir + '/predictions')
            os.makedirs(out_dir + '/map')
            os.makedirs(out_dir + '/metrics')
        tiles_dir = out_dir + '/tiles'
        pred_dir = out_dir + '/predictions'
        map_dir = out_dir + '/map'
        met_dir = out_dir + '/metrics'

        ### Split mosic into tiles
        print('Splitting image: %s...' % fn)
        with suppress_stdout():  ### suppress the long output
            Split.split_image(input=pic,
                              output_dir=tiles_dir,
                              patch_w=w,
                              patch_h=w,
                              adj_overlay_x=Ovr,
                              adj_overlay_y=Ovr,
                              out_format=f)
        os.remove('split_image_info.txt')

        ### Remove tiles that don't intersect ground truths & Re-number
        Filter.remove(tiles_dir, truths, overlap_only=True)

        ### convert to .JPEG
        Convert.to_jpg(tiles_dir)

        ### create & save predictions
        UNet_Predict.deploy_model(saved_model, tiles_dir, pred_dir)

        ### create map from prediction tiles
        if Map.build_map(tiles_dir, pred_dir, map_dir, fn, truths):

            ### calculate performance metrics (and save True Posities for timeline)
            Metrics.run_metrics(truths, map_dir, pic, fn, met_dir, timeline)

        count += 1

    top_folder = out_path + '/Maps'
    ### create topfolder to consolidate prediction/timeline output
    if os.path.isdir(top_folder) is False:
        os.makedirs(top_folder)

        ### copy output maps to top folder
        for file in glob.glob(out_path + '/**/map/*cascaded_map*'):
            shutil.copy(file, top_folder)

    return
Beispiel #23
0
def run_single_fold_train_test(df, phys_target, run_params, pre, curr_fold_num):
    """
    Train, predict, and calculate eval metrics, for model. on a single data fold.
    This function receives the data, takes care of splitting it to folds, trains the model and returns the results
    for the fold (index) it ran on.
    :param df: dataframe or list of dataframes. all columns are those that will be used for training
    :param phys_target: series with the physical model of the target data (for eval purposes)
    :param run_params: instance of type TestInstanceParams class, holds relevant model configurations
    :param pre: instance of type Process - for data preprocessing
    :param curr_fold_num: The index of the relevant fold number. has to be an int between (and including)
    0 and run_params.k -1.
    :return: a dictionary with the model, the predicitons and ground truth for the test, validation and train datasets,
    and for the validation and test also the physical model predictions
    and a dataframe summarizing the evaluation metrics for the fold, for the train, validation and test sets
    """
    fold_dict = {}
    fold_dict["fold_num"] = curr_fold_num
    train, val, test, phys_val, phys_test = Split.kfold_split_train_test(df, curr_fold_num,
                                                                         k=run_params.k, phys_target=phys_target)
    pre.fit(*get_feature_and_target_data(
        train, run_params.target_col, run_params.is_target_in_input))
    fold_dict["preprocess"] = pre
    X_train, y_train, dates_y_train = pre.transform(
        *get_feature_and_target_data(train, run_params.target_col, run_params.is_target_in_input))
    X_val, y_val, dates_y_val = pre.transform(
        *get_feature_and_target_data(val, run_params.target_col, run_params.is_target_in_input))
    X_test, y_test, dates_y_test = pre.transform(
        *get_feature_and_target_data(test, run_params.target_col, run_params.is_target_in_input))
    input_dim = X_train.shape[2]
    model_structure_args = {"look_back": run_params.train_steps, "input_dimension": input_dim,
                            "build_config_description": run_params.desc_str + "_f{}".format(curr_fold_num)}

    fold_dict["train"] = {}
    fold_dict["val"] = {}
    fold_dict["test"] = {}

    fold_dict["train"]["dates"] = dates_y_train
    fold_dict["val"]["dates"] = dates_y_val
    fold_dict["test"]["dates"] = dates_y_test

    with tf.device("/cpu:0"):
        curr_model = run_params.model_class(**model_structure_args)

    with tf.device("/cpu:0"):
        # train model (and save it, if this was implemented in model class)
        curr_model = curr_model.fit(X_train, y_train, val_data=(X_val, y_val), **run_params.model_args)

    fold_dict["model"] = curr_model

    fold_dict["test"]["pred"] = pre.inverse_scale_target(fold_dict["model"].predict(X_test))
    fold_dict["test"]["true"] = pre.inverse_scale_target(y_test.reshape(-1, 1))
    fold_dict["test"]["ww3"] = phys_test.iloc[run_params.train_steps + run_params.pred_forward:].values.reshape(-1, 1)

    fold_dict["val"]["pred"] = pre.inverse_scale_target(fold_dict["model"].predict(X_val))
    fold_dict["val"]["true"] = pre.inverse_scale_target(y_val.reshape(-1, 1))
    fold_dict["val"]["ww3"] = phys_val.iloc[run_params.train_steps + run_params.pred_forward:].values.reshape(-1, 1)

    fold_dict["train"]["pred"] = pre.inverse_scale_target(fold_dict["model"].predict(X_train))
    fold_dict["train"]["true"] = pre.inverse_scale_target(y_train.reshape(-1, 1))

    fold_dict["results_test"] = Eval.eval_pred_phys_const(fold_dict["test"], pre)
    fold_dict["results_val"] = Eval.eval_pred_phys_const(fold_dict["val"], pre)
    # for train we don't look at ww3 model or const guess. these metrics are interesting
    # only for checking overfit in training
    train_eval = Eval.eval_model(
        fold_dict["train"]["true"], fold_dict["train"]["pred"])
    fold_dict["results_train"] = pd.Series(train_eval, name="ML")
    return fold_dict
Beispiel #24
0
def makeDivision(ids, fraction=0.5, seed=0):
    sample = Split.getSample(len(ids), fraction, seed)
    division = {}
    for i in range(len(ids)):
        division[ids[i]] = sample[i]
    return division
Beispiel #25
0
import Split as sp
import Parser as prs
import mathExpr as ME

text = '''
X + y =100*65489-fsdahfj+76
x = c+4u
while (c>c( qb = fds+fgds
if (c>c) qb = fds+fgds
Q = x+y+n+5*9*y*u/u+i
'''

lists = sp.Split(text)

Res = prs.Parse(lists)

for i in range(len(lists)):
    print(lists[i],Res[i])

# text = 'x+y+n+5*9*y*u/u+i'
# lis = sp.Split(text)
# print (lis)
# print(ME.math_expr(lis[0]))
Beispiel #26
0
    outputTrees = []
    for i in range(options.folds):
        newRoot = ET.Element("corpus")
        for key in corpusElements.rootElement.attrib.keys():
            newRoot.attrib[key] = corpusElements.rootElement.attrib[key]
        outputTrees.append(newRoot)

    print >> sys.stderr, "Reading document ids"
    documentIds = []
    for document in corpusElements.documents:
        docId = document.attrib["id"]
        assert (not docId in documentIds)
        documentIds.append(docId)

    print >> sys.stderr, "Calculating document division"
    sample = Split.getFolds(len(documentIds), options.folds)
    division = {}
    for i in range(len(documentIds)):
        division[documentIds[i]] = sample[i]

    print >> sys.stderr, "Dividing documents"
    for document in corpusElements.documents:
        docId = document.attrib["id"]
        outputTrees[division[docId]].append(document)

    for i in range(options.folds):
        if options.output == None:
            filename = options.input + ".fold" + str(i)
        else:
            filename = os.path.join(
                options.output,
import PyPDF2
import Split
from subprocess import call
import sys

if (len(sys.argv) < 2):
    print("Error\nFormat: \n\tpython main.py your-pdf-file")
else:
    filename = sys.argv[1]
    directory = "splitted/" + filename

    Split.split(directory, filename)
    pdfFileObj = open(filename, 'rb')
    pdfReader = PyPDF2.PdfFileReader(pdfFileObj)

    for i in range(pdfReader.numPages):
        splitted_file_name = directory + "/" + repr(i)
        call(["pdftotext", splitted_file_name + ".pdf"])
        # f = open(splitted_file_name + '.txt', 'r')
        # print("Page %s" % repr(i+1))
        # print(f.read())
        # print("====================")
 def _split_bins(self):
     split_clf=Split(feature=self._temp,min_sample=self.min_sample,max_node_number=self._bins)
     split_clf.fit(self._df,self._label)
     self.bins=split_clf.bins