Example #1
0
def run(is_test=False):
    count = []
    word2idx = {}
    Config.is_test = is_test
    if not os.path.exists(Config.checkpoint_dir):
        os.makedirs(Config.checkpoint_dir)
    if not os.path.exists(Config.vector_dir):
        os.makedirs(Config.vector_dir)

    train_data = read_data(
        '%s/%s.train.txt' % (Config.data_dir, Config.data_name), count,
        word2idx)
    valid_data = read_data(
        '%s/%s.valid.txt' % (Config.data_dir, Config.data_name), count,
        word2idx)
    test_data = read_data(
        '%s/%s.test.txt' % (Config.data_dir, Config.data_name), count,
        word2idx)
    idx2word = dict(zip(word2idx.values(), word2idx.keys()))
    save_obj('%s/idx2word.pkl' % (Config.vector_dir), idx2word)
    save_obj('%s/word2idx.pkl' % (Config.vector_dir), word2idx)
    Config.nwords = len(word2idx)

    tf.reset_default_graph()
    with tf.Session() as sess:
        model = MemN2N(Config, sess, True)
        model.build_model()

        if Config.is_test:
            model.run(valid_data, test_data)
        else:
            model.run(train_data, valid_data)

        tf.summary.FileWriter("./logs", graph=tf.get_default_graph())
Example #2
0
def get_embeddings_tokenizer(filename1, filename2, EMBEDDING_DIM):
    """Get the embeddings and the tokenizer for words."""
    data = read_data(filename1, clean=False)[1:] + read_data(filename2, clean=False)[1:]
    texts = []

    for d in data:
        raw = d.split()
        texts.append(raw[0])

    word_tokenizer = Tokenizer()
    word_tokenizer.fit_on_texts(texts)
    word_tokenizer.word_index['<<SPAD>>'] = len(word_tokenizer.word_index) + 1
    word_visit = [0 for i in range(len(word_tokenizer.word_index) + 1)]

    embedding_matrix = np.random.random((len(word_tokenizer.word_index)+1, EMBEDDING_DIM))
    embedding_matrix = embedding_matrix.astype(np.float64)

    for d in data:
        raw = d.split()
        label = raw[0]

        word_index = word_tokenizer.word_index[label]

        if word_visit[word_index] == 0:
            vector = [float(x) for x in raw[1:EMBEDDING_DIM+1]]
            word_visit[word_index] = 1
        else:
            vector = (embedding_matrix[word_index] + np.array([float(x) for x in raw[1:EMBEDDING_DIM+1]]))/2

        embedding_matrix[word_index] = vector

    return embedding_matrix, word_tokenizer
def main():
    trainData = dh.read_data('./movie-data/ratings-train.csv')
    testData = dh.read_data('./movie-data/ratings-test.csv')
    movFeat = dh.read_data('./movie-data/movie-features.csv')

    start = time.time()
    q3a(trainData, testData)
    q3b(movFeat, trainData, testData)
    q3c(movFeat, trainData, testData)
    q3d(movFeat, trainData, testData)
    print "Overall time: ", time.time() - start
def q4b():

    print "\n------------------------------------------Question B------------------------------------------"
    trainMatrix = dh.read_data('./data/zip.train')
    testMatrix = dh.read_data('./data/zip.test')

    graph_setup("k", "Error", "PCA degree vs Error")

    PCA = 1

    gammaMin = 0.000
    gammaMax = 0.015
    gammaNum = 10
    cMin = 0.0
    cMax = 25.0
    cNum = 25
    PCAmin = 0
    PCAmax = 100
    PCAnum = 100

    rbf, trainSVMY, testSVMY, trainXIn, testXIn, trainY, testY, testError, trainError, cvError, gamma, C, k, runTime, valErrors = ln.margin_svm(
        trainMatrix,
        testMatrix,
        PCA=PCA,
        matrixList1=[2],
        matrixList2=[8],
        gammaMin=gammaMin,
        gammaMax=gammaMax,
        gNum=gammaNum,
        cMin=cMin,
        cMax=cMax,
        cNum=cNum,
        PCAmin=PCAmin,
        PCAmax=PCAmax,
        PCAnum=PCAnum)

    for tmpk, gammaDict in valErrors.items():
        tmpC, tmpgamma = min(gammaDict, key=gammaDict.get)
        print "K: ", tmpk, "\tC: ", tmpC, "\tGamma: ", tmpgamma, "\tcvError: ", gammaDict[
            (tmpC, tmpgamma)], "\ttrainError: ", trainError[
                tmpk], "\ttestError: ", testError[tmpk]

    print "Optimal Setup:\tGamma: ", gamma, "\tRuntime: ", runTime, "\tC: ", C, "\tFeatures: ", k
    print "trainError: ", trainError[k], "\ttestError: ", testError[
        k], "\tcvError: ", cvError[k]

    PCA_graph_add(trainError, "Train Error", 'blue')
    PCA_graph_add(testError, "Test Error", 'green')
    PCA_graph_add(cvError, "CV Error", 'red')

    plt.legend()
    plt.savefig("q3b.eps", format='eps', dpi=1000)
Example #5
0
    def run_all2(self):

        for i in range(9):
            print("epoch " + str(i + 1) + " out of 9")
            prices = data_handler.read_data(
                "./Data/lob_datatrial000" + str(i + 1) + ".csv", "MIC")
            X, y = data_handler.split_data(prices, self.steps)
            self.train(X, y, 100, 0)

        time = data_handler.read_data("./Data/lob_data.csv", "TIME")
        prices = data_handler.read_data("./Data/lob_data.csv", "MIC")
        X, y = data_handler.split_data(prices, self.steps)
        self.test(X, y, verbose=1)
Example #6
0
def main():
    print "------------- MEMM based POS-TAGGER -------------------- "
    tagger = MEMMTagger()
    trainset = read_data("oct27.train")
    devset = read_data("oct27.dev")
    testset = read_data("oct27.test")

    tagger.train(trainset)

    print '----------- Dev Set Results ----------------- '
    tagger.test(devset)

    print '----------- Test Set Results ---------------- '

    tagger.test(testset)
Example #7
0
def main():
	print "----------- Structured Perceptron POS-TAGGER ---------- "
	tagger = StructuredPerceptronTagger()
	trainset = read_data("oct27.train")	
	devset = read_data("oct27.dev")
        testset = read_data("oct27.test")
        
        tagger.train(trainset)
        
        print '----------- Dev Set Results ----------------- ' 
        tagger.test(devset)

        print '----------- Test Set Results ---------------- '
        
        tagger.test(testset)
Example #8
0
    def run_all(self):

        time = data_handler.read_data("./Data/lob_data.csv", "TIME")
        prices = data_handler.read_data("./Data/lob_data.csv", "MIC")
        X, y = data_handler.split_data(prices, self.steps)

        split_ratio = [9, 1]
        train_X, test_X = data_handler.split_train_test_data(X, split_ratio)
        train_X = train_X.reshape((-1, self.steps, 1))
        test_X = test_X.reshape((-1, self.steps, 1))
        train_y, test_y = data_handler.split_train_test_data(y, split_ratio)

        self.train(train_X, train_y, 200, verbose=1)
        self.test(test_X, test_y, verbose=1)
        self.save()
def q4a():
    # extract data
    print "------------------------------------------Question A------------------------------------------"
    trainMatrix = dh.read_data('./data/zip.train')
    testMatrix = dh.read_data('./data/zip.test')

    PCA = 0

    gammaMin = 0.0
    gammaMax = 0.05
    gammaNum = 500
    cMin = 0.0
    cMax = 0.1
    cNum = 10
    PCAmin = 0
    PCAmax = 100
    PCAnum = 100

    rbf, trainSVMY, testSVMY, trainXIn, testXIn, trainY, testY, testError, trainError, cvError, gamma, C, k, runTime, valErrors = ln.margin_svm(
        trainMatrix,
        testMatrix,
        PCA=PCA,
        matrixList1=[2],
        matrixList2=[8],
        gammaMin=gammaMin,
        gammaMax=gammaMax,
        gNum=gammaNum,
        cMin=cMin,
        cMax=cMax,
        cNum=cNum,
        PCAmin=PCAmin,
        PCAmax=PCAmax,
        PCAnum=PCAnum)

    for tmpk, gammaDict in valErrors.items():
        tmpC, tmpgamma = min(gammaDict, key=gammaDict.get)
        print "K: ", tmpk, "\tC: ", tmpC, "\tGamma: ", tmpgamma, "\tcvError: ", gammaDict[
            (tmpC, tmpgamma)], "\ttrainError: ", trainError[
                tmpk], "\ttestError: ", testError[tmpk]

    print "Optimal Setup:\tGamma: ", gamma, "\tRuntime: ", runTime, "\tC: ", C, "\tFeatures: ", k
    print "trainError: ", trainError[k], "\ttestError: ", testError[
        k], "\tcvError: ", cvError[k]

    graph_setup("Gamma", "Error", "RBF Kernel SVM")
    graph_add(valErrors, "Train Error", 'red')
    plt.legend()
    plt.savefig("q3a.eps", format='eps', dpi=1000)
Example #10
0
def get_features_importance():
    seed(47)
    np.random.seed(47)
    datasets = read_data('multiclass')

    dic = {}
    l = []

    for atype in datasets.keys():
        print(atype)
        dic[atype] = {}
        df = datasets[atype]
        X, y = df[df.columns[:-1]], df[df.columns[-1]]
        sss = StratifiedShuffleSplit(n_splits=1, test_size=.2)
        for l in [learners[1]]:
            for train_index, test_index in sss.split(X, y):
                train_df = df.iloc[train_index]
                test_df = df.iloc[test_index]
                tuner = get_tuner(l)
                default_config = tuner.default_config
                clf = tuner.get_clf(default_config)
                x_train, y_train = train_df[train_df.columns[:-1]], train_df[train_df.columns[-1]]
                x_test, y_test = test_df[test_df.columns[:-1]], test_df[test_df.columns[-1]]
                clf.fit(x_train, y_train)
                prediction = clf.predict(x_test)
                cm = confusion_matrix(y_test, prediction)

                key_feats_indices = np.argsort(clf.feature_importances_)[::-1][:5]
                for index in key_feats_indices:
                    print("%s: %s" % (df.columns[index], clf.feature_importances_[index]), end="; ")
                import pdb
                pdb.set_trace()
            print()
def cross_validation():
    df_data: pd.DataFrame = dh.read_data(paths.total_dataset_location)
    data_dict: dict = dh.split_original_data(df_data, 0.2)

    df_train_data: pd.DataFrame = data_dict["train_data"]
    df_test_data: pd.DataFrame = data_dict["test_data"]

    A: np.ndarray = fill_averages(df_train_data)

    U, Vh = perform_svd(A)

    min_k = 2
    max_k = 50

    print("Starting cross validation")

    ks = []
    errs = []

    # Winning K = 10
    for k in range(min_k, max_k + 1):
        prediction_matrix = make_predictions(k, U, Vh)
        err = calc_rmse(df_test_data, prediction_matrix)
        print("K = {0}, RMSE = {1}".format(k, err))
        ks.append(k)
        errs.append(err)

    plt.plot(ks, errs)
    plt.show()
Example #12
0
def read_layered_subword(filename):
    """Read data as subwords."""
    text_data = read_data(filename)

    text_layered = break_in_subword(text_data)

    return text_layered
Example #13
0
def run():
    logging.config.fileConfig("logging_config.ini")
    print("Processing data")
    df_data: pd.DataFrame = dh.read_data(paths.total_dataset_location)
    data_dict: dict = dh.split_original_data(df_data, 0.1)

    df_train_data: pd.DataFrame = data_dict["train_data"]
    df_test_data: pd.DataFrame = data_dict["test_data"]

    train_samples: np.ndarray = dh.df_as_array(df_train_data)
    test_samples: np.ndarray = dh.df_as_array(df_test_data)

    mean_predictions = calculate_all_means(df_train_data)

    # initialize variables needed for training
    k = 100
    bu = np.zeros(paths.num_users)
    bm = np.zeros(paths.num_movies)
    user_features = np.zeros((paths.num_users, k))
    movie_features = np.zeros((k, paths.num_movies))

    train(k, mean_predictions, user_features, movie_features, bu, bm,
          train_samples, test_samples)

    print("Calculating predictions and writing file")
    prediction_matrix = final_predictions(mean_predictions, user_features,
                                          movie_features, bu, bm)
    dh.write_submission(prediction_matrix)
Example #14
0
File: main.py Project: maikia/human
def playing_with_data():
    #folder_data = '/home/maja/PhDProject/human_data/data/'
    folder_data = '/home/maja/PhDProject/data/'
    folder_data='/home/maja/PhDProject/human_data/data/'
    folder_data ='/home/maja/PhDProject/data/'
    
    folder_specific = '2013_07_31/' #'HT_2013_04_02/'
    folder_specific = 'others/'
    folder_specific = '2013_08_10/'
    
    file_data = folder_specific + '2013_07_31_0002.abf' #2013_04_02_0013.abf'
    file_data = folder_specific + '2013_07_03 PR1_0000.abf'
    file_data = folder_specific + '2013_09_03_0002.abf'
    #file_data = folder_specific + '2013_09_03_0006.abf'
    file_data = folder_specific + '2013_09_05_0009_afterNBQX.abf'
    file_data = folder_specific + '2013_09_05_0019_synch.abf'
    file_data = folder_specific + '2013_09_05_0017.abf'
    file_data = folder_specific + '2013_10_08_0002.abf'
    
    folder_save = '/home/maja/PhDProject/data/2013_07_31/saved/'
    folder_save = '/home/maja/PhDProject/human_data/data/others/'
    
    file_save = folder_save + 'all_data_gabaB.npz'
    #file_save = folder_save + 'data.dat'
    
    data, scale, fs = dh_temp.read_data(folder_data, file_data)   
    dh_temp.save_data(folder_save, file_save, data, scale, fs)
    del data, scale, fs
    
    display.plot_data(folder_save, file_save, x_scale = 'ms')
Example #15
0
def run():
    logging.config.fileConfig("logging_config.ini")

    print("Processing data")
    df_data: pd.DataFrame = dh.read_data(paths.total_dataset_location)
    data_dict: dict = dh.split_original_data(df_data, 0.1)

    df_train_data: pd.DataFrame = data_dict["train_data"]
    df_test_data: pd.DataFrame = data_dict["test_data"]

    train(df_train_data, df_test_data)
def execute_approach():
    df_data: pd.DataFrame = dh.read_data(paths.total_dataset_location)

    A: np.ndarray = fill_averages(df_data)
    U, Vh = perform_svd(A)

    # K = 9 was the winning value from the cross validation
    k = 10

    prediction_matrix = make_predictions(k, U, Vh)
    assert(prediction_matrix.shape == (paths.num_users, paths.num_movies))
    dh.write_submission(prediction_matrix)
def run():
    logging.config.fileConfig("logging_config.ini")

    print("Processing data")
    df_data: pd.DataFrame = dh.read_data(paths.total_dataset_location)
    data_dict: dict = dh.split_original_data(df_data, 0.1)

    df_train_data: pd.DataFrame = data_dict["train_data"]
    df_test_data: pd.DataFrame = data_dict["test_data"]

    # cross_validation(df_train_data, df_test_data)
    # assign the best result from cross validation to K
    K = 10
    train(K, df_train_data, df_test_data)
def run():
    logging.config.fileConfig("logging_config.ini")

    print("Processing data")
    df_data: pd.DataFrame = dh.read_data(paths.total_dataset_location)
    data_dict: dict = dh.split_original_data(df_data, 0.1)

    df_train_data: pd.DataFrame = data_dict["train_data"]
    df_test_data: pd.DataFrame = data_dict["test_data"]

    print("Calculating initialization data")
    mean_predictions = calculate_all_means(df_train_data)
    train_samples: np.ndarray = dh.df_as_array(df_train_data)

    # Perform either cross validation or a single run using best result
    # cross_validation(df_train_data, train_samples, df_test_data, mean_predictions)
    k = 10
    execute_approach(k, df_train_data, train_samples, df_test_data, mean_predictions)
Example #19
0
def get_embeddings_tokenizer(filename, EMBEDDING_DIM):
    """Get the embeddings and the tokenizer for words."""
    data = read_data(filename)
    texts = []

    embedding_matrix = np.random.randn(len(data), EMBEDDING_DIM)
    embedding_matrix = embedding_matrix.astype(np.float64)
    for i in range(len(data)):
        raw = data[i].split()
        label = raw[0]
        vector = [float(x) for x in raw[1:EMBEDDING_DIM+1]]
        texts.append(label)
        embedding_matrix[i] = vector

    word_tokenizer = Tokenizer()
    word_tokenizer.fit_on_texts(texts)
    word_tokenizer.word_index['<<SPAD>>'] = len(word_tokenizer.word_index) + 1

    return embedding_matrix, word_tokenizer
Example #20
0
def execute(res=''):
    seed(47)
    np.random.seed(47)
    datasets = read_data()

    dic={}
    l=[]

    for atype in datasets.keys():
        print(atype)
        dic[atype] = {}
        df = datasets[atype]
        #import pdb
        #pdb.set_trace()
        X, y = df[df.columns[:-1]], df[df.columns[-1]]
        sss = StratifiedShuffleSplit(n_splits=5, test_size=.2)
        for m in metrics:
            print(m)
            dic[atype][m] = {}
            for l in learners:
                dic[atype][m][l] = {'flash': [], 'default': []}
                for train_index, test_index in sss.split(X, y):
                    train_df = df.iloc[train_index]
                    test_df = df.iloc[test_index]
                    tuner = get_tuner(l)
                    best_config = tuning(tuner, train_df, project_name="", metric=m)
                    default_config = tuner.default_config

                    x_train, y_train = train_df[train_df.columns[:-1]], train_df[train_df.columns[-1]]
                    x_test, y_test = test_df[test_df.columns[:-1]], test_df[test_df.columns[-1]]
                    tuned_score = measure_fitness(tuner, x_train, y_train, x_test, y_test, best_config, m)
                    default_score = measure_fitness(tuner, x_train, y_train, x_test, y_test, default_config, m)
                    dic[atype][m][l]['flash'].append(tuned_score)
                    dic[atype][m][l]['default'].append(default_score)
                print(l, dic[atype][m][l])
            print()
        print("*"*10)

    with open('dump/flash.pickle', 'wb') as handle:
        pickle.dump(dic, handle)
Example #21
0
def init(data=[], files=globe.dic['files'], replot=globe.dic['replots']):
    dic = globe.dic

    for i, filename in enumerate(files):
        if dic['Verbose'] > 0:
            print "loading", filename
        sys_err = dic['sys_err_default']
        if len(filename.split('#')) == 2:
            sys_err = float(filename.split('#')[1].strip())
            filename = filename.split('#')[0].strip()
        if dic['outputs']:
            output = dic['outputs'].pop()
        else:
            output = '.'.join(filename.split('.')[:-1])
        dic['numbered'] = 0

        # Now read data file
        blocks = make_blocks(read_data(filename))

        if blocks:
            for j, b in enumerate(blocks):
                if dic['GroupBy'] == 'files':
                    data.append([[i, j], filename, output, b, sys_err])
                elif dic['GroupBy'] == 'blocks':
                    data.append([[j, i], filename, output, b, sys_err])

    data.sort(key=lambda x: x[0])
    data = structure(data)

    for i, filename in enumerate(replot):
        if dic['Verbose'] > 0:
            print "reloading data from", filename
        if len(filename.split('#')) == 2:
            filename = filename.split('#')[0].strip()
        data = data + reload_plot(filename)

    return data
Example #22
0
    """Divide the data into train and test sets."""
    nrmlzd_list, size = find_distribution(data)
    train_size = int(0.8 * size)
    test_size = int(0.2 * size)
    print(train_size, test_size)
    train = []
    test = []
    train_id = []
    test_id = []
    counter = 0
    for each in nrmlzd_list:
        for i in range(0, train_size):
            data[each[i]]['id'] = each[i]
            train.append(data[each[i]])
            train_id.append(each[i])
            #pdb.set_trace()
        for j in range(0, test_size):
            data[each[j]]['id'] = each[j]
            test.append(data[each[j]])
            test_id.append(each[j])
    #pdb.set_trace()
    print(len(train), len(test))
    return train, test


if __name__ == "__main__":
    data = read_data("final_codemixed.json")
    train, test = divde_train_test(data)
    write_to_file(train, "train_data.json")
    write_to_file(test, "test_data.json")
Example #23
0
loader.load()

from indicnlp.transliterate.unicode_transliterate import ItransTransliterator


def transliterate(data, lang):
    """Transliterator."""
    total = len(data)
    new_data = list()
    for i in range(len(data)):
        printProgressBar(i + 1,
                         total,
                         prefix='Progress:',
                         suffix='Complete',
                         length=50)
        new_data.append(ItransTransliterator.to_itrans(data[i], LANG))

    return new_data


if __name__ == "__main__":
    LANG = 'hi'
    INPUT_FILE = "/home/chrizandr/code-mixing/data/IITB.en-hi.hi"
    OUTPUT_FILE = "/home/chrizandr/code-mixing/data/IITB.en-hi.hi.roman"
    print("Reading data")
    original_text = read_data(INPUT_FILE, encoding="UNI")
    print("Transliterating")
    romanized_text = transliterate(original_text, LANG)
    print("Writing to file")
    write_data(OUTPUT_FILE, romanized_text, encoding="UNI")
def q4c():
    print "\n------------------------------------------Question C------------------------------------------"

    trainMatrix = dh.read_data('./data/zip.train')
    testMatrix = dh.read_data('./data/zip.test')

    PCA = 2

    print "PCA: "

    gammaMin = 0.0
    gammaMax = 0.02
    gammaNum = 10
    cMin = 0.0
    cMax = 10
    cNum = 20
    PCAmin = 0
    PCAmax = 100
    PCAnum = 100

    rbf, trainSVMY, testSVMY, trainX, testX, trainY, testY, testError, trainError, cvError, gamma, C, k, runTime, valErrors = ln.margin_svm(
        trainMatrix,
        testMatrix,
        PCA=PCA,
        matrixList1=[1],
        matrixList2=[0, 2, 3, 4, 5, 6, 7, 8, 9],
        gammaMin=gammaMin,
        gammaMax=gammaMax,
        gNum=gammaNum,
        cMin=cMin,
        cMax=cMax,
        cNum=cNum,
        PCAmin=PCAmin,
        PCAmax=PCAmax,
        PCAnum=PCAnum)

    for tmpk, gammaDict in valErrors.items():
        tmpC, tmpgamma = min(gammaDict, key=gammaDict.get)
        print "K: ", tmpk, "\tC: ", tmpC, "\tGamma: ", tmpgamma, "\tcvError: ", gammaDict[
            (tmpC, tmpgamma)], "\ttrainError: ", trainError[
                tmpk], "\ttestError: ", testError[tmpk]

    print "gamma: ", gamma, "\ttime: ", runTime, "\tC: ", C, "\tFeatures: ", k
    print "trainError: ", trainError[k], "\ntestError: ", testError[
        k], "\ncvError: ", cvError[k]

    #graph_setup("k", "Error", "graph")
    #PCA_graph_add(trainError, "Train Error", 'blue')
    #PCA_graph_add(testError, "Test Error", 'green')
    #PCA_graph_add(cvError, "CV Error", 'red')

    #plt.legend()
    #plt.savefig("q3c-pca.png")

    graph_setup("Feature 1", "Feature 2", "PCA Test Set Results")
    q4c_graph(testX, testY, rbf, "pca-contour-test.eps")
    graph_setup("feature1", "feature2", "PCA Train Set Results")
    q4c_graph(trainX, trainY, rbf, "pca-contour-train.eps", 1)

    #------------------------------------------------------------------------------
    #               NEXT SECTION
    #------------------------------------------------------------------------------

    trainMatrix = dh.read_data('./data/features.train')
    testMatrix = dh.read_data('./data/features.test')

    PCA = 3

    print "\nFeature: "

    rbf, trainSVMY, testSVMY, trainX, testX, trainY, testY, testError, trainError, cvError, gamma, C, k, runTime, valErrors = ln.margin_svm(
        trainMatrix,
        testMatrix,
        PCA=PCA,
        matrixList1=[1],
        matrixList2=[0, 2, 3, 4, 5, 6, 7, 8, 9],
        gammaMin=gammaMin,
        gammaMax=gammaMax,
        gNum=gammaNum,
        cMin=cMin,
        cMax=cMax,
        cNum=cNum,
        PCAmin=PCAmin,
        PCAmax=PCAmax,
        PCAnum=PCAnum)

    for tmpk, gammaDict in valErrors.items():
        tmpC, tmpgamma = min(gammaDict, key=gammaDict.get)
        print "K: ", tmpk, "\tC: ", tmpC, "\tGamma: ", tmpgamma, "\tcvError: ", gammaDict[
            (tmpC, tmpgamma)], "\ttrainError: ", trainError[
                tmpk], "\ttestError: ", testError[tmpk]

    print "gamma: ", gamma, "\ttime: ", runTime, "\tC: ", C, "\tFeatures: ", k
    print "trainError: ", trainError[k], "\ntestError: ", testError[
        k], "\ncvError: ", cvError[k]

    #PCA_graph_add(trainError, "Train Error", 'blue')
    #PCA_graph_add(testError, "Test Error", 'green')
    #PCA_graph_add(cvError, "CV Error", 'red')

    #plt.legend()
    #plt.savefig("q3c-feature.png")

    graph_setup("Feature 1", "Feature 2", "Feature Data Test Set Results")
    q4c_graph(testX, testY, rbf, "feature-contour-test.eps")
    graph_setup("Feature 1", "Feature 2", "Feature Data Train Set Results")
    q4c_graph(trainX, trainY, rbf, "feature-contour-train.eps", 1)
Example #25
0
"""."""
from data_handler import read_data, write_data, break_in_subword
import pdb

INPUT = "data/IITB.en-hi.hi.roman.clean"
OUTPUT = "data/IITB.en-hi.hi.syll"
print "Reading"
data = read_data(INPUT, encoding="UNI", clean=True)
print "Breaking"
new_data = break_in_subword(data, sentences=True)
print "Writing"
write_data(OUTPUT, new_data, encoding="UNI")
pdb.set_trace()
Example #26
0
loader.load()

from indicnlp.transliterate.unicode_transliterate import ItransTransliterator


def transliterate(data, lang):
    """Transliterator."""
    total = len(data)
    new_data = list()
    for i in range(len(data)):
        print(i, len(data))
        # printProgressBar(i+1, total, prefix='Progress:', suffix='Complete', length=50)
        try:
            new_data.append(ItransTransliterator.to_itrans(data[i], LANG))
        except IndexError:
            print(data[i])

    return new_data


if __name__ == "__main__":
    LANG = 'hi'
    INPUT_FILE = "/home/chrizandr/code-mixing/data/IITB.en-hi.hi"
    OUTPUT_FILE = "/home/chrizandr/code-mixing/data/IITB.en-hi.hi.roman"
    print("Reading data")
    original_text = read_data(INPUT_FILE, encoding="UNI", clean=False)
    print("Transliterating")
    romanized_text = transliterate(original_text, LANG)
    print("Writing to file")
    write_data(OUTPUT_FILE, romanized_text, encoding="UNI")
Example #27
0
import sys

from keras.callbacks import ModelCheckpoint

import data_handler
import model
from data_handler import usable_chars

train_x, train_y = data_handler.read_data("train.txt", 50)
test_x, test_y = data_handler.read_data("test.txt", 50)
lstm = model.Model(y_shape=len(usable_chars), batch_size=50)
weights_file = "weights.hdf5"
checkpoint = ModelCheckpoint(weights_file,
                             monitor='val_acc',
                             verbose=1,
                             save_best_only=True,
                             mode='max')
callbacks_list = [checkpoint]
if len(sys.argv) > 1:
    lstm.load_weights(sys.argv[1])
else:
    lstm.train_model(train_x,
                     train_y,
                     test_x,
                     test_y,
                     epochs=50,
                     callbacks=callbacks_list)
    lstm.load_weights("weights.hdf5")

acc, loss = lstm.test_model(train_x, train_y)
print("Training Accuaracy:", acc, "Training Loss:", loss)
Example #28
0
        self.n_features = 1

    def train(self, X, y, epochs, verbose):
        self.model.fit(X, y, epochs=epochs, verbose=verbose)

    def test(self, X, y):
        for i in range(len(X)):
            input = X[i].reshape((1, self.steps, 1))
            yhat = self.model.predict(input, verbose=1)
            print(y[i], yhat[0][0], np.mean(input[0]))
        # model.fit(X, y, epochs=200, verbose=1)


if __name__ == "__main__":
    # numpy.set_printoptions(threshold=sys.maxsize)
    time = data_handler.read_data("lob_datatrial0001.csv", "TIME")
    prices = data_handler.read_data("lob_datatrial0001.csv", "MIC")

    # splitting data into chunks of 4
    steps = 59
    reshape = True
    # X, y = data_handler.split_data(prices, steps, reshape)

    # split_ratio = [9,1]
    # train_X, test_X = data_handler.split_train_test_data(X, split_ratio)

    # train_X = train_X.reshape((-1, steps, 1))
    # test_X = test_X.reshape((-1, steps, 1))

    # train_y, test_y = data_handler.split_train_test_data(y, split_ratio)
Example #29
0
import data_handler
import general_functions
from evaluation import evaluation
import cPickle
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
import matplotlib.pyplot as plt


if __name__ == "__main__":

    # ###
    # #  1 read data into dictionary
    # ###

    raw_data_dictionary = data_handler.read_data("./data")

    ###
    #  2 generate id list and text list from dictionary
    ###

    id_list, raw_text_list = data_handler.generate_lists_from_dictionary(raw_data_dictionary)

    ###
    #  3 stem texts and remove stopwords
    ###

    num_data = 20000
    stemmed_text_list_all = data_handler.stem_text_and_remove_stopwords(raw_text_list, "Krovetz")
    stemmed_text_list = stemmed_text_list_all[:num_data]
Example #30
0
        vec_spec = vehicle_spec.VehicleSpec(angle_norm=30,
                                            image_crop_vert=[220, 480])
        data_path = '/home/elschuer/data/LaneKeepingE2E/images_train_augmented/'
        desc_file = 'data_labels.csv'
        contains_full_path = True
        model_name = 'nvidia_model.h5'
        convert_image = False
        image_channels = 1

    data_handler = data_handler.DataHandler(
        data_path,
        desc_file,
        vehicle_spec=vec_spec,
        contains_full_path=contains_full_path,
        convert_image=convert_image,
        image_channels=1)
    data_handler.read_data()

    if analyze_data:
        data_analyzer = data_analyzer.DataAnalyzer()
        data_analyzer.showDataDistribution(data_handler.y_data)
        data_analyzer.print_samples_not_equal_zero(data_handler.y_data)

    model_trainer = ModelTrainer(epochs=10,
                                 data_handler=data_handler,
                                 model_name=model_name)
    model_trainer.train_model()

    if shutdown_on_finish:
        os.system("shutdown now -h")