Python preprocess Examples

Example #1

0

Show file

File: evalAlign.py Project: sherrychen127/CSC401_Natural_Language_Processing

def _get_BLEU_scores(eng_decoded, eng, google_refs, n):
    """
    Parameters
    ----------
    eng_decoded : an array of decoded sentences
    eng         : an array of reference handsard
    google_refs : an array of reference google translated sentences
    n           : the 'n' in the n-gram model being used

    Returns
    -------
    An array of evaluation (BLEU) scores for the sentences
    """
    BLEU = []
    for i in range(len(eng_decoded)):  #one sentence should g
        candidate = eng_decoded[i]
        hansard = preprocess(eng[i], 'e')  #convert to lowercase
        google = preprocess(google_refs[i], 'e')  #convert to lowercase
        references = [hansard, google]
        p = []
        for j in range(1, n + 1):
            score = BLEU_score(candidate, references, j, brevity=False)
            p.append(score)
        bp = BLEU_score(candidate, references, 1, brevity=True)

        pn = 1
        for pi in p:
            pn *= pi
        bp_score = bp * pn**(1 / n)
        BLEU.append(bp_score)

    return BLEU

Example #2

0

Show file

File: align_ibm1.py Project: imkrislu/Statistical-Machine-Translation

def read_hansard(train_dir, num_sentences):
    """
    Read up to num_sentences from train_dir.
    
    INPUTS:
    train_dir :     (string) The top-level directory name containing data
                    e.g., '/u/cs401/A2_SMT/data/Hansard/Testing/'
    num_sentences : (int) the maximum number of training sentences to consider
    
    
    Make sure to preprocess!
    Remember that the i^th line in fubar.e corresponds to the i^th line in fubar.f.
    
    Make sure to read the files in an aligned manner.
    """
    # TODO
    files = os.listdir(train_dir)
    files = set([f[:-1] for f in files if f[-1] == "e" or f[-1] == "f"])
    # vocab_size = len(LM["uni"])
    english = []
    french = []
    count = 0
    for ffile in files:
        eng_file = open(train_dir+ffile + "e", "r")
        fre_file = open(train_dir+ffile + "f", "r")
        eng_lines = eng_file.readlines()
        fre_lines = fre_file.readlines()
        for i in range(len(eng_lines)):
            if count == num_sentences:
                return english, french
            count += 1
            english.append(preprocess(eng_lines[i].strip(), "e").strip().split()[1:-1])
            french.append(preprocess(fre_lines[i].strip(), "f").strip().split()[1:-1])
    return english, french

Example #3

0

Show file

File: align_ibm1.py Project: TianxiangChen/Natural-Language-Processing

def readLine(eContent, fContent, numLineRead, data):
    count = 0
    while count < numLineRead:
        data['e'].append(preprocess(eContent[count], 'e').split())
        data['f'].append(preprocess(fContent[count], 'f').split())
        count += 1
    return data

Example #4

0

Show file

def test_parameters():
    for min_median_speed in min_median_speed_list:
        for min_travel_distance in min_travel_dist_list:
            for min_travel_time in min_travel_time_list:
                preprocess(minSIZE, maxGAP, min_travel_time, min_median_speed,
                           min_travel_distance, filedate, 'CONSU')
                preprocess(minSIZE, maxGAP, min_travel_time, min_median_speed,
                           min_travel_distance, filedate, 'FLEET')

Example #5

0

Show file

def read_hansard(train_dir, num_sentences):
    """
	Read up to num_sentences from train_dir.
	
	INPUTS:
	train_dir : 	(string) The top-level directory name containing data
					e.g., '/u/cs401/A2_SMT/data/Hansard/Testing/'
	num_sentences : (int) the maximum number of training sentences to consider
	
	
	Make sure to preprocess!
	Remember that the i^th line in fubar.e corresponds to the i^th line in fubar.f.
	
	Make sure to read the files in an aligned manner.
	"""

    total_sentences = 0
    eng = []
    fre = []

    for subdir, dirs, files in os.walk(train_dir):
        if total_sentences == num_sentences:
            break
        for file in files:
            if total_sentences == num_sentences:
                break

            if file.split(".")[-1] == 'e':
                # print("total sentences")
                # print(total_sentences)
                # print(file)
                fullFile_eng = os.path.join(subdir, file)
                f_eng = open(fullFile_eng)
                fre_file = file[0:-1] + 'f'
                # print(fre_file)
                fullFile_fre = os.path.join(subdir, fre_file)
                f_fre = open(fullFile_fre)

                eng_training = f_eng.readlines()
                fre_training = f_fre.readlines()
                # print(len(eng_training))

                for i in range(len(eng_training)):
                    if eng_training[i].strip(
                    ) != "" and total_sentences < num_sentences:
                        eng_sen = preprocess(eng_training[i], "e")
                        # eng_sen = eng_training[i]
                        eng.append(re.findall(r"[\S]+", eng_sen))
                        fre_sen = preprocess(fre_training[i], "f")
                        # fre_sen = fre_training[i]
                        fre.append(re.findall(r"[\S]+", fre_sen))
                        total_sentences += 1

    # print(eng)
    # print(fre)
    return eng, fre

Example #6

0

Show file

File: align_ibm1.py Project: Secure-EX/401A2

def read_hansard(train_dir, num_sentences):
    """
    Read up to num_sentences from train_dir.

    INPUTS:
    train_dir : 	(string) The top-level directory name containing data
                    e.g., '/u/cs401/A2_SMT/data/Hansard/Testing/'
    num_sentences : (int) the maximum number of training sentences to consider


    Make sure to preprocess!
    Remember that the i^th line in fubar.e corresponds to the i^th line in fubar.f.

    Make sure to read the files in an aligned manner.
    """
    # TODO
    # the blue cat <-> le chat bleu
    # the red dog <-> le chein rouge
    # try to build a dict-list style storage:
    # raw_e_AM = {the: [le, chat, bleu, chein, rouge],
    #             blue: [le, chat, bleu],
    #             cat: [le, chat, bleu],
    #             red: [le, chein, rouge],
    #             dog: [le, chein, rouge]}
    raw_e_AM = {}
    raw_f_AM = {}
    if os.path.exists(train_dir):
        print("Correct path...")
        # trains on all of the data les in data dir that end in either 'e' for English or 'f' for French
        for subdir, dirs, files in os.walk(train_dir):
            for file in files:
                sent_num = 0
                # language is only 'e' files needs parallel process
                if os.path.basename(file)[-1] == "e":
                    file1 = os.path.basename(file)
                    file2 = os.path.basename(file)[:-1] + "f"
                    # open files in parallel way
                    if os.path.exists(train_dir + file2):
                        with open(train_dir + file1) as f1, open(train_dir +
                                                                 file2) as f2:
                            # preprocess every lines
                            for x, y in zip(f1, f2):
                                if sent_num < num_sentences:
                                    line1 = preprocess(x, "e").split()
                                    line2 = preprocess(y, "f").split()
                                    # block of raw_e_AM[sent_num][list of e]
                                    raw_e_AM[sent_num] = line1
                                    # block of raw_f_AM[sent_num][list of f]
                                    raw_f_AM[sent_num] = line2
                                    sent_num += 1
    else:
        print("Path " + train_dir + " does not exist ...")

    # print(raw_e_AM)
    # print(raw_f_AM)
    return raw_e_AM, raw_f_AM

Example #7

0

Show file

File: align_ibm1.py Project: lydavid/CSC401_A2

def read_hansard(train_dir, num_sentences):
    """
	Read up to num_sentences from train_dir.
	
	INPUTS:
	train_dir : 	(string) The top-level directory name containing data
					e.g., '/u/cs401/A2_SMT/data/Hansard/Training/'
	num_sentences : (int) the maximum number of training sentences to consider

	OUTPUT:
	aligned_sentences : (list of list of string) aligned_sentences[0][n] is the English sentence aligned with the French sentence aligned_sentences[1][n]
	
	
	Make sure to preprocess!
	Remember that the i^th line in fubar.e corresponds to the i^th line in fubar.f.
	
	Make sure to read the files in an aligned manner.
	"""

    # we assume for that for each .e file, there is a corresponding .f file with the same name in the directory

    aligned_sentences = [[], []]
    e_sentences_read = 0
    f_sentences_read = 0

    for filename in glob.iglob(train_dir + "*.e"):

        base_name = os.path.basename(filename)[:-2]
        f_filename = glob.glob(train_dir + base_name + ".f")[0]

        # English
        with open(filename) as english_file:

            if e_sentences_read >= num_sentences:
                break

            for line in english_file:
                if e_sentences_read >= num_sentences:
                    break
                aligned_sentences[0].append(preprocess(line, "e"))
                e_sentences_read += 1

        # French
        with open(f_filename) as french_file:

            if f_sentences_read >= num_sentences:
                break

            for line in french_file:
                if f_sentences_read >= num_sentences:
                    break
                aligned_sentences[1].append(preprocess(line, "f"))
                f_sentences_read += 1

    return aligned_sentences

Example #8

0

Show file

def read_hansard(train_dir, num_sentences):
    """
	Read up to num_sentences from train_dir.

	INPUTS:
	train_dir : 	(string) The top-level directory name containing data
					e.g., '/u/cs401/A2_SMT/data/Hansard/Testing/'
	num_sentences : (int) the maximum number of training sentences to consider


	Make sure to preprocess!
	Remember that the i^th line in fubar.e corresponds to the i^th line in fubar.f.

	Make sure to read the files in an aligned manner.
	"""
    # TODO

    sents_e = []
    sents_f = []
    num_sentences = 1000
    for subdir, dirs, files in os.walk(train_dir):
        for file in sorted(files):
            if file == '.DS_Store':
                continue
            fullFile = os.path.join(subdir, file)
            if file[-1] == 'e':
                read_file_e = open(fullFile, 'r')
                read_data_e = read_file_e.read()
                data_e = read_data_e.split('\n')
                read_file_f = open(fullFile[:-1] + 'f', 'r')
                read_data_f = read_file_f.read()
                data_f = read_data_f.split('\n')
                for i in range(len(data_e)):
                    # print(data)
                    sents_e.append(preprocess(data_e[i], file[-1]))
                    sents_f.append(preprocess(data_f[i], file[-1]))
                    if len(sents_e) == num_sentences:
                        print(num_sentences, ' samples have achived')
                        break
                    else:
                        continue
                    break
                else:
                    continue
                break
            else:
                continue
            break
        else:
            continue
        break

    sents = {'en': sents_e, 'fr': sents_f}
    #print(sents['en'])
    return sents

Example #9

0

Show file

File: align_ibm1.py Project: ooo125258/lrChMNjx279z0kdzZrtA

def read_hansard(train_dir, num_sentences):
    """
	Read up to num_sentences from train_dir.
	
	INPUTS:
	train_dir : 	(string) The top-level directory name containing data
					e.g., '/u/cs401/A2_SMT/data/Hansard/Testing/'
	num_sentences : (int) the maximum number of training sentences to consider
	
	Return:
	    (eng, fre) when each of them is a list of list of pre-processed eng or fre words in sentences of the train_dir
	Make sure to preprocess!
	Remember that the i^th line in fubar.e corresponds to the i^th line in fubar.f.
	
	Make sure to read the files in an aligned manner.
	"""
    # TODO
    counter = 0
    training_set = {'eng': [], 'fre': []}
    for root, dirs, files in os.walk(train_dir):
        for file in files:
            if not (len(file) > 2 and file[-1] == 'e'
                    and file[-2] == '.'):  # .e
                continue

            e_fullName = os.path.join(train_dir, file)
            f_fullName = e_fullName[:-1] + 'f'
            if not os.path.exists(f_fullName):
                # To remove eng without fre
                continue
            e_file = open(e_fullName)
            f_file = open(f_fullName)

            e_readLine = e_file.readline()
            f_readLine = f_file.readline()

            while e_readLine:  # "" is false directly
                if not f_readLine:
                    continue
                training_set['eng'].append(preprocess(e_readLine, 'e').split())
                training_set['fre'].append(preprocess(f_readLine, 'f').split())
                counter += 1

                if counter >= num_sentences:
                    # The time is now
                    e_file.close()
                    f_file.close()
                    return training_set['eng'], training_set['fre']
                e_readLine = e_file.readline()
                f_readLine = f_file.readline()
            e_file.close()
            f_file.close()
    return training_set['eng'], training_set['fre']

Example #10

0

Show file

File: testPreprocessPARAMETER.py Project: lijiayi9712/PATH_project

def test_parameter():
    minSIZE = 1  #5
    BARRIER = 100000  #240
    min_travel_time = 1  #180  # minimum number of seconds for a trip to be considered valid
    min_median_speed = 1  #5  # minimum median speed for a trip to be considered valid
    min_travel_distance = 1  #1000  # travel distance in feet, 1000ft to 0.5mi (2 or 3 blocks, Qijian asks)

    #preprocess(minSIZE, BARRIER, min_travel_time, min_median_speed, min_travel_distance, filedate, 'OVERAL')
    preprocess(minSIZE, BARRIER, min_travel_time, min_median_speed,
               min_travel_distance, filedate, 'CONSU')
    preprocess(minSIZE, BARRIER, min_travel_time, min_median_speed,
               min_travel_distance, filedate, 'FLEET')

Example #11

0

Show file

File: align_ibm1.py Project: kaetojn/401

def read_hansard(train_dir, num_sentences):
    """
	Read up to num_sentences from train_dir.
	
	INPUTS:
	train_dir : 	(string) The top-level directory name containing data
					e.g., '/u/cs401/A2_SMT/data/Hansard/Testing/'
	num_sentences : (int) the maximum number of training sentences to consider
	
	
	Make sure to preprocess!
	Remember that the i^th line in fubar.e corresponds to the i^th line in fubar.f.
	
	Make sure to read the files in an aligned manner.
	"""

    sentence = {}

    for subdir, dirs, files in os.walk(train_dir):
        english = []
        french = []
        for file in files:
            filename, file_extension = os.path.splitext(file)

            if file_extension == ".e":
                englishFile = os.path.join(train_dir, file)
                ffilename = filename + ".f"
                frenchFile = os.path.join(train_dir, ffilename)

                with open(englishFile, "r") as e:
                    i = 0
                    for line in e:
                        if i == num_sentences:
                            break
                        i += 1
                        line = preprocess(line, "e")
                        english.append(
                            line.strip("SENTSTART").strip("SENTEND").split())

                with open(frenchFile, "r") as f:
                    i = 0
                    for line in f:
                        if i == num_sentences:
                            break
                        i += 1
                        line = preprocess(line, "f")
                        french.append(
                            line.strip("SENTSTART").strip("SENTEND").split())

        sentence["eng"] = english
        sentence["fre"] = french

    return sentence

Example #12

0

Show file

File: loadfiles.py Project: avinashsai/Character-level-CNN

def make_corpus(pos, neg, stopword):
    corpus = []
    labels = np.zeros(10662)

    for i in range(5331):
        corpus.append(preprocess(pos[i], stopword))

    for i in range(5331):
        corpus.append(preprocess(neg[i], stopword))

    labels[0:5331] = 1

    return corpus, labels

Example #13

0

Show file

def read_hansard(train_dir, num_sentences):
    """
....Read up to num_sentences from train_dir.
....
....INPUTS:
....train_dir : ....(string) The top-level directory name containing data
....................e.g., '/u/cs401/A2_SMT/data/Hansard/Testing/'
....num_sentences : (int) the maximum number of training sentences to consider
....
....
....Make sure to preprocess!
....Remember that the i^th line in fubar.e corresponds to the i^th line in fubar.f.
....
....Make sure to read the files in an aligned manner.
...."""

    # TODO
    # MY NOTE: return two lists of sentences, one for eng and one for french

    num_read = 0
    sens_e = []
    sens_f = []
    french_files = []

    # Read num_sentences english sentences from files

    for file in glob.iglob(train_dir + '*.e'):
        french_files.append(file[:-1] + 'f')
        with open(file) as fp:
            for line in fp:
                sens_e.append(preprocess(line, 'e'))
                num_read = num_read + 1
                if num_read >= num_sentences:
                    break
            if num_read >= num_sentences:
                break

    num_read = 0

    # Read num_sentences french sentences from files
    for file in french_files:
        with open(file) as fp:
            for line in fp:
                sens_f.append(preprocess(line, 'f'))
                num_read = num_read + 1
                if num_read >= num_sentences:
                    break
            if num_read >= num_sentences:
                break
    return (sens_e, sens_f)

Example #14

0

Show file

def evalAlign(max_iter):
    ''' 
    Translate the 25 French sentences in /u/cs401/A2 SMT/data/Hansard/Testing/Task5.f
    with the decode function and evaluate them using corresponding reference sentences,
    specifically:
    
    1. /u/cs401/A2 SMT/data/Hansard/Testing/Task5.e, from the Hansards.
    2. /u/cs401/A2 SMT/data/Hansard/Testing/Task5.google.e, Google’s translations of the French phrases2.
    
    To evaluate each translation, use the BLEU score from lecture 6,
    
    Repeat this task with at least four alignment models (trained on 1K, 10K, 15K, and 30K
    sentences, respectively) and with three values of n in the BLEU score (i.e., n = 1, 2, 
    3). You should therefore have 25×4×3 BLEU scores in your evaluation.
    '''

    bleu = np.zeros(shape=(25, 4, 3))

    train_dir = "/u/cs401/A2_SMT/data/Hansard/Training/"
    LM = lm_train(train_dir, "e", "fn_LM_e")
    num_sentences = [1000, 10000, 15000, 30000]
    for n in range(len(num_sentences)):

        n_s = num_sentences[n]
        AM = align_ibm1(train_dir, n_s, max_iter, "fm_AM_e_{}".format(n_s))

        with open(
                "/u/cs401/A2_SMT/data/Hansard/Testing/Task5.f"
        ) as candidate_sentences, open(
                "/u/cs401/A2_SMT/data/Hansard/Testing/Task5.e") as ref_1, open(
                    "/u/cs401/A2_SMT/data/Hansard/Testing/Task5.google.e"
                ) as ref_2:
            candidate_sentences = candidate_sentences.readlines()
            ref_1 = ref_1.readlines()
            ref_2 = ref_2.readlines()
            for i in range(len(candidate_sentences)):
                sentence = candidate_sentences[i].strip()
                sentence = preprocess(sentence, "f")
                ref_1_sentence = preprocess(ref_1[i].strip(), "e")
                ref_2_sentence = preprocess(ref_2[i].strip(), "e")
                english = decode(sentence, LM, AM)
                bleu[i][n][0] = BLEU_score(english,
                                           [ref_1_sentence, ref_2_sentence], 1)
                bleu[i][n][1] = BLEU_score(english,
                                           [ref_1_sentence, ref_2_sentence], 2)
                bleu[i][n][2] = BLEU_score(english,
                                           [ref_1_sentence, ref_2_sentence], 3)
    return bleu

Example #15

0

Show file

File: align_ibm1.py Project: AdamCharron/NLP_English_French_Translation

def read_hansard(train_dir, num_sentences):
    """
	Read up to num_sentences from train_dir.
	
	INPUTS:
	train_dir : 	(string) The top-level directory name containing data
					e.g., '/u/cs401/A2_SMT/data/Hansard/Testing/'
	num_sentences : (int) the maximum number of training sentences to consider
	
	
	Make sure to preprocess!
	Remember that the i^th line in fubar.e corresponds to the i^th line in fubar.f.
	
	Make sure to read the files in an aligned manner.
	"""
    sentences_e = []
    sentences_f = []
    sentence_count = 0
    for filename in os.listdir(train_dir):
        if filename.endswith(".e") or filename.endswith(".f"):
            name, ext = os.path.splitext(filename)
            name = os.path.join(train_dir, name)
            print(name)
            f_e = open(name + '.e', 'r')
            f_f = open(name + '.f', 'r')
            while sentence_count < num_sentences:

                # English read line
                e_line = f_e.readline()
                e_line = e_line.rstrip()
                if not e_line: break
                e_line = preprocess(e_line, 'e')
                
                # French read line
                f_line = f_f.readline()
                f_line = f_line.rstrip()
                if not f_line: break
                f_line = preprocess(f_line, 'f')
                
                # append lines to whatever it is I'm returning
                #print('\t' + e_line)
                #print('\t' + f_line)
                sentences_e.append(e_line)
                sentences_f.append(f_line)
                
                sentence_count += 1
            if sentence_count >= num_sentences: break
    return sentences_e, sentences_f

Example #16

0

Show file

File: lm_train.py Project: RobertBazzocchi/cs401_a2

def get_gram_counts(data_dir, language):

    # A LIST THATWILL CONTAIN ALL SENTENCES OF EACH .f or .e FILE AS A LIST
    data_list = []
    uni_dict = {}
    bi_dict = {}

    # ITERATE THROUGH EACH FILE IN THE TRAINING DATA
    for subdir, dirs, files in os.walk(data_dir):
        for file in files:
            if file == ".DS_Store":
                continue
            if file.endswith(language):  # CHECK IF FILE IS OF CORRECT LANGUAGE
                process = True
            else:
                process = False

            if process:  # PROCESS SENTENES IN THE FILE
                print("Processing file: " + file)
                path = data_dir + file
                hansard_file = open(path, 'r')
                for sentence in hansard_file.readlines():
                    processed_sentence = preprocess(sentence, language)
                    data_list.append(processed_sentence)
                    uni_dict, bi_dict = compute_dicts(processed_sentence,
                                                      uni_dict, bi_dict)

    return data_list, uni_dict, bi_dict

Example #17

0

Show file

File: main.py Project: murphyja11/Transformer-Recommendation-System

def main():
    data_filepath = '../data/userid-timestamp-artid-artname-traid-traname.tsv'
    user_data_filepath = '../data/userid-profile.tsv'
    user_events, user_info_dict, num_songs, artist_ids, track_ids, country_dict = preprocess(
        data_filepath, user_data_filepath)
    user_ids = user_events.keys()
    random.shuffle(user_ids)
    user_ids = np.array(user_ids)

    # Train/Test split
    train_user_ids = user_ids[:int(len(user_ids) * .85)]
    test_user_ids = user_ids[int(len(user_ids) * .85):]

    train_user_events = {}
    train_user_info = {}
    for user_id in train_user_ids:
        train_user_events[user_id] = user_events[user_id]
        train_user_info[user_id] = user_info_dict[user_id]

    test_user_events = {}
    test_user_info = {}
    for user_id in test_user_ids:
        test_user_events[user_id] = user_events[user_id]
        test_user_info[user_id] = user_info_dict[user_id]

    model = Model(num_songs)
    epochs = 5
    for epoch in epochs:
        train(model, train_user_events, train_user_info, songs)
        loss = test(model, test_user_events, test_user_info)

        print('Loss after epoch {} = {}'.format(epoch, loss))

Example #18

0

Show file

File: evalAlign.py Project: kaetojn/401

def evalAlign(file, references, LM, AM):

	buff = []

	for i in range(len(references)):	
		buff.append(open(references[i], "r"))

	with open(file, "r") as f:
		for line in f:
			procFrench = preprocess(line, "f")
			english = decode(procFrench, LM, AM)

			blueRef = []
			for j in range(len(buff)):
				newline = buff[j].readline()
				blueRef.append(newline)

			blue1 = BLEU_score(english, blueRef, 1)
			blue2 = BLEU_score(english, blueRef, 2)
			blue3 = BLEU_score(english, blueRef, 3)

			print(blue1, blue2, blue3)
			
	for i in buff:
		i.close()

Example #19

0

Show file

def extract_random_rois(data,
                        dsize,
                        rois_by_image=1000,
                        rng=np.random,
                        flat=True):
    rois = []
    if data != None:
        for i in range(len(data)):
            img, lung_mask = data.get(i)
            sampled, lce, norm = preprocess(img, lung_mask)
            # Pick LCE images
            side = lce.shape[0]
            assert lung_mask.shape[0] == lce.shape[0]
            #rois = []
            cnt = 0
            while cnt < rois_by_image:
                rx = int(rng.uniform(0, side))
                ry = int(rng.uniform(0, side))
                if lung_mask[rx, ry] > 0:
                    '''
                    print "img shape {}".format(img.shape)
                    print "lce shape {}".format(lce.shape)
                    print "lung_mask shape {}".format(lce.shape)
                    print "lung_mask corner_value {} max_value {}".format(lung_mask[0][0], np.max(lung_mask))
                    print "point {} {}".format(rx, ry)
                    #print 'roi-{}-{}.jpg'.format(i, cnt)
                    #imwrite('roi-{}-{}.jpg'.format(i, cnt), util.extract_roi(lce, (rx, ry, 25), dsize))
                    '''
                    rois.append([util.extract_roi(lce, (rx, ry, 25), dsize)])
                    cnt += 1
            #roi_set.append(rois)
    return np.array(rois)

Example #20

0

Show file

File: evalAlign.py Project: TianxiangChen/Natural-Language-Processing

def processFile(file_path, language):
    with open(file_path) as f:
        content = f.readlines()
    processed_list = []
    for line in content:
        processed_list.append(preprocess(line, language))
    return processed_list

Example #21

0

Show file

File: align_ibm1.py Project: cheesywow/CSC401-Winter-2019

def get_paired_sentences(data_dir):
    """
    Yields the aligned sentences of the documents
    in data_dir in the following form:
    (english, french)
    """
    for (e_path, f_path) in get_paired_doc_paths(data_dir):
        with open(e_path, "r") as e_file, \
                open(f_path, "r") as f_file:

            for (e_sent, f_sent) in zip(e_file.readlines(),
                                        f_file.readlines()):
                e_proc = preprocess(e_sent, "e")
                f_proc = preprocess(f_sent, "f")

                yield (e_proc, f_proc)

Example #22

0

Show file

File: data_prep.py Project: ppvalluri09/Facial-Keypoint-Detection

    def __init__(self, train=True):
        self.train = train
        if self.train:
            df = pd.read_csv('./data/training.csv')
            df = preprocess(df)
            self.data = df['Image'].values / 255.0
            self.y = df.drop(['Image'], axis=1).values
        else:
            df = pd.read_csv('./data/testing.csv')
            df = preprocess(df)
            self.data = df['Image'].values / 255.0
            self.y = df['ImageId'].values

        self.samples = []
        for i in range(self.data.shape[0]):
            self.samples.append((self.data[i].reshape(1, 96, 96), self.y[i]))

Example #23

0

Show file

def good_turing_lm(data_dir, language, fn_LM, usercached=True):
    if usercached:
        with open(fn_LM + '.pickle', 'rb') as input_file:
            LM = pickle.load(input_file)
            return LM
    LM = {}
    LM['uni'] = {}
    LM['bi'] = {}
    for subdir, dirs, files in os.walk(data_dir):
        total = len(files)
        for i in range(total):
            file = files[i]
            fullFile = os.path.join(subdir, file)
            #print("processiong:", fullFile, " count:", i+1, '/', total)
            if i % 100 == 0 or i == total - 1:
                print("processed:", i + 1, '/', total)
            if file.endswith(('.' + language)):
                with open(fullFile) as f:
                    f_content = f.readlines()
                    content = []  #preprocessed
                    for sentence in f_content:  #preprocess
                        content.append(preprocess(sentence.strip(),
                                                  language))  #strip
                    LM = construct_GT_LM(content, LM)
    freq = construct_freq(LM)
    LM = GT_smoothing(LM, freq)
    with open(fn_LM + '.pickle', 'wb') as handle:
        pickle.dump(LM, handle, protocol=pickle.HIGHEST_PROTOCOL)

Example #24

0

Show file

File: p300checker.py Project: mahdanahmad/py-emokit

def run() :
    data        = readFromFile(source)

    occipital   = []
    for i in range(8, 10)   : occipital.append(data[:,i])

    indices = {
        'O1'    : {},
        'O2'    : {}
    }

    for i in range(0, 129) :
        for key, val in enumerate(occipital):
            current     = parse(val[i:])
            PSD_result  = preprocess(current)
            peak_array  = findPeak(PSD_result)

            if 10 in peak_array :
                peak_pos    = peak_array.index(10)

                if peak_pos in indices["O" + str(key + 1)].keys() :
                    indices["O" + str(key + 1)][peak_pos]  += 1
                else :
                    indices["O" + str(key + 1)][peak_pos]   = 1
                print "O" + str(key + 1) + " detik ke- " + str(peak_pos) + " => " + str(peak_array)

    print indices

Example #25

0

Show file

File: validation.py Project: peteli3/hmm-seq-detection

def cross_validate_hmm(directory, hmm, validation_set):

	sorted_files = sorted( os.listdir(directory), key=lambda x: ( int( re.sub('\D', '', x) ), x) )
	triplet_list = [] # will contain the 'answers', in order of sorted files
	viterbi_seq = [] # will contain the BIO tag sequence from Viterbi, in order of sorted files

	# loop through sorted files
	for i in xrange( len(sorted_files) ):

		# extract only the files in validation_set
		file_name = sorted_files[i]
		if file_name.endswith('.txt') and i in validation_set:
			file_path = os.path.join(directory, file_name)

			# first populate the triplet_list so we have all the info from our validation set
			tags_list = preprocess(file_path)
			triplet_list += tags_list

			if tags_list:
				# next isolate only the tokens and run Viterbi, store the accumulating sequence 
				tokens_list = [token for (token, pos_tag, bio_tag) in tags_list]
				viterbi_seq += hmm.viterbi_decode(tokens_list)

	# finally, obtain results metrics: calculate precision, recall, f-score
	correct_tag_seq = [bio_tag for (token, pos_tag, bio_tag) in triplet_list]
	precision = calculate_precision(viterbi_seq, correct_tag_seq)
	recall = calculate_recall(viterbi_seq, correct_tag_seq)
	fscore = calculate_fscore(precision, recall)

	return (precision, recall, fscore)

Example #26

0

Show file

def preplexity(LM, test_dir, language, smoothing=False, delta=0):
    """
	Computes the preplexity of language model given a test corpus
	
	INPUT:
	
	LM : 		(dictionary) the language model trained by lm_train
	test_dir : 	(string) The top-level directory name containing data
				e.g., '/u/cs401/A2_SMT/data/Hansard/Testing/'
	language : `(string) either 'e' (English) or 'f' (French)
	smoothing : (boolean) True for add-delta smoothing, False for no smoothing
	delta : 	(float) smoothing parameter where 0<delta<=1
	"""

    files = os.listdir(test_dir)
    pp = 0
    N = 0
    vocab_size = len(LM["uni"])

    for ffile in files:
        if ffile.split(".")[-1] != language:
            continue

        opened_file = open(test_dir + ffile, "r")
        for line in opened_file:
            processed_line = preprocess(line, language)
            tpp = log_prob(processed_line, LM, smoothing, delta, vocab_size)

            if tpp > float("-inf"):
                pp = pp + tpp
                N += len(processed_line.split())
        opened_file.close()
    if N > 0:
        pp = 2**(-pp / N)
    return pp

Example #27

0

Show file

 def test_preprocess(self):
     ims, filenames, failed = batch_imread(self.folder_path)
     for i in range(len(ims)):
         im = cv2.imread(os.path.join(self.folder_path, filenames[i]))
         res = preprocess(im)
         if res is not None:
             self.assertEqual(res.shape, (256, 256, 3))

Example #28

0

Show file

File: data_generators_leo.py Project: TanjaCrijns/MLIP-Brainforest

def get_test_data(files, batch_size=4, img_size=(720, 1280), **kwargs):
    """
    Generator for test data

    # Params
    - files : list of files (e.g., output of glob)
    - batch_size : number of images per batch
    - img_size : size to resize images to (height, width)
    - kwargs : passed to the preprocess function

    # Returns
    - batches of (batch_size, 3, img_size[0], img_size[1])
    """
    i = 0
    n = len(files)
    # cycle to avoid batches not lining up with dataset size
    files = files + files
    while True:
        batch = np.zeros((batch_size, 3) + img_size)
        for j in range(batch_size):
            img = load_image(files[i])
            img = preprocess(img,
                             target_size=img_size,
                             augmentation=False,
                             zero_center=True,
                             scale=1. / 255.,
                             **kwargs)
            batch[j] = img
            i = (i + 1) % n
        yield batch

Example #29

0

Show file

File: run.py Project: andrewk1/risk-adjustment-ml

def main(args):
    # Load the model.
    model_path = Path(args.model_path)
    model = lgb.Booster(model_file=args.model_path)

    # Load the data.
    csv_path = Path(args.csv_path)
    df = pd.read_csv(csv_path)
    data = preprocess(df, args.sdh)

    data_num_predictors = data.shape[1]
    model_num_predictors = model.num_feature()
    if model_num_predictors != data_num_predictors:
        raise ValueError(f"Model expects {model_num_predictors} predictors," +
                         f"Got {data_num_predictors} predictors.")

    # Run the model.
    prediction = np.clip(model.predict(data), 0, 400000) + COST_ADJUSTMENT
    patient_costs = list(zip(df[PATIENT], prediction))
    print("Patient\tCost")
    for patient, cost in patient_costs:
        print(f"{patient}\t{cost}")
    print()

    print(f"Saving predicted cost data to {args.save_path}.")
    costs_df = pd.DataFrame(patient_costs, columns=[PATIENT, "Cost"])
    costs_df.to_csv(args.save_path, index=False)

Example #30

0

Show file

File: evalAlign.py Project: jingliuoft/CSC2514_NLP_Assignments

def main(args):
    """
    #TODO: Perform outlined tasks in assignment, like loading alignment
    models, computing BLEU scores etc.

    (You may use the helper functions)

    It's entirely upto you how you want to write Task5.txt. This is just
    an (sparse) example.

    """
    max_iter = 100
    num_sent = 1000
    data_dir = "../data/Hansard/Training/"
    language = 'e'
    fn_LM = 'e_temp_lm'
    fn_AM = 'e_temp_am'
    bleu_n = 1
    bleu_score = []

    LM = _getLM(data_dir, language, fn_LM)
    AM = _getAM(data_dir, num_sent, max_iter, fn_AM)

    read_file_f = open('../data/Hansard/Testing/Task5.f', 'r')
    read_data_f = read_file_f.read()
    data_f = read_data_f.split('\n')

    read_file_e = open('../data/Hansard/Testing/Task5.e', 'r')
    read_data_e = read_file_e.read()
    data_e = read_data_e.split('\n')
    read_file_er = open('../data/Hansard/Testing/Task5.google.e', 'r')
    read_data_er = read_file_er.read()
    data_er = read_data_er.split('\n')
    #print(len(data_f), len(data_e), len(data_er))

    e=[]
    ref1 = []
    ref2 = []
    for i in range(len(data_f)):
        f_prep = preprocess(data_f[i], 'f')
        e.append(decode(f_prep,LM,AM))
        ref1.append(preprocess(data_e[i], 'e'))
        ref2.append(preprocess(data_er[i], 'e'))

        scores = _get_BLEU_scores(e, ref1, ref2, bleu_n)

    print('done')

Example #31

0

Show file

def train(input_A,
          input_B,
          g_type=g_type,
          n_epochs=n_epochs,
          n_features=n_features,
          n_frames=n_frames,
          log_dir=log_dir,
          model_dir=model_dir):

    generator_lr = 0.0002
    generator_lr_decay = generator_lr / 200000
    discriminator_lr = 0.0001
    discriminator_lr_decay = discriminator_lr / 200000
    cycle_lambda = 10
    identity_lambda = 5

    # Preprocessing datasets
    A_norm, B_norm = preprocess(input_A, input_B)

    # Cyclegan_voice convert
    model = CycleGAN(num_features=n_features, g_type=g_type, log_dir=log_dir)

    print("Start Training...")
    for epoch in range(n_epochs):
        print("Epoch : %d " % epoch)
        start_time = time.time()
        train_A, train_B = sample_train_data(dataset_A=A_norm,
                                             dataset_B=B_norm,
                                             n_frames=n_frames)  # random data

        n_samples = train_A.shape[0]

        for i in range(n_samples):  # mini_ batch_size = 1
            n_iter = n_samples * epoch + i

            if n_iter > 10000:
                identity_lambda = 0
            if n_iter > 200000:
                generator_lr = max(0, generator_lr - generator_lr_decay)
                discriminator_lr = max(
                    0, discriminator_lr - discriminator_lr_decay)

            start = i
            end = start + 1

            generator_loss, discriminator_loss = model.train(
                input_A=train_A[start:end],
                input_B=train_B[start:end],
                cycle_lambda=cycle_lambda,
                identity_lambda=identity_lambda,
                generator_lr=generator_lr,
                discriminator_lr=discriminator_lr)
        end_time = time.time()
        epoch_time = end_time - start_time
        print(
            "Generator Loss : %f, Discriminator Loss : %f, Time : %02d:%02d" %
            (generator_loss, discriminator_loss, (epoch_time % 3600 // 60),
             (epoch_time % 60 // 1)))
        model.save(directory=model_dir, filename="model")

Example #32

0

Show file

File: detect.py Project: ARutgersson/OpenANPR

def detect_plates(image):
	"""Detects possible number plate regions in an image."""
	grey, bw = preprocess(image)
	contours = set(find_characters(grey, bw))
	for cluster in find_clusters(contours):
		plate = extract_plate(image, cluster)
		if plate is not None:
			yield plate

Example #33

0

Show file

File: run.py Project: Sean1989/draft-ml-workflow

def run(data_path, read_as, algorithm):
    # Read the data
    print "Reading the dataset:", data_path
    try:
        data_reading_function = globals()[read_as]
        raw_data = data_reading_function(data_path)
    except KeyError:
        print "Data reading function is not located. Please check it again."
        raise

    random.seed(17)

    # Preprocess the data
    X_train, X_test, y_train, y_test, constraints = preprocess(raw_data)

    # Apply a learning algorithm
    try:
        training_algorithm = globals()[algorithm]
        clf = training_algorithm(X_train, y_train)
    except KeyError:
        print "Training algorithm is not located. Please check it again."
        raise

    # Make a prediction
    y_pred = clf.predict(X_test)
    print "Predicted values:", y_pred

    # Evaluate the prediction
    print "Evaluating results..."
    results = dict()
    results['Precision'] = float(metrics.precision_score(y_test, y_pred))
    results['Recall'] = float(metrics.recall_score(y_test, y_pred))
    results['F1 score'] = float(metrics.f1_score(y_test, y_pred))
    results['Mean accuracy'] = float(clf.score(X_test, y_test))

    print "Precision: \t", results['Precision']
    print "Recall: \t", results['Recall']
    print "F1 score: \t", results['F1 score']
    print "Mean accuracy: \t", results['Mean accuracy']

    return results

Example #34

0

Show file

File: preparePoliticalDebates.py Project: juliakreutzer/loons

                opened = codecs.open(p+"/"+file, "r", "utf8", errors="ignore") #errors due to BOMS
                pos = True
                for line in opened:
                    if line.startswith("#"):
                        if "#stance=stance2" in line:
                            pos = False
                    else:
                        if pos == True:
                            posts_pos.append(line)
                        else:
                            posts_neg.append(line)
                opened.close()
        print("Loaded %d positive posts for topic '%s'" % (len(posts_pos), topic))
        print("Loaded %d negative posts for topic '%s'" % (len(posts_neg), topic))

        print("Preprocessing...")
        preprocessed = preprocess(" ".join(posts_pos))
        outFile = "data/poldeb/"+topic+".pos.txt.tok"
        topicFiles.append(outFile)
        opened = codecs.open(outFile, "w", "utf8")
        opened.write(preprocessed)
        opened.close()
        print("Wrote posts to file %s" % outFile)

        preprocessed = preprocess(" ".join(posts_neg))
        outFile = "data/poldeb/"+topic+".neg.txt.tok"
        topicFiles.append(outFile)
        opened = codecs.open(outFile, "w", "utf8")
        opened.write(preprocessed)
        opened.close()
        print("Wrote posts to file %s" % outFile)

Example #35

0

Show file

File: rescale.py Project: njsm87/chinese-zodiac-classifier

from preprocess import *
from visual import *


X, y = preprocess('data/ml2013final_train.dat')

dataset = zip(X, y)

output_dataset(dataset)

Example #36

0

Show file

File: run_preprocess.py Project: Seanny123/nef-conceptors

from preprocess import *

import ipdb

min_maxs, normed_data = preprocess(61, 2, True)#len(pattern_file_names), True)
ipdb.set_trace()

Example #37

0

Show file

File: svm_test.py Project: njsm87/chinese-zodiac-classifier

from preprocess import *
from visual import *
from sklearn import cross_validation, svm

X, y = load_preprocessed_dataset('data/rescaled28x28.dat')
X_test, _ = preprocess('data/test1.dat')

clf = svm.SVC(
    C=2., kernel='poly', degree=9, gamma=1./512., coef0=1).fit(X, y)

yhat = clf.predict(X_test)

# Output the predicted classes, one per line
for i in yhat:
  print i

Example #38

0

Show file

File: scoreDataSet.py Project: kenjyoung/Neurohex

import numpy as np
from resistance import score
from preprocess import *

positions = preprocess("data/raw_games.dat")
print "scoring positions..."
scores = np.empty((positions.shape[0],boardsize,boardsize))
num_positions = positions.shape[0]
output_interval = num_positions/100
for i in range(num_positions):
	if(i%output_interval == 0):
		print "completion: ",i/output_interval
	try:
		scores[i]=score(positions[i], 0)
	#if for some reason an uncaught singularity occurs just skip this position
	except np.linalg.linalg.LinAlgError:
		print "singular position at ",str(i),": ", state_string(positions[i])
		i-=1

print "saving to file..."
savefile = open("data/scoredPositionsFull.npz", 'w')
np.savez(savefile, positions=positions, scores=scores)

Example #39

0

Show file

File: classify.nofreeze.py Project: annashcherbina/cs231n_project

def main():
    #load data 
    X_train,Y_train,X_valid,Y_valid,X_test=load_data(training_dir,valid_dir,test_dir,labels,sample)
    #preprocess data by mean subtraction and normalization 
    X_train,X_valid,X_test=preprocess(X_train,X_valid,X_test)
    #del X_train
    #del X_test

    #or load pre-processed data from a previously saved hdf5 file:
    '''
    data=h5py.File('imagenet.transpose.individually.augment.hdf5','r')
    X_train=np.asarray(data['X_train']) 
    Y_train=np.asarray(data['Y_train']) 
    X_valid=np.asarray(data['X_valid']) 
    Y_valid=np.asarray(data['Y_valid']) 
    X_test=np.asarray(data['X_test']) 
    '''
    #print "loaded data from pickle" 
    #OPTIONAL: save loaded/pre-processed data to a pickle to save time in the future
    
    #print "saving preprocessed data to hdf5 file" 

    f=h5py.File('imagenet.transpose.individually.augment.contrast.tint.hdf5','w')
    dset_xtrain=f.create_dataset("X_train",data=X_train)
    dset_ytrain=f.create_dataset("Y_train",data=Y_train) 
    dset_xvalid=f.create_dataset("X_valid",data=X_valid) 
    dset_yvalid=f.create_dataset("Y_valid",data=Y_valid) 
    dset_xtest=f.create_dataset("X_test",data=X_test) 
    f.flush() 
    f.close() 

    #print "done saving pre-processed data to hdf5 file!" 
    pretrained_model = pretrained('pretrained_model.h5',False)
    sgd = SGD(lr=1e-3, decay=1e-6, momentum=0.9, nesterov=True)
    pretrained_model.compile(optimizer=sgd, loss='categorical_crossentropy',trainLayersIndividually=0)
    #do some training! 
    print "compilation finished, fitting model" 

    print "pretrained_model.trainLayersIndividually:"+str(pretrained_model.trainLayersIndividually) 
    if pretrained_model.trainLayersIndividually==1: 
        train_epochs=5 
    else: 
        train_epochs=5     
    history=pretrained_model.fit(X_train, Y_train, 128,train_epochs,validation_data=tuple([X_valid,Y_valid]),verbose=1,show_accuracy=True)
    pretrained_model.save_weights("assignment3_weights_nodropout_noregularization_augmenteddata.3epochs.contrast.tint.hdf5",overwrite=True) 
    class_predictions=pretrained_model.predict_classes(X_test) 
    np.savetxt('assignment3_class_predictions_nodropout_noregularization_augmenteddata.3epochs.contrast.tint.txt',class_predictions,fmt='%i',delimiter='\t') 
    train_scores=pretrained_evaluate(pretrained_model,X_train,Y_train)
    print "pretrained model training scores:"+str(train_scores) 
    valid_scores=pretrained_evaluate(pretrained_model,X_valid,Y_valid)
    print "pretrained validation scores:"+str(valid_scores)


    print "writing out the predictions file" 
    predictions=open('assignment3_class_predictions_nodropout_noregularization_augmenteddata.3epochs.contrast.tint.txt','r').read().split('\n') 
    while '' in predictions: 
        predictions.remove('') 
        
    wnids=open(labels,'r').read().split('\n') 
    while '' in wnids: 
        wnids.remove('') 

    cur_dir=test_dir+"images/"
    onlyfiles = [f for f in listdir(cur_dir) if isfile(join(cur_dir, f))]
    entries=10000
    outf=open('assignment3_class_predictions_nodropout_noregularization_augmenteddata.formatted.3epochs.contrast.tint.txt','w') 

    for i in range(entries): 
        image_name=onlyfiles[i] 
        predict_index=int(predictions[i])
        wnid1=wnids[predict_index]
        outf.write(image_name+'\t'+str(wnid1)+'\n')

Example #40

0

Show file

File: convnet.py Project: njsm87/chinese-zodiac-classifier

def evaluate_lenet5(learning_rate=.1, n_epochs=200,
                    dataset='ml2013final_train.dat',
                    nkerns=[20, 50], batch_size=500):
    """ Demonstrates lenet on MNIST dataset

    :type learning_rate: float
    :param learning_rate: learning rate used (factor for the stochastic
                          gradient)

    :type n_epochs: int
    :param n_epochs: maximal number of epochs to run the optimizer

    :type dataset: string
    :param dataset: path to the dataset used for training

    :type nkerns: list of ints
    :param nkerns: number of kernels on each layer
    """
    print "learning_rate = ", learning_rate

    rng = numpy.random.RandomState(23455)

    X, Y = shared_dataset(preprocess(dataset))

    # get validation set
    length = Y.owner.inputs[0].get_value(borrow=True).shape[0]
    kf = KFold(length, n_folds=5) # keep 80% for training and 20% for validation (test set = validation set here)
    for i, (train_index, valid_index) in enumerate(kf):
        if i == 0: 
            train_set_x, valid_set_x = X[train_index], X[valid_index] 
            train_set_y, valid_set_y = Y[train_index], Y[valid_index]
            train_set_x = theano.shared(numpy.asarray(train_set_x.eval(), dtype=theano.config.floatX))
            train_set_y = theano.shared(numpy.asarray(train_set_y.eval(), dtype=theano.config.floatX))
            train_set_y = T.cast(train_set_y, 'int32')
            valid_set_x = theano.shared(numpy.asarray(valid_set_x.eval(), dtype=theano.config.floatX))
            valid_set_y = theano.shared(numpy.asarray(valid_set_y.eval(), dtype=theano.config.floatX))
            valid_set_y = T.cast(valid_set_y, 'int32')
            # set test set = validation set 
            test_set_x = valid_set_x
            test_set_y = valid_set_y

    # compute number of minibatches for training, validation and testing
    n_train_batches = train_set_x.get_value(borrow=True).shape[0]
    n_valid_batches = valid_set_x.get_value(borrow=True).shape[0]
    n_test_batches = test_set_x.get_value(borrow=True).shape[0]
    n_train_batches /= batch_size
    n_valid_batches /= batch_size
    n_test_batches /= batch_size

    # allocate symbolic variables for the data
    index = T.lscalar()  # index to a [mini]batch
    x = T.matrix('x')   # the data is presented as rasterized images
    y = T.ivector('y')  # the labels are presented as 1D vector of
                        # [int] labels

    ishape = (28, 28)  # this is the size of the image 

    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'

    # Reshape matrix of rasterized images of shape (batch_size,28*28)
    # to a 4D tensor, compatible with our LeNetConvPoolLayer
    layer0_input = x.reshape((batch_size, 1, 28, 28))

    # Construct the first convolutional pooling layer:
    # filtering reduces the image size to (28-5+1,28-5+1)=(24,24)
    # maxpooling reduces this further to (24/2,24/2) = (12,12)
    # 4D output tensor is thus of shape (batch_size,nkerns[0],12,12)
    layer0 = LeNetConvPoolLayer(rng, input=layer0_input,
            image_shape=(batch_size, 1, 28, 28),
            filter_shape=(nkerns[0], 1, 5, 5), poolsize=(2, 2))

    # Construct the second convolutional pooling layer
    # filtering reduces the image size to (12-5+1,12-5+1)=(8,8)
    # maxpooling reduces this further to (8/2,8/2) = (4,4)
    # 4D output tensor is thus of shape (nkerns[0],nkerns[1],4,4)
    layer1 = LeNetConvPoolLayer(rng, input=layer0.output,
            image_shape=(batch_size, nkerns[0], 12, 12),
            filter_shape=(nkerns[1], nkerns[0], 5, 5), poolsize=(2, 2))

    # the TanhLayer being fully-connected, it operates on 2D matrices of
    # shape (batch_size,num_pixels) (i.e matrix of rasterized images).
    # This will generate a matrix of shape (20,32*4*4) = (20,512)
    layer2_input = layer1.output.flatten(2)

    # construct a fully-connected sigmoidal layer
    layer2 = HiddenLayer(rng, input=layer2_input, n_in=nkerns[1] * 4 * 4,
                         n_out=500, activation=T.tanh)

    # classify the values of the fully-connected sigmoidal layer
    layer3 = LogisticRegression(input=layer2.output, n_in=500, n_out=12)

    # the cost we minimize during training is the NLL of the model
    cost = layer3.negative_log_likelihood(y)

    # create a function to compute the mistakes that are made by the model
    test_model = theano.function([index], layer3.errors(y),
             givens={
                x: test_set_x[index * batch_size: (index + 1) * batch_size],
                y: test_set_y[index * batch_size: (index + 1) * batch_size]})

    validate_model = theano.function([index], layer3.errors(y),
            givens={
                x: valid_set_x[index * batch_size: (index + 1) * batch_size],
                y: valid_set_y[index * batch_size: (index + 1) * batch_size]})

    # create a list of all model parameters to be fit by gradient descent
    params = layer3.params + layer2.params + layer1.params + layer0.params

    # create a list of gradients for all model parameters
    grads = T.grad(cost, params)

    # train_model is a function that updates the model parameters by
    # SGD Since this model has many parameters, it would be tedious to
    # manually create an update rule for each model parameter. We thus
    # create the updates list by automatically looping over all
    # (params[i],grads[i]) pairs.
    updates = []
    for param_i, grad_i in zip(params, grads):
        updates.append((param_i, param_i - learning_rate * grad_i))

    train_model = theano.function([index], cost, updates=updates,
          givens={
            x: train_set_x[index * batch_size: (index + 1) * batch_size],
            y: train_set_y[index * batch_size: (index + 1) * batch_size]})

    ###############
    # TRAIN MODEL #
    ###############
    print '... training'
    # early-stopping parameters
    patience = 10000  # look as this many examples regardless
    patience_increase = 2  # wait this much longer when a new best is
                           # found
    improvement_threshold = 0.995  # a relative improvement of this much is
                                   # considered significant
    validation_frequency = min(n_train_batches, patience / 2)
                                  # go through this many
                                  # minibatche before checking the network
                                  # on the validation set; in this case we
                                  # check every epoch

    best_params = None
    best_validation_loss = numpy.inf
    best_iter = 0
    test_score = 0.
    start_time = time.clock()

    epoch = 0
    done_looping = False

    while (epoch < n_epochs) and (not done_looping):
        epoch = epoch + 1
        for minibatch_index in xrange(n_train_batches):

            iter = (epoch - 1) * n_train_batches + minibatch_index

            if iter % 100 == 0:
                print 'training @ iter = ', iter
            cost_ij = train_model(minibatch_index)

            if (iter + 1) % validation_frequency == 0:

                # compute zero-one loss on validation set
                validation_losses = [validate_model(i) for i
                                     in xrange(n_valid_batches)]
                this_validation_loss = numpy.mean(validation_losses)
                print('epoch %i, minibatch %i/%i, validation error %f %%' % \
                      (epoch, minibatch_index + 1, n_train_batches, \
                       this_validation_loss * 100.))

                # if we got the best validation score until now
                if this_validation_loss < best_validation_loss:

                    #improve patience if loss improvement is good enough
                    if this_validation_loss < best_validation_loss *  \
                       improvement_threshold:
                        patience = max(patience, iter * patience_increase)

                    # save best validation score and iteration number
                    best_validation_loss = this_validation_loss
                    best_iter = iter

                    # test it on the test set
                    test_losses = [test_model(i) for i in xrange(n_test_batches)]
                    test_score = numpy.mean(test_losses)
                    print(('     epoch %i, minibatch %i/%i, test error of best '
                           'model %f %%') %
                          (epoch, minibatch_index + 1, n_train_batches,
                           test_score * 100.))

                    if epoch > 30:
                       # Save the model
                       print 'saving model..'
                       save_file = open('cnn_%s.txt' % epoch, 'wb')
                       cPickle.dump(params, save_file, -1)
                       save_file.close()

            if patience <= iter:
                done_looping = True
                break

    end_time = time.clock()
    print('Optimization complete.')
    print('Best validation score of %f %% obtained at iteration %i,'\
          'with test performance %f %%' %
          (best_validation_loss * 100., best_iter + 1, test_score * 100.))
    print >> sys.stderr, ('The code for file ' +
                          os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))