def _get_BLEU_scores(eng_decoded, eng, google_refs, n): """ Parameters ---------- eng_decoded : an array of decoded sentences eng : an array of reference handsard google_refs : an array of reference google translated sentences n : the 'n' in the n-gram model being used Returns ------- An array of evaluation (BLEU) scores for the sentences """ BLEU = [] for i in range(len(eng_decoded)): #one sentence should g candidate = eng_decoded[i] hansard = preprocess(eng[i], 'e') #convert to lowercase google = preprocess(google_refs[i], 'e') #convert to lowercase references = [hansard, google] p = [] for j in range(1, n + 1): score = BLEU_score(candidate, references, j, brevity=False) p.append(score) bp = BLEU_score(candidate, references, 1, brevity=True) pn = 1 for pi in p: pn *= pi bp_score = bp * pn**(1 / n) BLEU.append(bp_score) return BLEU
def read_hansard(train_dir, num_sentences): """ Read up to num_sentences from train_dir. INPUTS: train_dir : (string) The top-level directory name containing data e.g., '/u/cs401/A2_SMT/data/Hansard/Testing/' num_sentences : (int) the maximum number of training sentences to consider Make sure to preprocess! Remember that the i^th line in fubar.e corresponds to the i^th line in fubar.f. Make sure to read the files in an aligned manner. """ # TODO files = os.listdir(train_dir) files = set([f[:-1] for f in files if f[-1] == "e" or f[-1] == "f"]) # vocab_size = len(LM["uni"]) english = [] french = [] count = 0 for ffile in files: eng_file = open(train_dir+ffile + "e", "r") fre_file = open(train_dir+ffile + "f", "r") eng_lines = eng_file.readlines() fre_lines = fre_file.readlines() for i in range(len(eng_lines)): if count == num_sentences: return english, french count += 1 english.append(preprocess(eng_lines[i].strip(), "e").strip().split()[1:-1]) french.append(preprocess(fre_lines[i].strip(), "f").strip().split()[1:-1]) return english, french
def readLine(eContent, fContent, numLineRead, data): count = 0 while count < numLineRead: data['e'].append(preprocess(eContent[count], 'e').split()) data['f'].append(preprocess(fContent[count], 'f').split()) count += 1 return data
def test_parameters(): for min_median_speed in min_median_speed_list: for min_travel_distance in min_travel_dist_list: for min_travel_time in min_travel_time_list: preprocess(minSIZE, maxGAP, min_travel_time, min_median_speed, min_travel_distance, filedate, 'CONSU') preprocess(minSIZE, maxGAP, min_travel_time, min_median_speed, min_travel_distance, filedate, 'FLEET')
def read_hansard(train_dir, num_sentences): """ Read up to num_sentences from train_dir. INPUTS: train_dir : (string) The top-level directory name containing data e.g., '/u/cs401/A2_SMT/data/Hansard/Testing/' num_sentences : (int) the maximum number of training sentences to consider Make sure to preprocess! Remember that the i^th line in fubar.e corresponds to the i^th line in fubar.f. Make sure to read the files in an aligned manner. """ total_sentences = 0 eng = [] fre = [] for subdir, dirs, files in os.walk(train_dir): if total_sentences == num_sentences: break for file in files: if total_sentences == num_sentences: break if file.split(".")[-1] == 'e': # print("total sentences") # print(total_sentences) # print(file) fullFile_eng = os.path.join(subdir, file) f_eng = open(fullFile_eng) fre_file = file[0:-1] + 'f' # print(fre_file) fullFile_fre = os.path.join(subdir, fre_file) f_fre = open(fullFile_fre) eng_training = f_eng.readlines() fre_training = f_fre.readlines() # print(len(eng_training)) for i in range(len(eng_training)): if eng_training[i].strip( ) != "" and total_sentences < num_sentences: eng_sen = preprocess(eng_training[i], "e") # eng_sen = eng_training[i] eng.append(re.findall(r"[\S]+", eng_sen)) fre_sen = preprocess(fre_training[i], "f") # fre_sen = fre_training[i] fre.append(re.findall(r"[\S]+", fre_sen)) total_sentences += 1 # print(eng) # print(fre) return eng, fre
def read_hansard(train_dir, num_sentences): """ Read up to num_sentences from train_dir. INPUTS: train_dir : (string) The top-level directory name containing data e.g., '/u/cs401/A2_SMT/data/Hansard/Testing/' num_sentences : (int) the maximum number of training sentences to consider Make sure to preprocess! Remember that the i^th line in fubar.e corresponds to the i^th line in fubar.f. Make sure to read the files in an aligned manner. """ # TODO # the blue cat <-> le chat bleu # the red dog <-> le chein rouge # try to build a dict-list style storage: # raw_e_AM = {the: [le, chat, bleu, chein, rouge], # blue: [le, chat, bleu], # cat: [le, chat, bleu], # red: [le, chein, rouge], # dog: [le, chein, rouge]} raw_e_AM = {} raw_f_AM = {} if os.path.exists(train_dir): print("Correct path...") # trains on all of the data les in data dir that end in either 'e' for English or 'f' for French for subdir, dirs, files in os.walk(train_dir): for file in files: sent_num = 0 # language is only 'e' files needs parallel process if os.path.basename(file)[-1] == "e": file1 = os.path.basename(file) file2 = os.path.basename(file)[:-1] + "f" # open files in parallel way if os.path.exists(train_dir + file2): with open(train_dir + file1) as f1, open(train_dir + file2) as f2: # preprocess every lines for x, y in zip(f1, f2): if sent_num < num_sentences: line1 = preprocess(x, "e").split() line2 = preprocess(y, "f").split() # block of raw_e_AM[sent_num][list of e] raw_e_AM[sent_num] = line1 # block of raw_f_AM[sent_num][list of f] raw_f_AM[sent_num] = line2 sent_num += 1 else: print("Path " + train_dir + " does not exist ...") # print(raw_e_AM) # print(raw_f_AM) return raw_e_AM, raw_f_AM
def read_hansard(train_dir, num_sentences): """ Read up to num_sentences from train_dir. INPUTS: train_dir : (string) The top-level directory name containing data e.g., '/u/cs401/A2_SMT/data/Hansard/Training/' num_sentences : (int) the maximum number of training sentences to consider OUTPUT: aligned_sentences : (list of list of string) aligned_sentences[0][n] is the English sentence aligned with the French sentence aligned_sentences[1][n] Make sure to preprocess! Remember that the i^th line in fubar.e corresponds to the i^th line in fubar.f. Make sure to read the files in an aligned manner. """ # we assume for that for each .e file, there is a corresponding .f file with the same name in the directory aligned_sentences = [[], []] e_sentences_read = 0 f_sentences_read = 0 for filename in glob.iglob(train_dir + "*.e"): base_name = os.path.basename(filename)[:-2] f_filename = glob.glob(train_dir + base_name + ".f")[0] # English with open(filename) as english_file: if e_sentences_read >= num_sentences: break for line in english_file: if e_sentences_read >= num_sentences: break aligned_sentences[0].append(preprocess(line, "e")) e_sentences_read += 1 # French with open(f_filename) as french_file: if f_sentences_read >= num_sentences: break for line in french_file: if f_sentences_read >= num_sentences: break aligned_sentences[1].append(preprocess(line, "f")) f_sentences_read += 1 return aligned_sentences
def read_hansard(train_dir, num_sentences): """ Read up to num_sentences from train_dir. INPUTS: train_dir : (string) The top-level directory name containing data e.g., '/u/cs401/A2_SMT/data/Hansard/Testing/' num_sentences : (int) the maximum number of training sentences to consider Make sure to preprocess! Remember that the i^th line in fubar.e corresponds to the i^th line in fubar.f. Make sure to read the files in an aligned manner. """ # TODO sents_e = [] sents_f = [] num_sentences = 1000 for subdir, dirs, files in os.walk(train_dir): for file in sorted(files): if file == '.DS_Store': continue fullFile = os.path.join(subdir, file) if file[-1] == 'e': read_file_e = open(fullFile, 'r') read_data_e = read_file_e.read() data_e = read_data_e.split('\n') read_file_f = open(fullFile[:-1] + 'f', 'r') read_data_f = read_file_f.read() data_f = read_data_f.split('\n') for i in range(len(data_e)): # print(data) sents_e.append(preprocess(data_e[i], file[-1])) sents_f.append(preprocess(data_f[i], file[-1])) if len(sents_e) == num_sentences: print(num_sentences, ' samples have achived') break else: continue break else: continue break else: continue break else: continue break sents = {'en': sents_e, 'fr': sents_f} #print(sents['en']) return sents
def read_hansard(train_dir, num_sentences): """ Read up to num_sentences from train_dir. INPUTS: train_dir : (string) The top-level directory name containing data e.g., '/u/cs401/A2_SMT/data/Hansard/Testing/' num_sentences : (int) the maximum number of training sentences to consider Return: (eng, fre) when each of them is a list of list of pre-processed eng or fre words in sentences of the train_dir Make sure to preprocess! Remember that the i^th line in fubar.e corresponds to the i^th line in fubar.f. Make sure to read the files in an aligned manner. """ # TODO counter = 0 training_set = {'eng': [], 'fre': []} for root, dirs, files in os.walk(train_dir): for file in files: if not (len(file) > 2 and file[-1] == 'e' and file[-2] == '.'): # .e continue e_fullName = os.path.join(train_dir, file) f_fullName = e_fullName[:-1] + 'f' if not os.path.exists(f_fullName): # To remove eng without fre continue e_file = open(e_fullName) f_file = open(f_fullName) e_readLine = e_file.readline() f_readLine = f_file.readline() while e_readLine: # "" is false directly if not f_readLine: continue training_set['eng'].append(preprocess(e_readLine, 'e').split()) training_set['fre'].append(preprocess(f_readLine, 'f').split()) counter += 1 if counter >= num_sentences: # The time is now e_file.close() f_file.close() return training_set['eng'], training_set['fre'] e_readLine = e_file.readline() f_readLine = f_file.readline() e_file.close() f_file.close() return training_set['eng'], training_set['fre']
def test_parameter(): minSIZE = 1 #5 BARRIER = 100000 #240 min_travel_time = 1 #180 # minimum number of seconds for a trip to be considered valid min_median_speed = 1 #5 # minimum median speed for a trip to be considered valid min_travel_distance = 1 #1000 # travel distance in feet, 1000ft to 0.5mi (2 or 3 blocks, Qijian asks) #preprocess(minSIZE, BARRIER, min_travel_time, min_median_speed, min_travel_distance, filedate, 'OVERAL') preprocess(minSIZE, BARRIER, min_travel_time, min_median_speed, min_travel_distance, filedate, 'CONSU') preprocess(minSIZE, BARRIER, min_travel_time, min_median_speed, min_travel_distance, filedate, 'FLEET')
def read_hansard(train_dir, num_sentences): """ Read up to num_sentences from train_dir. INPUTS: train_dir : (string) The top-level directory name containing data e.g., '/u/cs401/A2_SMT/data/Hansard/Testing/' num_sentences : (int) the maximum number of training sentences to consider Make sure to preprocess! Remember that the i^th line in fubar.e corresponds to the i^th line in fubar.f. Make sure to read the files in an aligned manner. """ sentence = {} for subdir, dirs, files in os.walk(train_dir): english = [] french = [] for file in files: filename, file_extension = os.path.splitext(file) if file_extension == ".e": englishFile = os.path.join(train_dir, file) ffilename = filename + ".f" frenchFile = os.path.join(train_dir, ffilename) with open(englishFile, "r") as e: i = 0 for line in e: if i == num_sentences: break i += 1 line = preprocess(line, "e") english.append( line.strip("SENTSTART").strip("SENTEND").split()) with open(frenchFile, "r") as f: i = 0 for line in f: if i == num_sentences: break i += 1 line = preprocess(line, "f") french.append( line.strip("SENTSTART").strip("SENTEND").split()) sentence["eng"] = english sentence["fre"] = french return sentence
def make_corpus(pos, neg, stopword): corpus = [] labels = np.zeros(10662) for i in range(5331): corpus.append(preprocess(pos[i], stopword)) for i in range(5331): corpus.append(preprocess(neg[i], stopword)) labels[0:5331] = 1 return corpus, labels
def read_hansard(train_dir, num_sentences): """ ....Read up to num_sentences from train_dir. .... ....INPUTS: ....train_dir : ....(string) The top-level directory name containing data ....................e.g., '/u/cs401/A2_SMT/data/Hansard/Testing/' ....num_sentences : (int) the maximum number of training sentences to consider .... .... ....Make sure to preprocess! ....Remember that the i^th line in fubar.e corresponds to the i^th line in fubar.f. .... ....Make sure to read the files in an aligned manner. ....""" # TODO # MY NOTE: return two lists of sentences, one for eng and one for french num_read = 0 sens_e = [] sens_f = [] french_files = [] # Read num_sentences english sentences from files for file in glob.iglob(train_dir + '*.e'): french_files.append(file[:-1] + 'f') with open(file) as fp: for line in fp: sens_e.append(preprocess(line, 'e')) num_read = num_read + 1 if num_read >= num_sentences: break if num_read >= num_sentences: break num_read = 0 # Read num_sentences french sentences from files for file in french_files: with open(file) as fp: for line in fp: sens_f.append(preprocess(line, 'f')) num_read = num_read + 1 if num_read >= num_sentences: break if num_read >= num_sentences: break return (sens_e, sens_f)
def evalAlign(max_iter): ''' Translate the 25 French sentences in /u/cs401/A2 SMT/data/Hansard/Testing/Task5.f with the decode function and evaluate them using corresponding reference sentences, specifically: 1. /u/cs401/A2 SMT/data/Hansard/Testing/Task5.e, from the Hansards. 2. /u/cs401/A2 SMT/data/Hansard/Testing/Task5.google.e, Google’s translations of the French phrases2. To evaluate each translation, use the BLEU score from lecture 6, Repeat this task with at least four alignment models (trained on 1K, 10K, 15K, and 30K sentences, respectively) and with three values of n in the BLEU score (i.e., n = 1, 2, 3). You should therefore have 25×4×3 BLEU scores in your evaluation. ''' bleu = np.zeros(shape=(25, 4, 3)) train_dir = "/u/cs401/A2_SMT/data/Hansard/Training/" LM = lm_train(train_dir, "e", "fn_LM_e") num_sentences = [1000, 10000, 15000, 30000] for n in range(len(num_sentences)): n_s = num_sentences[n] AM = align_ibm1(train_dir, n_s, max_iter, "fm_AM_e_{}".format(n_s)) with open( "/u/cs401/A2_SMT/data/Hansard/Testing/Task5.f" ) as candidate_sentences, open( "/u/cs401/A2_SMT/data/Hansard/Testing/Task5.e") as ref_1, open( "/u/cs401/A2_SMT/data/Hansard/Testing/Task5.google.e" ) as ref_2: candidate_sentences = candidate_sentences.readlines() ref_1 = ref_1.readlines() ref_2 = ref_2.readlines() for i in range(len(candidate_sentences)): sentence = candidate_sentences[i].strip() sentence = preprocess(sentence, "f") ref_1_sentence = preprocess(ref_1[i].strip(), "e") ref_2_sentence = preprocess(ref_2[i].strip(), "e") english = decode(sentence, LM, AM) bleu[i][n][0] = BLEU_score(english, [ref_1_sentence, ref_2_sentence], 1) bleu[i][n][1] = BLEU_score(english, [ref_1_sentence, ref_2_sentence], 2) bleu[i][n][2] = BLEU_score(english, [ref_1_sentence, ref_2_sentence], 3) return bleu
def read_hansard(train_dir, num_sentences): """ Read up to num_sentences from train_dir. INPUTS: train_dir : (string) The top-level directory name containing data e.g., '/u/cs401/A2_SMT/data/Hansard/Testing/' num_sentences : (int) the maximum number of training sentences to consider Make sure to preprocess! Remember that the i^th line in fubar.e corresponds to the i^th line in fubar.f. Make sure to read the files in an aligned manner. """ sentences_e = [] sentences_f = [] sentence_count = 0 for filename in os.listdir(train_dir): if filename.endswith(".e") or filename.endswith(".f"): name, ext = os.path.splitext(filename) name = os.path.join(train_dir, name) print(name) f_e = open(name + '.e', 'r') f_f = open(name + '.f', 'r') while sentence_count < num_sentences: # English read line e_line = f_e.readline() e_line = e_line.rstrip() if not e_line: break e_line = preprocess(e_line, 'e') # French read line f_line = f_f.readline() f_line = f_line.rstrip() if not f_line: break f_line = preprocess(f_line, 'f') # append lines to whatever it is I'm returning #print('\t' + e_line) #print('\t' + f_line) sentences_e.append(e_line) sentences_f.append(f_line) sentence_count += 1 if sentence_count >= num_sentences: break return sentences_e, sentences_f
def get_gram_counts(data_dir, language): # A LIST THATWILL CONTAIN ALL SENTENCES OF EACH .f or .e FILE AS A LIST data_list = [] uni_dict = {} bi_dict = {} # ITERATE THROUGH EACH FILE IN THE TRAINING DATA for subdir, dirs, files in os.walk(data_dir): for file in files: if file == ".DS_Store": continue if file.endswith(language): # CHECK IF FILE IS OF CORRECT LANGUAGE process = True else: process = False if process: # PROCESS SENTENES IN THE FILE print("Processing file: " + file) path = data_dir + file hansard_file = open(path, 'r') for sentence in hansard_file.readlines(): processed_sentence = preprocess(sentence, language) data_list.append(processed_sentence) uni_dict, bi_dict = compute_dicts(processed_sentence, uni_dict, bi_dict) return data_list, uni_dict, bi_dict
def main(): data_filepath = '../data/userid-timestamp-artid-artname-traid-traname.tsv' user_data_filepath = '../data/userid-profile.tsv' user_events, user_info_dict, num_songs, artist_ids, track_ids, country_dict = preprocess( data_filepath, user_data_filepath) user_ids = user_events.keys() random.shuffle(user_ids) user_ids = np.array(user_ids) # Train/Test split train_user_ids = user_ids[:int(len(user_ids) * .85)] test_user_ids = user_ids[int(len(user_ids) * .85):] train_user_events = {} train_user_info = {} for user_id in train_user_ids: train_user_events[user_id] = user_events[user_id] train_user_info[user_id] = user_info_dict[user_id] test_user_events = {} test_user_info = {} for user_id in test_user_ids: test_user_events[user_id] = user_events[user_id] test_user_info[user_id] = user_info_dict[user_id] model = Model(num_songs) epochs = 5 for epoch in epochs: train(model, train_user_events, train_user_info, songs) loss = test(model, test_user_events, test_user_info) print('Loss after epoch {} = {}'.format(epoch, loss))
def evalAlign(file, references, LM, AM): buff = [] for i in range(len(references)): buff.append(open(references[i], "r")) with open(file, "r") as f: for line in f: procFrench = preprocess(line, "f") english = decode(procFrench, LM, AM) blueRef = [] for j in range(len(buff)): newline = buff[j].readline() blueRef.append(newline) blue1 = BLEU_score(english, blueRef, 1) blue2 = BLEU_score(english, blueRef, 2) blue3 = BLEU_score(english, blueRef, 3) print(blue1, blue2, blue3) for i in buff: i.close()
def extract_random_rois(data, dsize, rois_by_image=1000, rng=np.random, flat=True): rois = [] if data != None: for i in range(len(data)): img, lung_mask = data.get(i) sampled, lce, norm = preprocess(img, lung_mask) # Pick LCE images side = lce.shape[0] assert lung_mask.shape[0] == lce.shape[0] #rois = [] cnt = 0 while cnt < rois_by_image: rx = int(rng.uniform(0, side)) ry = int(rng.uniform(0, side)) if lung_mask[rx, ry] > 0: ''' print "img shape {}".format(img.shape) print "lce shape {}".format(lce.shape) print "lung_mask shape {}".format(lce.shape) print "lung_mask corner_value {} max_value {}".format(lung_mask[0][0], np.max(lung_mask)) print "point {} {}".format(rx, ry) #print 'roi-{}-{}.jpg'.format(i, cnt) #imwrite('roi-{}-{}.jpg'.format(i, cnt), util.extract_roi(lce, (rx, ry, 25), dsize)) ''' rois.append([util.extract_roi(lce, (rx, ry, 25), dsize)]) cnt += 1 #roi_set.append(rois) return np.array(rois)
def processFile(file_path, language): with open(file_path) as f: content = f.readlines() processed_list = [] for line in content: processed_list.append(preprocess(line, language)) return processed_list
def get_paired_sentences(data_dir): """ Yields the aligned sentences of the documents in data_dir in the following form: (english, french) """ for (e_path, f_path) in get_paired_doc_paths(data_dir): with open(e_path, "r") as e_file, \ open(f_path, "r") as f_file: for (e_sent, f_sent) in zip(e_file.readlines(), f_file.readlines()): e_proc = preprocess(e_sent, "e") f_proc = preprocess(f_sent, "f") yield (e_proc, f_proc)
def __init__(self, train=True): self.train = train if self.train: df = pd.read_csv('./data/training.csv') df = preprocess(df) self.data = df['Image'].values / 255.0 self.y = df.drop(['Image'], axis=1).values else: df = pd.read_csv('./data/testing.csv') df = preprocess(df) self.data = df['Image'].values / 255.0 self.y = df['ImageId'].values self.samples = [] for i in range(self.data.shape[0]): self.samples.append((self.data[i].reshape(1, 96, 96), self.y[i]))
def good_turing_lm(data_dir, language, fn_LM, usercached=True): if usercached: with open(fn_LM + '.pickle', 'rb') as input_file: LM = pickle.load(input_file) return LM LM = {} LM['uni'] = {} LM['bi'] = {} for subdir, dirs, files in os.walk(data_dir): total = len(files) for i in range(total): file = files[i] fullFile = os.path.join(subdir, file) #print("processiong:", fullFile, " count:", i+1, '/', total) if i % 100 == 0 or i == total - 1: print("processed:", i + 1, '/', total) if file.endswith(('.' + language)): with open(fullFile) as f: f_content = f.readlines() content = [] #preprocessed for sentence in f_content: #preprocess content.append(preprocess(sentence.strip(), language)) #strip LM = construct_GT_LM(content, LM) freq = construct_freq(LM) LM = GT_smoothing(LM, freq) with open(fn_LM + '.pickle', 'wb') as handle: pickle.dump(LM, handle, protocol=pickle.HIGHEST_PROTOCOL)
def run() : data = readFromFile(source) occipital = [] for i in range(8, 10) : occipital.append(data[:,i]) indices = { 'O1' : {}, 'O2' : {} } for i in range(0, 129) : for key, val in enumerate(occipital): current = parse(val[i:]) PSD_result = preprocess(current) peak_array = findPeak(PSD_result) if 10 in peak_array : peak_pos = peak_array.index(10) if peak_pos in indices["O" + str(key + 1)].keys() : indices["O" + str(key + 1)][peak_pos] += 1 else : indices["O" + str(key + 1)][peak_pos] = 1 print "O" + str(key + 1) + " detik ke- " + str(peak_pos) + " => " + str(peak_array) print indices
def cross_validate_hmm(directory, hmm, validation_set): sorted_files = sorted( os.listdir(directory), key=lambda x: ( int( re.sub('\D', '', x) ), x) ) triplet_list = [] # will contain the 'answers', in order of sorted files viterbi_seq = [] # will contain the BIO tag sequence from Viterbi, in order of sorted files # loop through sorted files for i in xrange( len(sorted_files) ): # extract only the files in validation_set file_name = sorted_files[i] if file_name.endswith('.txt') and i in validation_set: file_path = os.path.join(directory, file_name) # first populate the triplet_list so we have all the info from our validation set tags_list = preprocess(file_path) triplet_list += tags_list if tags_list: # next isolate only the tokens and run Viterbi, store the accumulating sequence tokens_list = [token for (token, pos_tag, bio_tag) in tags_list] viterbi_seq += hmm.viterbi_decode(tokens_list) # finally, obtain results metrics: calculate precision, recall, f-score correct_tag_seq = [bio_tag for (token, pos_tag, bio_tag) in triplet_list] precision = calculate_precision(viterbi_seq, correct_tag_seq) recall = calculate_recall(viterbi_seq, correct_tag_seq) fscore = calculate_fscore(precision, recall) return (precision, recall, fscore)
def preplexity(LM, test_dir, language, smoothing=False, delta=0): """ Computes the preplexity of language model given a test corpus INPUT: LM : (dictionary) the language model trained by lm_train test_dir : (string) The top-level directory name containing data e.g., '/u/cs401/A2_SMT/data/Hansard/Testing/' language : `(string) either 'e' (English) or 'f' (French) smoothing : (boolean) True for add-delta smoothing, False for no smoothing delta : (float) smoothing parameter where 0<delta<=1 """ files = os.listdir(test_dir) pp = 0 N = 0 vocab_size = len(LM["uni"]) for ffile in files: if ffile.split(".")[-1] != language: continue opened_file = open(test_dir + ffile, "r") for line in opened_file: processed_line = preprocess(line, language) tpp = log_prob(processed_line, LM, smoothing, delta, vocab_size) if tpp > float("-inf"): pp = pp + tpp N += len(processed_line.split()) opened_file.close() if N > 0: pp = 2**(-pp / N) return pp
def test_preprocess(self): ims, filenames, failed = batch_imread(self.folder_path) for i in range(len(ims)): im = cv2.imread(os.path.join(self.folder_path, filenames[i])) res = preprocess(im) if res is not None: self.assertEqual(res.shape, (256, 256, 3))
def get_test_data(files, batch_size=4, img_size=(720, 1280), **kwargs): """ Generator for test data # Params - files : list of files (e.g., output of glob) - batch_size : number of images per batch - img_size : size to resize images to (height, width) - kwargs : passed to the preprocess function # Returns - batches of (batch_size, 3, img_size[0], img_size[1]) """ i = 0 n = len(files) # cycle to avoid batches not lining up with dataset size files = files + files while True: batch = np.zeros((batch_size, 3) + img_size) for j in range(batch_size): img = load_image(files[i]) img = preprocess(img, target_size=img_size, augmentation=False, zero_center=True, scale=1. / 255., **kwargs) batch[j] = img i = (i + 1) % n yield batch
def main(args): # Load the model. model_path = Path(args.model_path) model = lgb.Booster(model_file=args.model_path) # Load the data. csv_path = Path(args.csv_path) df = pd.read_csv(csv_path) data = preprocess(df, args.sdh) data_num_predictors = data.shape[1] model_num_predictors = model.num_feature() if model_num_predictors != data_num_predictors: raise ValueError(f"Model expects {model_num_predictors} predictors," + f"Got {data_num_predictors} predictors.") # Run the model. prediction = np.clip(model.predict(data), 0, 400000) + COST_ADJUSTMENT patient_costs = list(zip(df[PATIENT], prediction)) print("Patient\tCost") for patient, cost in patient_costs: print(f"{patient}\t{cost}") print() print(f"Saving predicted cost data to {args.save_path}.") costs_df = pd.DataFrame(patient_costs, columns=[PATIENT, "Cost"]) costs_df.to_csv(args.save_path, index=False)
def main(args): """ #TODO: Perform outlined tasks in assignment, like loading alignment models, computing BLEU scores etc. (You may use the helper functions) It's entirely upto you how you want to write Task5.txt. This is just an (sparse) example. """ max_iter = 100 num_sent = 1000 data_dir = "../data/Hansard/Training/" language = 'e' fn_LM = 'e_temp_lm' fn_AM = 'e_temp_am' bleu_n = 1 bleu_score = [] LM = _getLM(data_dir, language, fn_LM) AM = _getAM(data_dir, num_sent, max_iter, fn_AM) read_file_f = open('../data/Hansard/Testing/Task5.f', 'r') read_data_f = read_file_f.read() data_f = read_data_f.split('\n') read_file_e = open('../data/Hansard/Testing/Task5.e', 'r') read_data_e = read_file_e.read() data_e = read_data_e.split('\n') read_file_er = open('../data/Hansard/Testing/Task5.google.e', 'r') read_data_er = read_file_er.read() data_er = read_data_er.split('\n') #print(len(data_f), len(data_e), len(data_er)) e=[] ref1 = [] ref2 = [] for i in range(len(data_f)): f_prep = preprocess(data_f[i], 'f') e.append(decode(f_prep,LM,AM)) ref1.append(preprocess(data_e[i], 'e')) ref2.append(preprocess(data_er[i], 'e')) scores = _get_BLEU_scores(e, ref1, ref2, bleu_n) print('done')
def train(input_A, input_B, g_type=g_type, n_epochs=n_epochs, n_features=n_features, n_frames=n_frames, log_dir=log_dir, model_dir=model_dir): generator_lr = 0.0002 generator_lr_decay = generator_lr / 200000 discriminator_lr = 0.0001 discriminator_lr_decay = discriminator_lr / 200000 cycle_lambda = 10 identity_lambda = 5 # Preprocessing datasets A_norm, B_norm = preprocess(input_A, input_B) # Cyclegan_voice convert model = CycleGAN(num_features=n_features, g_type=g_type, log_dir=log_dir) print("Start Training...") for epoch in range(n_epochs): print("Epoch : %d " % epoch) start_time = time.time() train_A, train_B = sample_train_data(dataset_A=A_norm, dataset_B=B_norm, n_frames=n_frames) # random data n_samples = train_A.shape[0] for i in range(n_samples): # mini_ batch_size = 1 n_iter = n_samples * epoch + i if n_iter > 10000: identity_lambda = 0 if n_iter > 200000: generator_lr = max(0, generator_lr - generator_lr_decay) discriminator_lr = max( 0, discriminator_lr - discriminator_lr_decay) start = i end = start + 1 generator_loss, discriminator_loss = model.train( input_A=train_A[start:end], input_B=train_B[start:end], cycle_lambda=cycle_lambda, identity_lambda=identity_lambda, generator_lr=generator_lr, discriminator_lr=discriminator_lr) end_time = time.time() epoch_time = end_time - start_time print( "Generator Loss : %f, Discriminator Loss : %f, Time : %02d:%02d" % (generator_loss, discriminator_loss, (epoch_time % 3600 // 60), (epoch_time % 60 // 1))) model.save(directory=model_dir, filename="model")
def detect_plates(image): """Detects possible number plate regions in an image.""" grey, bw = preprocess(image) contours = set(find_characters(grey, bw)) for cluster in find_clusters(contours): plate = extract_plate(image, cluster) if plate is not None: yield plate
def run(data_path, read_as, algorithm): # Read the data print "Reading the dataset:", data_path try: data_reading_function = globals()[read_as] raw_data = data_reading_function(data_path) except KeyError: print "Data reading function is not located. Please check it again." raise random.seed(17) # Preprocess the data X_train, X_test, y_train, y_test, constraints = preprocess(raw_data) # Apply a learning algorithm try: training_algorithm = globals()[algorithm] clf = training_algorithm(X_train, y_train) except KeyError: print "Training algorithm is not located. Please check it again." raise # Make a prediction y_pred = clf.predict(X_test) print "Predicted values:", y_pred # Evaluate the prediction print "Evaluating results..." results = dict() results['Precision'] = float(metrics.precision_score(y_test, y_pred)) results['Recall'] = float(metrics.recall_score(y_test, y_pred)) results['F1 score'] = float(metrics.f1_score(y_test, y_pred)) results['Mean accuracy'] = float(clf.score(X_test, y_test)) print "Precision: \t", results['Precision'] print "Recall: \t", results['Recall'] print "F1 score: \t", results['F1 score'] print "Mean accuracy: \t", results['Mean accuracy'] return results
opened = codecs.open(p+"/"+file, "r", "utf8", errors="ignore") #errors due to BOMS pos = True for line in opened: if line.startswith("#"): if "#stance=stance2" in line: pos = False else: if pos == True: posts_pos.append(line) else: posts_neg.append(line) opened.close() print("Loaded %d positive posts for topic '%s'" % (len(posts_pos), topic)) print("Loaded %d negative posts for topic '%s'" % (len(posts_neg), topic)) print("Preprocessing...") preprocessed = preprocess(" ".join(posts_pos)) outFile = "data/poldeb/"+topic+".pos.txt.tok" topicFiles.append(outFile) opened = codecs.open(outFile, "w", "utf8") opened.write(preprocessed) opened.close() print("Wrote posts to file %s" % outFile) preprocessed = preprocess(" ".join(posts_neg)) outFile = "data/poldeb/"+topic+".neg.txt.tok" topicFiles.append(outFile) opened = codecs.open(outFile, "w", "utf8") opened.write(preprocessed) opened.close() print("Wrote posts to file %s" % outFile)
from preprocess import * from visual import * X, y = preprocess('data/ml2013final_train.dat') dataset = zip(X, y) output_dataset(dataset)
from preprocess import * import ipdb min_maxs, normed_data = preprocess(61, 2, True)#len(pattern_file_names), True) ipdb.set_trace()
from preprocess import * from visual import * from sklearn import cross_validation, svm X, y = load_preprocessed_dataset('data/rescaled28x28.dat') X_test, _ = preprocess('data/test1.dat') clf = svm.SVC( C=2., kernel='poly', degree=9, gamma=1./512., coef0=1).fit(X, y) yhat = clf.predict(X_test) # Output the predicted classes, one per line for i in yhat: print i
import numpy as np from resistance import score from preprocess import * positions = preprocess("data/raw_games.dat") print "scoring positions..." scores = np.empty((positions.shape[0],boardsize,boardsize)) num_positions = positions.shape[0] output_interval = num_positions/100 for i in range(num_positions): if(i%output_interval == 0): print "completion: ",i/output_interval try: scores[i]=score(positions[i], 0) #if for some reason an uncaught singularity occurs just skip this position except np.linalg.linalg.LinAlgError: print "singular position at ",str(i),": ", state_string(positions[i]) i-=1 print "saving to file..." savefile = open("data/scoredPositionsFull.npz", 'w') np.savez(savefile, positions=positions, scores=scores)
def main(): #load data X_train,Y_train,X_valid,Y_valid,X_test=load_data(training_dir,valid_dir,test_dir,labels,sample) #preprocess data by mean subtraction and normalization X_train,X_valid,X_test=preprocess(X_train,X_valid,X_test) #del X_train #del X_test #or load pre-processed data from a previously saved hdf5 file: ''' data=h5py.File('imagenet.transpose.individually.augment.hdf5','r') X_train=np.asarray(data['X_train']) Y_train=np.asarray(data['Y_train']) X_valid=np.asarray(data['X_valid']) Y_valid=np.asarray(data['Y_valid']) X_test=np.asarray(data['X_test']) ''' #print "loaded data from pickle" #OPTIONAL: save loaded/pre-processed data to a pickle to save time in the future #print "saving preprocessed data to hdf5 file" f=h5py.File('imagenet.transpose.individually.augment.contrast.tint.hdf5','w') dset_xtrain=f.create_dataset("X_train",data=X_train) dset_ytrain=f.create_dataset("Y_train",data=Y_train) dset_xvalid=f.create_dataset("X_valid",data=X_valid) dset_yvalid=f.create_dataset("Y_valid",data=Y_valid) dset_xtest=f.create_dataset("X_test",data=X_test) f.flush() f.close() #print "done saving pre-processed data to hdf5 file!" pretrained_model = pretrained('pretrained_model.h5',False) sgd = SGD(lr=1e-3, decay=1e-6, momentum=0.9, nesterov=True) pretrained_model.compile(optimizer=sgd, loss='categorical_crossentropy',trainLayersIndividually=0) #do some training! print "compilation finished, fitting model" print "pretrained_model.trainLayersIndividually:"+str(pretrained_model.trainLayersIndividually) if pretrained_model.trainLayersIndividually==1: train_epochs=5 else: train_epochs=5 history=pretrained_model.fit(X_train, Y_train, 128,train_epochs,validation_data=tuple([X_valid,Y_valid]),verbose=1,show_accuracy=True) pretrained_model.save_weights("assignment3_weights_nodropout_noregularization_augmenteddata.3epochs.contrast.tint.hdf5",overwrite=True) class_predictions=pretrained_model.predict_classes(X_test) np.savetxt('assignment3_class_predictions_nodropout_noregularization_augmenteddata.3epochs.contrast.tint.txt',class_predictions,fmt='%i',delimiter='\t') train_scores=pretrained_evaluate(pretrained_model,X_train,Y_train) print "pretrained model training scores:"+str(train_scores) valid_scores=pretrained_evaluate(pretrained_model,X_valid,Y_valid) print "pretrained validation scores:"+str(valid_scores) print "writing out the predictions file" predictions=open('assignment3_class_predictions_nodropout_noregularization_augmenteddata.3epochs.contrast.tint.txt','r').read().split('\n') while '' in predictions: predictions.remove('') wnids=open(labels,'r').read().split('\n') while '' in wnids: wnids.remove('') cur_dir=test_dir+"images/" onlyfiles = [f for f in listdir(cur_dir) if isfile(join(cur_dir, f))] entries=10000 outf=open('assignment3_class_predictions_nodropout_noregularization_augmenteddata.formatted.3epochs.contrast.tint.txt','w') for i in range(entries): image_name=onlyfiles[i] predict_index=int(predictions[i]) wnid1=wnids[predict_index] outf.write(image_name+'\t'+str(wnid1)+'\n')
def evaluate_lenet5(learning_rate=.1, n_epochs=200, dataset='ml2013final_train.dat', nkerns=[20, 50], batch_size=500): """ Demonstrates lenet on MNIST dataset :type learning_rate: float :param learning_rate: learning rate used (factor for the stochastic gradient) :type n_epochs: int :param n_epochs: maximal number of epochs to run the optimizer :type dataset: string :param dataset: path to the dataset used for training :type nkerns: list of ints :param nkerns: number of kernels on each layer """ print "learning_rate = ", learning_rate rng = numpy.random.RandomState(23455) X, Y = shared_dataset(preprocess(dataset)) # get validation set length = Y.owner.inputs[0].get_value(borrow=True).shape[0] kf = KFold(length, n_folds=5) # keep 80% for training and 20% for validation (test set = validation set here) for i, (train_index, valid_index) in enumerate(kf): if i == 0: train_set_x, valid_set_x = X[train_index], X[valid_index] train_set_y, valid_set_y = Y[train_index], Y[valid_index] train_set_x = theano.shared(numpy.asarray(train_set_x.eval(), dtype=theano.config.floatX)) train_set_y = theano.shared(numpy.asarray(train_set_y.eval(), dtype=theano.config.floatX)) train_set_y = T.cast(train_set_y, 'int32') valid_set_x = theano.shared(numpy.asarray(valid_set_x.eval(), dtype=theano.config.floatX)) valid_set_y = theano.shared(numpy.asarray(valid_set_y.eval(), dtype=theano.config.floatX)) valid_set_y = T.cast(valid_set_y, 'int32') # set test set = validation set test_set_x = valid_set_x test_set_y = valid_set_y # compute number of minibatches for training, validation and testing n_train_batches = train_set_x.get_value(borrow=True).shape[0] n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] n_test_batches = test_set_x.get_value(borrow=True).shape[0] n_train_batches /= batch_size n_valid_batches /= batch_size n_test_batches /= batch_size # allocate symbolic variables for the data index = T.lscalar() # index to a [mini]batch x = T.matrix('x') # the data is presented as rasterized images y = T.ivector('y') # the labels are presented as 1D vector of # [int] labels ishape = (28, 28) # this is the size of the image ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' # Reshape matrix of rasterized images of shape (batch_size,28*28) # to a 4D tensor, compatible with our LeNetConvPoolLayer layer0_input = x.reshape((batch_size, 1, 28, 28)) # Construct the first convolutional pooling layer: # filtering reduces the image size to (28-5+1,28-5+1)=(24,24) # maxpooling reduces this further to (24/2,24/2) = (12,12) # 4D output tensor is thus of shape (batch_size,nkerns[0],12,12) layer0 = LeNetConvPoolLayer(rng, input=layer0_input, image_shape=(batch_size, 1, 28, 28), filter_shape=(nkerns[0], 1, 5, 5), poolsize=(2, 2)) # Construct the second convolutional pooling layer # filtering reduces the image size to (12-5+1,12-5+1)=(8,8) # maxpooling reduces this further to (8/2,8/2) = (4,4) # 4D output tensor is thus of shape (nkerns[0],nkerns[1],4,4) layer1 = LeNetConvPoolLayer(rng, input=layer0.output, image_shape=(batch_size, nkerns[0], 12, 12), filter_shape=(nkerns[1], nkerns[0], 5, 5), poolsize=(2, 2)) # the TanhLayer being fully-connected, it operates on 2D matrices of # shape (batch_size,num_pixels) (i.e matrix of rasterized images). # This will generate a matrix of shape (20,32*4*4) = (20,512) layer2_input = layer1.output.flatten(2) # construct a fully-connected sigmoidal layer layer2 = HiddenLayer(rng, input=layer2_input, n_in=nkerns[1] * 4 * 4, n_out=500, activation=T.tanh) # classify the values of the fully-connected sigmoidal layer layer3 = LogisticRegression(input=layer2.output, n_in=500, n_out=12) # the cost we minimize during training is the NLL of the model cost = layer3.negative_log_likelihood(y) # create a function to compute the mistakes that are made by the model test_model = theano.function([index], layer3.errors(y), givens={ x: test_set_x[index * batch_size: (index + 1) * batch_size], y: test_set_y[index * batch_size: (index + 1) * batch_size]}) validate_model = theano.function([index], layer3.errors(y), givens={ x: valid_set_x[index * batch_size: (index + 1) * batch_size], y: valid_set_y[index * batch_size: (index + 1) * batch_size]}) # create a list of all model parameters to be fit by gradient descent params = layer3.params + layer2.params + layer1.params + layer0.params # create a list of gradients for all model parameters grads = T.grad(cost, params) # train_model is a function that updates the model parameters by # SGD Since this model has many parameters, it would be tedious to # manually create an update rule for each model parameter. We thus # create the updates list by automatically looping over all # (params[i],grads[i]) pairs. updates = [] for param_i, grad_i in zip(params, grads): updates.append((param_i, param_i - learning_rate * grad_i)) train_model = theano.function([index], cost, updates=updates, givens={ x: train_set_x[index * batch_size: (index + 1) * batch_size], y: train_set_y[index * batch_size: (index + 1) * batch_size]}) ############### # TRAIN MODEL # ############### print '... training' # early-stopping parameters patience = 10000 # look as this many examples regardless patience_increase = 2 # wait this much longer when a new best is # found improvement_threshold = 0.995 # a relative improvement of this much is # considered significant validation_frequency = min(n_train_batches, patience / 2) # go through this many # minibatche before checking the network # on the validation set; in this case we # check every epoch best_params = None best_validation_loss = numpy.inf best_iter = 0 test_score = 0. start_time = time.clock() epoch = 0 done_looping = False while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 for minibatch_index in xrange(n_train_batches): iter = (epoch - 1) * n_train_batches + minibatch_index if iter % 100 == 0: print 'training @ iter = ', iter cost_ij = train_model(minibatch_index) if (iter + 1) % validation_frequency == 0: # compute zero-one loss on validation set validation_losses = [validate_model(i) for i in xrange(n_valid_batches)] this_validation_loss = numpy.mean(validation_losses) print('epoch %i, minibatch %i/%i, validation error %f %%' % \ (epoch, minibatch_index + 1, n_train_batches, \ this_validation_loss * 100.)) # if we got the best validation score until now if this_validation_loss < best_validation_loss: #improve patience if loss improvement is good enough if this_validation_loss < best_validation_loss * \ improvement_threshold: patience = max(patience, iter * patience_increase) # save best validation score and iteration number best_validation_loss = this_validation_loss best_iter = iter # test it on the test set test_losses = [test_model(i) for i in xrange(n_test_batches)] test_score = numpy.mean(test_losses) print((' epoch %i, minibatch %i/%i, test error of best ' 'model %f %%') % (epoch, minibatch_index + 1, n_train_batches, test_score * 100.)) if epoch > 30: # Save the model print 'saving model..' save_file = open('cnn_%s.txt' % epoch, 'wb') cPickle.dump(params, save_file, -1) save_file.close() if patience <= iter: done_looping = True break end_time = time.clock() print('Optimization complete.') print('Best validation score of %f %% obtained at iteration %i,'\ 'with test performance %f %%' % (best_validation_loss * 100., best_iter + 1, test_score * 100.)) print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.))