def grid_search(self, kwargs): make_dir("../evaluations") wordNgrams = kwargs["wordNgrams"] bucket = kwargs["bucket"] lr = kwargs["lr"] dim = kwargs["dim"] epoch = kwargs["epoch"] loss = kwargs["loss"] args = product(wordNgrams, bucket, lr, dim, epoch, loss) for combinations in args: kwargs["wordNgrams"] = combinations[0] kwargs["bucket"] = int(combinations[1]) kwargs["lr"] = combinations[2] kwargs["dim"] = combinations[3] kwargs["epoch"] = combinations[4] kwargs["loss"] = combinations[5] parameters = " ".join( map(str, [ kwargs["wordNgrams"], kwargs["bucket"], kwargs["lr"], kwargs["dim"], kwargs["epoch"], kwargs["loss"] ])) self.trainClassifier(**kwargs) results = "{}\n{}\n\n".format(parameters, self.testClassifier(kwargs["name"])) save_data(directory="../evaluations", name="results.txt", docs=results, mode="a")
def __init__(self, path_to_moses, NCPUS, NGRAM, verbose=False): self.path_to_moses = path_to_moses self.NCPUS = NCPUS self.NGRAM = NGRAM self.lmdir = "lm/" utilities.make_dir(self.lmdir) self.verbose = verbose
def __init__(self, path_to_moses, mem_limit, max_len, min_len, verbose=False): self.path_to_moses = path_to_moses self.mem_limit = mem_limit self.max_len = max_len self.min_len = min_len self.destdir = "data/" self.traindir = self.destdir + "train/" self.tunedir = self.destdir + "tune/" self.testdir = self.destdir + "test/" self.verbose = verbose utilities.make_dir(self.destdir)
def trainClassifier(self, **kwargs): """ Trains supervised classifier Paras: hyper_parameters: parameters to train neural net Returns: None """ make_dir("../fastTextModels") name = kwargs["name"] model = kwargs["model"] parameters = self.setParameters(**kwargs) system( "../fastText/fasttext {} -input ../Dataset/training_set_processed/training_{}.txt -output ../fastTextModels/model_{} -label __label__ {}" .format(model, name, name, parameters))
def train(self, src_file, tar_file, working_dir): """ Carries out the training. Creates a working directory, extracts the root file information and file extension information necessary for moses to run. Sends output messages to working_dir/log """ if utilities.dir_exists(working_dir): return self._validate_file(src_file) self._validate_file(tar_file) cwd = os.getcwd() + "/" blm = cwd + "lm/" + utilities.strip_filename_from_path(tar_file) + ".blm" shared = self._find_common_beginning(src_file, tar_file) file1_ext = src_file[shared+1:] file2_ext = tar_file[shared+1:] fileroot = cwd + src_file[:shared] log = "train.out" utilities.make_dir(working_dir) self._print("Training model at {}. This may take a while... ".format(working_dir)) trainer = self.path_to_moses + "scripts/training/train-model.perl" command = "cd {};".format(working_dir) +\ " nohup nice " + trainer + \ " -root-dir train -corpus {}".format(fileroot) + \ " -f {} -e {} -alignment".format(file1_ext, file2_ext) + \ " grow-diag-final-and -reordering msd-bidirectional-fe" + \ " -lm 0:3:{}:8".format(blm) + \ " -cores {}".format(self.NCPUS) + \ " -mgiza --parallel" + \ " -external-bin-dir " + self.path_to_moses + "tools/mgizapp/" + \ " >& {};".format(log) + \ " cd .." subprocess.call(command, shell=True) self._print("Done\n")
def split_train_tune_test(self, src_file, src_piv_file, piv_tar_file, tar_file, train_split, test_split): """ Splits the full datafiles into test, tune, and train sets. Receives 4 files as parameters and 2 decimals indicating the percentage of data to be used as train, tune, and test data. If line 1 in src langs is in test, then line 1 in tar langs will also be in test. Etc. """ utilities.make_dir(self.traindir) utilities.make_dir(self.tunedir) utilities.make_dir(self.testdir) self._validate_file(src_file), self._validate_file(src_piv_file) self._validate_file(piv_tar_file), self._validate_file(tar_file) assert train_split + test_split <= 1 , "Invalid size for train, tune, and test splits" train_files, tune_files, test_files = self._ttt_filenames(src_file, src_piv_file, piv_tar_file, tar_file) if utilities.ttt_files_exist(train_files, tune_files, test_files): return else: utilities.ttt_wipe_files(train_files, tune_files, test_files) self._print("""Splitting data into train, tune, and test sets...""") train, tune, test = [[] ,[], [], []], [[], [], [], []], [[], [], [], []] for src_line, src_piv_line, piv_tar_line, tar_line in \ zip_longest(open(src_file), open(src_piv_file), open(piv_tar_file), open(tar_file)): x = numpy.random.sample() if x < train_split: self._add_line_to(train[0], src_line) self._add_line_to(train[1], src_piv_line) self._add_line_to(train[2], piv_tar_line) self._add_line_to(train[3], tar_line) elif x >= train_split and x < train_split + test_split: self._add_line_to(tune[0], src_line) self._add_line_to(tune[1], src_piv_line) self._add_line_to(tune[2], piv_tar_line) self._add_line_to(tune[3], tar_line) else: self._add_line_to(test[0], src_line) self._add_line_to(test[1], src_piv_line) self._add_line_to(test[2], piv_tar_line) self._add_line_to(test[3], tar_line) if asizeof.asizeof(train) + asizeof.asizeof(tune) + \ asizeof.asizeof(test) > self.mem_limit: self._dump_ttt_bufs_to(train, tune, test, train_files, tune_files, test_files) self._dump_ttt_bufs_to(train, tune, test, train_files, tune_files, test_files) self._print("Done\n")
def make_dir(self): """bla bla. """ utilities.make_dir(self.get_site_dir_path()) for subdir in json_site_subdirs: utilities.make_dir(self.get_site_dir_subdir_path(subdir))
def make_dir(self): """bla bla. """ utilities.make_dir(self.get_user_dir_path())