def main(): parser = argparse.ArgumentParser("Test a trained model on SQuAD") parent = argparse.ArgumentParser(add_help=False) util.add_data_args(parent) util.add_test_args(parent) subparsers = parser.add_subparsers() add_subparser("bidaf", "bidaf", subparsers, parent, bidaf_trainer) add_subparser("glove_transformer", "bidaf", subparsers, parent, glove_transformer_trainer) add_subparser("roberta_finetune", "bpe", subparsers, parent, roberta_finetune) args = parser.parse_args() # Require load_path for test.py if not args.load_path: raise argparse.ArgumentError("Missing required argument --load_path") args.save_dir = util.get_save_dir(args.save_dir, args.name, training=False) args.data_dir = util.get_data_dir(args.data_dir, args.data_sub_dir) util.build_data_dir_path(args) test = args.test del args.test test(args)
def generate_captcha(num): '''Generates labeled images using original captcha perl script from wakaba.''' current_dir = os.getcwd() script_dir = os.path.dirname(os.path.realpath(__file__)) data_dir = util.get_data_dir() image_dir = data_dir + "/images" if not os.path.exists(data_dir): os.mkdir(data_dir) if not os.path.exists(image_dir): os.mkdir(image_dir) gen_script = os.path.abspath(script_dir + "/gencaptcha.pl") os.chdir(image_dir) for i in range(0, num): subprocess.call(["perl", gen_script, str(i)]) os.chdir(current_dir)
def get_saved_classifier(): '''Getting classifier from saved data if possible or training/generating new''' data_dir = util.get_data_dir() if not os.path.exists(data_dir): os.mkdir(data_dir) #Trying to load saved neural network coefficients if os.path.exists(data_dir + "/" + config.network_name + ".npy"): input_layer = config.sample_h * config.sample_w outer_layer = config.character_number neural_classifier = neural.NeuralClassfier(input_layer, config.hidden_layer, outer_layer, config.reg, random_seed=config.seed) neural_classifier.weights = numpy.load(data_dir + "/" + config.network_name + ".npy") return neural_classifier #Trying to load saved training data if (os.path.exists(data_dir + "/X.npy") and os.path.exists(data_dir + "/y.npy")): X = numpy.load(data_dir + "/X.npy") y = numpy.load(data_dir + "/y.npy") else: image_dir = util.get_image_dir() if not os.path.exists(image_dir): os.mkdir(image_dir) if len(glob.glob(image_dir + "/*.gif")) < config.gen_train_size / 2: debug("Generating recognized captcha images using perl script.") generate_captcha(config.gen_train_size) debug( "Saved training data is not found. Generating new by segmentating images." ) X, y = segment.extract_features() numpy.save(data_dir + "/X.npy", X) numpy.save(data_dir + "/y.npy", y) debug("Network coefficients are not found. Training new neural network.") neural_classifier = neural.train_network(X, y) numpy.save(data_dir + "/" + config.network_name + ".npy", neural_classifier.weights) debug("Selfchecking full captcha files.") accuracy = test.check_labeled_dir(neural_classifier, util.get_image_dir(), limit=100) debug("Accuracy on generated set: {}".format(accuracy)) return neural_classifier
def main(): parser = argparse.ArgumentParser("Download and pre-process SQuAD") parent_parser = argparse.ArgumentParser(add_help=False) util.add_data_args(parent_parser) subparsers = parser.add_subparsers() add_subparser("bidaf", "bidaf", subparsers, parent_parser, bidaf_setup) add_subparser("bpe", "bpe", subparsers, parent_parser, bpe_setup) add_subparser("bpe_aug", "bpe", subparsers, parent_parser, bpe_aug_setup) args = parser.parse_args() args.data_dir = util.get_data_dir(args.data_dir, args.data_sub_dir) util.build_data_dir_path(args) args.setup(args)
def main(): parser = argparse.ArgumentParser("Train a model on SQuAD") parent = argparse.ArgumentParser(add_help=False) util.add_data_args(parent) util.add_train_args(parent) subparsers = parser.add_subparsers() add_subparser("bidaf", "bidaf", subparsers, parent, bidaf_trainer) add_subparser("glove_transformer", "bidaf", subparsers, parent, glove_transformer_trainer) add_subparser("roberta_pretrain", "bpe", subparsers, parent, roberta_pretrainer) add_subparser("electra_pretrain", "bpe", subparsers, parent, electra_pretrainer) add_subparser("didae_pretrain", "bpe", subparsers, parent, didae_pretrainer) add_subparser("roberta_finetune", "bpe", subparsers, parent, roberta_finetune) add_subparser("roberta_augment", "bpe", subparsers, parent, roberta_augment) args = parser.parse_args() if args.metric_name.startswith("NLL"): # Best checkpoint is the one that minimizes negative log-likelihood args.maximize_metric = False elif args.metric_name in ("EM", "F1") or args.metric_name.startswith("acc"): # Best checkpoint is the one that maximizes EM or F1 args.maximize_metric = True else: raise ValueError(f'Unrecognized metric name: "{args.metric_name}"') args.save_dir = util.get_save_dir(args.save_dir, args.name, training=True) args.data_dir = util.get_data_dir(args.data_dir, args.data_sub_dir) util.build_data_dir_path(args) train = args.train del args.train train(args)
from os import listdir, path from util import get_data_dir, get_mutual_followship_path, get_user_connections_path, get_authenticated_username data_dir = get_data_dir() authenticated_username = get_authenticated_username() connections_path = get_user_connections_path() def get_users_connections(): return set(open(connections_path).read().splitlines()) def correct_mutual_follwers(): for account in listdir(data_dir): mutuals_path = get_mutual_followship_path(account) if account is authenticated_username or not path.exists(mutuals_path): continue mutuals = set(open(mutuals_path).read().splitlines()) corrected = mutuals.intersection(get_users_connections()) with open(mutuals_path, 'w') as out: out.write("\n".join(corrected)) def check_mutual_correctness(): for account in listdir(data_dir): mutuals_path = get_mutual_followship_path(account) if account is authenticated_username or not path.exists(mutuals_path): continue
def get_preprocess_img_name(img_file): ds_dir = os.path.join(get_data_dir(img_file), "1_preprocess") return os.path.join(ds_dir, get_png_name_for_jpeg(img_file))
def find_component_files(component_identifiers): """ Accepts list of component identifiers (such as names), filenames, GitHub repositories (e.g., "gesso/raspberry-pi-3"), or file paths. Returns dictionary with keys from input and values as the associated component paths. """ logger = util.logger(__name__, exclude_prefix=True) # Matches strings such as: # - "gesso/ir-rangefinder" # - "gesso/raspberry-pi-3" component_github_pattern = re.compile(r'^[A-Za-z0-9-_]+\/[A-Za-z0-9-_]+$') logger.info('Component File Paths:') component_file_paths = {} path_list = [] current_dir = util.get_current_dir() for component_identifier in component_identifiers: # Prepare and compile regular expressions for validating and identifying a model file. component_path_pattern = re.compile( r'^%s(-(?:(\d+)\.)?(?:(\d+)\.)?(\*|\d+)){0,1}\.(yaml)$' % component_identifier) # Search local directory for YAML file (based on input argument) if component_identifier not in component_file_paths: for file_name in util.get_file_list(): if component_path_pattern.match(file_name) is not None: component_file_paths[component_identifier] = '%s/%s' % ( current_dir, file_name) # Search library's data/models folder if component_identifier not in component_file_paths: data_dir = util.get_data_dir() for file_name in util.get_file_list(data_dir): if component_path_pattern.match(file_name) is not None: component_file_paths[component_identifier] = '%s/%s' % ( data_dir, file_name) # Check for GitHub repository identifier if component_identifier not in component_file_paths: # TODO: Write function to load model file from a GitHub repository specified with format 'username/repo' if component_github_pattern.match( component_identifier) is not None: logger.info('Cloning %s/%s to %s/%s/%s' % (username, repository, '.gesso/components', username, repository)) username, repository = component_identifier.split('/') git.clone_github_repository(username, repository) # No component file was found for the specified identifer, so halt and show an error. # if component_identifier not in component_file_paths: # print "No component file found for '%s'." % component_identifier # # TODO: Log error/warning/info # None for component_identifier in component_identifiers: path_list.append(component_file_paths[component_identifier]) for component_identifier in component_identifiers: logger.info( '\t%s => Found component file: %s' % (component_identifier, component_file_paths[component_identifier])) logger.info('') return path_list