Example #1
0
def main():
    parser = argparse.ArgumentParser("Test a trained model on SQuAD")
    parent = argparse.ArgumentParser(add_help=False)

    util.add_data_args(parent)
    util.add_test_args(parent)
    subparsers = parser.add_subparsers()

    add_subparser("bidaf", "bidaf", subparsers, parent, bidaf_trainer)
    add_subparser("glove_transformer", "bidaf", subparsers, parent,
                  glove_transformer_trainer)
    add_subparser("roberta_finetune", "bpe", subparsers, parent,
                  roberta_finetune)

    args = parser.parse_args()

    # Require load_path for test.py
    if not args.load_path:
        raise argparse.ArgumentError("Missing required argument --load_path")

    args.save_dir = util.get_save_dir(args.save_dir, args.name, training=False)
    args.data_dir = util.get_data_dir(args.data_dir, args.data_sub_dir)
    util.build_data_dir_path(args)

    test = args.test
    del args.test
    test(args)
Example #2
0
def generate_captcha(num):
    '''Generates labeled images using original captcha perl script from wakaba.'''
    current_dir = os.getcwd()
    script_dir = os.path.dirname(os.path.realpath(__file__))
    data_dir = util.get_data_dir()
    image_dir = data_dir + "/images"
    if not os.path.exists(data_dir):
        os.mkdir(data_dir)
    if not os.path.exists(image_dir):
        os.mkdir(image_dir)
    gen_script = os.path.abspath(script_dir + "/gencaptcha.pl")
    os.chdir(image_dir)
    for i in range(0, num):
        subprocess.call(["perl", gen_script, str(i)])
    os.chdir(current_dir)
Example #3
0
def get_saved_classifier():
    '''Getting classifier from saved data if possible or training/generating new'''
    data_dir = util.get_data_dir()
    if not os.path.exists(data_dir):
        os.mkdir(data_dir)
    #Trying to load saved neural network coefficients
    if os.path.exists(data_dir + "/" + config.network_name + ".npy"):
        input_layer = config.sample_h * config.sample_w
        outer_layer = config.character_number
        neural_classifier = neural.NeuralClassfier(input_layer,
                                                   config.hidden_layer,
                                                   outer_layer,
                                                   config.reg,
                                                   random_seed=config.seed)
        neural_classifier.weights = numpy.load(data_dir + "/" +
                                               config.network_name + ".npy")
        return neural_classifier

    #Trying to load saved training data
    if (os.path.exists(data_dir + "/X.npy")
            and os.path.exists(data_dir + "/y.npy")):
        X = numpy.load(data_dir + "/X.npy")
        y = numpy.load(data_dir + "/y.npy")
    else:
        image_dir = util.get_image_dir()
        if not os.path.exists(image_dir):
            os.mkdir(image_dir)
        if len(glob.glob(image_dir + "/*.gif")) < config.gen_train_size / 2:
            debug("Generating recognized captcha images using perl script.")
            generate_captcha(config.gen_train_size)
        debug(
            "Saved training data is not found. Generating new by segmentating images."
        )
        X, y = segment.extract_features()
        numpy.save(data_dir + "/X.npy", X)
        numpy.save(data_dir + "/y.npy", y)

    debug("Network coefficients are not found. Training new neural network.")
    neural_classifier = neural.train_network(X, y)
    numpy.save(data_dir + "/" + config.network_name + ".npy",
               neural_classifier.weights)
    debug("Selfchecking full captcha files.")
    accuracy = test.check_labeled_dir(neural_classifier,
                                      util.get_image_dir(),
                                      limit=100)
    debug("Accuracy on generated set: {}".format(accuracy))
    return neural_classifier
Example #4
0
def main():
    parser = argparse.ArgumentParser("Download and pre-process SQuAD")
    parent_parser = argparse.ArgumentParser(add_help=False)

    util.add_data_args(parent_parser)

    subparsers = parser.add_subparsers()

    add_subparser("bidaf", "bidaf", subparsers, parent_parser, bidaf_setup)
    add_subparser("bpe", "bpe", subparsers, parent_parser, bpe_setup)
    add_subparser("bpe_aug", "bpe", subparsers, parent_parser, bpe_aug_setup)

    args = parser.parse_args()
    args.data_dir = util.get_data_dir(args.data_dir, args.data_sub_dir)
    util.build_data_dir_path(args)

    args.setup(args)
Example #5
0
def main():
    parser = argparse.ArgumentParser("Train a model on SQuAD")
    parent = argparse.ArgumentParser(add_help=False)

    util.add_data_args(parent)
    util.add_train_args(parent)
    subparsers = parser.add_subparsers()

    add_subparser("bidaf", "bidaf", subparsers, parent, bidaf_trainer)
    add_subparser("glove_transformer", "bidaf", subparsers, parent,
                  glove_transformer_trainer)
    add_subparser("roberta_pretrain", "bpe", subparsers, parent,
                  roberta_pretrainer)
    add_subparser("electra_pretrain", "bpe", subparsers, parent,
                  electra_pretrainer)
    add_subparser("didae_pretrain", "bpe", subparsers, parent,
                  didae_pretrainer)
    add_subparser("roberta_finetune", "bpe", subparsers, parent,
                  roberta_finetune)
    add_subparser("roberta_augment", "bpe", subparsers, parent,
                  roberta_augment)

    args = parser.parse_args()
    if args.metric_name.startswith("NLL"):
        # Best checkpoint is the one that minimizes negative log-likelihood
        args.maximize_metric = False
    elif args.metric_name in ("EM",
                              "F1") or args.metric_name.startswith("acc"):
        # Best checkpoint is the one that maximizes EM or F1
        args.maximize_metric = True
    else:
        raise ValueError(f'Unrecognized metric name: "{args.metric_name}"')

    args.save_dir = util.get_save_dir(args.save_dir, args.name, training=True)
    args.data_dir = util.get_data_dir(args.data_dir, args.data_sub_dir)
    util.build_data_dir_path(args)

    train = args.train
    del args.train
    train(args)
from os import listdir, path

from util import get_data_dir, get_mutual_followship_path, get_user_connections_path, get_authenticated_username

data_dir = get_data_dir()
authenticated_username = get_authenticated_username()
connections_path = get_user_connections_path()


def get_users_connections():
    return set(open(connections_path).read().splitlines())


def correct_mutual_follwers():
    for account in listdir(data_dir):
        mutuals_path = get_mutual_followship_path(account)
        if account is authenticated_username or not path.exists(mutuals_path):
            continue

        mutuals = set(open(mutuals_path).read().splitlines())
        corrected = mutuals.intersection(get_users_connections())

        with open(mutuals_path, 'w') as out:
            out.write("\n".join(corrected))


def check_mutual_correctness():
    for account in listdir(data_dir):
        mutuals_path = get_mutual_followship_path(account)
        if account is authenticated_username or not path.exists(mutuals_path):
            continue
Example #7
0
def get_preprocess_img_name(img_file):
    ds_dir = os.path.join(get_data_dir(img_file), "1_preprocess")
    return os.path.join(ds_dir, get_png_name_for_jpeg(img_file))
Example #8
0
def find_component_files(component_identifiers):
    """
    Accepts list of component identifiers (such as names), filenames, GitHub 
    repositories (e.g., "gesso/raspberry-pi-3"), or file paths.

    Returns dictionary with keys from input and values as the associated 
    component paths.
    """
    logger = util.logger(__name__, exclude_prefix=True)

    # Matches strings such as:
    # - "gesso/ir-rangefinder"
    # - "gesso/raspberry-pi-3"
    component_github_pattern = re.compile(r'^[A-Za-z0-9-_]+\/[A-Za-z0-9-_]+$')

    logger.info('Component File Paths:')
    component_file_paths = {}
    path_list = []
    current_dir = util.get_current_dir()
    for component_identifier in component_identifiers:

        # Prepare and compile regular expressions for validating and identifying a model file.
        component_path_pattern = re.compile(
            r'^%s(-(?:(\d+)\.)?(?:(\d+)\.)?(\*|\d+)){0,1}\.(yaml)$' %
            component_identifier)

        # Search local directory for YAML file (based on input argument)
        if component_identifier not in component_file_paths:
            for file_name in util.get_file_list():
                if component_path_pattern.match(file_name) is not None:
                    component_file_paths[component_identifier] = '%s/%s' % (
                        current_dir, file_name)

        # Search library's data/models folder
        if component_identifier not in component_file_paths:
            data_dir = util.get_data_dir()
            for file_name in util.get_file_list(data_dir):
                if component_path_pattern.match(file_name) is not None:
                    component_file_paths[component_identifier] = '%s/%s' % (
                        data_dir, file_name)

        # Check for GitHub repository identifier
        if component_identifier not in component_file_paths:
            # TODO: Write function to load model file from a GitHub repository specified with format 'username/repo'
            if component_github_pattern.match(
                    component_identifier) is not None:
                logger.info('Cloning %s/%s to %s/%s/%s' %
                            (username, repository, '.gesso/components',
                             username, repository))
                username, repository = component_identifier.split('/')
                git.clone_github_repository(username, repository)

        # No component file was found for the specified identifer, so halt and show an error.
        # if component_identifier not in component_file_paths:
        # print "No component file found for '%s'." % component_identifier
        # # TODO: Log error/warning/info
        # None

    for component_identifier in component_identifiers:
        path_list.append(component_file_paths[component_identifier])

    for component_identifier in component_identifiers:
        logger.info(
            '\t%s => Found component file: %s' %
            (component_identifier, component_file_paths[component_identifier]))
    logger.info('')

    return path_list