Example #1
0
def test_file_exists(file_name):
    start = "/dev"
    gold = file_name
    wrong = os.path.join(start, file_name)
    path = convert_path(file_name, start)
    assert path == gold
    assert path != wrong
Example #2
0
def test_no_loc():
    file_name = "test"
    gold = os.path.realpath(
        os.path.join(os.path.dirname(os.path.realpath(__file__)), "..", "mead",
                     file_name))
    path = convert_path(file_name)
    assert path == gold
Example #3
0
def test_loc():
    file_name = "test"
    start = "/dev"
    gold = os.path.join(start, file_name)
    path = convert_path(file_name, start)
    assert path == gold
Example #4
0
def main():
    parser = argparse.ArgumentParser(description='Train a text classifier')
    parser.add_argument(
        '--config',
        help=
        'JSON/YML Configuration for an experiment: local file or remote URL',
        type=convert_path,
        default="$MEAD_CONFIG")
    parser.add_argument('--settings',
                        help='JSON/YML Configuration for mead',
                        default=DEFAULT_SETTINGS_LOC,
                        type=convert_path)
    parser.add_argument('--task_modules',
                        help='tasks to load, must be local',
                        default=[],
                        nargs='+',
                        required=False)
    parser.add_argument(
        '--datasets',
        help=
        'index of dataset labels: local file, remote URL or mead-ml/hub ref',
        type=convert_path)
    parser.add_argument(
        '--modules',
        help='modules to load: local files, remote URLs or mead-ml/hub refs',
        default=[],
        nargs='+',
        required=False)
    parser.add_argument('--mod_train_file', help='override the training set')
    parser.add_argument('--mod_valid_file', help='override the validation set')
    parser.add_argument('--mod_test_file', help='override the test set')
    parser.add_argument('--fit_func', help='override the fit function')
    parser.add_argument(
        '--embeddings',
        help='index of embeddings: local file, remote URL or mead-ml/hub ref',
        type=convert_path)
    parser.add_argument(
        '--vecs',
        help='index of vectorizers: local file, remote URL or hub mead-ml/ref',
        type=convert_path)
    parser.add_argument('--logging',
                        help='json file for logging',
                        default=DEFAULT_LOGGING_LOC,
                        type=convert_path)
    parser.add_argument('--task',
                        help='task to run',
                        choices=['classify', 'tagger', 'seq2seq', 'lm'])
    parser.add_argument('--gpus',
                        help='Number of GPUs (defaults to number available)',
                        type=int,
                        default=-1)
    parser.add_argument(
        '--basedir',
        help='Override the base directory where models are stored',
        type=str)
    parser.add_argument('--reporting', help='reporting hooks', nargs='+')
    parser.add_argument('--backend', help='The deep learning backend to use')
    parser.add_argument('--checkpoint',
                        help='Restart training from this checkpoint')
    parser.add_argument(
        '--prefer_eager',
        help="If running in TensorFlow, should we prefer eager model",
        type=str2bool)
    args, overrides = parser.parse_known_args()
    config_params = read_config_stream(args.config)
    config_params = parse_and_merge_overrides(config_params,
                                              overrides,
                                              pre='x')
    if args.basedir is not None:
        config_params['basedir'] = args.basedir

    # task_module overrides are not allowed via hub or HTTP, must be defined locally
    for task in args.task_modules:
        import_user_module(task)

    task_name = config_params.get(
        'task', 'classify') if args.task is None else args.task
    args.logging = read_config_stream(args.logging)
    configure_logger(args.logging,
                     config_params.get('basedir', './{}'.format(task_name)))

    try:
        args.settings = read_config_stream(args.settings)
    except:
        logger.warning(
            'Warning: no mead-settings file was found at [{}]'.format(
                args.settings))
        args.settings = {}

    args.datasets = args.settings.get(
        'datasets', convert_path(
            DEFAULT_DATASETS_LOC)) if args.datasets is None else args.datasets
    args.datasets = read_config_stream(args.datasets)
    if args.mod_train_file or args.mod_valid_file or args.mod_test_file:
        logging.warning(
            'Warning: overriding the training/valid/test data with user-specified files'
            ' different from what was specified in the dataset index.  Creating a new key for this entry'
        )
        update_datasets(args.datasets, config_params, args.mod_train_file,
                        args.mod_valid_file, args.mod_test_file)

    args.embeddings = args.settings.get(
        'embeddings', convert_path(DEFAULT_EMBEDDINGS_LOC)
    ) if args.embeddings is None else args.embeddings
    args.embeddings = read_config_stream(args.embeddings)

    args.vecs = args.settings.get('vecs', convert_path(
        DEFAULT_VECTORIZERS_LOC)) if args.vecs is None else args.vecs
    args.vecs = read_config_stream(args.vecs)

    if args.gpus:
        # why does it go to model and not to train?
        config_params['train']['gpus'] = args.gpus
    if args.fit_func:
        config_params['train']['fit_func'] = args.fit_func
    if args.backend:
        config_params['backend'] = normalize_backend(args.backend)

    config_params['modules'] = list(
        set(chain(config_params.get('modules', []), args.modules)))

    cmd_hooks = args.reporting if args.reporting is not None else []
    config_hooks = config_params.get('reporting') if config_params.get(
        'reporting') is not None else []
    reporting = parse_extra_args(set(chain(cmd_hooks, config_hooks)),
                                 overrides)
    config_params['reporting'] = reporting

    logger.info('Task: [{}]'.format(task_name))

    task = mead.Task.get_task_specific(task_name, args.settings)

    task.read_config(config_params,
                     args.datasets,
                     args.vecs,
                     reporting_args=overrides,
                     prefer_eager=args.prefer_eager)
    task.initialize(args.embeddings)
    task.train(args.checkpoint)
def main():
    parser = argparse.ArgumentParser(
        description='Encode a sentence as an embedding')
    parser.add_argument('--subword_model_file', help='Subword model file')
    parser.add_argument('--nctx', default=256, type=int)
    parser.add_argument('--batchsz', default=20, type=int)
    parser.add_argument('--vec_id',
                        default='bert-base-uncased',
                        help='Reference to a specific embedding type')
    parser.add_argument('--embed_id',
                        default='bert-base-uncased',
                        help='What type of embeddings to use')
    parser.add_argument('--file', required=True)
    parser.add_argument('--column', type=str)
    parser.add_argument('--output', default='embeddings.npz')
    parser.add_argument(
        '--pool',
        help=
        'Should a reduction be applied on the embeddings?  Only use if your embeddings arent already pooled',
        type=str)
    parser.add_argument(
        '--embeddings',
        help='index of embeddings: local file, remote URL or mead-ml/hub ref',
        type=convert_path)
    parser.add_argument(
        '--vecs',
        help='index of vectorizers: local file, remote URL or hub mead-ml/ref',
        type=convert_path)
    parser.add_argument('--cuda', type=baseline.str2bool, default=True)
    parser.add_argument('--has_header', action="store_true")
    parser.add_argument(
        "--tokenizer_type",
        type=str,
        help="Optional tokenizer, default is to use string split")
    parser.add_argument(
        '--faiss_index',
        help="If provided, we will build a FAISS index and store it here")
    parser.add_argument(
        '--quoting',
        default=3,
        help='0=QUOTE_MINIMAL 1=QUOTE_ALL 2=QUOTE_NONNUMERIC 3=QUOTE_NONE',
        type=int)
    parser.add_argument('--sep', default='\t')
    parser.add_argument('--add_columns', nargs='+', default=[])

    args = parser.parse_args()

    if not args.has_header:
        if not args.column:
            args.column = 0
        if args.add_columns:
            args.add_columns = [int(c) for c in args.add_columns]
        column = int(args.column)

    else:
        column = args.column

    args.embeddings = convert_path(
        DEFAULT_EMBEDDINGS_LOC) if args.embeddings is None else args.embeddings
    args.embeddings = read_config_stream(args.embeddings)

    args.vecs = convert_path(
        DEFAULT_VECTORIZERS_LOC) if args.vecs is None else args.vecs

    vecs_index = read_config_stream(args.vecs)
    vecs_set = index_by_label(vecs_index)
    vec_params = vecs_set[args.vec_id]
    vec_params['mxlen'] = args.nctx

    if 'transform' in vec_params:
        vec_params['transform_fn'] = vec_params['transform']

    if 'transform_fn' in vec_params and isinstance(vec_params['transform_fn'],
                                                   str):
        vec_params['transform_fn'] = eval(vec_params['transform_fn'])
    tokenizer = create_tokenizer(args.tokenizer_type)
    vectorizer = create_vectorizer(**vec_params)
    if not isinstance(vectorizer, HasPredefinedVocab):
        raise Exception(
            "We currently require a vectorizer with a pre-defined vocab to run this script"
        )
    embeddings_index = read_config_stream(args.embeddings)
    embeddings_set = index_by_label(embeddings_index)
    embeddings_params = embeddings_set[args.embed_id]
    # If they dont want CUDA try and get the embedding loader to use CPU
    embeddings_params['cpu_placement'] = not args.cuda
    embeddings = load_embeddings_overlay(embeddings_set, embeddings_params,
                                         vectorizer.vocab)

    vocabs = {'x': embeddings['vocab']}
    embedder = embeddings['embeddings'].cpu()
    embedder.eval()
    if args.cuda:
        embedder = embedder.cuda()

    def _mean_pool(inputs, embeddings):
        mask = (inputs != 0)
        seq_lengths = mask.sum(1).unsqueeze(-1)
        return embeddings.sum(1) / seq_lengths

    def _zero_tok_pool(_, embeddings):
        pooled = embeddings[:, 0]
        return pooled

    def _max_pool(inputs, embeddings):
        mask = (inputs != 0)
        embeddings = embeddings.masked_fill(mask.unsqueeze(-1) == False, -1e8)
        return torch.max(embeddings, 1, False)[0]

    if args.pool:
        if args.pool == 'max':
            pool = _max_pool
        elif args.pool == 'zero' or args.pool == 'cls':
            pool = _zero_tok_pool
        else:
            pool = _mean_pool
    else:
        pool = lambda x, y: y

    def chunks(lst, n):
        """Yield successive n-sized chunks from lst."""
        for i in range(0, len(lst), n):
            yield lst[i:i + n]

    df = pd.read_csv(args.file,
                     header='infer' if args.has_header else None,
                     sep=args.sep)
    col = df[column]
    batches = []
    as_list = col.tolist()
    extra_col_map = {}
    for extra_col in args.add_columns:
        if isinstance(extra_col, int):
            key = f'column_{extra_col}'
        else:
            key = extra_col
        extra_col_map[key] = df[extra_col].tolist()
    num_batches = math.ceil(len(as_list) / args.batchsz)
    pg = baseline.create_progress_bar(num_batches, name='tqdm')
    for i, batch in enumerate(chunks(as_list, args.batchsz)):
        pg.update()
        with torch.no_grad():
            vecs = []
            for line in batch:
                tokenized = tokenizer(line)
                vec, l = vectorizer.run(tokenized, vocabs['x'])
                vecs.append(vec)
            vecs = torch.tensor(np.stack(vecs))
            if args.cuda:
                vecs = vecs.cuda()
            embedding = embedder(vecs)
            pooled_batch = pool(vecs, embedding).cpu().numpy()
            batches += [x for x in pooled_batch]

    np.savez(args.output, embeddings=batches, text=as_list, **extra_col_map)
    if args.faiss_index:
        import faiss
        index = faiss.IndexFlatIP(batches[0].shape[-1])
        batches = np.stack(batches)
        faiss.normalize_L2(batches)
        index.add(batches)
        faiss.write_index(index, args.faiss_index)
Example #6
0
def main():
    parser = argparse.ArgumentParser(description='Train a text classifier')
    parser.add_argument('--config',
                        help='configuration for an experiment',
                        type=convert_path,
                        default="$MEAD_CONFIG")
    parser.add_argument('--settings',
                        help='configuration for mead',
                        default=DEFAULT_SETTINGS_LOC,
                        type=convert_path)
    parser.add_argument('--datasets',
                        help='index of dataset labels',
                        type=convert_path)
    parser.add_argument('--modules',
                        help='modules to load',
                        default=[],
                        nargs='+',
                        required=False)
    parser.add_argument('--mod_train_file', help='override the training set')
    parser.add_argument('--mod_valid_file', help='override the validation set')
    parser.add_argument('--mod_test_file', help='override the test set')
    parser.add_argument('--embeddings',
                        help='index of embeddings',
                        type=convert_path)
    parser.add_argument('--logging',
                        help='config file for logging',
                        default=DEFAULT_LOGGING_LOC,
                        type=convert_path)
    parser.add_argument('--task',
                        help='task to run',
                        choices=['classify', 'tagger', 'seq2seq', 'lm'])
    parser.add_argument('--gpus',
                        help='Number of GPUs (defaults to number available)',
                        type=int,
                        default=-1)
    parser.add_argument(
        '--basedir',
        help='Override the base directory where models are stored',
        type=str)
    parser.add_argument('--reporting', help='reporting hooks', nargs='+')
    parser.add_argument('--backend', help='The deep learning backend to use')
    parser.add_argument('--checkpoint',
                        help='Restart training from this checkpoint')
    args, reporting_args = parser.parse_known_args()

    config_params = read_config_stream(args.config)

    if args.basedir is not None:
        config_params['basedir'] = args.basedir

    task_name = config_params.get(
        'task', 'classify') if args.task is None else args.task

    args.logging = read_config_stream(args.logging)
    configure_logger(args.logging,
                     config_params.get('basedir', './{}'.format(task_name)))

    try:
        args.settings = read_config_stream(args.settings)
    except:
        logger.warning(
            'Warning: no mead-settings file was found at [{}]'.format(
                args.settings))
        args.settings = {}

    args.datasets = args.datasets if args.datasets else args.settings.get(
        'datasets', convert_path(DEFAULT_DATASETS_LOC))
    args.datasets = read_config_stream(args.datasets)
    if args.mod_train_file or args.mod_valid_file or args.mod_test_file:
        logging.warning(
            'Warning: overriding the training/valid/test data with user-specified files'
            ' different from what was specified in the dataset index.  Creating a new key for this entry'
        )
        update_datasets(args.datasets, config_params, args.mod_train_file,
                        args.mod_valid_file, args.mod_test_file)

    args.embeddings = args.embeddings if args.embeddings else args.settings.get(
        'embeddings', convert_path(DEFAULT_EMBEDDINGS_LOC))
    args.embeddings = read_config_stream(args.embeddings)

    if args.gpus is not None:
        config_params['model']['gpus'] = args.gpus

    if args.backend is None and 'backend' in args.settings:
        args.backend = args.settings['backend']
    if args.backend is not None:
        config_params['backend'] = normalize_backend(args.backend)

    config_params['modules'] = list(
        set(chain(config_params.get('modules', []), args.modules)))

    cmd_hooks = args.reporting if args.reporting is not None else []
    config_hooks = config_params.get('reporting') if config_params.get(
        'reporting') is not None else []
    reporting = parse_extra_args(set(chain(cmd_hooks, config_hooks)),
                                 reporting_args)
    config_params['reporting'] = reporting

    logger.info('Task: [{}]'.format(task_name))
    task = mead.Task.get_task_specific(task_name, args.settings)
    task.read_config(config_params,
                     args.datasets,
                     reporting_args=reporting_args)
    task.initialize(args.embeddings)
    task.train(args.checkpoint)
Example #7
0
parser.add_argument('--max_len1d', type=int, default=100)
parser.add_argument(
    '--embeddings',
    help='index of embeddings: local file, remote URL or mead-ml/hub ref',
    type=convert_path)
parser.add_argument(
    '--vecs',
    help='index of vectorizers: local file, remote URL or hub mead-ml/ref',
    type=convert_path)
parser.add_argument('--cuda', type=baseline.str2bool, default=True)
parser.add_argument('--has_header', type=baseline.str2bool, default=True)
parser.add_argument('--sep', default='\t')

args = parser.parse_args()

args.embeddings = convert_path(
    DEFAULT_EMBEDDINGS_LOC) if args.embeddings is None else args.embeddings
args.embeddings = read_config_stream(args.embeddings)

args.vecs = convert_path(
    DEFAULT_VECTORIZERS_LOC) if args.vecs is None else args.vecs

vecs_index = read_config_stream(args.vecs)
vecs_set = index_by_label(vecs_index)
vec_params = vecs_set[args.vec_id]
vec_params['mxlen'] = args.nctx

if 'transform' in vec_params:
    vec_params['transform_fn'] = vec_params['transform']

if 'transform_fn' in vec_params and isinstance(vec_params['transform_fn'],
                                               str):
Example #8
0
def main():
    parser = argparse.ArgumentParser(description='Run senteval harness')
    parser.add_argument('--nctx', default=512, type=int)
    parser.add_argument("--module", default=None, help="Module containing custom tokenizers")
    parser.add_argument('--tasks', nargs="+", default=['sts', 'class', 'probe'])
    parser.add_argument('--batchsz', default=20, type=int)
    parser.add_argument('--tok', help='Optional tokenizer, e.g. "gpt2" or "basic". These can be defined in extra module')
    parser.add_argument('--pool', help='Should a reduction be applied on the embeddings?  Only use if your embeddings arent already pooled', type=str)
    parser.add_argument('--vec_id', help='Reference to a specific embedding type')
    parser.add_argument('--embed_id', help='What type of embeddings to use')
    parser.add_argument("--device", type=str, default="cuda" if torch.cuda.is_available() else "cpu", help="Device (cuda or cpu)")
    parser.add_argument('--max_len1d', type=int, default=100)
    parser.add_argument('--embeddings', help='index of embeddings: local file, remote URL or mead-ml/hub ref', type=convert_path)
    parser.add_argument('--vecs', help='index of vectorizers: local file, remote URL or hub mead-ml/ref', type=convert_path)
    parser.add_argument('--fast', help="Run fast, but not necessarily as accurate", action='store_true')
    parser.add_argument('--data', help="Path to senteval data",
                        default=os.path.expanduser("~/dev/work/SentEval/data"))
    args = parser.parse_args()

    if args.module:
        logger.warning("Loading custom user module %s for masking rules and tokenizers", args.module)
        baseline.import_user_module(args.module)


    tokenizer = create_tokenizer(args.tok) if args.tok else None

    args.embeddings = convert_path(DEFAULT_EMBEDDINGS_LOC) if args.embeddings is None else args.embeddings
    args.embeddings = read_config_stream(args.embeddings)

    args.vecs = convert_path(DEFAULT_VECTORIZERS_LOC) if args.vecs is None else args.vecs

    vecs_index = read_config_stream(args.vecs)
    vecs_set = index_by_label(vecs_index)
    vec_params = vecs_set[args.vec_id]
    vec_params['mxlen'] = args.nctx

    if 'transform' in vec_params:
        vec_params['transform_fn'] = vec_params['transform']

    if 'transform_fn' in vec_params and isinstance(vec_params['transform_fn'], str):
        vec_params['transform_fn'] = eval(vec_params['transform_fn'])

    vectorizer = create_vectorizer(**vec_params)
    if not isinstance(vectorizer, HasPredefinedVocab):
        raise Exception("We currently require a vectorizer with a pre-defined vocab to run this script")
    embeddings_index = read_config_stream(args.embeddings)
    embeddings_set = index_by_label(embeddings_index)
    embeddings_params = embeddings_set[args.embed_id]
    embeddings = load_embeddings_overlay(embeddings_set, embeddings_params, vectorizer.vocab)

    embedder = embeddings['embeddings']
    embedder.to(args.device).eval()

    def _mean_pool(inputs, embeddings):
        mask = (inputs != 0)
        seq_lengths = mask.sum(1).unsqueeze(-1)
        return embeddings.sum(1)/seq_lengths

    def _zero_tok_pool(_, embeddings):
        pooled = embeddings[:, 0]
        return pooled

    def _max_pool(inputs, embeddings):
        mask = (inputs != 0)
        embeddings = embeddings.masked_fill(mask.unsqueeze(-1) == False, -1e8)
        return torch.max(embeddings, 1, False)[0]

    if args.pool:
        if args.pool == 'max':
            pool = _max_pool
        elif args.pool == 'zero' or args.pool == 'cls':
            pool = _zero_tok_pool
        else:
            pool = _mean_pool
    else:
        pool = lambda x, y: y

    params_senteval = {'task_path': args.data, 'usepytorch': True, 'kfold': 10}
    params_senteval['classifier'] = {'nhid': 0, 'optim': 'adam', 'batch_size': 64,
                                     'tenacity': 5, 'epoch_size': 4}
    if args.fast:
        logging.info("Setting fast params")
        params_senteval['kfold'] = 5
        params_senteval['classifier']['epoch_size'] = 2
        params_senteval['classifier']['tenacity'] = 3
        params_senteval['classifier']['batch_size'] = 128

    # SentEval prepare and batcher
    def prepare(params, samples):
        max_sample = max(len(s) for s in samples)
        vectorizer.mxlen = min(args.nctx, max_sample + SUBWORD_EXTRA)
        logging.info('num_samples %d, mxlen set to %d', max_sample, vectorizer.mxlen)

    def batcher(params, batch):
        if not tokenizer:
            batch = [sent if sent != [] else ['.'] for sent in batch]
        else:
            batch = [tokenizer(' '.join(sent)) for sent in batch]

        vs = []
        for sent in batch:
            v, l = vectorizer.run(sent, vectorizer.vocab)
            vs.append(v)
        vs = np.stack(vs)
        with torch.no_grad():
            inputs = torch.tensor(vs, device=args.device)
            encoding = embedder(inputs)
            encoding = pool(inputs, encoding)
            encoding = encoding.cpu().numpy()
        return encoding

    se = senteval.engine.SE(params_senteval, batcher, prepare)
    transfer_tasks = []
    if 'sts' in args.tasks:
        transfer_tasks += ['STS12', 'STS13', 'STS14', 'STS15', 'STS16', 'SICKRelatedness', 'STSBenchmark']
    if 'class' in args.tasks:
        transfer_tasks += ['MR', 'CR', 'MPQA', 'SUBJ', 'SST2', 'SST5', 'TREC', 'MRPC',
                           'SICKEntailment']
    if 'probe' in args.tasks:
        transfer_tasks += ['Length', 'WordContent', 'Depth', 'TopConstituents',
                           'BigramShift', 'Tense', 'SubjNumber', 'ObjNumber',
                           'OddManOut', 'CoordinationInversion']

    results = se.eval(transfer_tasks)
    print(results)