def main():
    # device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    # tokenizer = RobertaTokenizer.from_pretrained("microsoft/codebert-base")
    model = RobertaModel.from_pretrained("microsoft/codebert-base")
    # model.to(device)
    args = get_type_prediction_arguments()

    # allowed = {'str', 'bool', 'Optional', 'None', 'int', 'Any', 'Union', 'List', 'Dict', 'Callable', 'ndarray',
    #            'FrameOrSeries', 'bytes', 'DataFrame', 'Matcher', 'float', 'Tuple', 'bool_t', 'Description', 'Type'}

    train_data, test_data = read_data(
        open(args.data_path, "r").readlines(), normalize=True, allowed=None, include_replacements=True,
        include_only="entities",
        min_entity_count=args.min_entity_count, random_seed=args.random_seed
    )

    trainer = CodeBertModelTrainer(train_data, test_data, params={}, seq_len=512)
    trainer.set_type_ann_edges(args.type_ann_edges)
    trainer.train_model()
def main():
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument("--model_path", "-m", dest="model_path", default=None)
    parser.add_argument("--vectors", "-v", dest="vectors", default=None)
    parser.add_argument("data_path")
    parser.add_argument("--output_model",
                        "-o",
                        dest="output_model",
                        default="spacy-typing-ner")
    parser.add_argument("--epochs", "-e", dest="epochs", default=90, type=int)
    parser.add_argument("--seed",
                        "-s",
                        dest="seed",
                        default=42,
                        type=int,
                        help="Seed for random dataset split")
    parser.add_argument("--bpe", dest="bpe", default=None, type=str, help="")
    args = parser.parse_args()

    train_data, test_data = read_data(open(args.data_path, "r").readlines(),
                                      include_only="categories",
                                      random_seed=args.seed)

    if args.model_path is not None:
        model = spacy.load(args.model_path)
    else:
        if args.vectors is not None:
            model = create_tokenizer("spacy_bpe", bpe_path=args.bpe)
            add_vectors(model, args.vectors)
        else:
            raise Exception(
                "You should provide either an initialized spacy model or pretrained vectors"
            )

    train_spacy_categorizer(train_data,
                            test_data,
                            model=model,
                            output_dir=args.output_model,
                            n_iter=args.epochs)
Example #3
0
                        dest='graph_emb_path',
                        default=None,
                        help='Path to the file with edges')
    parser.add_argument('--word_emb_path',
                        dest='word_emb_path',
                        default=None,
                        help='Path to the file with edges')
    parser.add_argument('checkpoint_path', default=None, help='')
    parser.add_argument('--random_seed',
                        dest='random_seed',
                        default=None,
                        type=int,
                        help='')

    args = parser.parse_args()

    train_data, test_data = read_data(open(args.data_path, "r").readlines(),
                                      normalize=True,
                                      allowed=None,
                                      include_replacements=True,
                                      include_only="entities",
                                      random_seed=args.random_seed,
                                      min_entity_count=3)

    apply_to_dataset(test_data,
                     PythonBatcher,
                     TypePredictor,
                     graph_emb_path=args.graph_emb_path,
                     word_emb_path=args.word_emb_path,
                     checkpoint_path=args.checkpoint_path)