def main(): # device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # tokenizer = RobertaTokenizer.from_pretrained("microsoft/codebert-base") model = RobertaModel.from_pretrained("microsoft/codebert-base") # model.to(device) args = get_type_prediction_arguments() # allowed = {'str', 'bool', 'Optional', 'None', 'int', 'Any', 'Union', 'List', 'Dict', 'Callable', 'ndarray', # 'FrameOrSeries', 'bytes', 'DataFrame', 'Matcher', 'float', 'Tuple', 'bool_t', 'Description', 'Type'} train_data, test_data = read_data( open(args.data_path, "r").readlines(), normalize=True, allowed=None, include_replacements=True, include_only="entities", min_entity_count=args.min_entity_count, random_seed=args.random_seed ) trainer = CodeBertModelTrainer(train_data, test_data, params={}, seq_len=512) trainer.set_type_ann_edges(args.type_ann_edges) trainer.train_model()
def main(): import argparse parser = argparse.ArgumentParser() parser.add_argument("--model_path", "-m", dest="model_path", default=None) parser.add_argument("--vectors", "-v", dest="vectors", default=None) parser.add_argument("data_path") parser.add_argument("--output_model", "-o", dest="output_model", default="spacy-typing-ner") parser.add_argument("--epochs", "-e", dest="epochs", default=90, type=int) parser.add_argument("--seed", "-s", dest="seed", default=42, type=int, help="Seed for random dataset split") parser.add_argument("--bpe", dest="bpe", default=None, type=str, help="") args = parser.parse_args() train_data, test_data = read_data(open(args.data_path, "r").readlines(), include_only="categories", random_seed=args.seed) if args.model_path is not None: model = spacy.load(args.model_path) else: if args.vectors is not None: model = create_tokenizer("spacy_bpe", bpe_path=args.bpe) add_vectors(model, args.vectors) else: raise Exception( "You should provide either an initialized spacy model or pretrained vectors" ) train_spacy_categorizer(train_data, test_data, model=model, output_dir=args.output_model, n_iter=args.epochs)
dest='graph_emb_path', default=None, help='Path to the file with edges') parser.add_argument('--word_emb_path', dest='word_emb_path', default=None, help='Path to the file with edges') parser.add_argument('checkpoint_path', default=None, help='') parser.add_argument('--random_seed', dest='random_seed', default=None, type=int, help='') args = parser.parse_args() train_data, test_data = read_data(open(args.data_path, "r").readlines(), normalize=True, allowed=None, include_replacements=True, include_only="entities", random_seed=args.random_seed, min_entity_count=3) apply_to_dataset(test_data, PythonBatcher, TypePredictor, graph_emb_path=args.graph_emb_path, word_emb_path=args.word_emb_path, checkpoint_path=args.checkpoint_path)