from nala.utils.corpora import get_corpus from nalaf.preprocessing.spliters import NLTKSplitter from nalaf.preprocessing.tokenizers import TmVarTokenizer data = get_corpus('nala_training_1') NLTKSplitter().split(data) TmVarTokenizer().tokenize(data) from nalaf.features.embeddings import BinarizedWordEmbeddingsFeatureGenerator BinarizedWordEmbeddingsFeatureGenerator( '/home/abojchevski/projects/nala/nala/data/word_embeddings_2016-03-28/word_embeddings.model' ).generate(data) for token in data.tokens(): print(token.features, token.end)
def __init__(self): self.data = get_corpus('IDP4+') NLTKSplitter().split(self.data) TmVarTokenizer().tokenize(self.data)
def train(argv): parser = argparse.ArgumentParser(description='Train model') parser.add_argument( '--training_corpus', help= 'Name of the corpus to train on. Ex: nala_training, IDP4+_training, nala_training_5' ) parser.add_argument('--test_corpus', help='Name of the corpus to test on') parser.add_argument('--string', help='String to tag') parser.add_argument('--validation', required=False, default="stratified", choices=["cross-validation", "stratified", "none"], help='Type of validation to use when training') parser.add_argument( '--cv_n', required=False, help= 'if given, cross validation (instead of stratification) is used for validating the training. \ In this case you must also set `cv_fold` and only that fold number will be run' ) parser.add_argument( '--cv_fold', required=False, help= 'fold number if cross validation is activated (it starts at 0; i.e. for cv_n=5, you have folds: [0,1,2,3,4] )' ) parser.add_argument( '--output_folder', required=False, help= 'Folder where the training model is written to. Otherwise a tmp folder is used' ) parser.add_argument( '--model_name_suffix', default='', required=False, help= 'Optional suffix to add to the generated model name in training mode'), parser.add_argument( '--write_anndoc', required=False, default=False, action='store_true', help='Write anndoc of predicted test_corpus (validation corpus in fact)' ), parser.add_argument( '--model_path_1', required=False, help='Path of the first model binary file if evaluation is performed') parser.add_argument( '--model_path_2', required=False, help= 'Path of the second model binary file if evaluation is performed with two models' ) parser.add_argument('--labeler', required=False, default="BIEO", choices=["BIEO", "BIO", "IO", "11labels"], help='Labeler to use for training') parser.add_argument( '--mutations_specific', default='True', help= 'Apply feature pipelines specific to mutations or otherwise (false) use general one' ) parser.add_argument( '--only_class_id', required=False, default=MUT_CLASS_ID, help= "By default, only the mutation entities are read from corpora (assumed to have class_id == '" + MUT_CLASS_ID + "'). Set this class_id to filter rest out") parser.add_argument( '--delete_subclasses', required=False, default="", help='Comma-separated subclasses to delete. Example: "2,3"') parser.add_argument('--pruner', required=False, default="parts", choices=["parts", "sentences"]) parser.add_argument('--ps_ST', required=False, default=False, action='store_true') parser.add_argument('--ps_NL', required=False, default=False, action='store_true') parser.add_argument('--ps_random', required=False, default=0.0, type=float) parser.add_argument('--elastic_net', action='store_true', help='Use elastic net regularization') parser.add_argument('--word_embeddings', '--we', default='True', help='Use word embeddings features') parser.add_argument('--we_additive', type=float, default=0) parser.add_argument('--we_multiplicative', type=float, default=1) parser.add_argument('--we_model_location', type=str, default=None) parser.add_argument('--use_feat_windows', default='True') parser.add_argument('--nl', action='store_true', help='Use NLMentionFeatureGenerator') parser.add_argument('--nl_threshold', type=int, default=0) parser.add_argument('--nl_window', action='store_true', help='use window feature for NLFeatureGenerator') parser.add_argument( '--execute_pp', default='True', help='Execute post processing specific to mutations (default) or not') parser.add_argument( '--keep_silent', default='True', help= 'Keep silent mutations (default) or not, i.e., delete mentions like `Cys23-Cys`' ) parser.add_argument( '--keep_genetic_markers', default='True', help='Keep genetic markers of the form D17S250, true (default) or false' ) parser.add_argument( '--keep_unnumbered', default='True', help= 'Keep unnumbered mentions (default) or not, i.e., delete mentions like `C -> T`' ) parser.add_argument( '--keep_rs_ids', default='True', help= 'Keep unnumbered mentions (default) or not, i.e., delete mentions like `rs1801280` or `ss221`' ) parser.add_argument( '--dictionaries_paths', default=None, help= 'Dictionary paths to use for dictionary features. Can be used within hdfs' ) parser.add_argument('--dictionaries_stop_words', default=None, help='Stop words for dictionaries if these are used') parser.add_argument('--hdfs_url', required=False, default=None, type=str, help='URL of hdfs if this is used') parser.add_argument( '--hdfs_user', required=False, default=None, type=str, help="user of hdfs if this used. Must be given if `hdfs_url` is given") FALSE = ['false', 'f', '0', 'no', 'none'] def arg_bool(arg_value): return False if arg_value.lower() in FALSE else True args = parser.parse_args(argv) start_time = time.time() # ------------------------------------------------------------------------------ delete_subclasses = [] for c in args.delete_subclasses.split(","): c.strip() if c: delete_subclasses.append(int(c)) args.delete_subclasses = delete_subclasses if not args.output_folder: args.output_folder = tempfile.mkdtemp() str_delete_subclasses = "None" if not args.delete_subclasses else str( args.delete_subclasses).strip('[]').replace(' ', '') if args.labeler == "BIEO": labeler = BIEOLabeler() elif args.labeler == "BIO": labeler = BIOLabeler() elif args.labeler == "IO": labeler = IOLabeler() elif args.labeler == "11labels": labeler = TmVarLabeler() args.word_embeddings = arg_bool(args.word_embeddings) if args.word_embeddings: args.we_params = { 'additive': args.we_additive, 'multiplicative': args.we_multiplicative, 'location': args.we_model_location } else: args.we_params = {} # means: do not use we if args.nl: args.nl_features = { 'threshold': args.nl_threshold, # threshold for neighbour space in dictionaries 'window': args.nl_window, } else: args.nl_features = None if args.elastic_net: args.crf_train_params = { 'c1': 1.0, # coefficient for L1 penalty 'c2': 1e-3, # coefficient for L2 penalty } else: args.crf_train_params = None args.use_feat_windows = False if args.use_feat_windows.lower( ) in FALSE else True args.mutations_specific = False if args.mutations_specific.lower( ) in FALSE else True args.execute_pp = False if args.execute_pp.lower() in FALSE else True args.keep_silent = False if args.keep_silent.lower() in FALSE else True args.keep_genetic_markers = False if args.keep_genetic_markers.lower( ) in FALSE else True args.keep_unnumbered = False if args.keep_unnumbered.lower( ) in FALSE else True args.keep_rs_ids = False if args.keep_rs_ids.lower() in FALSE else True args.do_train = False if args.model_path_1 else True if args.cv_n is not None or args.cv_fold is not None: args.validation = "cross-validation" if args.validation == "cross-validation": assert (args.cv_n is not None and args.cv_fold is not None), "You must set both cv_n AND cv_n" # ------------------------------------------------------------------------------ if args.training_corpus: # Get the name of training corpus even if this is given as a folder path, in which case the last folder name is used training_corpus_name = list( filter(None, args.training_corpus.split('/')))[-1] args.model_name = "{}_{}_del_{}".format(training_corpus_name, args.labeler, str_delete_subclasses) if args.validation == "cross-validation": args.model_name += "_cvfold_" + str(args.cv_fold) args.model_name_suffix = args.model_name_suffix.strip() if args.model_name_suffix: args.model_name += "_" + str(args.model_name_suffix) else: args.model_name = args.test_corpus # ------------------------------------------------------------------------------ def stats(dataset, name): print('\n\t{} size: {}'.format(name, len(dataset))) print('\tsubclass distribution: {}'.format(repr(dataset))) # Caveat: the dataset must be passed through the pipeline first print('\tnum sentences: {}\n'.format( sum(1 for x in dataset.sentences()))) definer = ExclusiveNLDefiner() if args.training_corpus: train_set = get_corpus(args.training_corpus, only_class_id=args.only_class_id, hdfs_url=args.hdfs_url, hdfs_user=args.hdfs_user) if args.test_corpus: test_set = get_corpus(args.test_corpus, only_class_id=args.only_class_id, hdfs_url=args.hdfs_url, hdfs_user=args.hdfs_user) elif args.string: test_set = StringReader(args.string).read() elif args.validation == "none": test_set = None elif args.validation == "cross-validation": train_set, test_set = train_set.fold_nr_split( int(args.cv_n), int(args.cv_fold)) elif args.validation == "stratified": definer.define(train_set) train_set, test_set = train_set.stratified_split() elif args.test_corpus: train_set = None test_set = get_corpora(args.test_corpus, args.only_class_id) elif args.string: train_set = None test_set = StringReader(args.string).read() else: raise Exception( "you must give at least a parameter of: training_corpus, test_corpus, or string" ) def verify_corpus(corpus): if corpus is not None: assert len( corpus ) > 0, f"The corpus should have at least one document; had 0: {args.training_corpus}" assert next( corpus.entities(), None ) is not None, "The corpus should have at least one entity; had 0" verify_corpus(train_set) # ------------------------------------------------------------------------------ if args.mutations_specific: print("Pipeline specific to mutations") features_pipeline = get_prepare_pipeline_for_best_model( args.use_feat_windows, args.we_params, args.nl_features) else: print("Pipeline is general") features_pipeline = get_prepare_pipeline_for_best_model_general( args.use_feat_windows, args.we_params, args.dictionaries_paths, args.hdfs_url, args.hdfs_user, args.dictionaries_stop_words) # ------------------------------------------------------------------------------ def print_run_args(): for key, value in sorted((vars(args)).items()): print("\t{} = {}".format(key, value)) print() print("Running arguments: ") print_run_args() # ------------------------------------------------------------------------------ def train(train_set): definer.define(train_set) train_set.delete_subclass_annotations(args.delete_subclasses) features_pipeline.execute(train_set) labeler.label(train_set) if args.pruner == "parts": train_set.prune_empty_parts() else: try: f = HighRecallRegexClassifier(ST=args.ps_ST, NL=args.ps_NL) except AssertionError: f = (lambda _: False) train_set.prune_filtered_sentences(filterin=f, percent_to_keep=args.ps_random) stats(train_set, "training") model_path = os.path.join(args.output_folder, args.model_name + ".bin") PyCRFSuite.train(train_set, model_path, args.crf_train_params) return model_path # ------------------------------------------------------------------------------ if args.do_train: args.model_path_1 = train(train_set) # ------------------------------------------------------------------------------ def test(tagger, test_set, print_eval=True, print_results=False): tagger.tag(test_set) definer.define(test_set) stats(test_set, "test") evaluation = MentionLevelEvaluator( subclass_analysis=True).evaluate(test_set) print_run_args() if print_eval: print(evaluation) if print_results: ConsoleWriter(ent1_class_id=PRO_CLASS_ID, ent2_class_id=MUT_CLASS_ID, color=True).write(test_set) # ------------------------------------------------------------------------------ assert (args.model_path_1 is not None) if args.model_path_2: tagger = NalaMultipleModelTagger( st_model=args.model_path_1, all3_model=args.model_path_2, features_pipeline=features_pipeline, execute_pp=args.execute_pp, keep_silent=args.keep_silent, keep_genetic_markers=args.keep_genetic_markers, keep_unnumbered=args.keep_unnumbered, keep_rs_ids=args.keep_rs_ids) else: tagger = NalaSingleModelTagger( bin_model=args.model_path_1, features_pipeline=features_pipeline, execute_pp=args.execute_pp, keep_silent=args.keep_silent, keep_genetic_markers=args.keep_genetic_markers, keep_unnumbered=args.keep_unnumbered, keep_rs_ids=args.keep_rs_ids) # ------------------------------------------------------------------------------ print("\n{}".format(args.model_name)) if train_set: stats(train_set, "training") if test_set: test(tagger, test_set, print_eval=args.string is None, print_results=args.string is not None) if args.do_train: print("\nThe model is saved to: {}\n".format(args.model_path_1)) if args.write_anndoc: outdir = os.path.join(args.output_folder, args.model_name) os.mkdir(outdir) print("\nThe predicted test data is saved to: {}\n".format(outdir)) TagTogFormat(test_set, use_predicted=True, to_save_to=outdir).export(0) end_time = time.time() print_debug("Elapsed time: ", (end_time - start_time)) return { "tagger": tagger, "trained_model_path": args.model_path_1, "training_num_docs": 0 if train_set is None else len(train_set.documents), "training_num_annotations": 0 if train_set is None else sum(1 for e in train_set.entities() if e.class_id == args.only_class_id) }
"%NL", "#SST", "%SST", "#NL+SST", "%NL+SST", "u#ann", "u#ST", "u%ST", "u#NL", "u%NL", "u#SST", "u%SST", "u#NL+SS", "u%NL+SS", #here called SS (not SST) for lack of column width space for pretty alignment "%d_u_NL", "%m_u_NL" ] print('\t'.join(header)) for corpus_name in args.corpora: realname, typ = get_corpus_type(corpus_name) corpus = get_corpus(realname, only_class_id=MUT_CLASS_ID) columns = get_stats(corpus_name, corpus, typ) if args.listanns: print('\t'.join(header)) print(*columns, sep='\t') # for count in WordsCounter.most_common()[:-len(WordsCounter)-1:-1]: # print(count)