def setUp(self): self.dataset = StringReader( 'some text ... (c.2708_2711delTTAG, p.V903GfsX905) ... text').read( ) NLTKSplitter().split(self.dataset) TmVarTokenizer().tokenize(self.dataset) part = list(self.dataset.parts())[0] part.annotations.append( Entity(STUB_ENTITY_CLASS_ID, 15, 'c.2708_2711delTTAG')) part.annotations.append( Entity(STUB_ENTITY_CLASS_ID, 35, 'p.V903GfsX905'))
def test_generate_patterns_245(self): dataset = StringReader('token c.A436C token').read() NLTKSplitter().split(dataset) TmVarTokenizer().tokenize(dataset) TmVarDictionaryFeatureGenerator().generate(dataset) token_features = [{key: value for key, value in token.features.items() if value is not 'O'} for token in dataset.tokens()] self.assertEqual(token_features[0], {}) self.assertEqual(token_features[1], {'pattern4[0]': 'B', 'pattern2[0]': 'B'}) self.assertEqual(token_features[2], {'pattern4[0]': 'I', 'pattern2[0]': 'I'}) self.assertEqual(token_features[3], {'pattern4[0]': 'I', 'pattern2[0]': 'I', 'pattern5[0]': 'B'}) self.assertEqual(token_features[4], {'pattern4[0]': 'I', 'pattern2[0]': 'I', 'pattern5[0]': 'I'}) self.assertEqual(token_features[5], {'pattern4[0]': 'E', 'pattern2[0]': 'I', 'pattern5[0]': 'E'}) self.assertEqual(token_features[6], {})
class TestLabelers(unittest.TestCase): def setUp(self): self.dataset = StringReader( 'some text ... (c.2708_2711delTTAG, p.V903GfsX905) ... text').read( ) NLTKSplitter().split(self.dataset) TmVarTokenizer().tokenize(self.dataset) part = list(self.dataset.parts())[0] part.annotations.append( Entity(STUB_ENTITY_CLASS_ID, 15, 'c.2708_2711delTTAG')) part.annotations.append( Entity(STUB_ENTITY_CLASS_ID, 35, 'p.V903GfsX905')) def test_bio_labeler(self): BIOLabeler().label(self.dataset) labels = [ token.original_labels[0].value for token in self.dataset.tokens() ] expected = [ 'O', 'O', 'O', 'O', 'O', 'O', 'B-e_x', 'I-e_x', 'I-e_x', 'I-e_x', 'I-e_x', 'I-e_x', 'I-e_x', 'O', 'B-e_x', 'I-e_x', 'I-e_x', 'I-e_x', 'I-e_x', 'I-e_x', 'I-e_x', 'O', 'O', 'O', 'O', 'O' ] self.assertEqual(labels, expected) def test_bieo_labeler(self): BIEOLabeler().label(self.dataset) labels = [ token.original_labels[0].value for token in self.dataset.tokens() ] expected = [ 'O', 'O', 'O', 'O', 'O', 'O', 'B-e_x', 'I-e_x', 'I-e_x', 'I-e_x', 'I-e_x', 'I-e_x', 'E-e_x', 'O', 'B-e_x', 'I-e_x', 'I-e_x', 'I-e_x', 'I-e_x', 'I-e_x', 'E-e_x', 'O', 'O', 'O', 'O', 'O' ] self.assertEqual(labels, expected) def test_tmvar_labeler(self): TmVarLabeler(STUB_ENTITY_CLASS_ID).label(self.dataset) labels = [ token.original_labels[0].value for token in self.dataset.tokens() ] expected = [ 'O', 'O', 'O', 'O', 'O', 'O', 'A', 'I', 'P', 'P', 'P', 'T', 'W', 'O', 'A', 'I', 'W', 'P', 'I', 'M', 'P', 'O', 'O', 'O', 'O', 'O' ] self.assertEqual(labels, expected)
def _get_test_data(self, entity_sentence, assumed_tokens_words=None): if assumed_tokens_words is None: assumed_tokens_words = entity_sentence.split(' ') # Create dataset dataset = StringReader(entity_sentence).read() part = next(dataset.parts()) entity = Entity(class_id=STUB_ENTITY_CLASS_ID, offset=0, text=entity_sentence) part.annotations.append(entity) # Apply through pipeline NLTKSplitter().split(dataset) NLTK_TOKENIZER.tokenize(dataset) self.parser.parse(dataset) # Rest sentences = part.sentences assert len(sentences) == 1 sentence = sentences[0] assert len(assumed_tokens_words) == len(sentence) for (assumed_token_word, actual_token) in zip(assumed_tokens_words, sentence): assert assumed_token_word == actual_token.word part.compute_tokens_depth() roots = Part.get_sentence_roots(sentence) for r in roots: self._assert_depth_eq(r, 0) part.set_entities_head_tokens() return (dataset, sentence, entity, roots)
def run_with_argv(argv=[]): args = parse_arguments(argv) ner, re = read_models(args) if args.text: corpus = StringReader(args.text).read() elif args.pmid: corpus = PMIDReader(args.pmid).read() # See more possible readers including some NCBI XML files in `nalaf.utils.readers` ner.annotate(corpus) re.annotate(corpus) return corpus
def train(argv): parser = argparse.ArgumentParser(description='Train model') parser.add_argument( '--training_corpus', help= 'Name of the corpus to train on. Ex: nala_training, IDP4+_training, nala_training_5' ) parser.add_argument('--test_corpus', help='Name of the corpus to test on') parser.add_argument('--string', help='String to tag') parser.add_argument('--validation', required=False, default="stratified", choices=["cross-validation", "stratified", "none"], help='Type of validation to use when training') parser.add_argument( '--cv_n', required=False, help= 'if given, cross validation (instead of stratification) is used for validating the training. \ In this case you must also set `cv_fold` and only that fold number will be run' ) parser.add_argument( '--cv_fold', required=False, help= 'fold number if cross validation is activated (it starts at 0; i.e. for cv_n=5, you have folds: [0,1,2,3,4] )' ) parser.add_argument( '--output_folder', required=False, help= 'Folder where the training model is written to. Otherwise a tmp folder is used' ) parser.add_argument( '--model_name_suffix', default='', required=False, help= 'Optional suffix to add to the generated model name in training mode'), parser.add_argument( '--write_anndoc', required=False, default=False, action='store_true', help='Write anndoc of predicted test_corpus (validation corpus in fact)' ), parser.add_argument( '--model_path_1', required=False, help='Path of the first model binary file if evaluation is performed') parser.add_argument( '--model_path_2', required=False, help= 'Path of the second model binary file if evaluation is performed with two models' ) parser.add_argument('--labeler', required=False, default="BIEO", choices=["BIEO", "BIO", "IO", "11labels"], help='Labeler to use for training') parser.add_argument( '--mutations_specific', default='True', help= 'Apply feature pipelines specific to mutations or otherwise (false) use general one' ) parser.add_argument( '--only_class_id', required=False, default=MUT_CLASS_ID, help= "By default, only the mutation entities are read from corpora (assumed to have class_id == '" + MUT_CLASS_ID + "'). Set this class_id to filter rest out") parser.add_argument( '--delete_subclasses', required=False, default="", help='Comma-separated subclasses to delete. Example: "2,3"') parser.add_argument('--pruner', required=False, default="parts", choices=["parts", "sentences"]) parser.add_argument('--ps_ST', required=False, default=False, action='store_true') parser.add_argument('--ps_NL', required=False, default=False, action='store_true') parser.add_argument('--ps_random', required=False, default=0.0, type=float) parser.add_argument('--elastic_net', action='store_true', help='Use elastic net regularization') parser.add_argument('--word_embeddings', '--we', default='True', help='Use word embeddings features') parser.add_argument('--we_additive', type=float, default=0) parser.add_argument('--we_multiplicative', type=float, default=1) parser.add_argument('--we_model_location', type=str, default=None) parser.add_argument('--use_feat_windows', default='True') parser.add_argument('--nl', action='store_true', help='Use NLMentionFeatureGenerator') parser.add_argument('--nl_threshold', type=int, default=0) parser.add_argument('--nl_window', action='store_true', help='use window feature for NLFeatureGenerator') parser.add_argument( '--execute_pp', default='True', help='Execute post processing specific to mutations (default) or not') parser.add_argument( '--keep_silent', default='True', help= 'Keep silent mutations (default) or not, i.e., delete mentions like `Cys23-Cys`' ) parser.add_argument( '--keep_genetic_markers', default='True', help='Keep genetic markers of the form D17S250, true (default) or false' ) parser.add_argument( '--keep_unnumbered', default='True', help= 'Keep unnumbered mentions (default) or not, i.e., delete mentions like `C -> T`' ) parser.add_argument( '--keep_rs_ids', default='True', help= 'Keep unnumbered mentions (default) or not, i.e., delete mentions like `rs1801280` or `ss221`' ) parser.add_argument( '--dictionaries_paths', default=None, help= 'Dictionary paths to use for dictionary features. Can be used within hdfs' ) parser.add_argument('--dictionaries_stop_words', default=None, help='Stop words for dictionaries if these are used') parser.add_argument('--hdfs_url', required=False, default=None, type=str, help='URL of hdfs if this is used') parser.add_argument( '--hdfs_user', required=False, default=None, type=str, help="user of hdfs if this used. Must be given if `hdfs_url` is given") FALSE = ['false', 'f', '0', 'no', 'none'] def arg_bool(arg_value): return False if arg_value.lower() in FALSE else True args = parser.parse_args(argv) start_time = time.time() # ------------------------------------------------------------------------------ delete_subclasses = [] for c in args.delete_subclasses.split(","): c.strip() if c: delete_subclasses.append(int(c)) args.delete_subclasses = delete_subclasses if not args.output_folder: args.output_folder = tempfile.mkdtemp() str_delete_subclasses = "None" if not args.delete_subclasses else str( args.delete_subclasses).strip('[]').replace(' ', '') if args.labeler == "BIEO": labeler = BIEOLabeler() elif args.labeler == "BIO": labeler = BIOLabeler() elif args.labeler == "IO": labeler = IOLabeler() elif args.labeler == "11labels": labeler = TmVarLabeler() args.word_embeddings = arg_bool(args.word_embeddings) if args.word_embeddings: args.we_params = { 'additive': args.we_additive, 'multiplicative': args.we_multiplicative, 'location': args.we_model_location } else: args.we_params = {} # means: do not use we if args.nl: args.nl_features = { 'threshold': args.nl_threshold, # threshold for neighbour space in dictionaries 'window': args.nl_window, } else: args.nl_features = None if args.elastic_net: args.crf_train_params = { 'c1': 1.0, # coefficient for L1 penalty 'c2': 1e-3, # coefficient for L2 penalty } else: args.crf_train_params = None args.use_feat_windows = False if args.use_feat_windows.lower( ) in FALSE else True args.mutations_specific = False if args.mutations_specific.lower( ) in FALSE else True args.execute_pp = False if args.execute_pp.lower() in FALSE else True args.keep_silent = False if args.keep_silent.lower() in FALSE else True args.keep_genetic_markers = False if args.keep_genetic_markers.lower( ) in FALSE else True args.keep_unnumbered = False if args.keep_unnumbered.lower( ) in FALSE else True args.keep_rs_ids = False if args.keep_rs_ids.lower() in FALSE else True args.do_train = False if args.model_path_1 else True if args.cv_n is not None or args.cv_fold is not None: args.validation = "cross-validation" if args.validation == "cross-validation": assert (args.cv_n is not None and args.cv_fold is not None), "You must set both cv_n AND cv_n" # ------------------------------------------------------------------------------ if args.training_corpus: # Get the name of training corpus even if this is given as a folder path, in which case the last folder name is used training_corpus_name = list( filter(None, args.training_corpus.split('/')))[-1] args.model_name = "{}_{}_del_{}".format(training_corpus_name, args.labeler, str_delete_subclasses) if args.validation == "cross-validation": args.model_name += "_cvfold_" + str(args.cv_fold) args.model_name_suffix = args.model_name_suffix.strip() if args.model_name_suffix: args.model_name += "_" + str(args.model_name_suffix) else: args.model_name = args.test_corpus # ------------------------------------------------------------------------------ def stats(dataset, name): print('\n\t{} size: {}'.format(name, len(dataset))) print('\tsubclass distribution: {}'.format(repr(dataset))) # Caveat: the dataset must be passed through the pipeline first print('\tnum sentences: {}\n'.format( sum(1 for x in dataset.sentences()))) definer = ExclusiveNLDefiner() if args.training_corpus: train_set = get_corpus(args.training_corpus, only_class_id=args.only_class_id, hdfs_url=args.hdfs_url, hdfs_user=args.hdfs_user) if args.test_corpus: test_set = get_corpus(args.test_corpus, only_class_id=args.only_class_id, hdfs_url=args.hdfs_url, hdfs_user=args.hdfs_user) elif args.string: test_set = StringReader(args.string).read() elif args.validation == "none": test_set = None elif args.validation == "cross-validation": train_set, test_set = train_set.fold_nr_split( int(args.cv_n), int(args.cv_fold)) elif args.validation == "stratified": definer.define(train_set) train_set, test_set = train_set.stratified_split() elif args.test_corpus: train_set = None test_set = get_corpora(args.test_corpus, args.only_class_id) elif args.string: train_set = None test_set = StringReader(args.string).read() else: raise Exception( "you must give at least a parameter of: training_corpus, test_corpus, or string" ) def verify_corpus(corpus): if corpus is not None: assert len( corpus ) > 0, f"The corpus should have at least one document; had 0: {args.training_corpus}" assert next( corpus.entities(), None ) is not None, "The corpus should have at least one entity; had 0" verify_corpus(train_set) # ------------------------------------------------------------------------------ if args.mutations_specific: print("Pipeline specific to mutations") features_pipeline = get_prepare_pipeline_for_best_model( args.use_feat_windows, args.we_params, args.nl_features) else: print("Pipeline is general") features_pipeline = get_prepare_pipeline_for_best_model_general( args.use_feat_windows, args.we_params, args.dictionaries_paths, args.hdfs_url, args.hdfs_user, args.dictionaries_stop_words) # ------------------------------------------------------------------------------ def print_run_args(): for key, value in sorted((vars(args)).items()): print("\t{} = {}".format(key, value)) print() print("Running arguments: ") print_run_args() # ------------------------------------------------------------------------------ def train(train_set): definer.define(train_set) train_set.delete_subclass_annotations(args.delete_subclasses) features_pipeline.execute(train_set) labeler.label(train_set) if args.pruner == "parts": train_set.prune_empty_parts() else: try: f = HighRecallRegexClassifier(ST=args.ps_ST, NL=args.ps_NL) except AssertionError: f = (lambda _: False) train_set.prune_filtered_sentences(filterin=f, percent_to_keep=args.ps_random) stats(train_set, "training") model_path = os.path.join(args.output_folder, args.model_name + ".bin") PyCRFSuite.train(train_set, model_path, args.crf_train_params) return model_path # ------------------------------------------------------------------------------ if args.do_train: args.model_path_1 = train(train_set) # ------------------------------------------------------------------------------ def test(tagger, test_set, print_eval=True, print_results=False): tagger.tag(test_set) definer.define(test_set) stats(test_set, "test") evaluation = MentionLevelEvaluator( subclass_analysis=True).evaluate(test_set) print_run_args() if print_eval: print(evaluation) if print_results: ConsoleWriter(ent1_class_id=PRO_CLASS_ID, ent2_class_id=MUT_CLASS_ID, color=True).write(test_set) # ------------------------------------------------------------------------------ assert (args.model_path_1 is not None) if args.model_path_2: tagger = NalaMultipleModelTagger( st_model=args.model_path_1, all3_model=args.model_path_2, features_pipeline=features_pipeline, execute_pp=args.execute_pp, keep_silent=args.keep_silent, keep_genetic_markers=args.keep_genetic_markers, keep_unnumbered=args.keep_unnumbered, keep_rs_ids=args.keep_rs_ids) else: tagger = NalaSingleModelTagger( bin_model=args.model_path_1, features_pipeline=features_pipeline, execute_pp=args.execute_pp, keep_silent=args.keep_silent, keep_genetic_markers=args.keep_genetic_markers, keep_unnumbered=args.keep_unnumbered, keep_rs_ids=args.keep_rs_ids) # ------------------------------------------------------------------------------ print("\n{}".format(args.model_name)) if train_set: stats(train_set, "training") if test_set: test(tagger, test_set, print_eval=args.string is None, print_results=args.string is not None) if args.do_train: print("\nThe model is saved to: {}\n".format(args.model_path_1)) if args.write_anndoc: outdir = os.path.join(args.output_folder, args.model_name) os.mkdir(outdir) print("\nThe predicted test data is saved to: {}\n".format(outdir)) TagTogFormat(test_set, use_predicted=True, to_save_to=outdir).export(0) end_time = time.time() print_debug("Elapsed time: ", (end_time - start_time)) return { "tagger": tagger, "trained_model_path": args.model_path_1, "training_num_docs": 0 if train_set is None else len(train_set.documents), "training_num_annotations": 0 if train_set is None else sum(1 for e in train_set.entities() if e.class_id == args.only_class_id) }
group.add_argument('-s', '--string', help='string you want to predict for') group.add_argument('-d', '--dir_or_file', help='directory or file you want to predict for') group.add_argument( '-p', '--pmids', nargs='+', help='a single PMID or a list of PMIDs separated by space') args = parser.parse_args() # warning = 'Due to a dependence on GNormPlus, running nala with -s and -d switches might take a long time.' if args.string: # print(warning) dataset = StringReader(args.string).read() elif args.pmids: dataset = PMIDReader(args.pmids).read() elif os.path.exists(args.dir_or_file): # print(warning) dataset = TextFilesReader(args.dir_or_file).read() else: raise FileNotFoundError('directory or file "{}" does not exist'.format( args.dir_or_file)) bin_model = pkg_resources.resource_filename('nala.data', 'default_model') tagger = NalaSingleModelTagger(class_id=MUT_CLASS_ID, bin_model=bin_model) tagger.tag(dataset) if args.output_dir:
raise Exception("Make sure to add seth.jar to your classpath (use repo https://github.com/juanmirocks/SETH) -- " + e.stderr) else: raise # ------------------------------------------------------------------------------ methodName = sys.argv[1] assert methodName in {"SETH", "MFmodified", "check_performance"}, \ "Method name must be SETH or MFmodified or check_performance" corpusName = sys.argv[2] try: corpus = get_corpora(corpusName) folderName = sys.argv[3] except: corpus = StringReader(corpusName).read() folderName = None # just print out in standard output # ------------------------------------------------------------------------------ # Example calls: # python scripts/SETH.py SETH nala_test resources/predictions/ # predict # python scripts/SETH.py check_performance nala_test resources/predictions/SETH/nala_test &> resources/predictions/SETH/nala_test/oresults.tsv # evaluate if (methodName == 'check_performance'): # folderName is assumed to be the final/leaf predictions folder, e.g., `resources/predictions/SETH/nala_test` BRATPartsAnnotationReader(folderName, is_predicted=True).annotate(corpus) ExclusiveNLDefiner().define(corpus) evaluation = MentionLevelEvaluator(subclass_analysis=True).evaluate(corpus) print(evaluation)