Esempio n. 1
0
 def setUp(self):
     self.dataset = StringReader(
         'some text ... (c.2708_2711delTTAG, p.V903GfsX905) ... text').read(
         )
     NLTKSplitter().split(self.dataset)
     TmVarTokenizer().tokenize(self.dataset)
     part = list(self.dataset.parts())[0]
     part.annotations.append(
         Entity(STUB_ENTITY_CLASS_ID, 15, 'c.2708_2711delTTAG'))
     part.annotations.append(
         Entity(STUB_ENTITY_CLASS_ID, 35, 'p.V903GfsX905'))
Esempio n. 2
0
    def test_generate_patterns_245(self):
        dataset = StringReader('token c.A436C token').read()
        NLTKSplitter().split(dataset)
        TmVarTokenizer().tokenize(dataset)
        TmVarDictionaryFeatureGenerator().generate(dataset)

        token_features = [{key: value for key, value in token.features.items() if value is not 'O'}
                          for token in dataset.tokens()]
        self.assertEqual(token_features[0], {})
        self.assertEqual(token_features[1], {'pattern4[0]': 'B', 'pattern2[0]': 'B'})
        self.assertEqual(token_features[2], {'pattern4[0]': 'I', 'pattern2[0]': 'I'})
        self.assertEqual(token_features[3], {'pattern4[0]': 'I', 'pattern2[0]': 'I', 'pattern5[0]': 'B'})
        self.assertEqual(token_features[4], {'pattern4[0]': 'I', 'pattern2[0]': 'I', 'pattern5[0]': 'I'})
        self.assertEqual(token_features[5], {'pattern4[0]': 'E', 'pattern2[0]': 'I', 'pattern5[0]': 'E'})
        self.assertEqual(token_features[6], {})
Esempio n. 3
0
class TestLabelers(unittest.TestCase):
    def setUp(self):
        self.dataset = StringReader(
            'some text ... (c.2708_2711delTTAG, p.V903GfsX905) ... text').read(
            )
        NLTKSplitter().split(self.dataset)
        TmVarTokenizer().tokenize(self.dataset)
        part = list(self.dataset.parts())[0]
        part.annotations.append(
            Entity(STUB_ENTITY_CLASS_ID, 15, 'c.2708_2711delTTAG'))
        part.annotations.append(
            Entity(STUB_ENTITY_CLASS_ID, 35, 'p.V903GfsX905'))

    def test_bio_labeler(self):
        BIOLabeler().label(self.dataset)
        labels = [
            token.original_labels[0].value for token in self.dataset.tokens()
        ]
        expected = [
            'O', 'O', 'O', 'O', 'O', 'O', 'B-e_x', 'I-e_x', 'I-e_x', 'I-e_x',
            'I-e_x', 'I-e_x', 'I-e_x', 'O', 'B-e_x', 'I-e_x', 'I-e_x', 'I-e_x',
            'I-e_x', 'I-e_x', 'I-e_x', 'O', 'O', 'O', 'O', 'O'
        ]
        self.assertEqual(labels, expected)

    def test_bieo_labeler(self):
        BIEOLabeler().label(self.dataset)
        labels = [
            token.original_labels[0].value for token in self.dataset.tokens()
        ]
        expected = [
            'O', 'O', 'O', 'O', 'O', 'O', 'B-e_x', 'I-e_x', 'I-e_x', 'I-e_x',
            'I-e_x', 'I-e_x', 'E-e_x', 'O', 'B-e_x', 'I-e_x', 'I-e_x', 'I-e_x',
            'I-e_x', 'I-e_x', 'E-e_x', 'O', 'O', 'O', 'O', 'O'
        ]
        self.assertEqual(labels, expected)

    def test_tmvar_labeler(self):
        TmVarLabeler(STUB_ENTITY_CLASS_ID).label(self.dataset)
        labels = [
            token.original_labels[0].value for token in self.dataset.tokens()
        ]
        expected = [
            'O', 'O', 'O', 'O', 'O', 'O', 'A', 'I', 'P', 'P', 'P', 'T', 'W',
            'O', 'A', 'I', 'W', 'P', 'I', 'M', 'P', 'O', 'O', 'O', 'O', 'O'
        ]
        self.assertEqual(labels, expected)
Esempio n. 4
0
    def _get_test_data(self, entity_sentence, assumed_tokens_words=None):
        if assumed_tokens_words is None:
            assumed_tokens_words = entity_sentence.split(' ')

        # Create dataset

        dataset = StringReader(entity_sentence).read()
        part = next(dataset.parts())
        entity = Entity(class_id=STUB_ENTITY_CLASS_ID,
                        offset=0,
                        text=entity_sentence)
        part.annotations.append(entity)

        # Apply through pipeline

        NLTKSplitter().split(dataset)
        NLTK_TOKENIZER.tokenize(dataset)
        self.parser.parse(dataset)

        # Rest

        sentences = part.sentences
        assert len(sentences) == 1
        sentence = sentences[0]

        assert len(assumed_tokens_words) == len(sentence)
        for (assumed_token_word, actual_token) in zip(assumed_tokens_words,
                                                      sentence):
            assert assumed_token_word == actual_token.word

        part.compute_tokens_depth()
        roots = Part.get_sentence_roots(sentence)
        for r in roots:
            self._assert_depth_eq(r, 0)

        part.set_entities_head_tokens()

        return (dataset, sentence, entity, roots)
Esempio n. 5
0
def run_with_argv(argv=[]):
    args = parse_arguments(argv)

    ner, re = read_models(args)

    if args.text:
        corpus = StringReader(args.text).read()
    elif args.pmid:
        corpus = PMIDReader(args.pmid).read()
    # See more possible readers including some NCBI XML files in `nalaf.utils.readers`

    ner.annotate(corpus)
    re.annotate(corpus)

    return corpus
Esempio n. 6
0
def train(argv):
    parser = argparse.ArgumentParser(description='Train model')

    parser.add_argument(
        '--training_corpus',
        help=
        'Name of the corpus to train on. Ex: nala_training, IDP4+_training, nala_training_5'
    )
    parser.add_argument('--test_corpus', help='Name of the corpus to test on')
    parser.add_argument('--string', help='String to tag')

    parser.add_argument('--validation',
                        required=False,
                        default="stratified",
                        choices=["cross-validation", "stratified", "none"],
                        help='Type of validation to use when training')

    parser.add_argument(
        '--cv_n',
        required=False,
        help=
        'if given, cross validation (instead of stratification) is used for validating the training. \
                        In this case you must also set `cv_fold` and only that fold number will be run'
    )
    parser.add_argument(
        '--cv_fold',
        required=False,
        help=
        'fold number if cross validation is activated (it starts at 0; i.e. for cv_n=5, you have folds: [0,1,2,3,4] )'
    )

    parser.add_argument(
        '--output_folder',
        required=False,
        help=
        'Folder where the training model is written to. Otherwise a tmp folder is used'
    )
    parser.add_argument(
        '--model_name_suffix',
        default='',
        required=False,
        help=
        'Optional suffix to add to the generated model name in training mode'),
    parser.add_argument(
        '--write_anndoc',
        required=False,
        default=False,
        action='store_true',
        help='Write anndoc of predicted test_corpus (validation corpus in fact)'
    ),
    parser.add_argument(
        '--model_path_1',
        required=False,
        help='Path of the first model binary file if evaluation is performed')
    parser.add_argument(
        '--model_path_2',
        required=False,
        help=
        'Path of the second model binary file if evaluation is performed with two models'
    )

    parser.add_argument('--labeler',
                        required=False,
                        default="BIEO",
                        choices=["BIEO", "BIO", "IO", "11labels"],
                        help='Labeler to use for training')

    parser.add_argument(
        '--mutations_specific',
        default='True',
        help=
        'Apply feature pipelines specific to mutations or otherwise (false) use general one'
    )

    parser.add_argument(
        '--only_class_id',
        required=False,
        default=MUT_CLASS_ID,
        help=
        "By default, only the mutation entities are read from corpora (assumed to have class_id == '"
        + MUT_CLASS_ID + "'). Set this class_id to filter rest out")
    parser.add_argument(
        '--delete_subclasses',
        required=False,
        default="",
        help='Comma-separated subclasses to delete. Example: "2,3"')

    parser.add_argument('--pruner',
                        required=False,
                        default="parts",
                        choices=["parts", "sentences"])
    parser.add_argument('--ps_ST',
                        required=False,
                        default=False,
                        action='store_true')
    parser.add_argument('--ps_NL',
                        required=False,
                        default=False,
                        action='store_true')
    parser.add_argument('--ps_random', required=False, default=0.0, type=float)

    parser.add_argument('--elastic_net',
                        action='store_true',
                        help='Use elastic net regularization')

    parser.add_argument('--word_embeddings',
                        '--we',
                        default='True',
                        help='Use word embeddings features')
    parser.add_argument('--we_additive', type=float, default=0)
    parser.add_argument('--we_multiplicative', type=float, default=1)
    parser.add_argument('--we_model_location', type=str, default=None)

    parser.add_argument('--use_feat_windows', default='True')

    parser.add_argument('--nl',
                        action='store_true',
                        help='Use NLMentionFeatureGenerator')
    parser.add_argument('--nl_threshold', type=int, default=0)
    parser.add_argument('--nl_window',
                        action='store_true',
                        help='use window feature for NLFeatureGenerator')

    parser.add_argument(
        '--execute_pp',
        default='True',
        help='Execute post processing specific to mutations (default) or not')
    parser.add_argument(
        '--keep_silent',
        default='True',
        help=
        'Keep silent mutations (default) or not, i.e., delete mentions like `Cys23-Cys`'
    )
    parser.add_argument(
        '--keep_genetic_markers',
        default='True',
        help='Keep genetic markers of the form D17S250, true (default) or false'
    )
    parser.add_argument(
        '--keep_unnumbered',
        default='True',
        help=
        'Keep unnumbered mentions (default) or not, i.e., delete mentions like `C -> T`'
    )
    parser.add_argument(
        '--keep_rs_ids',
        default='True',
        help=
        'Keep unnumbered mentions (default) or not, i.e., delete mentions like `rs1801280` or `ss221`'
    )

    parser.add_argument(
        '--dictionaries_paths',
        default=None,
        help=
        'Dictionary paths to use for dictionary features. Can be used within hdfs'
    )
    parser.add_argument('--dictionaries_stop_words',
                        default=None,
                        help='Stop words for dictionaries if these are used')

    parser.add_argument('--hdfs_url',
                        required=False,
                        default=None,
                        type=str,
                        help='URL of hdfs if this is used')
    parser.add_argument(
        '--hdfs_user',
        required=False,
        default=None,
        type=str,
        help="user of hdfs if this used. Must be given if `hdfs_url` is given")

    FALSE = ['false', 'f', '0', 'no', 'none']

    def arg_bool(arg_value):
        return False if arg_value.lower() in FALSE else True

    args = parser.parse_args(argv)

    start_time = time.time()

    # ------------------------------------------------------------------------------

    delete_subclasses = []
    for c in args.delete_subclasses.split(","):
        c.strip()
        if c:
            delete_subclasses.append(int(c))

    args.delete_subclasses = delete_subclasses

    if not args.output_folder:
        args.output_folder = tempfile.mkdtemp()

    str_delete_subclasses = "None" if not args.delete_subclasses else str(
        args.delete_subclasses).strip('[]').replace(' ', '')

    if args.labeler == "BIEO":
        labeler = BIEOLabeler()
    elif args.labeler == "BIO":
        labeler = BIOLabeler()
    elif args.labeler == "IO":
        labeler = IOLabeler()
    elif args.labeler == "11labels":
        labeler = TmVarLabeler()

    args.word_embeddings = arg_bool(args.word_embeddings)

    if args.word_embeddings:
        args.we_params = {
            'additive': args.we_additive,
            'multiplicative': args.we_multiplicative,
            'location': args.we_model_location
        }
    else:
        args.we_params = {}  # means: do not use we

    if args.nl:
        args.nl_features = {
            'threshold':
            args.nl_threshold,  # threshold for neighbour space in dictionaries
            'window': args.nl_window,
        }
    else:
        args.nl_features = None

    if args.elastic_net:
        args.crf_train_params = {
            'c1': 1.0,  # coefficient for L1 penalty
            'c2': 1e-3,  # coefficient for L2 penalty
        }
    else:
        args.crf_train_params = None

    args.use_feat_windows = False if args.use_feat_windows.lower(
    ) in FALSE else True
    args.mutations_specific = False if args.mutations_specific.lower(
    ) in FALSE else True
    args.execute_pp = False if args.execute_pp.lower() in FALSE else True
    args.keep_silent = False if args.keep_silent.lower() in FALSE else True
    args.keep_genetic_markers = False if args.keep_genetic_markers.lower(
    ) in FALSE else True
    args.keep_unnumbered = False if args.keep_unnumbered.lower(
    ) in FALSE else True
    args.keep_rs_ids = False if args.keep_rs_ids.lower() in FALSE else True

    args.do_train = False if args.model_path_1 else True

    if args.cv_n is not None or args.cv_fold is not None:
        args.validation = "cross-validation"

    if args.validation == "cross-validation":
        assert (args.cv_n is not None and args.cv_fold
                is not None), "You must set both cv_n AND cv_n"

    # ------------------------------------------------------------------------------

    if args.training_corpus:
        # Get the name of training corpus even if this is given as a folder path, in which case the last folder name is used
        training_corpus_name = list(
            filter(None, args.training_corpus.split('/')))[-1]

        args.model_name = "{}_{}_del_{}".format(training_corpus_name,
                                                args.labeler,
                                                str_delete_subclasses)

        if args.validation == "cross-validation":
            args.model_name += "_cvfold_" + str(args.cv_fold)
        args.model_name_suffix = args.model_name_suffix.strip()
        if args.model_name_suffix:
            args.model_name += "_" + str(args.model_name_suffix)

    else:
        args.model_name = args.test_corpus

    # ------------------------------------------------------------------------------

    def stats(dataset, name):
        print('\n\t{} size: {}'.format(name, len(dataset)))
        print('\tsubclass distribution: {}'.format(repr(dataset)))
        # Caveat: the dataset must be passed through the pipeline first
        print('\tnum sentences: {}\n'.format(
            sum(1 for x in dataset.sentences())))

    definer = ExclusiveNLDefiner()

    if args.training_corpus:
        train_set = get_corpus(args.training_corpus,
                               only_class_id=args.only_class_id,
                               hdfs_url=args.hdfs_url,
                               hdfs_user=args.hdfs_user)

        if args.test_corpus:
            test_set = get_corpus(args.test_corpus,
                                  only_class_id=args.only_class_id,
                                  hdfs_url=args.hdfs_url,
                                  hdfs_user=args.hdfs_user)
        elif args.string:
            test_set = StringReader(args.string).read()
        elif args.validation == "none":
            test_set = None
        elif args.validation == "cross-validation":
            train_set, test_set = train_set.fold_nr_split(
                int(args.cv_n), int(args.cv_fold))
        elif args.validation == "stratified":
            definer.define(train_set)
            train_set, test_set = train_set.stratified_split()

    elif args.test_corpus:
        train_set = None
        test_set = get_corpora(args.test_corpus, args.only_class_id)

    elif args.string:
        train_set = None
        test_set = StringReader(args.string).read()

    else:
        raise Exception(
            "you must give at least a parameter of: training_corpus, test_corpus, or string"
        )

    def verify_corpus(corpus):
        if corpus is not None:
            assert len(
                corpus
            ) > 0, f"The corpus should have at least one document; had 0: {args.training_corpus}"
            assert next(
                corpus.entities(), None
            ) is not None, "The corpus should have at least one entity; had 0"

    verify_corpus(train_set)

    # ------------------------------------------------------------------------------

    if args.mutations_specific:
        print("Pipeline specific to mutations")
        features_pipeline = get_prepare_pipeline_for_best_model(
            args.use_feat_windows, args.we_params, args.nl_features)
    else:
        print("Pipeline is general")
        features_pipeline = get_prepare_pipeline_for_best_model_general(
            args.use_feat_windows, args.we_params, args.dictionaries_paths,
            args.hdfs_url, args.hdfs_user, args.dictionaries_stop_words)

    # ------------------------------------------------------------------------------

    def print_run_args():
        for key, value in sorted((vars(args)).items()):
            print("\t{} = {}".format(key, value))
        print()

    print("Running arguments: ")

    print_run_args()

    # ------------------------------------------------------------------------------

    def train(train_set):
        definer.define(train_set)
        train_set.delete_subclass_annotations(args.delete_subclasses)
        features_pipeline.execute(train_set)
        labeler.label(train_set)

        if args.pruner == "parts":
            train_set.prune_empty_parts()
        else:
            try:
                f = HighRecallRegexClassifier(ST=args.ps_ST, NL=args.ps_NL)
            except AssertionError:
                f = (lambda _: False)
            train_set.prune_filtered_sentences(filterin=f,
                                               percent_to_keep=args.ps_random)

        stats(train_set, "training")

        model_path = os.path.join(args.output_folder, args.model_name + ".bin")
        PyCRFSuite.train(train_set, model_path, args.crf_train_params)

        return model_path

    # ------------------------------------------------------------------------------

    if args.do_train:
        args.model_path_1 = train(train_set)

    # ------------------------------------------------------------------------------

    def test(tagger, test_set, print_eval=True, print_results=False):
        tagger.tag(test_set)
        definer.define(test_set)
        stats(test_set, "test")
        evaluation = MentionLevelEvaluator(
            subclass_analysis=True).evaluate(test_set)

        print_run_args()

        if print_eval:
            print(evaluation)
        if print_results:
            ConsoleWriter(ent1_class_id=PRO_CLASS_ID,
                          ent2_class_id=MUT_CLASS_ID,
                          color=True).write(test_set)

    # ------------------------------------------------------------------------------

    assert (args.model_path_1 is not None)

    if args.model_path_2:
        tagger = NalaMultipleModelTagger(
            st_model=args.model_path_1,
            all3_model=args.model_path_2,
            features_pipeline=features_pipeline,
            execute_pp=args.execute_pp,
            keep_silent=args.keep_silent,
            keep_genetic_markers=args.keep_genetic_markers,
            keep_unnumbered=args.keep_unnumbered,
            keep_rs_ids=args.keep_rs_ids)
    else:
        tagger = NalaSingleModelTagger(
            bin_model=args.model_path_1,
            features_pipeline=features_pipeline,
            execute_pp=args.execute_pp,
            keep_silent=args.keep_silent,
            keep_genetic_markers=args.keep_genetic_markers,
            keep_unnumbered=args.keep_unnumbered,
            keep_rs_ids=args.keep_rs_ids)

    # ------------------------------------------------------------------------------

    print("\n{}".format(args.model_name))

    if train_set:
        stats(train_set, "training")

    if test_set:
        test(tagger,
             test_set,
             print_eval=args.string is None,
             print_results=args.string is not None)

    if args.do_train:
        print("\nThe model is saved to: {}\n".format(args.model_path_1))

    if args.write_anndoc:
        outdir = os.path.join(args.output_folder, args.model_name)
        os.mkdir(outdir)
        print("\nThe predicted test data is saved to: {}\n".format(outdir))
        TagTogFormat(test_set, use_predicted=True, to_save_to=outdir).export(0)

    end_time = time.time()

    print_debug("Elapsed time: ", (end_time - start_time))

    return {
        "tagger":
        tagger,
        "trained_model_path":
        args.model_path_1,
        "training_num_docs":
        0 if train_set is None else len(train_set.documents),
        "training_num_annotations":
        0 if train_set is None else sum(1 for e in train_set.entities()
                                        if e.class_id == args.only_class_id)
    }
Esempio n. 7
0
    group.add_argument('-s', '--string', help='string you want to predict for')
    group.add_argument('-d',
                       '--dir_or_file',
                       help='directory or file you want to predict for')
    group.add_argument(
        '-p',
        '--pmids',
        nargs='+',
        help='a single PMID or a list of PMIDs separated by space')
    args = parser.parse_args()

    # warning = 'Due to a dependence on GNormPlus, running nala with -s and -d switches might take a long time.'
    if args.string:
        # print(warning)
        dataset = StringReader(args.string).read()
    elif args.pmids:
        dataset = PMIDReader(args.pmids).read()
    elif os.path.exists(args.dir_or_file):
        # print(warning)
        dataset = TextFilesReader(args.dir_or_file).read()
    else:
        raise FileNotFoundError('directory or file "{}" does not exist'.format(
            args.dir_or_file))

    bin_model = pkg_resources.resource_filename('nala.data', 'default_model')
    tagger = NalaSingleModelTagger(class_id=MUT_CLASS_ID, bin_model=bin_model)

    tagger.tag(dataset)

    if args.output_dir:
Esempio n. 8
0
            raise Exception("Make sure to add seth.jar to your classpath (use repo https://github.com/juanmirocks/SETH) -- " + e.stderr)
        else:
            raise

# ------------------------------------------------------------------------------

methodName = sys.argv[1]
assert methodName in {"SETH", "MFmodified", "check_performance"}, \
    "Method name must be SETH or MFmodified or check_performance"
corpusName = sys.argv[2]
try:
    corpus = get_corpora(corpusName)
    folderName = sys.argv[3]

except:
    corpus = StringReader(corpusName).read()
    folderName = None  # just print out in standard output

# ------------------------------------------------------------------------------

# Example calls:
# python scripts/SETH.py SETH nala_test resources/predictions/  # predict
# python scripts/SETH.py check_performance nala_test resources/predictions/SETH/nala_test &> resources/predictions/SETH/nala_test/oresults.tsv  # evaluate

if (methodName == 'check_performance'):
    # folderName is assumed to be the final/leaf predictions folder, e.g., `resources/predictions/SETH/nala_test`
    BRATPartsAnnotationReader(folderName, is_predicted=True).annotate(corpus)
    ExclusiveNLDefiner().define(corpus)
    evaluation = MentionLevelEvaluator(subclass_analysis=True).evaluate(corpus)
    print(evaluation)