Ejemplo n.º 1
0
def benchmark_nala(member1, member2):
    itrs = []

    # Read the IAA iterations in blocks so that the plain documents are not deleted with the AnnJsonAnnotationReader's
    for itr in IterationRound.all():
        if itr.is_IAA():
            dataset = itr.read(read_annotations=False)
            AnnJsonAnnotationReader(
                os.path.join(itr.path, "reviewed", member1),
                read_only_class_id=MUT_CLASS_ID,
                delete_incomplete_docs=False).annotate(dataset)
            AnnJsonAnnotationReader(os.path.join(itr.path, "reviewed",
                                                 member2),
                                    read_only_class_id=MUT_CLASS_ID,
                                    delete_incomplete_docs=False,
                                    is_predicted=True).annotate(dataset)
            itrs.append(dataset)
            dataset = None

    # Then merge the IAA iterations
    all_itrs_dataset = Dataset()
    for itr_dataset in itrs:
        all_itrs_dataset.extend_dataset(itr_dataset)

    ExclusiveNLDefiner().define(all_itrs_dataset)

    return (all_itrs_dataset, MentionLevelEvaluator(
        subclass_analysis=True).evaluate(all_itrs_dataset))
Ejemplo n.º 2
0
 def filter(self, documents):
     pycrf = PyCRFSuite(self.binary_model)
     for pmid, doc in documents:
         dataset = Dataset()
         dataset.documents[pmid] = doc
         self.pipeline.execute(dataset)
         self.labeler.label(dataset)
         pycrf.tag(dataset, MUT_CLASS_ID)
         PostProcessing().process(dataset)
         ExclusiveNLDefiner().define(dataset)
         total_nl_mentions = []
         for part in doc:
             # print(part.annotations)
             print_verbose('predicted_annotations:',
                           part.predicted_annotations)
             nl_mentions = [
                 (ann.text, ann.subclass, ann.confidence)
                 for ann in part.predicted_annotations
                 if ann.subclass != 0 and ann.confidence <= self.threshold
             ]
             total_nl_mentions += nl_mentions
         if any(total_nl_mentions):
             print('nl mentions', json.dumps(total_nl_mentions, indent=4))
             yield pmid, doc
         print_verbose('nothing found')
Ejemplo n.º 3
0
def evaluate():
    from nalaf.utils.annotation_readers import AnnJsonAnnotationReader
    size_before = len(data)
    AnnJsonAnnotationReader(os.path.join(folder_name, "annjson"),
                            is_predicted=True,
                            delete_incomplete_docs=False).annotate(data)
    assert (size_before == len(data))

    ExclusiveNLDefiner().define(data)
    e = MentionLevelEvaluator(subclass_analysis=True).evaluate(data)
    print(e)
Ejemplo n.º 4
0
def find_number_of_documents():
    data = read_data(39, read_base=False)
    train, test = data.stratified_split()
    del data
    del train

    pipeline = get_prepare_pipeline_for_best_model()
    pipeline.execute(test)
    BIEOLabeler().label(test)
    PyCRFSuite().tag(test, 'idp4_model')
    PostProcessing().process(test)
    ExclusiveNLDefiner().define(test)

    keys = test.documents.keys()
    for test_size in range(30, 101, 10):
        sample = Dataset()
        random_keys = random.sample(keys, test_size)
        sample.documents = {key: test.documents[key] for key in random_keys}

        print('============== {} =============='.format(test_size))
        calculate_standard_error(sample)
Ejemplo n.º 5
0
def benchmark_IDP4(member1, member2):
    itr = IterationRound(0)
    IDP4_corpus = itr.read(read_annotations=False)

    IAA_IDP4_corpus = Dataset()
    for docid, document in IDP4_corpus.documents.items():
        if docid in IDP4_IAA_docs:
            IAA_IDP4_corpus.documents[docid] = document

    AnnJsonAnnotationReader(
        os.path.join(itr.path, "base", "annjson", "members", member1),
        read_only_class_id=MUT_CLASS_ID,
        delete_incomplete_docs=True).annotate(IAA_IDP4_corpus)
    AnnJsonAnnotationReader(os.path.join(itr.path, "base", "annjson",
                                         "members", member2),
                            read_only_class_id=MUT_CLASS_ID,
                            delete_incomplete_docs=True,
                            is_predicted=True).annotate(IAA_IDP4_corpus)

    ExclusiveNLDefiner().define(IAA_IDP4_corpus)

    return (IAA_IDP4_corpus, MentionLevelEvaluator(
        subclass_analysis=True).evaluate(IAA_IDP4_corpus))
Ejemplo n.º 6
0
def train(argv):
    parser = argparse.ArgumentParser(description='Train model')

    parser.add_argument(
        '--training_corpus',
        help=
        'Name of the corpus to train on. Ex: nala_training, IDP4+_training, nala_training_5'
    )
    parser.add_argument('--test_corpus', help='Name of the corpus to test on')
    parser.add_argument('--string', help='String to tag')

    parser.add_argument('--validation',
                        required=False,
                        default="stratified",
                        choices=["cross-validation", "stratified", "none"],
                        help='Type of validation to use when training')

    parser.add_argument(
        '--cv_n',
        required=False,
        help=
        'if given, cross validation (instead of stratification) is used for validating the training. \
                        In this case you must also set `cv_fold` and only that fold number will be run'
    )
    parser.add_argument(
        '--cv_fold',
        required=False,
        help=
        'fold number if cross validation is activated (it starts at 0; i.e. for cv_n=5, you have folds: [0,1,2,3,4] )'
    )

    parser.add_argument(
        '--output_folder',
        required=False,
        help=
        'Folder where the training model is written to. Otherwise a tmp folder is used'
    )
    parser.add_argument(
        '--model_name_suffix',
        default='',
        required=False,
        help=
        'Optional suffix to add to the generated model name in training mode'),
    parser.add_argument(
        '--write_anndoc',
        required=False,
        default=False,
        action='store_true',
        help='Write anndoc of predicted test_corpus (validation corpus in fact)'
    ),
    parser.add_argument(
        '--model_path_1',
        required=False,
        help='Path of the first model binary file if evaluation is performed')
    parser.add_argument(
        '--model_path_2',
        required=False,
        help=
        'Path of the second model binary file if evaluation is performed with two models'
    )

    parser.add_argument('--labeler',
                        required=False,
                        default="BIEO",
                        choices=["BIEO", "BIO", "IO", "11labels"],
                        help='Labeler to use for training')

    parser.add_argument(
        '--mutations_specific',
        default='True',
        help=
        'Apply feature pipelines specific to mutations or otherwise (false) use general one'
    )

    parser.add_argument(
        '--only_class_id',
        required=False,
        default=MUT_CLASS_ID,
        help=
        "By default, only the mutation entities are read from corpora (assumed to have class_id == '"
        + MUT_CLASS_ID + "'). Set this class_id to filter rest out")
    parser.add_argument(
        '--delete_subclasses',
        required=False,
        default="",
        help='Comma-separated subclasses to delete. Example: "2,3"')

    parser.add_argument('--pruner',
                        required=False,
                        default="parts",
                        choices=["parts", "sentences"])
    parser.add_argument('--ps_ST',
                        required=False,
                        default=False,
                        action='store_true')
    parser.add_argument('--ps_NL',
                        required=False,
                        default=False,
                        action='store_true')
    parser.add_argument('--ps_random', required=False, default=0.0, type=float)

    parser.add_argument('--elastic_net',
                        action='store_true',
                        help='Use elastic net regularization')

    parser.add_argument('--word_embeddings',
                        '--we',
                        default='True',
                        help='Use word embeddings features')
    parser.add_argument('--we_additive', type=float, default=0)
    parser.add_argument('--we_multiplicative', type=float, default=1)
    parser.add_argument('--we_model_location', type=str, default=None)

    parser.add_argument('--use_feat_windows', default='True')

    parser.add_argument('--nl',
                        action='store_true',
                        help='Use NLMentionFeatureGenerator')
    parser.add_argument('--nl_threshold', type=int, default=0)
    parser.add_argument('--nl_window',
                        action='store_true',
                        help='use window feature for NLFeatureGenerator')

    parser.add_argument(
        '--execute_pp',
        default='True',
        help='Execute post processing specific to mutations (default) or not')
    parser.add_argument(
        '--keep_silent',
        default='True',
        help=
        'Keep silent mutations (default) or not, i.e., delete mentions like `Cys23-Cys`'
    )
    parser.add_argument(
        '--keep_genetic_markers',
        default='True',
        help='Keep genetic markers of the form D17S250, true (default) or false'
    )
    parser.add_argument(
        '--keep_unnumbered',
        default='True',
        help=
        'Keep unnumbered mentions (default) or not, i.e., delete mentions like `C -> T`'
    )
    parser.add_argument(
        '--keep_rs_ids',
        default='True',
        help=
        'Keep unnumbered mentions (default) or not, i.e., delete mentions like `rs1801280` or `ss221`'
    )

    parser.add_argument(
        '--dictionaries_paths',
        default=None,
        help=
        'Dictionary paths to use for dictionary features. Can be used within hdfs'
    )
    parser.add_argument('--dictionaries_stop_words',
                        default=None,
                        help='Stop words for dictionaries if these are used')

    parser.add_argument('--hdfs_url',
                        required=False,
                        default=None,
                        type=str,
                        help='URL of hdfs if this is used')
    parser.add_argument(
        '--hdfs_user',
        required=False,
        default=None,
        type=str,
        help="user of hdfs if this used. Must be given if `hdfs_url` is given")

    FALSE = ['false', 'f', '0', 'no', 'none']

    def arg_bool(arg_value):
        return False if arg_value.lower() in FALSE else True

    args = parser.parse_args(argv)

    start_time = time.time()

    # ------------------------------------------------------------------------------

    delete_subclasses = []
    for c in args.delete_subclasses.split(","):
        c.strip()
        if c:
            delete_subclasses.append(int(c))

    args.delete_subclasses = delete_subclasses

    if not args.output_folder:
        args.output_folder = tempfile.mkdtemp()

    str_delete_subclasses = "None" if not args.delete_subclasses else str(
        args.delete_subclasses).strip('[]').replace(' ', '')

    if args.labeler == "BIEO":
        labeler = BIEOLabeler()
    elif args.labeler == "BIO":
        labeler = BIOLabeler()
    elif args.labeler == "IO":
        labeler = IOLabeler()
    elif args.labeler == "11labels":
        labeler = TmVarLabeler()

    args.word_embeddings = arg_bool(args.word_embeddings)

    if args.word_embeddings:
        args.we_params = {
            'additive': args.we_additive,
            'multiplicative': args.we_multiplicative,
            'location': args.we_model_location
        }
    else:
        args.we_params = {}  # means: do not use we

    if args.nl:
        args.nl_features = {
            'threshold':
            args.nl_threshold,  # threshold for neighbour space in dictionaries
            'window': args.nl_window,
        }
    else:
        args.nl_features = None

    if args.elastic_net:
        args.crf_train_params = {
            'c1': 1.0,  # coefficient for L1 penalty
            'c2': 1e-3,  # coefficient for L2 penalty
        }
    else:
        args.crf_train_params = None

    args.use_feat_windows = False if args.use_feat_windows.lower(
    ) in FALSE else True
    args.mutations_specific = False if args.mutations_specific.lower(
    ) in FALSE else True
    args.execute_pp = False if args.execute_pp.lower() in FALSE else True
    args.keep_silent = False if args.keep_silent.lower() in FALSE else True
    args.keep_genetic_markers = False if args.keep_genetic_markers.lower(
    ) in FALSE else True
    args.keep_unnumbered = False if args.keep_unnumbered.lower(
    ) in FALSE else True
    args.keep_rs_ids = False if args.keep_rs_ids.lower() in FALSE else True

    args.do_train = False if args.model_path_1 else True

    if args.cv_n is not None or args.cv_fold is not None:
        args.validation = "cross-validation"

    if args.validation == "cross-validation":
        assert (args.cv_n is not None and args.cv_fold
                is not None), "You must set both cv_n AND cv_n"

    # ------------------------------------------------------------------------------

    if args.training_corpus:
        # Get the name of training corpus even if this is given as a folder path, in which case the last folder name is used
        training_corpus_name = list(
            filter(None, args.training_corpus.split('/')))[-1]

        args.model_name = "{}_{}_del_{}".format(training_corpus_name,
                                                args.labeler,
                                                str_delete_subclasses)

        if args.validation == "cross-validation":
            args.model_name += "_cvfold_" + str(args.cv_fold)
        args.model_name_suffix = args.model_name_suffix.strip()
        if args.model_name_suffix:
            args.model_name += "_" + str(args.model_name_suffix)

    else:
        args.model_name = args.test_corpus

    # ------------------------------------------------------------------------------

    def stats(dataset, name):
        print('\n\t{} size: {}'.format(name, len(dataset)))
        print('\tsubclass distribution: {}'.format(repr(dataset)))
        # Caveat: the dataset must be passed through the pipeline first
        print('\tnum sentences: {}\n'.format(
            sum(1 for x in dataset.sentences())))

    definer = ExclusiveNLDefiner()

    if args.training_corpus:
        train_set = get_corpus(args.training_corpus,
                               only_class_id=args.only_class_id,
                               hdfs_url=args.hdfs_url,
                               hdfs_user=args.hdfs_user)

        if args.test_corpus:
            test_set = get_corpus(args.test_corpus,
                                  only_class_id=args.only_class_id,
                                  hdfs_url=args.hdfs_url,
                                  hdfs_user=args.hdfs_user)
        elif args.string:
            test_set = StringReader(args.string).read()
        elif args.validation == "none":
            test_set = None
        elif args.validation == "cross-validation":
            train_set, test_set = train_set.fold_nr_split(
                int(args.cv_n), int(args.cv_fold))
        elif args.validation == "stratified":
            definer.define(train_set)
            train_set, test_set = train_set.stratified_split()

    elif args.test_corpus:
        train_set = None
        test_set = get_corpora(args.test_corpus, args.only_class_id)

    elif args.string:
        train_set = None
        test_set = StringReader(args.string).read()

    else:
        raise Exception(
            "you must give at least a parameter of: training_corpus, test_corpus, or string"
        )

    def verify_corpus(corpus):
        if corpus is not None:
            assert len(
                corpus
            ) > 0, f"The corpus should have at least one document; had 0: {args.training_corpus}"
            assert next(
                corpus.entities(), None
            ) is not None, "The corpus should have at least one entity; had 0"

    verify_corpus(train_set)

    # ------------------------------------------------------------------------------

    if args.mutations_specific:
        print("Pipeline specific to mutations")
        features_pipeline = get_prepare_pipeline_for_best_model(
            args.use_feat_windows, args.we_params, args.nl_features)
    else:
        print("Pipeline is general")
        features_pipeline = get_prepare_pipeline_for_best_model_general(
            args.use_feat_windows, args.we_params, args.dictionaries_paths,
            args.hdfs_url, args.hdfs_user, args.dictionaries_stop_words)

    # ------------------------------------------------------------------------------

    def print_run_args():
        for key, value in sorted((vars(args)).items()):
            print("\t{} = {}".format(key, value))
        print()

    print("Running arguments: ")

    print_run_args()

    # ------------------------------------------------------------------------------

    def train(train_set):
        definer.define(train_set)
        train_set.delete_subclass_annotations(args.delete_subclasses)
        features_pipeline.execute(train_set)
        labeler.label(train_set)

        if args.pruner == "parts":
            train_set.prune_empty_parts()
        else:
            try:
                f = HighRecallRegexClassifier(ST=args.ps_ST, NL=args.ps_NL)
            except AssertionError:
                f = (lambda _: False)
            train_set.prune_filtered_sentences(filterin=f,
                                               percent_to_keep=args.ps_random)

        stats(train_set, "training")

        model_path = os.path.join(args.output_folder, args.model_name + ".bin")
        PyCRFSuite.train(train_set, model_path, args.crf_train_params)

        return model_path

    # ------------------------------------------------------------------------------

    if args.do_train:
        args.model_path_1 = train(train_set)

    # ------------------------------------------------------------------------------

    def test(tagger, test_set, print_eval=True, print_results=False):
        tagger.tag(test_set)
        definer.define(test_set)
        stats(test_set, "test")
        evaluation = MentionLevelEvaluator(
            subclass_analysis=True).evaluate(test_set)

        print_run_args()

        if print_eval:
            print(evaluation)
        if print_results:
            ConsoleWriter(ent1_class_id=PRO_CLASS_ID,
                          ent2_class_id=MUT_CLASS_ID,
                          color=True).write(test_set)

    # ------------------------------------------------------------------------------

    assert (args.model_path_1 is not None)

    if args.model_path_2:
        tagger = NalaMultipleModelTagger(
            st_model=args.model_path_1,
            all3_model=args.model_path_2,
            features_pipeline=features_pipeline,
            execute_pp=args.execute_pp,
            keep_silent=args.keep_silent,
            keep_genetic_markers=args.keep_genetic_markers,
            keep_unnumbered=args.keep_unnumbered,
            keep_rs_ids=args.keep_rs_ids)
    else:
        tagger = NalaSingleModelTagger(
            bin_model=args.model_path_1,
            features_pipeline=features_pipeline,
            execute_pp=args.execute_pp,
            keep_silent=args.keep_silent,
            keep_genetic_markers=args.keep_genetic_markers,
            keep_unnumbered=args.keep_unnumbered,
            keep_rs_ids=args.keep_rs_ids)

    # ------------------------------------------------------------------------------

    print("\n{}".format(args.model_name))

    if train_set:
        stats(train_set, "training")

    if test_set:
        test(tagger,
             test_set,
             print_eval=args.string is None,
             print_results=args.string is not None)

    if args.do_train:
        print("\nThe model is saved to: {}\n".format(args.model_path_1))

    if args.write_anndoc:
        outdir = os.path.join(args.output_folder, args.model_name)
        os.mkdir(outdir)
        print("\nThe predicted test data is saved to: {}\n".format(outdir))
        TagTogFormat(test_set, use_predicted=True, to_save_to=outdir).export(0)

    end_time = time.time()

    print_debug("Elapsed time: ", (end_time - start_time))

    return {
        "tagger":
        tagger,
        "trained_model_path":
        args.model_path_1,
        "training_num_docs":
        0 if train_set is None else len(train_set.documents),
        "training_num_annotations":
        0 if train_set is None else sum(1 for e in train_set.entities()
                                        if e.class_id == args.only_class_id)
    }
Ejemplo n.º 7
0
def pattern_stats(dataset):
    """
    Testing Ground Carsten - High Recall Patterns creation method development method ported here.
    :type nala.structures.Dataset: dataset to perform pattern evaluation on (must include annotations)
    :return: nothing (print statements for the moment)
    """
    ExclusiveNLDefiner().define(dataset)

    # PubTatorFormat(dataset, no_annotations=False).export()

    print(dataset)

    nl_annotations = []

    # import connecting_words.json
    with open('nala/data/connecting_words.json', 'r') as f:
        regexs = json.load(f)

    # print(regexs)
    compiled_regexs = [re.compile(x) for x in regexs]

    nr_word_regex = re.compile(
        '\\b(one|two|three|four|five|six|seven|eight|nine|ten)\\b')
    aa_short_regex = re.compile(
        '\\b(cys|ile|ser|gln|met|asn|pro|lys|asp|thr|phe|ala|gly|his|leu|arg|trp|val|glu|tyr)\\b'
    )
    aa_long_regex = re.compile(
        '\\b(glutamine|glutamic acid|leucine|valine|isoleucine|lysine|alanine|glycine|aspartate|methionine|threonine|histidine|aspartic acid|arginine|asparagine|tryptophan|proline|phenylalanine|cysteine|serine|glutamate|tyrosine)\\b'
    )
    bp_code = re.compile('\\b\\w\\b')

    wordlist = []

    # for ann in dataset.annotations():
    #     if ann.subclass == 1 or ann.subclass == 2:
    #         new_text = ann.text.lower()
    #         for reg in compiled_regexs:
    #             new_text = reg.sub('_TT_', new_text)
    #         # re.sub('\\b\\d+\\b]', '_NR_', new_text)
    #         new_text = re.sub('\\b\\w*\\d+\\w*\\b', '_CODE_', new_text)
    #         new_text = nr_word_regex.sub('_TT_', new_text)
    #         new_text = aa_short_regex.sub('_AA_', new_text)
    #         new_text = aa_long_regex.sub('_AA_', new_text)
    #         new_text = bp_code.sub('_TT_', new_text)
    #         new_text = re.sub('\\W', ' ', new_text)
    #         # new_text = re.sub('\\b(\\w{1,3})\\b', '_TT_', new_text)
    #
    #         wordlist.extend(new_text.split(' '))
    #         # print(new_text)
    #         nl_annotations.append(new_text)
    #
    # wordset = set(wordlist)
    # wordlist = sorted(list(wordset))
    # print(json.dumps(wordlist, indent=2, sort_keys=True))
    # print(json.dumps(nl_annotations, indent=2, sort_keys=True))

    # todo provide method to create new pattern on an automated base
    # read in nl_patterns
    with open('nala/data/nl_patterns.json', 'r') as f:
        regexs = json.load(f)

    patterns = [re.compile(x) for x in regexs]

    # f-measure pattern-based
    _perf_patterns = {}
    for reg in patterns:
        _perf_patterns[reg.pattern] = [0, 0, -1]

    # check for annotations

    # for part in dataset.parts():
    #     print(part.text)

    # dataset with tmVar
    # TODO change if idp4 then those results otherwise use tmvartagger and caching
    dataset_high_recall = TmVarReader(
        'resources/corpora/idp4/pubtator_tmvar.txt').read()
    TP = 0
    FP = 0
    _length = len(dataset.documents.keys())
    _progress = 0
    _timestart = time.time()

    _time_avg_per_pattern = 0
    _pattern_calls = 0
    _time_reg_pattern_total = 0
    _time_max_pattern = 0
    _low_performant_pattern = ""
    _avg_chars_per_doc = dataset.get_size_chars() / len(
        dataset.documents.keys())

    # NLDefiners init
    exclusive_definer = ExclusiveNLDefiner()
    _e_array = [0, 0, 0]
    inclusive_definer = InclusiveNLDefiner()
    _i_array = [0, 0]

    # todo param file to save to
    with open('results/testing_ground_carsten.txt', 'w',
              encoding='utf-8') as f:
        for did, doc in dataset.documents.items():
            part_offset = 0
            for i, x in enumerate(doc.parts):
                # print("Part", i)
                sent_offset = 0
                cur_part = doc.parts.get(x)
                sentences = cur_part.sentences
                # new_text = cur_part.text.lower()
                # new_text = re.sub('\s+', ' ', new_text)
                # sentences = new_text.split('. ')
                for sent in sentences:
                    sent_len = len(sent)
                    new_text = sent.lower()
                    new_text = re.sub('[\./\\-(){}\[\],%]', '', new_text)
                    new_text = re.sub('\W+', ' ', new_text)
                    for i, reg in enumerate(patterns):

                        _lasttime = time.time()  # time start var
                        match = reg.search(new_text)

                        # debug bottleneck patterns
                        _time_current_reg = time.time(
                        ) - _lasttime  # time end var
                        _pattern_calls += 1  # pattern calls already occured
                        _time_reg_pattern_total += _time_current_reg  # total time spent on searching with patterns
                        if _time_reg_pattern_total > 0:
                            _time_avg_per_pattern = _time_reg_pattern_total / _pattern_calls  # avg spent time per pattern call

                        # if _pattern_calls > len(patterns) * 20 and _time_avg_per_pattern * 10000 < _time_current_reg:
                        #     print("BAD_PATTERN_PERFORMANCE:", _time_avg_per_pattern, _time_current_reg, reg.pattern)
                        # if _time_max_pattern < _time_current_reg:
                        #     _time_max_pattern = _time_current_reg
                        #     _low_performant_pattern = reg.pattern
                        #     print(_time_avg_per_pattern, _low_performant_pattern, _time_max_pattern)

                        # if reg.pattern == r'(\b\w*\d+\w*\b\s?){1,3} (\b\w+\b\s?){1,4} (\b\w*\d+\w*\b\s?){1,3} (\b\w+\b\s?){1,4} (deletion|deleting|deleted)':
                        #     if _time_current_reg > _time_avg_per_pattern * 10:
                        #         # print(_time_avg_per_pattern, _time_current_reg)
                        #         f.write("BAD_PATTERN\n")
                        #         f.write(sent + "\n")
                        #         f.write(new_text + "\n")

                        if match:
                            if did in dataset_high_recall.documents:
                                anti_doc = dataset_high_recall.documents.get(
                                    did)
                                start = part_offset + sent_offset + match.span(
                                )[0]
                                end = part_offset + sent_offset + match.span(
                                )[1]
                                if not anti_doc.overlaps_with_mention(
                                        start, end):
                                    _e_result = exclusive_definer.define_string(
                                        new_text[match.span()[0]:match.span(
                                        )[1]])
                                    _e_array[_e_result] += 1
                                    _i_result = inclusive_definer.define_string(
                                        new_text[match.span()[0]:match.span(
                                        )[1]])
                                    _i_array[_i_result] += 1
                                    if doc.overlaps_with_mention(start, end):
                                        TP += 1
                                        f.write(
                                            "{}\tTP\te{}\ti{}\t{}\t{}\t{}\n".
                                            format(did, _e_result, _i_result,
                                                   sent, match, reg.pattern))
                                        _perf_patterns[reg.pattern][0] += 1
                                    else:
                                        FP += 1
                                        f.write(
                                            "{}\tFP\te{}\ti{}\t{}\t{}\t{}\n".
                                            format(did, _e_result, _i_result,
                                                   sent, match, reg.pattern))
                                        _perf_patterns[reg.pattern][1] += 1

                                    if _perf_patterns[reg.pattern][1] > 0:
                                        _perf_patterns[
                                            reg.pattern][2] = _perf_patterns[
                                                reg.
                                                pattern][0] / _perf_patterns[
                                                    reg.pattern][1]
                        if _lasttime - time.time() > 1:
                            print(i)
                    sent_offset += 2 + sent_len
                part_offset += sent_offset
            _progress += doc.get_size() / _avg_chars_per_doc
            _time_progressed = time.time() - _timestart
            _time_per_doc = _time_progressed / _progress
            _time_req_time = _time_per_doc * _length
            _time_eta = _time_req_time - _time_progressed
            print("PROGRESS: {:.3%} PROGRESS: {:.2f} secs ETA: {:.2f} secs".
                  format(_progress / _length, _time_progressed, _time_eta))
            if TP + FP > 0:
                print(
                    'STATS: TP:{}, FP:{}, TP+FP:{} %containingNLmentions:{:.4%}'
                    .format(TP, FP, TP + FP, TP / (TP + FP)))

    print("Exclusive Definer:", _e_array)
    print("Inclusive Definer:", _i_array)

    for key, value in _perf_patterns.items():
        if value[2] != -1:
            print(value, key)
Ejemplo n.º 8
0
    def filter(self, documents, min_found=1, use_nala=False):
        """
        :type documents: collections.Iterable[(str, nalaf.structures.data.Document)]
        """

        _progress = 1
        _start_time = time.time()
        _total_time = 0

        _time_avg_per_pattern = 0
        _pattern_calls = 0
        _time_reg_pattern_total = 0
        _time_max_pattern = 0
        _low_performant_pattern = ""

        # NLDefiners init
        exclusive_definer = ExclusiveNLDefiner()
        _e_array = [0, 0, 0]
        inclusive_definer = InclusiveNLDefiner()
        _i_array = [0, 0]

        last_found = 0
        crf = PyCRFSuite(self.location_binary_model)

        # counter_to_stop_for_caching = 0

        for pmid, doc in documents:
            # if any part of the document contains any of the keywords
            # yield that document

            # if counter_to_stop_for_caching > 400:
            #     break
            # counter_to_stop_for_caching += 1
            # print(counter_to_stop_for_caching)

            part_offset = 0
            data_tmp = Dataset()
            data_tmp.documents[pmid] = doc
            data_nala = deepcopy(data_tmp)
            NLTKSplitter().split(data_tmp)
            # data_tmvar = TmVarTagger().generate_abstracts([pmid])
            if use_nala:
                self.pipeline.execute(data_nala)
                self.labeler.label(data_nala)
                crf.tag(data_nala, MUT_CLASS_ID)
                PostProcessing().process(data_nala)
                ExclusiveNLDefiner().define(data_nala)

            used_regexs = {}

            positive_sentences = 0
            for i, x in enumerate(doc.parts):
                # print("Part", i)
                sent_offset = 0
                cur_part = doc.parts.get(x)
                sentences = cur_part.sentences_

                for sent in sentences:
                    sent_length = len(sent)
                    new_text = sent.lower()
                    new_text = re.sub('[\./\\-(){}\[\],%]', ' ', new_text)
                    # new_text = re.sub('\W+', ' ', new_text)

                    found_in_sentence = False

                    for i, reg in enumerate(self.patterns):
                        _lasttime = time.time()  # time start var
                        match = reg.search(new_text)

                        # debug bottleneck patterns
                        _time_current_reg = time.time(
                        ) - _lasttime  # time end var
                        _pattern_calls += 1  # pattern calls already occured
                        _time_reg_pattern_total += _time_current_reg  # total time spent on searching with patterns
                        if _time_reg_pattern_total > 0:
                            _time_avg_per_pattern = _time_reg_pattern_total / _pattern_calls  # avg spent time per pattern call
                        # todo create pattern performance eval for descending amount of recognized patterns
                        # if _pattern_calls > len(patterns) * 20 and _time_avg_per_pattern * 10000 < _time_current_reg:
                        #     print("BAD_PATTERN_PERFORMANCE:", _time_avg_per_pattern, _time_current_reg, reg.pattern)
                        # if _time_max_pattern < _time_current_reg:
                        #     _time_max_pattern = _time_current_reg
                        #     _low_performant_pattern = reg.pattern
                        #     print(_time_avg_per_pattern, _low_performant_pattern, _time_max_pattern)

                        # if reg.pattern == r'(\b\w*\d+\w*\b\s?){1,3} (\b\w+\b\s?){1,4} (\b\w*\d+\w*\b\s?){1,3} (\b\w+\b\s?){1,4} (deletion|deleting|deleted)':
                        #     if _time_current_reg > _time_avg_per_pattern * 10:
                        #         # print(_time_avg_per_pattern, _time_current_reg)
                        #         f.write("BAD_PATTERN\n")
                        #         f.write(sent + "\n")
                        #         f.write(new_text + "\n")
                        if match:
                            # if pmid in data_tmvar.documents:
                            #     anti_doc = data_tmvar.documents.get(pmid)
                            nala_doc = data_nala.documents.get(pmid)

                            start = part_offset + sent_offset + match.span()[0]
                            end = part_offset + sent_offset + match.span()[1]
                            # print("TmVar is not overlapping?:", not anti_doc.overlaps_with_mention(start, end))
                            # print(not nala_doc.overlaps_with_mention(start, end, annotated=False))

                            if reg.pattern in used_regexs:
                                used_regexs[reg.pattern] += 1
                            else:
                                used_regexs[reg.pattern] = 1
                            print(color.PURPLE + new_text.replace(
                                match.group(), color.BOLD + color.DARKCYAN +
                                color.UNDERLINE + match.group() + color.END +
                                color.PURPLE) + color.END)
                            if not found_in_sentence:
                                positive_sentences += 1
                                found_in_sentence = True
                                # if not anti_doc.overlaps_with_mention(start,
                                #                                       end) \
                                #         and not nala_doc.overlaps_with_mention(start, end, annotated=False):
                                #     _e_result = exclusive_definer.define_string(
                                #         new_text[match.span()[0]:match.span()[1]])
                                #     _e_array[_e_result] += 1
                                #     _i_result = inclusive_definer.define_string(
                                #         new_text[match.span()[0]:match.span()[1]])
                                #     _i_array[_i_result] += 1
                                # todo write to file param + saving to manually annotate and find tp + fp for performance eval on each pattern
                                # print("e{}\ti{}\t{}\t{}\t{}\n".format(_e_result, _i_result, sent, match, reg.pattern))

                                # last_found += 1
                                # found_in_sentence = True
                                # else:
                                #     # if nala not used only tmvar considered
                                #     if not anti_doc.overlaps_with_mention(start, end):
                                #         _e_result = exclusive_definer.define_string(
                                #             new_text[match.span()[0]:match.span()[1]])
                                #         _e_array[_e_result] += 1
                                #         _i_result = inclusive_definer.define_string(
                                #             new_text[match.span()[0]:match.span()[1]])
                                #         _i_array[_i_result] += 1
                                #         # todo write to file param + saving to manually annotate and find tp + fp for performance eval on each pattern
                                #         # print("e{}\ti{}\t{}\t{}\t{}\n".format(_e_result, _i_result, sent, match, reg.pattern))
                                #         last_found += 1
                                #         found_in_sentence = True

                            if use_nala:
                                nala_found_mention = nala_doc.overlaps_with_mention(
                                    start, end, annotated=False)
                                if nala_found_mention:
                                    print_verbose(nala_found_mention)
                                    if nala_found_mention.subclass > 0 and nala_found_mention.confidence <= self.threshold:
                                        yield pmid, doc

                        if _lasttime - time.time() > 1:
                            print_verbose('time intensive regex', i)
                    sent_offset += 2 + sent_length

                    # for per sentence positives
                    if found_in_sentence:
                        positive_sentences += 1

                part_offset += sent_offset
            if use_nala:
                for part in nala_doc:
                    for ann in part.predicted_annotations:
                        if ann.subclass > 0:
                            print_verbose(part.text[:ann.offset] + color.BOLD +
                                          ann.text + color.END +
                                          part.text[ann.offset +
                                                    len(ann.text):])
                            positive_sentences += min_found
            _old_time = _start_time
            _start_time = time.time()
            _one_time = _start_time - _old_time

            if _one_time > 0.3 and positive_sentences > min_found:
                _progress += 1
                _total_time += _one_time

            _time_per_doc = _total_time / _progress
            print_verbose(
                "PROGRESS: {:.2f} secs ETA per one positive document:"
                " {:.2f} secs".format(_total_time, _time_per_doc))
            print_debug('used regular expressions:',
                        json.dumps(used_regexs, indent=4))
            if positive_sentences >= min_found:
                last_found = 0
                print_verbose('YEP', pmid)
                yield pmid, doc
            else:
                print_verbose('NOPE', pmid)
Ejemplo n.º 9
0
    folderName = sys.argv[3]

except:
    corpus = StringReader(corpusName).read()
    folderName = None  # just print out in standard output

# ------------------------------------------------------------------------------

# Example calls:
# python scripts/SETH.py SETH nala_test resources/predictions/  # predict
# python scripts/SETH.py check_performance nala_test resources/predictions/SETH/nala_test &> resources/predictions/SETH/nala_test/oresults.tsv  # evaluate

if (methodName == 'check_performance'):
    # folderName is assumed to be the final/leaf predictions folder, e.g., `resources/predictions/SETH/nala_test`
    BRATPartsAnnotationReader(folderName, is_predicted=True).annotate(corpus)
    ExclusiveNLDefiner().define(corpus)
    evaluation = MentionLevelEvaluator(subclass_analysis=True).evaluate(corpus)
    print(evaluation)

else:
    if folderName:
        # folderName is assumed to be the root predictions folder, e.g., `resources/predictions/`
        folderName = os.path.join(folderName, methodName, corpusName)
        if not os.path.exists(folderName):
            os.makedirs(folderName)

    useMutationFinderOnly = "true" if methodName == "MFmodified" else "false"

    run_set_server(useMutationFinderOnly)

    run_seth_on_corpus(corpus, folderName, useMutationFinderOnly)
Ejemplo n.º 10
0
parser.add_argument('--counttokens',
                    help='Count the tokens. Note, this is considerably slower',
                    action='store_true')

args = parser.parse_args()

if args.corpora[0] == "*" or args.corpora[0] == 'all':
    args.corpora = ALL_CORPORA

if args.listanns == '*' or args.listanns == 'all':
    args.listanns = '0,1,2'
args.listanns = set(int(c) for c in args.listanns.split(",") if c)

# ------------------------------------------------------------------------------

nldefiner = ExclusiveNLDefiner()

pipeline = PrepareDatasetPipeline(feature_generators=[])

ST = 0  # Standard
NL = 1  # Natural Language
SST = 2  # Semi-Standard -- also often denoted before as 'SS'

MARKER = ['        ', '@@@@@@@@', '********']

PROB = "{0:.3f}"  # FORMAT

# ------------------------------------------------------------------------------


def get_corpus_type(name):
Ejemplo n.º 11
0
 def setUpClass(self):
     self.definer = ExclusiveNLDefiner()
Ejemplo n.º 12
0
class TestExclusiveNLDefiner(unittest.TestCase):
    @classmethod
    def setUpClass(self):
        self.definer = ExclusiveNLDefiner()

    def test_on_empty_string(self):
        try:
            self.definer.define_string("")
        except Exception:
            self.fail(
                "empty string result is undefined but should not throw an exception"
            )

    def test_define_string(self):
        f = self.definer.define_string
        testEqual = self.assertEqual

        testEqual(0, f("rs206437"))  # rsid
        testEqual(0, f("ss469415642"))  # ssid

        testEqual(2, f("C226 to T"))
        testEqual(2, f("G446 to A"))
        testEqual(2, f("C821 to T"))
        testEqual(2, f("Arg76 to Trp"))
        testEqual(2, f("Arg149 to Gln"))
        testEqual(2, f("Pro274 to Leu"))
        testEqual(2, f("T320 to C"))
        testEqual(2, f("Leu107 to Pro"))
        testEqual(2, f("C631 to T"))
        testEqual(2, f("Arg211 to Cys"))
        testEqual(2, f("Ala215 to Thr"))
        testEqual(1, f("deletion of its cytoplasmic tail"))
        testEqual(1, f("nonsense mutation Q3X"))
        testEqual(0, f("R142Q"))
        testEqual(1, f("G-->A transition of a CpG dinucleotide"))
        testEqual(1, f("A C-->T transition of the same CpG"))
        testEqual(0, f("R142X"))
        testEqual(0, f("R142X"))
        testEqual(0, f("R142Q"))
        testEqual(1, f("replacement of this CpG hotspot by CpA"))
        testEqual(0, f("R142X"))
        testEqual(1, f("caused skipping of the exon"))
        testEqual(1, f("Absence of exon 5"))
        testEqual(0, f("Asp8Asn"))
        testEqual(1, f("G to A transition at nt22"))
        testEqual(1, f("asparagine for aspartic acid at codon 8"))
        testEqual(0, f("Asp8Asn"))
        testEqual(
            1,
            f("substitution of neutral asparagine for anionic aspartic acid"))
        testEqual(1, f("G to A transition is at a CpG dinucleotide"))
        testEqual(1,
                  f("codon CAA encoding glutamine-2153 to UAA, a stop codon"))
        testEqual(
            1,
            f("attaching an epitope tag sequence to the C terminus of the editing protein"
              ))
        testEqual(0, f("H15D"))
        testEqual(0, f("A83D"))
        testEqual(0, f("A179D"))
        testEqual(0, f("573 + IG-->A"))
        testEqual(0, f("H15D"))
        testEqual(0, f("A83D"))
        testEqual(0, f("A179D"))
        testEqual(1, f("skipping of exon 5"))
        testEqual(0, f("H15D"))
        testEqual(
            1,
            f("Replacement of these small hydrophobic Ala residues with the charged, more bulky Asp side chain"
              ))
        testEqual(0, f("G20R"))
        testEqual(1, f("G to A transition at a CpG"))
        testEqual(1, f("glycine to arginine substitution at codon 20"))
        testEqual(0, f("26delA"))

        testEqual(0, f("delPhe1388"))
        testEqual(1, f("deleted C1 domain"))

        testEqual(0, f("Q115P"))
        testEqual(0, f("g.3912G>C"))
        testEqual(0, f("c.925delA"))
        testEqual(0, f("c.388+3insT"))

        testEqual(0, f("3992-9g-->a"))
        testEqual(2, f("3992-9g-->a mutation"))
        testEqual(2, f("G643 to A"))
        testEqual(2, f("leucine for arginine 90"))

        testEqual(1, f("deletion of aa 527-534"))
        testEqual(
            1, f("deletion of 10 and 8 residues from the N- and C-terminals"))
        testEqual(1, f("143 from alanine to glycine"))
        testEqual(
            1,
            f("alterations of amino acid residue 143 from alanine to glycine"))

        testEqual(1, f("trinucleotide deletion"))

        testEqual(1, f("arginine-141 to serine substitution"))
        testEqual(1, f("mutations at Arg885"))
        testEqual(1, f("point mutation at Cys93"))
        testEqual(1, f("heterozygous missense 3035G>T"))
        testEqual(2, f("synonymous 696T>C"))
        testEqual(2, f("missense Glu285Ala"))
        testEqual(1, f("somatic 16-bp deletion"))
        testEqual(1, f("serine 749 is phosphorylated"))
        testEqual(1, f("Ser58 to Glu substitution"))
        testEqual(1, f("deletion of"))
        testEqual(1, f("deletion of"))
        testEqual(1, f("deletion of"))
        testEqual(1, f("deletion of"))
        testEqual(0, f("GAT-->GTT, Asp-->Val"))
        testEqual(2, f("codon 98 GAT-->GTT, Asp-->Val"))
        testEqual(2, f("codon 92, TAC-->TAT"))
        testEqual(
            1,
            f("arginine-127 into glutamine and arginine-469 into tryptophan"))
        testEqual(2, f("arginine-127 into glutamine"))
        testEqual(2, f("arginine-469 into tryptophan"))

        testEqual(0, f("TP73Δex2/3"))
        testEqual(1, f("abrogated loss of Chr19"))

        # More difficult

        testEqual(2, f("chromothripsis"))
        testEqual(2, f("Morpholino knockdown"))
        testEqual(2, f("methionine replaces lysine 27"))
        testEqual(2, f("lysine(27)-to-methionine"))

        testEqual(1, f("C-tail displacement"))
        testEqual(1, f("22q11 deletion syndrome"))
        testEqual(1, f("hippocampal neuron L1 insertions"))
        testEqual(1, f("copy-number variants"))
Ejemplo n.º 13
0
class PostProcessing:
    def __init__(self,
                 keep_silent=True,
                 keep_genetic_markers=True,
                 keep_unnumbered=True,
                 keep_rs_ids=True):

        amino_acids = [
            'alanine', 'ala', 'arginine', 'arg', 'asparagine', 'asn',
            'aspartic acid', 'aspartate', 'asp', 'cysteine', 'cys',
            'glutamine', 'gln', 'glutamic acid', 'glutamate', 'glu', 'glycine',
            'gly', 'histidine', 'his', 'isoleucine', 'ile', 'leucine', 'leu',
            'lysine', 'lys', 'methionine', 'met', 'phenylalanine', 'phe',
            'proline', 'pro', 'serine', 'ser', 'threonine', 'thr',
            'tryptophan', 'trp', 'tyrosine', 'tyr', 'valine', 'val',
            'aspartic acid', 'asparagine', 'asx', 'glutamine', 'glutamic acid',
            'glx'
        ]

        nucleotides = ['adenine', 'guanine', 'thymine', 'cytosine', 'uracil']

        keywords = [
            'substit\w*', 'lead\w*', 'exchang\w*', 'chang\w*', 'mutant\w*',
            'mutate\w*', 'devia\w*', 'modif\w*', 'alter\w*', 'switch\w*',
            'variat\w*', 'instead\w*', 'replac\w*', 'in place', 'convert\w*',
            'becom\w*'
        ]

        # AA = '|'.join(amino_acids)
        AA_NN = '|'.join(amino_acids + nucleotides)
        AA_LL = '|'.join(amino_acids + list('CISQMNPKDTFAGHLRWVEYX'))
        KK = '|'.join(keywords)

        genetic_marker_regex = re.compile(r'\bD\d+([A-Z]\d+)?S\d{2,}\b')
        rs_id_regex = re.compile(r'\b\[?rs\]? *\d{3,}(,\d+)*\b')
        ss_id_regex = re.compile(r'\b\[?ss\]? *\d{3,}(,\d+)*\b')

        self.patterns = [
            re.compile(
                '({SS})[- ]*[1-9][0-9]* +(in|to|into|for|of|by|with|at) +({SS})( *(,|,?or|,?and) +({SS}))*'
                .format(SS=AA_NN), re.IGNORECASE),
            re.compile(
                '({SS}) +(in|to|into|for|of|by|with|at) +({SS})[- ]*[1-9][0-9]*'
                '( *(,|,?or|,?and) +({SS})[- ]*[1-9][0-9]*)*'.format(SS=AA_NN),
                re.IGNORECASE),
            re.compile(
                '({SS})(( (({KK})) (in|to|into|for|of|by|with|at) (a|an|the|) '
                '*({SS})[1-9]\d*( *(,|or|and|, and|, or) ({SS})[1-9]\d*)*)'
                '|([- ]*[1-9]\d*( +((has|have|had) +been|is|are|was|were|) '
                '+(({KK})))? +(in|to|into|for|of|by|with|at) +({SS})( *(,|or|and|, and|, or) +({SS}))*))'
                .format(SS=AA_NN, KK=KK), re.IGNORECASE),
            re.compile(r'\bp\. *({SS}) *[-+]*\d+ *({SS})\b'.format(SS=AA_NN),
                       re.IGNORECASE),
            re.compile(
                r'\b({SS})[-to ]*[-+]*\d+[-to ]*({SS})\b'.format(SS=AA_NN),
                re.IGNORECASE),
            re.compile(
                r'\b[CISQMNPKDTFAGHLRWVEYX](/|-|-*>|→|-to-)[CISQMNPKDTFAGHLRWVEYX] *[-+]*\d+\b'
            ),
            re.compile(
                r'((?<!\w)[-+]*\d+:? *?)??[CISQMNPKDTFAGHLRWVEYX] *(/|-|-*>|→|-*to-*) *[CISQMNPKDTFAGHLRWVEYX]\b'
            ),
            re.compile(r'\b[CISQMNPKDTFAGHLRWVEYX]{3,}/-(?<!\w)'),
            re.compile(
                r'\b[CISQMNPKDTFAGHLRWVEYX] *\d{2,} *[CISQMNPKDTFAGHLRWVEYX]( *(/) *[CISQMNPKDTFAGHLRWVEYX])*\b'
            ), genetic_marker_regex, rs_id_regex, ss_id_regex,
            re.compile(
                r'\b(\d+-)?\d*[D|d]elta(\d{2,}|[CISQMNPKDTFAGHLRWVEYX])\b'),
            re.compile(r'\b(c\. *)?[ATCG] *([-+]|\d)\d+ *[ATCG]\b'),
            re.compile(r'\b(c\.|E(X|x)\d+) *([-+]|\d)\d+[ATCG] *> *[ATCG]\b'),
            re.compile(r'\b[ATCG][-+]*\d+[ATCG]/[ATCG]\b'),
            re.compile(
                r'(?<!\w)[-+]?\d+ *\d* *(b|bp|N|ntb|p|BP|B) *(INS|DEL|INDEL|DELINS|DUP|ins|del|indel|delins|dup)\b'
            ),
            re.compile(
                r'(?<!\w)[-+]*\d+ *(INS|DEL|INDEL|DELINS|DUP|ins|del|indel|delins|dup)[0-9CISQMNPKDTFAGHLRWVEYX]+\b'
            ),
            re.compile(
                r'\b[CISQMNPKDTFAGHLRWVEYX]+ *[-+]*\d+ *(INS|DEL|INDEL|DELINS|DUP|ins|del|indel|delins|dup)\b'
            ),
            re.compile(
                r'\b(INS|DEL|INDEL|DELINS|DUP|ins|del|indel|delins|dup) *(\d+(b|bp|N|ntb|p|BP|B)|[ATCG]{1,})\b'
            ),
            re.compile(
                r'(?<!\w)[-+]*\d+ *(INS|DEL|INDEL|DELINS|DUP|ins|del|indel|delins|dup)[CISQMNPKDTFAGHLRWVEYX]+\b'
            ),
            re.compile(
                r'\b[CISQMNPKDTFAGHLRWVEYX]+ *[-+]*\d+ *(INS|DEL|INDEL|DELINS|DUP|ins|del|indel|delins|dup)\b'
            )
        ]

        self.negative_patterns = [
            # single AAs
            re.compile(r'^({SS}) *\d+$'.format(SS=AA_NN), re.IGNORECASE),
            re.compile(r'^[CISQMNPKDTFAGHLRWVEYX]+ *\d+$'),
            re.compile(r'^({SS})([-/>]({SS}))*$'.format(SS=AA_LL),
                       re.IGNORECASE),
            # just numbers
            re.compile(r'^[-+]?\d+([-+/ ]+\d+)*( *(b|bp|N|ntb|p|BP|B))?$')
        ]

        if not keep_genetic_markers:
            self.negative_patterns.append(genetic_marker_regex)

        if not keep_rs_ids:
            self.negative_patterns.append(rs_id_regex)
            self.negative_patterns.append(ss_id_regex)

        self.keep_unnumbered = keep_unnumbered

        self.at_least_one_letter_n_number_letter_n_number = re.compile(
            '(?=.*[A-Za-z])(?=.*[0-9])[A-Za-z0-9]+')
        self.keep_silent = keep_silent
        self.definer = ExclusiveNLDefiner()

    def process(self, dataset, class_id=MUT_CLASS_ID):
        for doc_id, doc in dataset.documents.items():
            for part_id, part in doc.parts.items():
                self.__fix_issues(part)
                for regex in self.patterns:
                    for match in regex.finditer(part.text):
                        start = match.start()
                        end = match.end()
                        matched_text = part.text[start:end]
                        ann = Entity(class_id, start, matched_text)

                        Entity.equality_operator = 'exact_or_overlapping'
                        if ann not in part.predicted_annotations:
                            part.predicted_annotations.append(
                                Entity(class_id, start, matched_text))
                        Entity.equality_operator = 'overlapping'
                        if ann in part.predicted_annotations:
                            for index, ann_b in enumerate(
                                    part.predicted_annotations):
                                if ann == ann_b and len(matched_text) > len(
                                        ann_b.text):
                                    part.predicted_annotations[index] = ann

                to_delete = [
                    index
                    for index, ann in enumerate(part.predicted_annotations)
                    if any(r.search(ann.text) for r in self.negative_patterns)
                    or (not self.keep_silent and self.__is_silent(ann)) or
                    (not self.keep_unnumbered and not self._is_numbered(ann))
                ]

                part.predicted_annotations = [
                    ann for index, ann in enumerate(part.predicted_annotations)
                    if index not in to_delete
                ]

        # sanity check, make sure annotations match their offset
        for part in dataset.parts():
            for ann in part.predicted_annotations:
                assert ann.text == part.text[ann.offset:ann.offset +
                                             len(ann.text)]
                while ann.text[0] == ' ':
                    ann.offset += 1
                    ann.text = ann.text[1:]
                while ann.text[-1] == ' ':
                    ann.text = ann.text[:-1]
                # assert ann.text == ann.text.strip(), ("'" + ann.text + "'")

    def __is_silent(self, ann):
        split = re.split('[^A-Za-z]+', ann.text)
        return len(split) == 2 and split[0] == split[1]

    def _is_numbered(self, ann):
        return any(c.isdigit()
                   for c in ann.text) or self.definer.define_string(
                       ann.text) == 1

    def __fix_issues(self, part):
        """
        :type part: nalaf.structures.data.Part
        """
        to_be_removed = []
        for index, ann in enumerate(part.predicted_annotations):
            start = ann.offset
            end = ann.offset + len(ann.text)

            # split multiple mentions
            split = re.split(r' *(?:\band\b|/|\\|,|;|\bor\b) *', ann.text)
            if len(split) > 1:
                # for each split part calculate the offsets and the constraints
                offset = 0
                split_info = []
                for text in split:
                    split_info.append(
                        (text, self.definer.define_string(text),
                         ann.text.find(text, offset),
                         self.at_least_one_letter_n_number_letter_n_number.
                         search(text)))
                    offset += len(text)

                split_parts = [
                    split_part for split_part in split_info
                    if split_part[0] != ''
                ]
                lens = [len(split_part[0]) for split_part in split_parts]
                patterns = [
                    re.sub(
                        '\W+', '',
                        re.sub('[0-9]', '0', re.sub('[a-zA-Z]', 'a',
                                                    parts[0])))
                    for parts in split_parts
                ]

                # if all the non empty parts are from class ST (0) and also contain at least one number and one letter
                # or if the lengths of the splitted parts are the same or follow the same pattern
                if all(split_part[1] == 0 and split_part[3]
                       for split_part in split_parts) or max(lens) == min(
                           lens) or len(set(patterns)) == 1:
                    to_be_removed.append(index)

                    # add them to
                    for split_text, split_class, split_offset, aonanl in split_info:
                        if split_text != '':
                            part.predicted_annotations.append(
                                Entity(ann.class_id, ann.offset + split_offset,
                                       split_text))

            # fix boundary, 1858C>T --> +1858C>T
            if re.search('^[0-9]', ann.text) and re.search(
                    '([\-\+])', part.text[start - 1]):
                ann.offset -= 1
                ann.text = part.text[start - 1] + ann.text
                start -= 1

            # fix boundary delete (
            if ann.text[0] == '(' and ')' not in ann.text:
                ann.offset += 1
                ann.text = ann.text[1:]
                start += 1

            # fix boundary delete )
            if ann.text[-1] == ')' and '(' not in ann.text:
                ann.text = ann.text[:-1]

            # fix boundary add missing (
            if part.text[start - 1] == '(' and ')' in ann.text:
                ann.offset -= 1
                ann.text = '(' + ann.text
                start -= 1

            # fix boundary add missing )
            try:
                if part.text[end] == ')' and '(' in ann.text:
                    ann.text += ')'
            except IndexError:
                pass

            # fix boundary add missing number after fsX
            try:
                found_missing_fsx = False
                if part.text[end:end + 2] == 'fs':
                    ann.text += 'fs'
                    end += 2
                    found_missing_fsx = True
                if ann.text.endswith('fs') and part.text[end] == 'X':
                    ann.text += 'X'
                    end += 1
                    found_missing_fsx = True
                if found_missing_fsx:
                    while part.text[end].isnumeric():
                        ann.text += part.text[end]
                        end += 1
            except IndexError:
                pass

            # fix boundary add missing c. or p. before ann
            try:
                if ann.text.startswith('.'):
                    if part.text[start - 1] in ('c', 'p'):
                        ann.offset -= 1
                        ann.text = part.text[start - 1] + ann.text
                        start -= 1
                elif part.text[start - 2:start] in ('c.', 'p.', 'rt'):
                    ann.offset -= 2
                    ann.text = part.text[start - 2:start] + ann.text
                    start -= 2
            except IndexError:
                pass

            # fix boundary add missing \d+ at the beginning
            if ann.text[0] == '-' or part.text[start - 1] == '-':
                tmp = start
                while tmp - 1 > 0 and (part.text[tmp - 1].isnumeric()
                                       or part.text[tmp - 1] == '-'):
                    tmp -= 1
                if part.text[tmp - 1] == ' ':
                    ann.offset = tmp
                    ann.text = part.text[ann.offset:start] + ann.text
                    start = tmp

            isword = re.compile(r'\w')

            # The word must end in space to the left
            # not matched: 'and+2740 A>G'
            if isword.search(ann.text[0]) and \
                (not (ann.offset >= 3 and part.text[ann.offset - 3: ann.offset] == "and"
                or (ann.offset >= 2 and part.text[ann.offset - 2: ann.offset] == "or"))):

                while ann.offset > 0 and isword.search(
                        part.text[ann.offset - 1]):
                    ann.text = part.text[ann.offset - 1] + ann.text
                    ann.offset -= 1

            veryend = len(part.text)
            end = ann.offset + len(ann.text)

            # The word must end in space to the right
            while end < veryend and isword.search(part.text[end]):
                ann.text = ann.text + part.text[end]
                end += 1

            # Remove parenthesis if within parenthesis but no parentesis either in between
            if ann.text[0] in ['('] and ann.text[-1] in [
                    ')'
            ] and (ann.text.count('(') < 2 and ann.text.count(')') < 2):
                ann.offset += 1
                ann.text = ann.text[1:-1]

            # Follow the rule of abbreviations + first gene mutation (then protein mutation)
            if ((ann.text[-1] == ')' or
                 (end < veryend and part.text[end] == ")"))
                    and ann.text[:-1].count('(') == 1):
                # Requirement 1: must be space to the left of (, not to match things like in Arg407(AGG) or IVS3(+1)
                p = re.compile("\\s+\\(")
                split = p.split(ann.text)
                if len(split) == 2:

                    # Requirement 2: both parths must contain a number (== position, they can stand alone)
                    def req2():
                        return any(c.isdigit() for c in split[0]) and any(
                            c.isdigit() for c in split[1])

                    # Other Reqs on left part
                    def req3():
                        return any(c.isalpha() for c in split[0].replace(
                            'and', '').replace('or', ''))

                    # Other Reqs on right part
                    def req4():
                        return any(c.isalpha() for c in split[1].replace(
                            'and', '').replace('or', ''))

                    if req2() and len(split[0]) > 2 and req3() and req4():
                        # Neg.: Arg407(AGG) - single amino acid substitution (Phe for Val) - nonsense mutation (286T)
                        # Neg.: deletion (229bp) -  nonsense mutation (glycine 568 to stop)
                        # Neg.: one insertion mutation (698insC) - AChR epsilon (CHRNE E376K)
                        # Neg. (other reqs): M1 (Val213) - 207 and 208 (207-HA)
                        # Neg. (other reqs): located 14 amino acids toward the amino-terminal end from the (682)
                        #
                        # Pos.: serine to arginine at the codon 113 (p. S113R)
                        # Pos.: mutagenesis of the initial ATG codon to ACG (Met 1 to Thr) - H2A at position 105 (Q105)
                        # Pos.: Trp replacing Gln in position 156 (A*2406) - A-1144-to-C transversion (K382Q)
                        # Pos: deletion of 123 bases (41 codons) - exon 12 (R432T)

                        ann1text = split[0]
                        to_be_removed.append(index)
                        part.predicted_annotations.append(
                            Entity(ann.class_id, ann.offset, ann1text))
                        ann2text = split[1] if ann.text[-1] != ')' else split[
                            1][:-1]
                        # last part is number of spaces + (
                        ann2offset = ann.offset + len(ann1text) + (
                            len(ann.text) - sum(len(x) for x in split))
                        part.predicted_annotations.append(
                            Entity(ann.class_id, ann2offset, ann2text))

        part.predicted_annotations = [
            ann for index, ann in enumerate(part.predicted_annotations)
            if index not in to_be_removed
        ]
Ejemplo n.º 14
0
    def __init__(self,
                 keep_silent=True,
                 keep_genetic_markers=True,
                 keep_unnumbered=True,
                 keep_rs_ids=True):

        amino_acids = [
            'alanine', 'ala', 'arginine', 'arg', 'asparagine', 'asn',
            'aspartic acid', 'aspartate', 'asp', 'cysteine', 'cys',
            'glutamine', 'gln', 'glutamic acid', 'glutamate', 'glu', 'glycine',
            'gly', 'histidine', 'his', 'isoleucine', 'ile', 'leucine', 'leu',
            'lysine', 'lys', 'methionine', 'met', 'phenylalanine', 'phe',
            'proline', 'pro', 'serine', 'ser', 'threonine', 'thr',
            'tryptophan', 'trp', 'tyrosine', 'tyr', 'valine', 'val',
            'aspartic acid', 'asparagine', 'asx', 'glutamine', 'glutamic acid',
            'glx'
        ]

        nucleotides = ['adenine', 'guanine', 'thymine', 'cytosine', 'uracil']

        keywords = [
            'substit\w*', 'lead\w*', 'exchang\w*', 'chang\w*', 'mutant\w*',
            'mutate\w*', 'devia\w*', 'modif\w*', 'alter\w*', 'switch\w*',
            'variat\w*', 'instead\w*', 'replac\w*', 'in place', 'convert\w*',
            'becom\w*'
        ]

        # AA = '|'.join(amino_acids)
        AA_NN = '|'.join(amino_acids + nucleotides)
        AA_LL = '|'.join(amino_acids + list('CISQMNPKDTFAGHLRWVEYX'))
        KK = '|'.join(keywords)

        genetic_marker_regex = re.compile(r'\bD\d+([A-Z]\d+)?S\d{2,}\b')
        rs_id_regex = re.compile(r'\b\[?rs\]? *\d{3,}(,\d+)*\b')
        ss_id_regex = re.compile(r'\b\[?ss\]? *\d{3,}(,\d+)*\b')

        self.patterns = [
            re.compile(
                '({SS})[- ]*[1-9][0-9]* +(in|to|into|for|of|by|with|at) +({SS})( *(,|,?or|,?and) +({SS}))*'
                .format(SS=AA_NN), re.IGNORECASE),
            re.compile(
                '({SS}) +(in|to|into|for|of|by|with|at) +({SS})[- ]*[1-9][0-9]*'
                '( *(,|,?or|,?and) +({SS})[- ]*[1-9][0-9]*)*'.format(SS=AA_NN),
                re.IGNORECASE),
            re.compile(
                '({SS})(( (({KK})) (in|to|into|for|of|by|with|at) (a|an|the|) '
                '*({SS})[1-9]\d*( *(,|or|and|, and|, or) ({SS})[1-9]\d*)*)'
                '|([- ]*[1-9]\d*( +((has|have|had) +been|is|are|was|were|) '
                '+(({KK})))? +(in|to|into|for|of|by|with|at) +({SS})( *(,|or|and|, and|, or) +({SS}))*))'
                .format(SS=AA_NN, KK=KK), re.IGNORECASE),
            re.compile(r'\bp\. *({SS}) *[-+]*\d+ *({SS})\b'.format(SS=AA_NN),
                       re.IGNORECASE),
            re.compile(
                r'\b({SS})[-to ]*[-+]*\d+[-to ]*({SS})\b'.format(SS=AA_NN),
                re.IGNORECASE),
            re.compile(
                r'\b[CISQMNPKDTFAGHLRWVEYX](/|-|-*>|→|-to-)[CISQMNPKDTFAGHLRWVEYX] *[-+]*\d+\b'
            ),
            re.compile(
                r'((?<!\w)[-+]*\d+:? *?)??[CISQMNPKDTFAGHLRWVEYX] *(/|-|-*>|→|-*to-*) *[CISQMNPKDTFAGHLRWVEYX]\b'
            ),
            re.compile(r'\b[CISQMNPKDTFAGHLRWVEYX]{3,}/-(?<!\w)'),
            re.compile(
                r'\b[CISQMNPKDTFAGHLRWVEYX] *\d{2,} *[CISQMNPKDTFAGHLRWVEYX]( *(/) *[CISQMNPKDTFAGHLRWVEYX])*\b'
            ), genetic_marker_regex, rs_id_regex, ss_id_regex,
            re.compile(
                r'\b(\d+-)?\d*[D|d]elta(\d{2,}|[CISQMNPKDTFAGHLRWVEYX])\b'),
            re.compile(r'\b(c\. *)?[ATCG] *([-+]|\d)\d+ *[ATCG]\b'),
            re.compile(r'\b(c\.|E(X|x)\d+) *([-+]|\d)\d+[ATCG] *> *[ATCG]\b'),
            re.compile(r'\b[ATCG][-+]*\d+[ATCG]/[ATCG]\b'),
            re.compile(
                r'(?<!\w)[-+]?\d+ *\d* *(b|bp|N|ntb|p|BP|B) *(INS|DEL|INDEL|DELINS|DUP|ins|del|indel|delins|dup)\b'
            ),
            re.compile(
                r'(?<!\w)[-+]*\d+ *(INS|DEL|INDEL|DELINS|DUP|ins|del|indel|delins|dup)[0-9CISQMNPKDTFAGHLRWVEYX]+\b'
            ),
            re.compile(
                r'\b[CISQMNPKDTFAGHLRWVEYX]+ *[-+]*\d+ *(INS|DEL|INDEL|DELINS|DUP|ins|del|indel|delins|dup)\b'
            ),
            re.compile(
                r'\b(INS|DEL|INDEL|DELINS|DUP|ins|del|indel|delins|dup) *(\d+(b|bp|N|ntb|p|BP|B)|[ATCG]{1,})\b'
            ),
            re.compile(
                r'(?<!\w)[-+]*\d+ *(INS|DEL|INDEL|DELINS|DUP|ins|del|indel|delins|dup)[CISQMNPKDTFAGHLRWVEYX]+\b'
            ),
            re.compile(
                r'\b[CISQMNPKDTFAGHLRWVEYX]+ *[-+]*\d+ *(INS|DEL|INDEL|DELINS|DUP|ins|del|indel|delins|dup)\b'
            )
        ]

        self.negative_patterns = [
            # single AAs
            re.compile(r'^({SS}) *\d+$'.format(SS=AA_NN), re.IGNORECASE),
            re.compile(r'^[CISQMNPKDTFAGHLRWVEYX]+ *\d+$'),
            re.compile(r'^({SS})([-/>]({SS}))*$'.format(SS=AA_LL),
                       re.IGNORECASE),
            # just numbers
            re.compile(r'^[-+]?\d+([-+/ ]+\d+)*( *(b|bp|N|ntb|p|BP|B))?$')
        ]

        if not keep_genetic_markers:
            self.negative_patterns.append(genetic_marker_regex)

        if not keep_rs_ids:
            self.negative_patterns.append(rs_id_regex)
            self.negative_patterns.append(ss_id_regex)

        self.keep_unnumbered = keep_unnumbered

        self.at_least_one_letter_n_number_letter_n_number = re.compile(
            '(?=.*[A-Za-z])(?=.*[0-9])[A-Za-z0-9]+')
        self.keep_silent = keep_silent
        self.definer = ExclusiveNLDefiner()