def benchmark_nala(member1, member2): itrs = [] # Read the IAA iterations in blocks so that the plain documents are not deleted with the AnnJsonAnnotationReader's for itr in IterationRound.all(): if itr.is_IAA(): dataset = itr.read(read_annotations=False) AnnJsonAnnotationReader( os.path.join(itr.path, "reviewed", member1), read_only_class_id=MUT_CLASS_ID, delete_incomplete_docs=False).annotate(dataset) AnnJsonAnnotationReader(os.path.join(itr.path, "reviewed", member2), read_only_class_id=MUT_CLASS_ID, delete_incomplete_docs=False, is_predicted=True).annotate(dataset) itrs.append(dataset) dataset = None # Then merge the IAA iterations all_itrs_dataset = Dataset() for itr_dataset in itrs: all_itrs_dataset.extend_dataset(itr_dataset) ExclusiveNLDefiner().define(all_itrs_dataset) return (all_itrs_dataset, MentionLevelEvaluator( subclass_analysis=True).evaluate(all_itrs_dataset))
def filter(self, documents): pycrf = PyCRFSuite(self.binary_model) for pmid, doc in documents: dataset = Dataset() dataset.documents[pmid] = doc self.pipeline.execute(dataset) self.labeler.label(dataset) pycrf.tag(dataset, MUT_CLASS_ID) PostProcessing().process(dataset) ExclusiveNLDefiner().define(dataset) total_nl_mentions = [] for part in doc: # print(part.annotations) print_verbose('predicted_annotations:', part.predicted_annotations) nl_mentions = [ (ann.text, ann.subclass, ann.confidence) for ann in part.predicted_annotations if ann.subclass != 0 and ann.confidence <= self.threshold ] total_nl_mentions += nl_mentions if any(total_nl_mentions): print('nl mentions', json.dumps(total_nl_mentions, indent=4)) yield pmid, doc print_verbose('nothing found')
def evaluate(): from nalaf.utils.annotation_readers import AnnJsonAnnotationReader size_before = len(data) AnnJsonAnnotationReader(os.path.join(folder_name, "annjson"), is_predicted=True, delete_incomplete_docs=False).annotate(data) assert (size_before == len(data)) ExclusiveNLDefiner().define(data) e = MentionLevelEvaluator(subclass_analysis=True).evaluate(data) print(e)
def find_number_of_documents(): data = read_data(39, read_base=False) train, test = data.stratified_split() del data del train pipeline = get_prepare_pipeline_for_best_model() pipeline.execute(test) BIEOLabeler().label(test) PyCRFSuite().tag(test, 'idp4_model') PostProcessing().process(test) ExclusiveNLDefiner().define(test) keys = test.documents.keys() for test_size in range(30, 101, 10): sample = Dataset() random_keys = random.sample(keys, test_size) sample.documents = {key: test.documents[key] for key in random_keys} print('============== {} =============='.format(test_size)) calculate_standard_error(sample)
def benchmark_IDP4(member1, member2): itr = IterationRound(0) IDP4_corpus = itr.read(read_annotations=False) IAA_IDP4_corpus = Dataset() for docid, document in IDP4_corpus.documents.items(): if docid in IDP4_IAA_docs: IAA_IDP4_corpus.documents[docid] = document AnnJsonAnnotationReader( os.path.join(itr.path, "base", "annjson", "members", member1), read_only_class_id=MUT_CLASS_ID, delete_incomplete_docs=True).annotate(IAA_IDP4_corpus) AnnJsonAnnotationReader(os.path.join(itr.path, "base", "annjson", "members", member2), read_only_class_id=MUT_CLASS_ID, delete_incomplete_docs=True, is_predicted=True).annotate(IAA_IDP4_corpus) ExclusiveNLDefiner().define(IAA_IDP4_corpus) return (IAA_IDP4_corpus, MentionLevelEvaluator( subclass_analysis=True).evaluate(IAA_IDP4_corpus))
def train(argv): parser = argparse.ArgumentParser(description='Train model') parser.add_argument( '--training_corpus', help= 'Name of the corpus to train on. Ex: nala_training, IDP4+_training, nala_training_5' ) parser.add_argument('--test_corpus', help='Name of the corpus to test on') parser.add_argument('--string', help='String to tag') parser.add_argument('--validation', required=False, default="stratified", choices=["cross-validation", "stratified", "none"], help='Type of validation to use when training') parser.add_argument( '--cv_n', required=False, help= 'if given, cross validation (instead of stratification) is used for validating the training. \ In this case you must also set `cv_fold` and only that fold number will be run' ) parser.add_argument( '--cv_fold', required=False, help= 'fold number if cross validation is activated (it starts at 0; i.e. for cv_n=5, you have folds: [0,1,2,3,4] )' ) parser.add_argument( '--output_folder', required=False, help= 'Folder where the training model is written to. Otherwise a tmp folder is used' ) parser.add_argument( '--model_name_suffix', default='', required=False, help= 'Optional suffix to add to the generated model name in training mode'), parser.add_argument( '--write_anndoc', required=False, default=False, action='store_true', help='Write anndoc of predicted test_corpus (validation corpus in fact)' ), parser.add_argument( '--model_path_1', required=False, help='Path of the first model binary file if evaluation is performed') parser.add_argument( '--model_path_2', required=False, help= 'Path of the second model binary file if evaluation is performed with two models' ) parser.add_argument('--labeler', required=False, default="BIEO", choices=["BIEO", "BIO", "IO", "11labels"], help='Labeler to use for training') parser.add_argument( '--mutations_specific', default='True', help= 'Apply feature pipelines specific to mutations or otherwise (false) use general one' ) parser.add_argument( '--only_class_id', required=False, default=MUT_CLASS_ID, help= "By default, only the mutation entities are read from corpora (assumed to have class_id == '" + MUT_CLASS_ID + "'). Set this class_id to filter rest out") parser.add_argument( '--delete_subclasses', required=False, default="", help='Comma-separated subclasses to delete. Example: "2,3"') parser.add_argument('--pruner', required=False, default="parts", choices=["parts", "sentences"]) parser.add_argument('--ps_ST', required=False, default=False, action='store_true') parser.add_argument('--ps_NL', required=False, default=False, action='store_true') parser.add_argument('--ps_random', required=False, default=0.0, type=float) parser.add_argument('--elastic_net', action='store_true', help='Use elastic net regularization') parser.add_argument('--word_embeddings', '--we', default='True', help='Use word embeddings features') parser.add_argument('--we_additive', type=float, default=0) parser.add_argument('--we_multiplicative', type=float, default=1) parser.add_argument('--we_model_location', type=str, default=None) parser.add_argument('--use_feat_windows', default='True') parser.add_argument('--nl', action='store_true', help='Use NLMentionFeatureGenerator') parser.add_argument('--nl_threshold', type=int, default=0) parser.add_argument('--nl_window', action='store_true', help='use window feature for NLFeatureGenerator') parser.add_argument( '--execute_pp', default='True', help='Execute post processing specific to mutations (default) or not') parser.add_argument( '--keep_silent', default='True', help= 'Keep silent mutations (default) or not, i.e., delete mentions like `Cys23-Cys`' ) parser.add_argument( '--keep_genetic_markers', default='True', help='Keep genetic markers of the form D17S250, true (default) or false' ) parser.add_argument( '--keep_unnumbered', default='True', help= 'Keep unnumbered mentions (default) or not, i.e., delete mentions like `C -> T`' ) parser.add_argument( '--keep_rs_ids', default='True', help= 'Keep unnumbered mentions (default) or not, i.e., delete mentions like `rs1801280` or `ss221`' ) parser.add_argument( '--dictionaries_paths', default=None, help= 'Dictionary paths to use for dictionary features. Can be used within hdfs' ) parser.add_argument('--dictionaries_stop_words', default=None, help='Stop words for dictionaries if these are used') parser.add_argument('--hdfs_url', required=False, default=None, type=str, help='URL of hdfs if this is used') parser.add_argument( '--hdfs_user', required=False, default=None, type=str, help="user of hdfs if this used. Must be given if `hdfs_url` is given") FALSE = ['false', 'f', '0', 'no', 'none'] def arg_bool(arg_value): return False if arg_value.lower() in FALSE else True args = parser.parse_args(argv) start_time = time.time() # ------------------------------------------------------------------------------ delete_subclasses = [] for c in args.delete_subclasses.split(","): c.strip() if c: delete_subclasses.append(int(c)) args.delete_subclasses = delete_subclasses if not args.output_folder: args.output_folder = tempfile.mkdtemp() str_delete_subclasses = "None" if not args.delete_subclasses else str( args.delete_subclasses).strip('[]').replace(' ', '') if args.labeler == "BIEO": labeler = BIEOLabeler() elif args.labeler == "BIO": labeler = BIOLabeler() elif args.labeler == "IO": labeler = IOLabeler() elif args.labeler == "11labels": labeler = TmVarLabeler() args.word_embeddings = arg_bool(args.word_embeddings) if args.word_embeddings: args.we_params = { 'additive': args.we_additive, 'multiplicative': args.we_multiplicative, 'location': args.we_model_location } else: args.we_params = {} # means: do not use we if args.nl: args.nl_features = { 'threshold': args.nl_threshold, # threshold for neighbour space in dictionaries 'window': args.nl_window, } else: args.nl_features = None if args.elastic_net: args.crf_train_params = { 'c1': 1.0, # coefficient for L1 penalty 'c2': 1e-3, # coefficient for L2 penalty } else: args.crf_train_params = None args.use_feat_windows = False if args.use_feat_windows.lower( ) in FALSE else True args.mutations_specific = False if args.mutations_specific.lower( ) in FALSE else True args.execute_pp = False if args.execute_pp.lower() in FALSE else True args.keep_silent = False if args.keep_silent.lower() in FALSE else True args.keep_genetic_markers = False if args.keep_genetic_markers.lower( ) in FALSE else True args.keep_unnumbered = False if args.keep_unnumbered.lower( ) in FALSE else True args.keep_rs_ids = False if args.keep_rs_ids.lower() in FALSE else True args.do_train = False if args.model_path_1 else True if args.cv_n is not None or args.cv_fold is not None: args.validation = "cross-validation" if args.validation == "cross-validation": assert (args.cv_n is not None and args.cv_fold is not None), "You must set both cv_n AND cv_n" # ------------------------------------------------------------------------------ if args.training_corpus: # Get the name of training corpus even if this is given as a folder path, in which case the last folder name is used training_corpus_name = list( filter(None, args.training_corpus.split('/')))[-1] args.model_name = "{}_{}_del_{}".format(training_corpus_name, args.labeler, str_delete_subclasses) if args.validation == "cross-validation": args.model_name += "_cvfold_" + str(args.cv_fold) args.model_name_suffix = args.model_name_suffix.strip() if args.model_name_suffix: args.model_name += "_" + str(args.model_name_suffix) else: args.model_name = args.test_corpus # ------------------------------------------------------------------------------ def stats(dataset, name): print('\n\t{} size: {}'.format(name, len(dataset))) print('\tsubclass distribution: {}'.format(repr(dataset))) # Caveat: the dataset must be passed through the pipeline first print('\tnum sentences: {}\n'.format( sum(1 for x in dataset.sentences()))) definer = ExclusiveNLDefiner() if args.training_corpus: train_set = get_corpus(args.training_corpus, only_class_id=args.only_class_id, hdfs_url=args.hdfs_url, hdfs_user=args.hdfs_user) if args.test_corpus: test_set = get_corpus(args.test_corpus, only_class_id=args.only_class_id, hdfs_url=args.hdfs_url, hdfs_user=args.hdfs_user) elif args.string: test_set = StringReader(args.string).read() elif args.validation == "none": test_set = None elif args.validation == "cross-validation": train_set, test_set = train_set.fold_nr_split( int(args.cv_n), int(args.cv_fold)) elif args.validation == "stratified": definer.define(train_set) train_set, test_set = train_set.stratified_split() elif args.test_corpus: train_set = None test_set = get_corpora(args.test_corpus, args.only_class_id) elif args.string: train_set = None test_set = StringReader(args.string).read() else: raise Exception( "you must give at least a parameter of: training_corpus, test_corpus, or string" ) def verify_corpus(corpus): if corpus is not None: assert len( corpus ) > 0, f"The corpus should have at least one document; had 0: {args.training_corpus}" assert next( corpus.entities(), None ) is not None, "The corpus should have at least one entity; had 0" verify_corpus(train_set) # ------------------------------------------------------------------------------ if args.mutations_specific: print("Pipeline specific to mutations") features_pipeline = get_prepare_pipeline_for_best_model( args.use_feat_windows, args.we_params, args.nl_features) else: print("Pipeline is general") features_pipeline = get_prepare_pipeline_for_best_model_general( args.use_feat_windows, args.we_params, args.dictionaries_paths, args.hdfs_url, args.hdfs_user, args.dictionaries_stop_words) # ------------------------------------------------------------------------------ def print_run_args(): for key, value in sorted((vars(args)).items()): print("\t{} = {}".format(key, value)) print() print("Running arguments: ") print_run_args() # ------------------------------------------------------------------------------ def train(train_set): definer.define(train_set) train_set.delete_subclass_annotations(args.delete_subclasses) features_pipeline.execute(train_set) labeler.label(train_set) if args.pruner == "parts": train_set.prune_empty_parts() else: try: f = HighRecallRegexClassifier(ST=args.ps_ST, NL=args.ps_NL) except AssertionError: f = (lambda _: False) train_set.prune_filtered_sentences(filterin=f, percent_to_keep=args.ps_random) stats(train_set, "training") model_path = os.path.join(args.output_folder, args.model_name + ".bin") PyCRFSuite.train(train_set, model_path, args.crf_train_params) return model_path # ------------------------------------------------------------------------------ if args.do_train: args.model_path_1 = train(train_set) # ------------------------------------------------------------------------------ def test(tagger, test_set, print_eval=True, print_results=False): tagger.tag(test_set) definer.define(test_set) stats(test_set, "test") evaluation = MentionLevelEvaluator( subclass_analysis=True).evaluate(test_set) print_run_args() if print_eval: print(evaluation) if print_results: ConsoleWriter(ent1_class_id=PRO_CLASS_ID, ent2_class_id=MUT_CLASS_ID, color=True).write(test_set) # ------------------------------------------------------------------------------ assert (args.model_path_1 is not None) if args.model_path_2: tagger = NalaMultipleModelTagger( st_model=args.model_path_1, all3_model=args.model_path_2, features_pipeline=features_pipeline, execute_pp=args.execute_pp, keep_silent=args.keep_silent, keep_genetic_markers=args.keep_genetic_markers, keep_unnumbered=args.keep_unnumbered, keep_rs_ids=args.keep_rs_ids) else: tagger = NalaSingleModelTagger( bin_model=args.model_path_1, features_pipeline=features_pipeline, execute_pp=args.execute_pp, keep_silent=args.keep_silent, keep_genetic_markers=args.keep_genetic_markers, keep_unnumbered=args.keep_unnumbered, keep_rs_ids=args.keep_rs_ids) # ------------------------------------------------------------------------------ print("\n{}".format(args.model_name)) if train_set: stats(train_set, "training") if test_set: test(tagger, test_set, print_eval=args.string is None, print_results=args.string is not None) if args.do_train: print("\nThe model is saved to: {}\n".format(args.model_path_1)) if args.write_anndoc: outdir = os.path.join(args.output_folder, args.model_name) os.mkdir(outdir) print("\nThe predicted test data is saved to: {}\n".format(outdir)) TagTogFormat(test_set, use_predicted=True, to_save_to=outdir).export(0) end_time = time.time() print_debug("Elapsed time: ", (end_time - start_time)) return { "tagger": tagger, "trained_model_path": args.model_path_1, "training_num_docs": 0 if train_set is None else len(train_set.documents), "training_num_annotations": 0 if train_set is None else sum(1 for e in train_set.entities() if e.class_id == args.only_class_id) }
def pattern_stats(dataset): """ Testing Ground Carsten - High Recall Patterns creation method development method ported here. :type nala.structures.Dataset: dataset to perform pattern evaluation on (must include annotations) :return: nothing (print statements for the moment) """ ExclusiveNLDefiner().define(dataset) # PubTatorFormat(dataset, no_annotations=False).export() print(dataset) nl_annotations = [] # import connecting_words.json with open('nala/data/connecting_words.json', 'r') as f: regexs = json.load(f) # print(regexs) compiled_regexs = [re.compile(x) for x in regexs] nr_word_regex = re.compile( '\\b(one|two|three|four|five|six|seven|eight|nine|ten)\\b') aa_short_regex = re.compile( '\\b(cys|ile|ser|gln|met|asn|pro|lys|asp|thr|phe|ala|gly|his|leu|arg|trp|val|glu|tyr)\\b' ) aa_long_regex = re.compile( '\\b(glutamine|glutamic acid|leucine|valine|isoleucine|lysine|alanine|glycine|aspartate|methionine|threonine|histidine|aspartic acid|arginine|asparagine|tryptophan|proline|phenylalanine|cysteine|serine|glutamate|tyrosine)\\b' ) bp_code = re.compile('\\b\\w\\b') wordlist = [] # for ann in dataset.annotations(): # if ann.subclass == 1 or ann.subclass == 2: # new_text = ann.text.lower() # for reg in compiled_regexs: # new_text = reg.sub('_TT_', new_text) # # re.sub('\\b\\d+\\b]', '_NR_', new_text) # new_text = re.sub('\\b\\w*\\d+\\w*\\b', '_CODE_', new_text) # new_text = nr_word_regex.sub('_TT_', new_text) # new_text = aa_short_regex.sub('_AA_', new_text) # new_text = aa_long_regex.sub('_AA_', new_text) # new_text = bp_code.sub('_TT_', new_text) # new_text = re.sub('\\W', ' ', new_text) # # new_text = re.sub('\\b(\\w{1,3})\\b', '_TT_', new_text) # # wordlist.extend(new_text.split(' ')) # # print(new_text) # nl_annotations.append(new_text) # # wordset = set(wordlist) # wordlist = sorted(list(wordset)) # print(json.dumps(wordlist, indent=2, sort_keys=True)) # print(json.dumps(nl_annotations, indent=2, sort_keys=True)) # todo provide method to create new pattern on an automated base # read in nl_patterns with open('nala/data/nl_patterns.json', 'r') as f: regexs = json.load(f) patterns = [re.compile(x) for x in regexs] # f-measure pattern-based _perf_patterns = {} for reg in patterns: _perf_patterns[reg.pattern] = [0, 0, -1] # check for annotations # for part in dataset.parts(): # print(part.text) # dataset with tmVar # TODO change if idp4 then those results otherwise use tmvartagger and caching dataset_high_recall = TmVarReader( 'resources/corpora/idp4/pubtator_tmvar.txt').read() TP = 0 FP = 0 _length = len(dataset.documents.keys()) _progress = 0 _timestart = time.time() _time_avg_per_pattern = 0 _pattern_calls = 0 _time_reg_pattern_total = 0 _time_max_pattern = 0 _low_performant_pattern = "" _avg_chars_per_doc = dataset.get_size_chars() / len( dataset.documents.keys()) # NLDefiners init exclusive_definer = ExclusiveNLDefiner() _e_array = [0, 0, 0] inclusive_definer = InclusiveNLDefiner() _i_array = [0, 0] # todo param file to save to with open('results/testing_ground_carsten.txt', 'w', encoding='utf-8') as f: for did, doc in dataset.documents.items(): part_offset = 0 for i, x in enumerate(doc.parts): # print("Part", i) sent_offset = 0 cur_part = doc.parts.get(x) sentences = cur_part.sentences # new_text = cur_part.text.lower() # new_text = re.sub('\s+', ' ', new_text) # sentences = new_text.split('. ') for sent in sentences: sent_len = len(sent) new_text = sent.lower() new_text = re.sub('[\./\\-(){}\[\],%]', '', new_text) new_text = re.sub('\W+', ' ', new_text) for i, reg in enumerate(patterns): _lasttime = time.time() # time start var match = reg.search(new_text) # debug bottleneck patterns _time_current_reg = time.time( ) - _lasttime # time end var _pattern_calls += 1 # pattern calls already occured _time_reg_pattern_total += _time_current_reg # total time spent on searching with patterns if _time_reg_pattern_total > 0: _time_avg_per_pattern = _time_reg_pattern_total / _pattern_calls # avg spent time per pattern call # if _pattern_calls > len(patterns) * 20 and _time_avg_per_pattern * 10000 < _time_current_reg: # print("BAD_PATTERN_PERFORMANCE:", _time_avg_per_pattern, _time_current_reg, reg.pattern) # if _time_max_pattern < _time_current_reg: # _time_max_pattern = _time_current_reg # _low_performant_pattern = reg.pattern # print(_time_avg_per_pattern, _low_performant_pattern, _time_max_pattern) # if reg.pattern == r'(\b\w*\d+\w*\b\s?){1,3} (\b\w+\b\s?){1,4} (\b\w*\d+\w*\b\s?){1,3} (\b\w+\b\s?){1,4} (deletion|deleting|deleted)': # if _time_current_reg > _time_avg_per_pattern * 10: # # print(_time_avg_per_pattern, _time_current_reg) # f.write("BAD_PATTERN\n") # f.write(sent + "\n") # f.write(new_text + "\n") if match: if did in dataset_high_recall.documents: anti_doc = dataset_high_recall.documents.get( did) start = part_offset + sent_offset + match.span( )[0] end = part_offset + sent_offset + match.span( )[1] if not anti_doc.overlaps_with_mention( start, end): _e_result = exclusive_definer.define_string( new_text[match.span()[0]:match.span( )[1]]) _e_array[_e_result] += 1 _i_result = inclusive_definer.define_string( new_text[match.span()[0]:match.span( )[1]]) _i_array[_i_result] += 1 if doc.overlaps_with_mention(start, end): TP += 1 f.write( "{}\tTP\te{}\ti{}\t{}\t{}\t{}\n". format(did, _e_result, _i_result, sent, match, reg.pattern)) _perf_patterns[reg.pattern][0] += 1 else: FP += 1 f.write( "{}\tFP\te{}\ti{}\t{}\t{}\t{}\n". format(did, _e_result, _i_result, sent, match, reg.pattern)) _perf_patterns[reg.pattern][1] += 1 if _perf_patterns[reg.pattern][1] > 0: _perf_patterns[ reg.pattern][2] = _perf_patterns[ reg. pattern][0] / _perf_patterns[ reg.pattern][1] if _lasttime - time.time() > 1: print(i) sent_offset += 2 + sent_len part_offset += sent_offset _progress += doc.get_size() / _avg_chars_per_doc _time_progressed = time.time() - _timestart _time_per_doc = _time_progressed / _progress _time_req_time = _time_per_doc * _length _time_eta = _time_req_time - _time_progressed print("PROGRESS: {:.3%} PROGRESS: {:.2f} secs ETA: {:.2f} secs". format(_progress / _length, _time_progressed, _time_eta)) if TP + FP > 0: print( 'STATS: TP:{}, FP:{}, TP+FP:{} %containingNLmentions:{:.4%}' .format(TP, FP, TP + FP, TP / (TP + FP))) print("Exclusive Definer:", _e_array) print("Inclusive Definer:", _i_array) for key, value in _perf_patterns.items(): if value[2] != -1: print(value, key)
def filter(self, documents, min_found=1, use_nala=False): """ :type documents: collections.Iterable[(str, nalaf.structures.data.Document)] """ _progress = 1 _start_time = time.time() _total_time = 0 _time_avg_per_pattern = 0 _pattern_calls = 0 _time_reg_pattern_total = 0 _time_max_pattern = 0 _low_performant_pattern = "" # NLDefiners init exclusive_definer = ExclusiveNLDefiner() _e_array = [0, 0, 0] inclusive_definer = InclusiveNLDefiner() _i_array = [0, 0] last_found = 0 crf = PyCRFSuite(self.location_binary_model) # counter_to_stop_for_caching = 0 for pmid, doc in documents: # if any part of the document contains any of the keywords # yield that document # if counter_to_stop_for_caching > 400: # break # counter_to_stop_for_caching += 1 # print(counter_to_stop_for_caching) part_offset = 0 data_tmp = Dataset() data_tmp.documents[pmid] = doc data_nala = deepcopy(data_tmp) NLTKSplitter().split(data_tmp) # data_tmvar = TmVarTagger().generate_abstracts([pmid]) if use_nala: self.pipeline.execute(data_nala) self.labeler.label(data_nala) crf.tag(data_nala, MUT_CLASS_ID) PostProcessing().process(data_nala) ExclusiveNLDefiner().define(data_nala) used_regexs = {} positive_sentences = 0 for i, x in enumerate(doc.parts): # print("Part", i) sent_offset = 0 cur_part = doc.parts.get(x) sentences = cur_part.sentences_ for sent in sentences: sent_length = len(sent) new_text = sent.lower() new_text = re.sub('[\./\\-(){}\[\],%]', ' ', new_text) # new_text = re.sub('\W+', ' ', new_text) found_in_sentence = False for i, reg in enumerate(self.patterns): _lasttime = time.time() # time start var match = reg.search(new_text) # debug bottleneck patterns _time_current_reg = time.time( ) - _lasttime # time end var _pattern_calls += 1 # pattern calls already occured _time_reg_pattern_total += _time_current_reg # total time spent on searching with patterns if _time_reg_pattern_total > 0: _time_avg_per_pattern = _time_reg_pattern_total / _pattern_calls # avg spent time per pattern call # todo create pattern performance eval for descending amount of recognized patterns # if _pattern_calls > len(patterns) * 20 and _time_avg_per_pattern * 10000 < _time_current_reg: # print("BAD_PATTERN_PERFORMANCE:", _time_avg_per_pattern, _time_current_reg, reg.pattern) # if _time_max_pattern < _time_current_reg: # _time_max_pattern = _time_current_reg # _low_performant_pattern = reg.pattern # print(_time_avg_per_pattern, _low_performant_pattern, _time_max_pattern) # if reg.pattern == r'(\b\w*\d+\w*\b\s?){1,3} (\b\w+\b\s?){1,4} (\b\w*\d+\w*\b\s?){1,3} (\b\w+\b\s?){1,4} (deletion|deleting|deleted)': # if _time_current_reg > _time_avg_per_pattern * 10: # # print(_time_avg_per_pattern, _time_current_reg) # f.write("BAD_PATTERN\n") # f.write(sent + "\n") # f.write(new_text + "\n") if match: # if pmid in data_tmvar.documents: # anti_doc = data_tmvar.documents.get(pmid) nala_doc = data_nala.documents.get(pmid) start = part_offset + sent_offset + match.span()[0] end = part_offset + sent_offset + match.span()[1] # print("TmVar is not overlapping?:", not anti_doc.overlaps_with_mention(start, end)) # print(not nala_doc.overlaps_with_mention(start, end, annotated=False)) if reg.pattern in used_regexs: used_regexs[reg.pattern] += 1 else: used_regexs[reg.pattern] = 1 print(color.PURPLE + new_text.replace( match.group(), color.BOLD + color.DARKCYAN + color.UNDERLINE + match.group() + color.END + color.PURPLE) + color.END) if not found_in_sentence: positive_sentences += 1 found_in_sentence = True # if not anti_doc.overlaps_with_mention(start, # end) \ # and not nala_doc.overlaps_with_mention(start, end, annotated=False): # _e_result = exclusive_definer.define_string( # new_text[match.span()[0]:match.span()[1]]) # _e_array[_e_result] += 1 # _i_result = inclusive_definer.define_string( # new_text[match.span()[0]:match.span()[1]]) # _i_array[_i_result] += 1 # todo write to file param + saving to manually annotate and find tp + fp for performance eval on each pattern # print("e{}\ti{}\t{}\t{}\t{}\n".format(_e_result, _i_result, sent, match, reg.pattern)) # last_found += 1 # found_in_sentence = True # else: # # if nala not used only tmvar considered # if not anti_doc.overlaps_with_mention(start, end): # _e_result = exclusive_definer.define_string( # new_text[match.span()[0]:match.span()[1]]) # _e_array[_e_result] += 1 # _i_result = inclusive_definer.define_string( # new_text[match.span()[0]:match.span()[1]]) # _i_array[_i_result] += 1 # # todo write to file param + saving to manually annotate and find tp + fp for performance eval on each pattern # # print("e{}\ti{}\t{}\t{}\t{}\n".format(_e_result, _i_result, sent, match, reg.pattern)) # last_found += 1 # found_in_sentence = True if use_nala: nala_found_mention = nala_doc.overlaps_with_mention( start, end, annotated=False) if nala_found_mention: print_verbose(nala_found_mention) if nala_found_mention.subclass > 0 and nala_found_mention.confidence <= self.threshold: yield pmid, doc if _lasttime - time.time() > 1: print_verbose('time intensive regex', i) sent_offset += 2 + sent_length # for per sentence positives if found_in_sentence: positive_sentences += 1 part_offset += sent_offset if use_nala: for part in nala_doc: for ann in part.predicted_annotations: if ann.subclass > 0: print_verbose(part.text[:ann.offset] + color.BOLD + ann.text + color.END + part.text[ann.offset + len(ann.text):]) positive_sentences += min_found _old_time = _start_time _start_time = time.time() _one_time = _start_time - _old_time if _one_time > 0.3 and positive_sentences > min_found: _progress += 1 _total_time += _one_time _time_per_doc = _total_time / _progress print_verbose( "PROGRESS: {:.2f} secs ETA per one positive document:" " {:.2f} secs".format(_total_time, _time_per_doc)) print_debug('used regular expressions:', json.dumps(used_regexs, indent=4)) if positive_sentences >= min_found: last_found = 0 print_verbose('YEP', pmid) yield pmid, doc else: print_verbose('NOPE', pmid)
folderName = sys.argv[3] except: corpus = StringReader(corpusName).read() folderName = None # just print out in standard output # ------------------------------------------------------------------------------ # Example calls: # python scripts/SETH.py SETH nala_test resources/predictions/ # predict # python scripts/SETH.py check_performance nala_test resources/predictions/SETH/nala_test &> resources/predictions/SETH/nala_test/oresults.tsv # evaluate if (methodName == 'check_performance'): # folderName is assumed to be the final/leaf predictions folder, e.g., `resources/predictions/SETH/nala_test` BRATPartsAnnotationReader(folderName, is_predicted=True).annotate(corpus) ExclusiveNLDefiner().define(corpus) evaluation = MentionLevelEvaluator(subclass_analysis=True).evaluate(corpus) print(evaluation) else: if folderName: # folderName is assumed to be the root predictions folder, e.g., `resources/predictions/` folderName = os.path.join(folderName, methodName, corpusName) if not os.path.exists(folderName): os.makedirs(folderName) useMutationFinderOnly = "true" if methodName == "MFmodified" else "false" run_set_server(useMutationFinderOnly) run_seth_on_corpus(corpus, folderName, useMutationFinderOnly)
parser.add_argument('--counttokens', help='Count the tokens. Note, this is considerably slower', action='store_true') args = parser.parse_args() if args.corpora[0] == "*" or args.corpora[0] == 'all': args.corpora = ALL_CORPORA if args.listanns == '*' or args.listanns == 'all': args.listanns = '0,1,2' args.listanns = set(int(c) for c in args.listanns.split(",") if c) # ------------------------------------------------------------------------------ nldefiner = ExclusiveNLDefiner() pipeline = PrepareDatasetPipeline(feature_generators=[]) ST = 0 # Standard NL = 1 # Natural Language SST = 2 # Semi-Standard -- also often denoted before as 'SS' MARKER = [' ', '@@@@@@@@', '********'] PROB = "{0:.3f}" # FORMAT # ------------------------------------------------------------------------------ def get_corpus_type(name):
def setUpClass(self): self.definer = ExclusiveNLDefiner()
class TestExclusiveNLDefiner(unittest.TestCase): @classmethod def setUpClass(self): self.definer = ExclusiveNLDefiner() def test_on_empty_string(self): try: self.definer.define_string("") except Exception: self.fail( "empty string result is undefined but should not throw an exception" ) def test_define_string(self): f = self.definer.define_string testEqual = self.assertEqual testEqual(0, f("rs206437")) # rsid testEqual(0, f("ss469415642")) # ssid testEqual(2, f("C226 to T")) testEqual(2, f("G446 to A")) testEqual(2, f("C821 to T")) testEqual(2, f("Arg76 to Trp")) testEqual(2, f("Arg149 to Gln")) testEqual(2, f("Pro274 to Leu")) testEqual(2, f("T320 to C")) testEqual(2, f("Leu107 to Pro")) testEqual(2, f("C631 to T")) testEqual(2, f("Arg211 to Cys")) testEqual(2, f("Ala215 to Thr")) testEqual(1, f("deletion of its cytoplasmic tail")) testEqual(1, f("nonsense mutation Q3X")) testEqual(0, f("R142Q")) testEqual(1, f("G-->A transition of a CpG dinucleotide")) testEqual(1, f("A C-->T transition of the same CpG")) testEqual(0, f("R142X")) testEqual(0, f("R142X")) testEqual(0, f("R142Q")) testEqual(1, f("replacement of this CpG hotspot by CpA")) testEqual(0, f("R142X")) testEqual(1, f("caused skipping of the exon")) testEqual(1, f("Absence of exon 5")) testEqual(0, f("Asp8Asn")) testEqual(1, f("G to A transition at nt22")) testEqual(1, f("asparagine for aspartic acid at codon 8")) testEqual(0, f("Asp8Asn")) testEqual( 1, f("substitution of neutral asparagine for anionic aspartic acid")) testEqual(1, f("G to A transition is at a CpG dinucleotide")) testEqual(1, f("codon CAA encoding glutamine-2153 to UAA, a stop codon")) testEqual( 1, f("attaching an epitope tag sequence to the C terminus of the editing protein" )) testEqual(0, f("H15D")) testEqual(0, f("A83D")) testEqual(0, f("A179D")) testEqual(0, f("573 + IG-->A")) testEqual(0, f("H15D")) testEqual(0, f("A83D")) testEqual(0, f("A179D")) testEqual(1, f("skipping of exon 5")) testEqual(0, f("H15D")) testEqual( 1, f("Replacement of these small hydrophobic Ala residues with the charged, more bulky Asp side chain" )) testEqual(0, f("G20R")) testEqual(1, f("G to A transition at a CpG")) testEqual(1, f("glycine to arginine substitution at codon 20")) testEqual(0, f("26delA")) testEqual(0, f("delPhe1388")) testEqual(1, f("deleted C1 domain")) testEqual(0, f("Q115P")) testEqual(0, f("g.3912G>C")) testEqual(0, f("c.925delA")) testEqual(0, f("c.388+3insT")) testEqual(0, f("3992-9g-->a")) testEqual(2, f("3992-9g-->a mutation")) testEqual(2, f("G643 to A")) testEqual(2, f("leucine for arginine 90")) testEqual(1, f("deletion of aa 527-534")) testEqual( 1, f("deletion of 10 and 8 residues from the N- and C-terminals")) testEqual(1, f("143 from alanine to glycine")) testEqual( 1, f("alterations of amino acid residue 143 from alanine to glycine")) testEqual(1, f("trinucleotide deletion")) testEqual(1, f("arginine-141 to serine substitution")) testEqual(1, f("mutations at Arg885")) testEqual(1, f("point mutation at Cys93")) testEqual(1, f("heterozygous missense 3035G>T")) testEqual(2, f("synonymous 696T>C")) testEqual(2, f("missense Glu285Ala")) testEqual(1, f("somatic 16-bp deletion")) testEqual(1, f("serine 749 is phosphorylated")) testEqual(1, f("Ser58 to Glu substitution")) testEqual(1, f("deletion of")) testEqual(1, f("deletion of")) testEqual(1, f("deletion of")) testEqual(1, f("deletion of")) testEqual(0, f("GAT-->GTT, Asp-->Val")) testEqual(2, f("codon 98 GAT-->GTT, Asp-->Val")) testEqual(2, f("codon 92, TAC-->TAT")) testEqual( 1, f("arginine-127 into glutamine and arginine-469 into tryptophan")) testEqual(2, f("arginine-127 into glutamine")) testEqual(2, f("arginine-469 into tryptophan")) testEqual(0, f("TP73Δex2/3")) testEqual(1, f("abrogated loss of Chr19")) # More difficult testEqual(2, f("chromothripsis")) testEqual(2, f("Morpholino knockdown")) testEqual(2, f("methionine replaces lysine 27")) testEqual(2, f("lysine(27)-to-methionine")) testEqual(1, f("C-tail displacement")) testEqual(1, f("22q11 deletion syndrome")) testEqual(1, f("hippocampal neuron L1 insertions")) testEqual(1, f("copy-number variants"))
class PostProcessing: def __init__(self, keep_silent=True, keep_genetic_markers=True, keep_unnumbered=True, keep_rs_ids=True): amino_acids = [ 'alanine', 'ala', 'arginine', 'arg', 'asparagine', 'asn', 'aspartic acid', 'aspartate', 'asp', 'cysteine', 'cys', 'glutamine', 'gln', 'glutamic acid', 'glutamate', 'glu', 'glycine', 'gly', 'histidine', 'his', 'isoleucine', 'ile', 'leucine', 'leu', 'lysine', 'lys', 'methionine', 'met', 'phenylalanine', 'phe', 'proline', 'pro', 'serine', 'ser', 'threonine', 'thr', 'tryptophan', 'trp', 'tyrosine', 'tyr', 'valine', 'val', 'aspartic acid', 'asparagine', 'asx', 'glutamine', 'glutamic acid', 'glx' ] nucleotides = ['adenine', 'guanine', 'thymine', 'cytosine', 'uracil'] keywords = [ 'substit\w*', 'lead\w*', 'exchang\w*', 'chang\w*', 'mutant\w*', 'mutate\w*', 'devia\w*', 'modif\w*', 'alter\w*', 'switch\w*', 'variat\w*', 'instead\w*', 'replac\w*', 'in place', 'convert\w*', 'becom\w*' ] # AA = '|'.join(amino_acids) AA_NN = '|'.join(amino_acids + nucleotides) AA_LL = '|'.join(amino_acids + list('CISQMNPKDTFAGHLRWVEYX')) KK = '|'.join(keywords) genetic_marker_regex = re.compile(r'\bD\d+([A-Z]\d+)?S\d{2,}\b') rs_id_regex = re.compile(r'\b\[?rs\]? *\d{3,}(,\d+)*\b') ss_id_regex = re.compile(r'\b\[?ss\]? *\d{3,}(,\d+)*\b') self.patterns = [ re.compile( '({SS})[- ]*[1-9][0-9]* +(in|to|into|for|of|by|with|at) +({SS})( *(,|,?or|,?and) +({SS}))*' .format(SS=AA_NN), re.IGNORECASE), re.compile( '({SS}) +(in|to|into|for|of|by|with|at) +({SS})[- ]*[1-9][0-9]*' '( *(,|,?or|,?and) +({SS})[- ]*[1-9][0-9]*)*'.format(SS=AA_NN), re.IGNORECASE), re.compile( '({SS})(( (({KK})) (in|to|into|for|of|by|with|at) (a|an|the|) ' '*({SS})[1-9]\d*( *(,|or|and|, and|, or) ({SS})[1-9]\d*)*)' '|([- ]*[1-9]\d*( +((has|have|had) +been|is|are|was|were|) ' '+(({KK})))? +(in|to|into|for|of|by|with|at) +({SS})( *(,|or|and|, and|, or) +({SS}))*))' .format(SS=AA_NN, KK=KK), re.IGNORECASE), re.compile(r'\bp\. *({SS}) *[-+]*\d+ *({SS})\b'.format(SS=AA_NN), re.IGNORECASE), re.compile( r'\b({SS})[-to ]*[-+]*\d+[-to ]*({SS})\b'.format(SS=AA_NN), re.IGNORECASE), re.compile( r'\b[CISQMNPKDTFAGHLRWVEYX](/|-|-*>|→|-to-)[CISQMNPKDTFAGHLRWVEYX] *[-+]*\d+\b' ), re.compile( r'((?<!\w)[-+]*\d+:? *?)??[CISQMNPKDTFAGHLRWVEYX] *(/|-|-*>|→|-*to-*) *[CISQMNPKDTFAGHLRWVEYX]\b' ), re.compile(r'\b[CISQMNPKDTFAGHLRWVEYX]{3,}/-(?<!\w)'), re.compile( r'\b[CISQMNPKDTFAGHLRWVEYX] *\d{2,} *[CISQMNPKDTFAGHLRWVEYX]( *(/) *[CISQMNPKDTFAGHLRWVEYX])*\b' ), genetic_marker_regex, rs_id_regex, ss_id_regex, re.compile( r'\b(\d+-)?\d*[D|d]elta(\d{2,}|[CISQMNPKDTFAGHLRWVEYX])\b'), re.compile(r'\b(c\. *)?[ATCG] *([-+]|\d)\d+ *[ATCG]\b'), re.compile(r'\b(c\.|E(X|x)\d+) *([-+]|\d)\d+[ATCG] *> *[ATCG]\b'), re.compile(r'\b[ATCG][-+]*\d+[ATCG]/[ATCG]\b'), re.compile( r'(?<!\w)[-+]?\d+ *\d* *(b|bp|N|ntb|p|BP|B) *(INS|DEL|INDEL|DELINS|DUP|ins|del|indel|delins|dup)\b' ), re.compile( r'(?<!\w)[-+]*\d+ *(INS|DEL|INDEL|DELINS|DUP|ins|del|indel|delins|dup)[0-9CISQMNPKDTFAGHLRWVEYX]+\b' ), re.compile( r'\b[CISQMNPKDTFAGHLRWVEYX]+ *[-+]*\d+ *(INS|DEL|INDEL|DELINS|DUP|ins|del|indel|delins|dup)\b' ), re.compile( r'\b(INS|DEL|INDEL|DELINS|DUP|ins|del|indel|delins|dup) *(\d+(b|bp|N|ntb|p|BP|B)|[ATCG]{1,})\b' ), re.compile( r'(?<!\w)[-+]*\d+ *(INS|DEL|INDEL|DELINS|DUP|ins|del|indel|delins|dup)[CISQMNPKDTFAGHLRWVEYX]+\b' ), re.compile( r'\b[CISQMNPKDTFAGHLRWVEYX]+ *[-+]*\d+ *(INS|DEL|INDEL|DELINS|DUP|ins|del|indel|delins|dup)\b' ) ] self.negative_patterns = [ # single AAs re.compile(r'^({SS}) *\d+$'.format(SS=AA_NN), re.IGNORECASE), re.compile(r'^[CISQMNPKDTFAGHLRWVEYX]+ *\d+$'), re.compile(r'^({SS})([-/>]({SS}))*$'.format(SS=AA_LL), re.IGNORECASE), # just numbers re.compile(r'^[-+]?\d+([-+/ ]+\d+)*( *(b|bp|N|ntb|p|BP|B))?$') ] if not keep_genetic_markers: self.negative_patterns.append(genetic_marker_regex) if not keep_rs_ids: self.negative_patterns.append(rs_id_regex) self.negative_patterns.append(ss_id_regex) self.keep_unnumbered = keep_unnumbered self.at_least_one_letter_n_number_letter_n_number = re.compile( '(?=.*[A-Za-z])(?=.*[0-9])[A-Za-z0-9]+') self.keep_silent = keep_silent self.definer = ExclusiveNLDefiner() def process(self, dataset, class_id=MUT_CLASS_ID): for doc_id, doc in dataset.documents.items(): for part_id, part in doc.parts.items(): self.__fix_issues(part) for regex in self.patterns: for match in regex.finditer(part.text): start = match.start() end = match.end() matched_text = part.text[start:end] ann = Entity(class_id, start, matched_text) Entity.equality_operator = 'exact_or_overlapping' if ann not in part.predicted_annotations: part.predicted_annotations.append( Entity(class_id, start, matched_text)) Entity.equality_operator = 'overlapping' if ann in part.predicted_annotations: for index, ann_b in enumerate( part.predicted_annotations): if ann == ann_b and len(matched_text) > len( ann_b.text): part.predicted_annotations[index] = ann to_delete = [ index for index, ann in enumerate(part.predicted_annotations) if any(r.search(ann.text) for r in self.negative_patterns) or (not self.keep_silent and self.__is_silent(ann)) or (not self.keep_unnumbered and not self._is_numbered(ann)) ] part.predicted_annotations = [ ann for index, ann in enumerate(part.predicted_annotations) if index not in to_delete ] # sanity check, make sure annotations match their offset for part in dataset.parts(): for ann in part.predicted_annotations: assert ann.text == part.text[ann.offset:ann.offset + len(ann.text)] while ann.text[0] == ' ': ann.offset += 1 ann.text = ann.text[1:] while ann.text[-1] == ' ': ann.text = ann.text[:-1] # assert ann.text == ann.text.strip(), ("'" + ann.text + "'") def __is_silent(self, ann): split = re.split('[^A-Za-z]+', ann.text) return len(split) == 2 and split[0] == split[1] def _is_numbered(self, ann): return any(c.isdigit() for c in ann.text) or self.definer.define_string( ann.text) == 1 def __fix_issues(self, part): """ :type part: nalaf.structures.data.Part """ to_be_removed = [] for index, ann in enumerate(part.predicted_annotations): start = ann.offset end = ann.offset + len(ann.text) # split multiple mentions split = re.split(r' *(?:\band\b|/|\\|,|;|\bor\b) *', ann.text) if len(split) > 1: # for each split part calculate the offsets and the constraints offset = 0 split_info = [] for text in split: split_info.append( (text, self.definer.define_string(text), ann.text.find(text, offset), self.at_least_one_letter_n_number_letter_n_number. search(text))) offset += len(text) split_parts = [ split_part for split_part in split_info if split_part[0] != '' ] lens = [len(split_part[0]) for split_part in split_parts] patterns = [ re.sub( '\W+', '', re.sub('[0-9]', '0', re.sub('[a-zA-Z]', 'a', parts[0]))) for parts in split_parts ] # if all the non empty parts are from class ST (0) and also contain at least one number and one letter # or if the lengths of the splitted parts are the same or follow the same pattern if all(split_part[1] == 0 and split_part[3] for split_part in split_parts) or max(lens) == min( lens) or len(set(patterns)) == 1: to_be_removed.append(index) # add them to for split_text, split_class, split_offset, aonanl in split_info: if split_text != '': part.predicted_annotations.append( Entity(ann.class_id, ann.offset + split_offset, split_text)) # fix boundary, 1858C>T --> +1858C>T if re.search('^[0-9]', ann.text) and re.search( '([\-\+])', part.text[start - 1]): ann.offset -= 1 ann.text = part.text[start - 1] + ann.text start -= 1 # fix boundary delete ( if ann.text[0] == '(' and ')' not in ann.text: ann.offset += 1 ann.text = ann.text[1:] start += 1 # fix boundary delete ) if ann.text[-1] == ')' and '(' not in ann.text: ann.text = ann.text[:-1] # fix boundary add missing ( if part.text[start - 1] == '(' and ')' in ann.text: ann.offset -= 1 ann.text = '(' + ann.text start -= 1 # fix boundary add missing ) try: if part.text[end] == ')' and '(' in ann.text: ann.text += ')' except IndexError: pass # fix boundary add missing number after fsX try: found_missing_fsx = False if part.text[end:end + 2] == 'fs': ann.text += 'fs' end += 2 found_missing_fsx = True if ann.text.endswith('fs') and part.text[end] == 'X': ann.text += 'X' end += 1 found_missing_fsx = True if found_missing_fsx: while part.text[end].isnumeric(): ann.text += part.text[end] end += 1 except IndexError: pass # fix boundary add missing c. or p. before ann try: if ann.text.startswith('.'): if part.text[start - 1] in ('c', 'p'): ann.offset -= 1 ann.text = part.text[start - 1] + ann.text start -= 1 elif part.text[start - 2:start] in ('c.', 'p.', 'rt'): ann.offset -= 2 ann.text = part.text[start - 2:start] + ann.text start -= 2 except IndexError: pass # fix boundary add missing \d+ at the beginning if ann.text[0] == '-' or part.text[start - 1] == '-': tmp = start while tmp - 1 > 0 and (part.text[tmp - 1].isnumeric() or part.text[tmp - 1] == '-'): tmp -= 1 if part.text[tmp - 1] == ' ': ann.offset = tmp ann.text = part.text[ann.offset:start] + ann.text start = tmp isword = re.compile(r'\w') # The word must end in space to the left # not matched: 'and+2740 A>G' if isword.search(ann.text[0]) and \ (not (ann.offset >= 3 and part.text[ann.offset - 3: ann.offset] == "and" or (ann.offset >= 2 and part.text[ann.offset - 2: ann.offset] == "or"))): while ann.offset > 0 and isword.search( part.text[ann.offset - 1]): ann.text = part.text[ann.offset - 1] + ann.text ann.offset -= 1 veryend = len(part.text) end = ann.offset + len(ann.text) # The word must end in space to the right while end < veryend and isword.search(part.text[end]): ann.text = ann.text + part.text[end] end += 1 # Remove parenthesis if within parenthesis but no parentesis either in between if ann.text[0] in ['('] and ann.text[-1] in [ ')' ] and (ann.text.count('(') < 2 and ann.text.count(')') < 2): ann.offset += 1 ann.text = ann.text[1:-1] # Follow the rule of abbreviations + first gene mutation (then protein mutation) if ((ann.text[-1] == ')' or (end < veryend and part.text[end] == ")")) and ann.text[:-1].count('(') == 1): # Requirement 1: must be space to the left of (, not to match things like in Arg407(AGG) or IVS3(+1) p = re.compile("\\s+\\(") split = p.split(ann.text) if len(split) == 2: # Requirement 2: both parths must contain a number (== position, they can stand alone) def req2(): return any(c.isdigit() for c in split[0]) and any( c.isdigit() for c in split[1]) # Other Reqs on left part def req3(): return any(c.isalpha() for c in split[0].replace( 'and', '').replace('or', '')) # Other Reqs on right part def req4(): return any(c.isalpha() for c in split[1].replace( 'and', '').replace('or', '')) if req2() and len(split[0]) > 2 and req3() and req4(): # Neg.: Arg407(AGG) - single amino acid substitution (Phe for Val) - nonsense mutation (286T) # Neg.: deletion (229bp) - nonsense mutation (glycine 568 to stop) # Neg.: one insertion mutation (698insC) - AChR epsilon (CHRNE E376K) # Neg. (other reqs): M1 (Val213) - 207 and 208 (207-HA) # Neg. (other reqs): located 14 amino acids toward the amino-terminal end from the (682) # # Pos.: serine to arginine at the codon 113 (p. S113R) # Pos.: mutagenesis of the initial ATG codon to ACG (Met 1 to Thr) - H2A at position 105 (Q105) # Pos.: Trp replacing Gln in position 156 (A*2406) - A-1144-to-C transversion (K382Q) # Pos: deletion of 123 bases (41 codons) - exon 12 (R432T) ann1text = split[0] to_be_removed.append(index) part.predicted_annotations.append( Entity(ann.class_id, ann.offset, ann1text)) ann2text = split[1] if ann.text[-1] != ')' else split[ 1][:-1] # last part is number of spaces + ( ann2offset = ann.offset + len(ann1text) + ( len(ann.text) - sum(len(x) for x in split)) part.predicted_annotations.append( Entity(ann.class_id, ann2offset, ann2text)) part.predicted_annotations = [ ann for index, ann in enumerate(part.predicted_annotations) if index not in to_be_removed ]
def __init__(self, keep_silent=True, keep_genetic_markers=True, keep_unnumbered=True, keep_rs_ids=True): amino_acids = [ 'alanine', 'ala', 'arginine', 'arg', 'asparagine', 'asn', 'aspartic acid', 'aspartate', 'asp', 'cysteine', 'cys', 'glutamine', 'gln', 'glutamic acid', 'glutamate', 'glu', 'glycine', 'gly', 'histidine', 'his', 'isoleucine', 'ile', 'leucine', 'leu', 'lysine', 'lys', 'methionine', 'met', 'phenylalanine', 'phe', 'proline', 'pro', 'serine', 'ser', 'threonine', 'thr', 'tryptophan', 'trp', 'tyrosine', 'tyr', 'valine', 'val', 'aspartic acid', 'asparagine', 'asx', 'glutamine', 'glutamic acid', 'glx' ] nucleotides = ['adenine', 'guanine', 'thymine', 'cytosine', 'uracil'] keywords = [ 'substit\w*', 'lead\w*', 'exchang\w*', 'chang\w*', 'mutant\w*', 'mutate\w*', 'devia\w*', 'modif\w*', 'alter\w*', 'switch\w*', 'variat\w*', 'instead\w*', 'replac\w*', 'in place', 'convert\w*', 'becom\w*' ] # AA = '|'.join(amino_acids) AA_NN = '|'.join(amino_acids + nucleotides) AA_LL = '|'.join(amino_acids + list('CISQMNPKDTFAGHLRWVEYX')) KK = '|'.join(keywords) genetic_marker_regex = re.compile(r'\bD\d+([A-Z]\d+)?S\d{2,}\b') rs_id_regex = re.compile(r'\b\[?rs\]? *\d{3,}(,\d+)*\b') ss_id_regex = re.compile(r'\b\[?ss\]? *\d{3,}(,\d+)*\b') self.patterns = [ re.compile( '({SS})[- ]*[1-9][0-9]* +(in|to|into|for|of|by|with|at) +({SS})( *(,|,?or|,?and) +({SS}))*' .format(SS=AA_NN), re.IGNORECASE), re.compile( '({SS}) +(in|to|into|for|of|by|with|at) +({SS})[- ]*[1-9][0-9]*' '( *(,|,?or|,?and) +({SS})[- ]*[1-9][0-9]*)*'.format(SS=AA_NN), re.IGNORECASE), re.compile( '({SS})(( (({KK})) (in|to|into|for|of|by|with|at) (a|an|the|) ' '*({SS})[1-9]\d*( *(,|or|and|, and|, or) ({SS})[1-9]\d*)*)' '|([- ]*[1-9]\d*( +((has|have|had) +been|is|are|was|were|) ' '+(({KK})))? +(in|to|into|for|of|by|with|at) +({SS})( *(,|or|and|, and|, or) +({SS}))*))' .format(SS=AA_NN, KK=KK), re.IGNORECASE), re.compile(r'\bp\. *({SS}) *[-+]*\d+ *({SS})\b'.format(SS=AA_NN), re.IGNORECASE), re.compile( r'\b({SS})[-to ]*[-+]*\d+[-to ]*({SS})\b'.format(SS=AA_NN), re.IGNORECASE), re.compile( r'\b[CISQMNPKDTFAGHLRWVEYX](/|-|-*>|→|-to-)[CISQMNPKDTFAGHLRWVEYX] *[-+]*\d+\b' ), re.compile( r'((?<!\w)[-+]*\d+:? *?)??[CISQMNPKDTFAGHLRWVEYX] *(/|-|-*>|→|-*to-*) *[CISQMNPKDTFAGHLRWVEYX]\b' ), re.compile(r'\b[CISQMNPKDTFAGHLRWVEYX]{3,}/-(?<!\w)'), re.compile( r'\b[CISQMNPKDTFAGHLRWVEYX] *\d{2,} *[CISQMNPKDTFAGHLRWVEYX]( *(/) *[CISQMNPKDTFAGHLRWVEYX])*\b' ), genetic_marker_regex, rs_id_regex, ss_id_regex, re.compile( r'\b(\d+-)?\d*[D|d]elta(\d{2,}|[CISQMNPKDTFAGHLRWVEYX])\b'), re.compile(r'\b(c\. *)?[ATCG] *([-+]|\d)\d+ *[ATCG]\b'), re.compile(r'\b(c\.|E(X|x)\d+) *([-+]|\d)\d+[ATCG] *> *[ATCG]\b'), re.compile(r'\b[ATCG][-+]*\d+[ATCG]/[ATCG]\b'), re.compile( r'(?<!\w)[-+]?\d+ *\d* *(b|bp|N|ntb|p|BP|B) *(INS|DEL|INDEL|DELINS|DUP|ins|del|indel|delins|dup)\b' ), re.compile( r'(?<!\w)[-+]*\d+ *(INS|DEL|INDEL|DELINS|DUP|ins|del|indel|delins|dup)[0-9CISQMNPKDTFAGHLRWVEYX]+\b' ), re.compile( r'\b[CISQMNPKDTFAGHLRWVEYX]+ *[-+]*\d+ *(INS|DEL|INDEL|DELINS|DUP|ins|del|indel|delins|dup)\b' ), re.compile( r'\b(INS|DEL|INDEL|DELINS|DUP|ins|del|indel|delins|dup) *(\d+(b|bp|N|ntb|p|BP|B)|[ATCG]{1,})\b' ), re.compile( r'(?<!\w)[-+]*\d+ *(INS|DEL|INDEL|DELINS|DUP|ins|del|indel|delins|dup)[CISQMNPKDTFAGHLRWVEYX]+\b' ), re.compile( r'\b[CISQMNPKDTFAGHLRWVEYX]+ *[-+]*\d+ *(INS|DEL|INDEL|DELINS|DUP|ins|del|indel|delins|dup)\b' ) ] self.negative_patterns = [ # single AAs re.compile(r'^({SS}) *\d+$'.format(SS=AA_NN), re.IGNORECASE), re.compile(r'^[CISQMNPKDTFAGHLRWVEYX]+ *\d+$'), re.compile(r'^({SS})([-/>]({SS}))*$'.format(SS=AA_LL), re.IGNORECASE), # just numbers re.compile(r'^[-+]?\d+([-+/ ]+\d+)*( *(b|bp|N|ntb|p|BP|B))?$') ] if not keep_genetic_markers: self.negative_patterns.append(genetic_marker_regex) if not keep_rs_ids: self.negative_patterns.append(rs_id_regex) self.negative_patterns.append(ss_id_regex) self.keep_unnumbered = keep_unnumbered self.at_least_one_letter_n_number_letter_n_number = re.compile( '(?=.*[A-Za-z])(?=.*[0-9])[A-Za-z0-9]+') self.keep_silent = keep_silent self.definer = ExclusiveNLDefiner()