def _create_instances(num_features, corpus, setting_function): """ rtype: Tuple[scipy.sparse.csr_matrix, List[int]] """ start = time.time() num_instances = sum(1 for _ in corpus.edges()) # We first construct the X matrix of features with the sparse lil_matrix, which is efficient in reshaping its structure dynamically # At the end, we convert this to csr_matrix, which is efficient for algebra operations # See https://docs.scipy.org/doc/scipy-0.18.1/reference/generated/scipy.sparse.lil_matrix.html#scipy.sparse.lil_matrix # See http://scikit-learn.org/stable/modules/svm.html#svm X = scipy.sparse.lil_matrix((num_instances, num_features), dtype=np.float64) y = np.zeros( num_instances, order='C' ) # -- see: http://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html#sklearn.svm.SVC X, y, groups = setting_function(X, y, corpus) X = X.tocsr() end = time.time() print_debug("SVC convert instances, running time: ", (end - start)) return (X, y, groups)
def execute(self, dataset, only_features=False): # Note: the order of splitter/tokenizer/edger/parser is important # Note: we could avoid the re-splitting & tokenization (see c3d320f08ed8893460d5a68b1b5c87aab6ea0c27) # yet that may later create unforseen problems and re-doing has no significant impact in running time start = time.time() if not only_features: self.splitter.split(dataset) self.tokenizer.tokenize(dataset) self.parser.parse( dataset ) # Note, the percolate_tokens_to_entities should go before the edge generator due to sentences adjustments self.edge_generator.generate(dataset) # The labels are always re-generated dataset.label_edges() for feature_generator in self.feature_generators: feature_generator.generate(dataset, self.feature_set, use_gold=self.edge_generator.use_gold, use_pred=self.edge_generator.use_pred) end = time.time() print_debug( "Relation pipeline (only_features: {}), running time: {}".format( only_features, str(end - start)))
def __init__(self): self.jar_path = pkg_resources.resource_filename( 'nalaf.data', "biolemmatizer-core-1.2-jar-with-dependencies.jar") if not os.path.isfile(self.jar_path): raise Exception("Could't find biolemmatizer jar: " + self.jar_path) self.program = ["java", "-Xmx1G", "-jar", self.jar_path, "-l", "-t"] self.p = Popen(self.program, universal_newlines=True, stdin=PIPE, stdout=PIPE, stderr=PIPE, bufsize=1) BioLemmatizer.__setNonBlocking(self.p.stdout) BioLemmatizer.__setNonBlocking(self.p.stderr) # Initialize java program print_debug("BioLemmatizer: INIT START") out = None while not out: try: out = self.p.stdout.read() except TypeError as e: continue else: if ("Running BioLemmatizer in interactive mode" in out): break else: out = None print_debug("BioLemmatizer: INIT END")
def __init__(self, model_path=None, classification_threshold=0.0, use_tree_kernel=False, svmlight_dir_path=''): self.model_path = model_path if model_path is not None else tempfile.NamedTemporaryFile( ).name """the model (path) to read from / write to""" print_debug("SVM-Light model file path: " + self.model_path) self.classification_threshold = classification_threshold self.use_tree_kernel = use_tree_kernel """whether to use tree kernels or not""" self.svmlight_dir_path = svmlight_dir_path """ The directory where the executables svm_classify and svm_learn are located. Defaults to the empty string '', which then means that the svmlight executables must be in your binary path """ executables_extension = '' if sys.platform.startswith( 'linux') or sys.platform.startswith('darwin') else '.exe' self.svm_learn_call = os.path.join( self.svmlight_dir_path, ('svm_learn' + executables_extension)) self.svm_classify_call = os.path.join( self.svmlight_dir_path, ('svm_classify' + executables_extension)) self.verbosity_level = str( 0 ) # for now, for verbosity=0; -- alternative: str(1 if is_verbose_mode else 0)
def __read_dictionaries(dic_paths, read_function, string_tokenizer, case_sensitive, stop_words): stop_words = DictionaryFeatureGenerator.__normalize_stop_words( stop_words) ret = [] for dic_path in dic_paths: try: reader = read_function(dic_path) try: name = DictionaryFeatureGenerator.__get_filename(dic_path) words_set = DictionaryFeatureGenerator.construct_words_set( reader, string_tokenizer, case_sensitive, stop_words) generator = DictionaryFeatureGenerator( name, words_set, case_sensitive) ret.append(generator) finally: reader.close() except Exception as e: traceback.print_exc() print_debug("Could not read dictionary: {}".format(dic_path), e) continue print_verbose("Using dictionaries: {}".format(", ".join( (repr(x) for x in ret)))) return ret
def annotate(self, corpus): X, y = self.__convert_edges_features_to_vector_instances(corpus) if X.shape[0] == 0: # no instances at all (corpus with no edges) --> nothing to do with the corpus return corpus else: X = self.preprocess.transform(X) print_debug( "SVC after preprocessing, #features: {} && max value: {}". format( X.shape[1], max(sklearn.utils.sparsefuncs.min_max_axis(X, axis=0)[1]))) # Pure classification prediction y_pred = self.model.predict(X) print_debug("Mean accuracy: {}".format( sum(real == pred for real, pred in zip(y, y_pred)) / len(y))) # same as == self.model.score(X, y)) for edge, target_pred in zip(corpus.edges(), y_pred): edge.pred_target = target_pred return corpus.form_predicted_relations()
def evaluate(self, dataset): """ :type dataset: nala.structures.data.Dataset :returns Evaluations """ subcounts = ['tp', 'fp', 'fn'] counts = {docid: dict.fromkeys(subcounts, 0) for docid in dataset.documents.keys()} print_verbose() for docid, doc in dataset.documents.items(): if self.evaluate_only_on_edges_plausible_relations: # a set would be better, but so far Relation is unshable relations_search_space = list(dataset.plausible_relations_from_generated_edges()) else: relations_search_space = None gold = doc.map_relations(use_predicted=False, relation_type=self.rel_type, entity_map_fun=self.entity_map_fun, relations_search_space=relations_search_space).keys() pred = doc.map_relations(use_predicted=True, relation_type=self.rel_type, entity_map_fun=self.entity_map_fun).keys() for r_pred in pred: accept_decisions = {self.relation_accept_fun(r_gold, r_pred) for r_gold in gold} assert set.issubset(accept_decisions, {True, False, None}), "`relation_accept_fun` cannot return: " + str(accept_decisions) if True in accept_decisions: # Count the true positives while iterating on gold pass elif None in accept_decisions: # Ignore as documented pass else: # either False or the set is empty, meaning that there are no gold annotations print_debug(" ", docid, ": FALSE POSITIV", r_pred) counts[docid]['fp'] += 1 for r_gold in gold: r_preds = [r_pred for r_pred in pred if self.relation_accept_fun(r_gold, r_pred)] if len(r_preds) > 0: # we could also do any(...); we have this in place only for debugging purposes print_verbose(" ", docid, ": true positive", r_gold) counts[docid]['tp'] += 1 else: print_debug(" ", docid, ": FALSE NEGATIV", r_gold) counts[docid]['fn'] += 1 print_verbose() evaluations = Evaluations() evaluations.add(EvaluationWithStandardError(self.rel_type, counts)) return evaluations
def __init__(self, tokens, name="", is_edge_type_constant=False, there_is_target=True, default_n_grams=None): self.tokens = tokens self.nodes = [] self.name = name self.is_edge_type_constant = is_edge_type_constant self.default_n_grams = default_n_grams if default_n_grams is not None else [] for u_token, v_token in zip(tokens, tokens[1:]): # Note: the last one is not added yet, see below if is_edge_type_constant: edge_type = "" is_forward = None else: parser_defined = __class__._get_dep_edges( u_token, v_token, __class__.__mk_list_rm_None(u_token.features['dependency_from']), __class__.__mk_list_rm_None(v_token.features['dependency_from'])) user_defined = __class__._get_dep_edges( u_token, v_token, u_token.features['user_dependency_from'], v_token.features['user_dependency_from']) all_dep_edges = parser_defined + user_defined assert len(all_dep_edges) > 0, \ ("One must be a dependency of the other", u_token, v_token, tokens) if len(all_dep_edges) > 1: print_debug("Multiple dependencies are not handled yet; defaulted to first. This should strictly only happen with user-defined dependencies") edge_type = all_dep_edges[0][0] is_forward = all_dep_edges[0][1] self.nodes.append(PathNode(u_token, edge_type, is_forward)) if len(self.tokens) == 0: self.exists = False self.source = self.target = self.middle = [] else: self.exists = True self.nodes.append(PathNode(self.tokens[-1], edge_type="", is_forward=None, is_target=there_is_target)) self.nodes[0].is_source = True self.source = [self.nodes[+0]] if there_is_target: self.middle = self.nodes[1:-1] self.target = [self.nodes[-1]] else: self.middle = self.nodes[1:] self.target = []
def _get_spacy_nlp_english(load_parser): import spacy start = time.time() print_debug("Spacy NLP English, Parser: {} -- INIT START".format(str(load_parser))) if load_parser is True: nlp = spacy.load('en', entity=False) else: nlp = spacy.load('en', parser=False, entity=False) print_debug("Spacy NLP English, Parser: {} -- INIT END : ".format(str(load_parser)), (time.time() - start)) return nlp
def generate(self, dataset): """ :type dataset: nalaf.structures.data.Dataset """ for sentence in dataset.sentences(): try: sentence[0].features['BOS'] = 1 sentence[-1].features['EOS'] = 1 except IndexError as e: if isinstance(sentence, str): raise Exception( "Could not index the following sentence; likely the sentence was not tokenized: {}" .format(sentence), e) else: print_debug( "ERROR: {}. Ignoring this sentence (type: {}); it is either empty or not tokenized: {}" .format(e, type(sentence), sentence))
def classify(self, instancesfile): predictionsfile = tempfile.NamedTemporaryFile('r+', delete=False) print_debug("predict: svm predictions file: " + predictionsfile.name) callv = [ self.svm_classify_call, '-v', '1', instancesfile.name, self.model_path, predictionsfile.name ] print_debug("svm light classify parameters: " + ' '.join(callv) + "\n") exitcode = subprocess.call(callv) if exitcode != 0: raise Exception("Error when tagging: " + ' '.join(call)) predictionsfile.flush() # Note, we do not close the file return predictionsfile
def learn(self, instancesfile, c=None): with instancesfile: if self.use_tree_kernel: callv = [ self.svm_learn_call, '-v', self.verbosity_level, '-t', '5', '-T', '1', '-W', 'S', '-V', 'S', '-C', '+', '-c', str(c), instancesfile.name, self.model_path ] else: callv = [self.svm_learn_call, '-v', self.verbosity_level] if c is not None: callv = callv + ['-c', str(c)] callv = callv + [instancesfile.name, self.model_path] print_debug("svm light learn parameters: " + ' '.join(callv) + "\n") subprocess.call(callv) return self.model_path
def cross_validate(annotator_gen_fun, corpus, evaluator, k_num_folds, use_validation_set=True): merged_evaluations = [] print_debug("Cross-Validation") for training_set, evaluation_set in corpus.cv_kfold_splits(k_num_folds, validation_set=use_validation_set): annotator_apply = annotator_gen_fun(training_set) annotator_apply(evaluation_set) r = evaluator.evaluate(evaluation_set) print_debug(r) merged_evaluations.append(r) ret = Evaluations.merge(merged_evaluations) print_debug("\n" + str(ret) + "\n") return ret
def train(self, training_corpus): X, y = self.__convert_edges_features_to_vector_instances( training_corpus) X = self.preprocess.fit_transform(X) print_debug( "SVC after preprocessing, #features: {} && max value: {}".format( X.shape[1], max(sklearn.utils.sparsefuncs.min_max_axis(X, axis=0)[1]))) print_debug( "Train SVC with #samples {} - #features {} - params: {}".format( X.shape[0], X.shape[1], str(self.model.get_params()))) start = time.time() self.model.fit(X, y) end = time.time() print_debug("SVC train, running time: ", (end - start)) return self
def tag(self, data): """ :type data: nalaf.structures.data.Dataset """ for doc_id, doc in data.documents.items(): if doc_id in self.cache: print_debug("Use cached response", doc_id) response_text = self.cache[doc_id] elif len(doc.parts) == 2 and self._is_pmid(doc_id): print_debug("Use PMID-based API", doc_id) r = requests.get(self.url_tmvar_pmids.format(doc_id)) if r.status_code == 200: response_text = r.text self.cache[doc_id] = response_text else: continue else: print_debug("Use free-text API", doc_id) r = requests.post(self.url_tmvar_freetext, self._doc_to_json(doc)) if 'Receive' in r.url: s = 501 while s == 501: time.sleep(5) s = requests.get(r.url) response_text = s.text s = s.status_code response_text = '[' + response_text + ']' self.cache[doc_id] = response_text else: continue if response_text.startswith('[Error]'): warnings.warn(response_text) else: if response_text.startswith("["): self._parse_json(doc_id, doc, response_text) else: self._parse_pubtator(doc_id, doc, response_text)
def _clean_predictions(self, dataset, name="tagger"): for part in dataset.parts(): print_debug(name, [ann.text for ann in part.predicted_annotations]) part.predicted_annotations = []
def train(argv): parser = argparse.ArgumentParser(description='Train model') parser.add_argument( '--training_corpus', help= 'Name of the corpus to train on. Ex: nala_training, IDP4+_training, nala_training_5' ) parser.add_argument('--test_corpus', help='Name of the corpus to test on') parser.add_argument('--string', help='String to tag') parser.add_argument('--validation', required=False, default="stratified", choices=["cross-validation", "stratified", "none"], help='Type of validation to use when training') parser.add_argument( '--cv_n', required=False, help= 'if given, cross validation (instead of stratification) is used for validating the training. \ In this case you must also set `cv_fold` and only that fold number will be run' ) parser.add_argument( '--cv_fold', required=False, help= 'fold number if cross validation is activated (it starts at 0; i.e. for cv_n=5, you have folds: [0,1,2,3,4] )' ) parser.add_argument( '--output_folder', required=False, help= 'Folder where the training model is written to. Otherwise a tmp folder is used' ) parser.add_argument( '--model_name_suffix', default='', required=False, help= 'Optional suffix to add to the generated model name in training mode'), parser.add_argument( '--write_anndoc', required=False, default=False, action='store_true', help='Write anndoc of predicted test_corpus (validation corpus in fact)' ), parser.add_argument( '--model_path_1', required=False, help='Path of the first model binary file if evaluation is performed') parser.add_argument( '--model_path_2', required=False, help= 'Path of the second model binary file if evaluation is performed with two models' ) parser.add_argument('--labeler', required=False, default="BIEO", choices=["BIEO", "BIO", "IO", "11labels"], help='Labeler to use for training') parser.add_argument( '--mutations_specific', default='True', help= 'Apply feature pipelines specific to mutations or otherwise (false) use general one' ) parser.add_argument( '--only_class_id', required=False, default=MUT_CLASS_ID, help= "By default, only the mutation entities are read from corpora (assumed to have class_id == '" + MUT_CLASS_ID + "'). Set this class_id to filter rest out") parser.add_argument( '--delete_subclasses', required=False, default="", help='Comma-separated subclasses to delete. Example: "2,3"') parser.add_argument('--pruner', required=False, default="parts", choices=["parts", "sentences"]) parser.add_argument('--ps_ST', required=False, default=False, action='store_true') parser.add_argument('--ps_NL', required=False, default=False, action='store_true') parser.add_argument('--ps_random', required=False, default=0.0, type=float) parser.add_argument('--elastic_net', action='store_true', help='Use elastic net regularization') parser.add_argument('--word_embeddings', '--we', default='True', help='Use word embeddings features') parser.add_argument('--we_additive', type=float, default=0) parser.add_argument('--we_multiplicative', type=float, default=1) parser.add_argument('--we_model_location', type=str, default=None) parser.add_argument('--use_feat_windows', default='True') parser.add_argument('--nl', action='store_true', help='Use NLMentionFeatureGenerator') parser.add_argument('--nl_threshold', type=int, default=0) parser.add_argument('--nl_window', action='store_true', help='use window feature for NLFeatureGenerator') parser.add_argument( '--execute_pp', default='True', help='Execute post processing specific to mutations (default) or not') parser.add_argument( '--keep_silent', default='True', help= 'Keep silent mutations (default) or not, i.e., delete mentions like `Cys23-Cys`' ) parser.add_argument( '--keep_genetic_markers', default='True', help='Keep genetic markers of the form D17S250, true (default) or false' ) parser.add_argument( '--keep_unnumbered', default='True', help= 'Keep unnumbered mentions (default) or not, i.e., delete mentions like `C -> T`' ) parser.add_argument( '--keep_rs_ids', default='True', help= 'Keep unnumbered mentions (default) or not, i.e., delete mentions like `rs1801280` or `ss221`' ) parser.add_argument( '--dictionaries_paths', default=None, help= 'Dictionary paths to use for dictionary features. Can be used within hdfs' ) parser.add_argument('--dictionaries_stop_words', default=None, help='Stop words for dictionaries if these are used') parser.add_argument('--hdfs_url', required=False, default=None, type=str, help='URL of hdfs if this is used') parser.add_argument( '--hdfs_user', required=False, default=None, type=str, help="user of hdfs if this used. Must be given if `hdfs_url` is given") FALSE = ['false', 'f', '0', 'no', 'none'] def arg_bool(arg_value): return False if arg_value.lower() in FALSE else True args = parser.parse_args(argv) start_time = time.time() # ------------------------------------------------------------------------------ delete_subclasses = [] for c in args.delete_subclasses.split(","): c.strip() if c: delete_subclasses.append(int(c)) args.delete_subclasses = delete_subclasses if not args.output_folder: args.output_folder = tempfile.mkdtemp() str_delete_subclasses = "None" if not args.delete_subclasses else str( args.delete_subclasses).strip('[]').replace(' ', '') if args.labeler == "BIEO": labeler = BIEOLabeler() elif args.labeler == "BIO": labeler = BIOLabeler() elif args.labeler == "IO": labeler = IOLabeler() elif args.labeler == "11labels": labeler = TmVarLabeler() args.word_embeddings = arg_bool(args.word_embeddings) if args.word_embeddings: args.we_params = { 'additive': args.we_additive, 'multiplicative': args.we_multiplicative, 'location': args.we_model_location } else: args.we_params = {} # means: do not use we if args.nl: args.nl_features = { 'threshold': args.nl_threshold, # threshold for neighbour space in dictionaries 'window': args.nl_window, } else: args.nl_features = None if args.elastic_net: args.crf_train_params = { 'c1': 1.0, # coefficient for L1 penalty 'c2': 1e-3, # coefficient for L2 penalty } else: args.crf_train_params = None args.use_feat_windows = False if args.use_feat_windows.lower( ) in FALSE else True args.mutations_specific = False if args.mutations_specific.lower( ) in FALSE else True args.execute_pp = False if args.execute_pp.lower() in FALSE else True args.keep_silent = False if args.keep_silent.lower() in FALSE else True args.keep_genetic_markers = False if args.keep_genetic_markers.lower( ) in FALSE else True args.keep_unnumbered = False if args.keep_unnumbered.lower( ) in FALSE else True args.keep_rs_ids = False if args.keep_rs_ids.lower() in FALSE else True args.do_train = False if args.model_path_1 else True if args.cv_n is not None or args.cv_fold is not None: args.validation = "cross-validation" if args.validation == "cross-validation": assert (args.cv_n is not None and args.cv_fold is not None), "You must set both cv_n AND cv_n" # ------------------------------------------------------------------------------ if args.training_corpus: # Get the name of training corpus even if this is given as a folder path, in which case the last folder name is used training_corpus_name = list( filter(None, args.training_corpus.split('/')))[-1] args.model_name = "{}_{}_del_{}".format(training_corpus_name, args.labeler, str_delete_subclasses) if args.validation == "cross-validation": args.model_name += "_cvfold_" + str(args.cv_fold) args.model_name_suffix = args.model_name_suffix.strip() if args.model_name_suffix: args.model_name += "_" + str(args.model_name_suffix) else: args.model_name = args.test_corpus # ------------------------------------------------------------------------------ def stats(dataset, name): print('\n\t{} size: {}'.format(name, len(dataset))) print('\tsubclass distribution: {}'.format(repr(dataset))) # Caveat: the dataset must be passed through the pipeline first print('\tnum sentences: {}\n'.format( sum(1 for x in dataset.sentences()))) definer = ExclusiveNLDefiner() if args.training_corpus: train_set = get_corpus(args.training_corpus, only_class_id=args.only_class_id, hdfs_url=args.hdfs_url, hdfs_user=args.hdfs_user) if args.test_corpus: test_set = get_corpus(args.test_corpus, only_class_id=args.only_class_id, hdfs_url=args.hdfs_url, hdfs_user=args.hdfs_user) elif args.string: test_set = StringReader(args.string).read() elif args.validation == "none": test_set = None elif args.validation == "cross-validation": train_set, test_set = train_set.fold_nr_split( int(args.cv_n), int(args.cv_fold)) elif args.validation == "stratified": definer.define(train_set) train_set, test_set = train_set.stratified_split() elif args.test_corpus: train_set = None test_set = get_corpora(args.test_corpus, args.only_class_id) elif args.string: train_set = None test_set = StringReader(args.string).read() else: raise Exception( "you must give at least a parameter of: training_corpus, test_corpus, or string" ) def verify_corpus(corpus): if corpus is not None: assert len( corpus ) > 0, f"The corpus should have at least one document; had 0: {args.training_corpus}" assert next( corpus.entities(), None ) is not None, "The corpus should have at least one entity; had 0" verify_corpus(train_set) # ------------------------------------------------------------------------------ if args.mutations_specific: print("Pipeline specific to mutations") features_pipeline = get_prepare_pipeline_for_best_model( args.use_feat_windows, args.we_params, args.nl_features) else: print("Pipeline is general") features_pipeline = get_prepare_pipeline_for_best_model_general( args.use_feat_windows, args.we_params, args.dictionaries_paths, args.hdfs_url, args.hdfs_user, args.dictionaries_stop_words) # ------------------------------------------------------------------------------ def print_run_args(): for key, value in sorted((vars(args)).items()): print("\t{} = {}".format(key, value)) print() print("Running arguments: ") print_run_args() # ------------------------------------------------------------------------------ def train(train_set): definer.define(train_set) train_set.delete_subclass_annotations(args.delete_subclasses) features_pipeline.execute(train_set) labeler.label(train_set) if args.pruner == "parts": train_set.prune_empty_parts() else: try: f = HighRecallRegexClassifier(ST=args.ps_ST, NL=args.ps_NL) except AssertionError: f = (lambda _: False) train_set.prune_filtered_sentences(filterin=f, percent_to_keep=args.ps_random) stats(train_set, "training") model_path = os.path.join(args.output_folder, args.model_name + ".bin") PyCRFSuite.train(train_set, model_path, args.crf_train_params) return model_path # ------------------------------------------------------------------------------ if args.do_train: args.model_path_1 = train(train_set) # ------------------------------------------------------------------------------ def test(tagger, test_set, print_eval=True, print_results=False): tagger.tag(test_set) definer.define(test_set) stats(test_set, "test") evaluation = MentionLevelEvaluator( subclass_analysis=True).evaluate(test_set) print_run_args() if print_eval: print(evaluation) if print_results: ConsoleWriter(ent1_class_id=PRO_CLASS_ID, ent2_class_id=MUT_CLASS_ID, color=True).write(test_set) # ------------------------------------------------------------------------------ assert (args.model_path_1 is not None) if args.model_path_2: tagger = NalaMultipleModelTagger( st_model=args.model_path_1, all3_model=args.model_path_2, features_pipeline=features_pipeline, execute_pp=args.execute_pp, keep_silent=args.keep_silent, keep_genetic_markers=args.keep_genetic_markers, keep_unnumbered=args.keep_unnumbered, keep_rs_ids=args.keep_rs_ids) else: tagger = NalaSingleModelTagger( bin_model=args.model_path_1, features_pipeline=features_pipeline, execute_pp=args.execute_pp, keep_silent=args.keep_silent, keep_genetic_markers=args.keep_genetic_markers, keep_unnumbered=args.keep_unnumbered, keep_rs_ids=args.keep_rs_ids) # ------------------------------------------------------------------------------ print("\n{}".format(args.model_name)) if train_set: stats(train_set, "training") if test_set: test(tagger, test_set, print_eval=args.string is None, print_results=args.string is not None) if args.do_train: print("\nThe model is saved to: {}\n".format(args.model_path_1)) if args.write_anndoc: outdir = os.path.join(args.output_folder, args.model_name) os.mkdir(outdir) print("\nThe predicted test data is saved to: {}\n".format(outdir)) TagTogFormat(test_set, use_predicted=True, to_save_to=outdir).export(0) end_time = time.time() print_debug("Elapsed time: ", (end_time - start_time)) return { "tagger": tagger, "trained_model_path": args.model_path_1, "training_num_docs": 0 if train_set is None else len(train_set.documents), "training_num_annotations": 0 if train_set is None else sum(1 for e in train_set.entities() if e.class_id == args.only_class_id) }
def evaluate(self, dataset): """ :type dataset: nalaf.structures.data.Dataset :returns (tp, fp, fn, tp_overlapping, precision, recall, f_measure): (int, int, int, int, float, float, float) Calculates precision, recall and subsequently F1 measure, defined as: * precision: number of correctly predicted items as a percentage of the total number of predicted items len(predicted items that are also real)/len(predicted) or in other words tp / tp + fp * recall: number of correctly predicted items as a percentage of the total number of correct items len(real items that are also predicted)/len(real) or in other words tp / tp + fn * possibly considers overlapping matches as well """ TOTAL = MentionLevelEvaluator.TOTAL_LABEL labels = [TOTAL] def labelize(e): """ Use this to represent an entity subclass as string and, if this is None or False (but not 0!), represent the entity with its class_id Convert to subclasses / classes ids to avoid the misstep of comparing possible subclass '0' with False, which in python breaks the universe --> info: https://twitter.com/juanmirocks/status/802209750612054016 """ return str(e.subclass) if str( e.subclass) not in ['None', 'False'] else str(e.class_id) if self.subclass_analysis: # find all possible subclasses or otherwise full classes subclasses = set(labelize(e) for e in dataset.entities()) subclasses.update( set(labelize(e) for e in dataset.predicted_entities())) for x in subclasses: labels.append(x) docids = dataset.documents.keys() subcounts = ['tp', 'fp', 'fn', 'fp_ov', 'fn_ov'] counts = { label: {docid: dict.fromkeys(subcounts, 0) for docid in docids} for label in labels } for docid, doc in dataset.documents.items(): for partid, part in doc.parts.items(): overlap_real = {label: [] for label in labels} overlap_predicted = {label: [] for label in labels} Entity.equality_operator = 'overlapping' for ann_a in part.annotations: for ann_b in part.predicted_annotations: if ann_a == ann_b: # equal according according to exclusive overlapping eq (not exact) overlap_real[TOTAL].append(ann_a) overlap_predicted[TOTAL].append(ann_b) if self.subclass_analysis: if labelize(ann_a) != labelize(ann_b): print_debug( 'overlapping subclasses do not match', ann_a.subclass, ann_b.subclass) ann_b.subclass = ann_a.subclass overlap_real[labelize(ann_a)].append(ann_a) overlap_predicted[labelize(ann_b)].append( ann_b) Entity.equality_operator = 'exact' for ann in part.predicted_annotations: if ann in part.annotations: counts[TOTAL][docid]['tp'] += 1 print_verbose(" ", docid, ": TRUE POSITVE", ann) if self.subclass_analysis: counts[labelize(ann)][docid]['tp'] += 1 else: counts[TOTAL][docid]['fp'] += 1 if ann in overlap_predicted[TOTAL]: counts[TOTAL][docid]['fp_ov'] += 1 else: print_debug(" ", docid, ": FALSE POSITIV", ann) if self.subclass_analysis: counts[labelize(ann)][docid]['fp'] += 1 if ann in overlap_predicted[labelize(ann)]: counts[labelize(ann)][docid]['fp_ov'] += 1 for ann in part.annotations: if ann not in part.predicted_annotations: counts[TOTAL][docid]['fn'] += 1 if ann in overlap_real[TOTAL]: counts[TOTAL][docid]['fn_ov'] += 1 else: print_debug(" ", docid, ": FALSE NEGATIV", ann) if self.subclass_analysis: counts[labelize(ann)][docid]['fn'] += 1 if ann in overlap_real[labelize(ann)]: counts[labelize(ann)][docid]['fn_ov'] += 1 evaluations = Evaluations() for label in labels: evaluations.add(EvaluationWithStandardError(label, counts[label])) return evaluations
def evaluate(self, dataset): """ :type dataset: nalaf.structures.data.Dataset :returns (tp, fp, fn, precision, recall, f_measure): (int, int, int, float, float, float) Calculates precision, recall and subsequently F1 measure, defined as: * precision: number of correctly predicted items as a percentage of the total number of predicted items len(predicted items that are also real)/len(predicted) or in other words tp / tp + fp * recall: number of correctly predicted items as a percentage of the total number of correct items len(real items that are also predicted)/len(real) or in other words tp / tp + fn """ TOTAL = EntityEvaluator.TOTAL_LABEL labels = [TOTAL] # find all possible subclasses or otherwise full classes labels += list(set(__class__._labelize(e) for e in dataset.entities())) labels += list( set(__class__._labelize(e) for e in dataset.predicted_entities())) docids = dataset.documents.keys() subcounts = ['tp', 'fp', 'fn'] counts = { label: {docid: dict.fromkeys(subcounts, 0) for docid in docids} for label in labels } for docid, doc in dataset.documents.items(): for partid, part in doc.parts.items(): gold_anns = set( filter(None, (self.entity_map_fun(e) for e in part.annotations))) pred_anns = set( filter(None, (self.entity_map_fun(e) for e in part.predicted_annotations))) for pred in pred_anns: accept_decisions = { self.entity_accept_fun(gold, pred) for gold in gold_anns } assert set.issubset( accept_decisions, {True, False, None }), "did not expect: " + str(accept_decisions) if True in accept_decisions: # Count the true positives while iterating on gold pass elif None in accept_decisions: pass else: # either False or the set is empty, meaning that there are no gold annotations print_debug(" ", docid, ": FALSE POSITIV", pred) counts[TOTAL][docid]['fp'] += 1 counts[__class__._labelize(pred)][docid]['fp'] += 1 for gold in gold_anns: accept_decisions = { self.entity_accept_fun(gold, pred) for pred in pred_anns } if True in accept_decisions: print_verbose(" ", docid, ": true positive", gold) counts[TOTAL][docid]['tp'] += 1 counts[__class__._labelize(gold)][docid]['tp'] += 1 elif "UNKNOWN:" in gold: # Pass when unknown normalization pass else: print_debug(" ", docid, ": FALSE NEGATIV", gold) counts[TOTAL][docid]['fn'] += 1 counts[__class__._labelize(gold)][docid]['fn'] += 1 evaluations = Evaluations() for label in labels: evaluations.add(EvaluationWithStandardError(label, counts[label])) return evaluations
def filter(self, documents, min_found=1, use_nala=False): """ :type documents: collections.Iterable[(str, nalaf.structures.data.Document)] """ _progress = 1 _start_time = time.time() _total_time = 0 _time_avg_per_pattern = 0 _pattern_calls = 0 _time_reg_pattern_total = 0 _time_max_pattern = 0 _low_performant_pattern = "" # NLDefiners init exclusive_definer = ExclusiveNLDefiner() _e_array = [0, 0, 0] inclusive_definer = InclusiveNLDefiner() _i_array = [0, 0] last_found = 0 crf = PyCRFSuite(self.location_binary_model) # counter_to_stop_for_caching = 0 for pmid, doc in documents: # if any part of the document contains any of the keywords # yield that document # if counter_to_stop_for_caching > 400: # break # counter_to_stop_for_caching += 1 # print(counter_to_stop_for_caching) part_offset = 0 data_tmp = Dataset() data_tmp.documents[pmid] = doc data_nala = deepcopy(data_tmp) NLTKSplitter().split(data_tmp) # data_tmvar = TmVarTagger().generate_abstracts([pmid]) if use_nala: self.pipeline.execute(data_nala) self.labeler.label(data_nala) crf.tag(data_nala, MUT_CLASS_ID) PostProcessing().process(data_nala) ExclusiveNLDefiner().define(data_nala) used_regexs = {} positive_sentences = 0 for i, x in enumerate(doc.parts): # print("Part", i) sent_offset = 0 cur_part = doc.parts.get(x) sentences = cur_part.sentences_ for sent in sentences: sent_length = len(sent) new_text = sent.lower() new_text = re.sub('[\./\\-(){}\[\],%]', ' ', new_text) # new_text = re.sub('\W+', ' ', new_text) found_in_sentence = False for i, reg in enumerate(self.patterns): _lasttime = time.time() # time start var match = reg.search(new_text) # debug bottleneck patterns _time_current_reg = time.time( ) - _lasttime # time end var _pattern_calls += 1 # pattern calls already occured _time_reg_pattern_total += _time_current_reg # total time spent on searching with patterns if _time_reg_pattern_total > 0: _time_avg_per_pattern = _time_reg_pattern_total / _pattern_calls # avg spent time per pattern call # todo create pattern performance eval for descending amount of recognized patterns # if _pattern_calls > len(patterns) * 20 and _time_avg_per_pattern * 10000 < _time_current_reg: # print("BAD_PATTERN_PERFORMANCE:", _time_avg_per_pattern, _time_current_reg, reg.pattern) # if _time_max_pattern < _time_current_reg: # _time_max_pattern = _time_current_reg # _low_performant_pattern = reg.pattern # print(_time_avg_per_pattern, _low_performant_pattern, _time_max_pattern) # if reg.pattern == r'(\b\w*\d+\w*\b\s?){1,3} (\b\w+\b\s?){1,4} (\b\w*\d+\w*\b\s?){1,3} (\b\w+\b\s?){1,4} (deletion|deleting|deleted)': # if _time_current_reg > _time_avg_per_pattern * 10: # # print(_time_avg_per_pattern, _time_current_reg) # f.write("BAD_PATTERN\n") # f.write(sent + "\n") # f.write(new_text + "\n") if match: # if pmid in data_tmvar.documents: # anti_doc = data_tmvar.documents.get(pmid) nala_doc = data_nala.documents.get(pmid) start = part_offset + sent_offset + match.span()[0] end = part_offset + sent_offset + match.span()[1] # print("TmVar is not overlapping?:", not anti_doc.overlaps_with_mention(start, end)) # print(not nala_doc.overlaps_with_mention(start, end, annotated=False)) if reg.pattern in used_regexs: used_regexs[reg.pattern] += 1 else: used_regexs[reg.pattern] = 1 print(color.PURPLE + new_text.replace( match.group(), color.BOLD + color.DARKCYAN + color.UNDERLINE + match.group() + color.END + color.PURPLE) + color.END) if not found_in_sentence: positive_sentences += 1 found_in_sentence = True # if not anti_doc.overlaps_with_mention(start, # end) \ # and not nala_doc.overlaps_with_mention(start, end, annotated=False): # _e_result = exclusive_definer.define_string( # new_text[match.span()[0]:match.span()[1]]) # _e_array[_e_result] += 1 # _i_result = inclusive_definer.define_string( # new_text[match.span()[0]:match.span()[1]]) # _i_array[_i_result] += 1 # todo write to file param + saving to manually annotate and find tp + fp for performance eval on each pattern # print("e{}\ti{}\t{}\t{}\t{}\n".format(_e_result, _i_result, sent, match, reg.pattern)) # last_found += 1 # found_in_sentence = True # else: # # if nala not used only tmvar considered # if not anti_doc.overlaps_with_mention(start, end): # _e_result = exclusive_definer.define_string( # new_text[match.span()[0]:match.span()[1]]) # _e_array[_e_result] += 1 # _i_result = inclusive_definer.define_string( # new_text[match.span()[0]:match.span()[1]]) # _i_array[_i_result] += 1 # # todo write to file param + saving to manually annotate and find tp + fp for performance eval on each pattern # # print("e{}\ti{}\t{}\t{}\t{}\n".format(_e_result, _i_result, sent, match, reg.pattern)) # last_found += 1 # found_in_sentence = True if use_nala: nala_found_mention = nala_doc.overlaps_with_mention( start, end, annotated=False) if nala_found_mention: print_verbose(nala_found_mention) if nala_found_mention.subclass > 0 and nala_found_mention.confidence <= self.threshold: yield pmid, doc if _lasttime - time.time() > 1: print_verbose('time intensive regex', i) sent_offset += 2 + sent_length # for per sentence positives if found_in_sentence: positive_sentences += 1 part_offset += sent_offset if use_nala: for part in nala_doc: for ann in part.predicted_annotations: if ann.subclass > 0: print_verbose(part.text[:ann.offset] + color.BOLD + ann.text + color.END + part.text[ann.offset + len(ann.text):]) positive_sentences += min_found _old_time = _start_time _start_time = time.time() _one_time = _start_time - _old_time if _one_time > 0.3 and positive_sentences > min_found: _progress += 1 _total_time += _one_time _time_per_doc = _total_time / _progress print_verbose( "PROGRESS: {:.2f} secs ETA per one positive document:" " {:.2f} secs".format(_total_time, _time_per_doc)) print_debug('used regular expressions:', json.dumps(used_regexs, indent=4)) if positive_sentences >= min_found: last_found = 0 print_verbose('YEP', pmid) yield pmid, doc else: print_verbose('NOPE', pmid)
def create_input_file(self, dataset, mode, features, minority_class=None, majority_class_undersampling=1.0): string = '' # Real counts vs Used ones after undersampling is applied num_pos_instances = [0, 0] num_neg_instances = [0, 0] num_unl_instances = [0, 0] allowed_features_keys = set(features.values()) for edge in dataset.edges(): if edge.real_target == +1: num_pos_instances[0] += 1 elif edge.real_target == -1: num_neg_instances[0] += 1 else: num_unl_instances[0] += 1 if mode != 'train' or minority_class is None or edge.real_target == minority_class or random( ) <= majority_class_undersampling: if edge.real_target == +1: num_pos_instances[1] += 1 elif edge.real_target == -1: num_neg_instances[1] += 1 else: num_unl_instances[1] += 1 # (Estimation) Writing any dummy target/class (0 in particular) or the actual target is irrelevant # Yet, with the actual target, svmlight can throw useful evaluation performance numbers instance_label = str(edge.real_target) string += instance_label if self.use_tree_kernel: string += ' |BT| ' string += edge.same_part.sentence_parse_trees[ edge.same_sentence_id] string += ' |ET|' for key in sorted(edge.features.keys()): if key in allowed_features_keys: value = edge.features[key] string += ' ' + str(key) + ':' + str(value) string += '\n' instancesfile = tempfile.NamedTemporaryFile('w', delete=False) print_debug("{}: svmlight instances file: {}".format( mode, instancesfile.name)) instancesfile.write(string) instancesfile.flush() # Note, we do not close the file total_real = (num_pos_instances[0] + num_neg_instances[0] + num_unl_instances[0]) total_used = (num_pos_instances[1] + num_neg_instances[1] + num_unl_instances[1]) print_line = "{}: instances, #REAL: {} == P: {} vs N: {} vs ?: {} || vs. #USED: {} == P {} vs N: {} vs ?: {}" print_debug( print_line.format(mode, total_real, num_pos_instances[0], num_neg_instances[0], num_unl_instances[0], total_used, num_pos_instances[1], num_neg_instances[1], num_unl_instances[1])) return instancesfile