Example #1
0
    def _create_instances(num_features, corpus, setting_function):
        """
        rtype: Tuple[scipy.sparse.csr_matrix, List[int]]
        """
        start = time.time()

        num_instances = sum(1 for _ in corpus.edges())

        # We first construct the X matrix of features with the sparse lil_matrix, which is efficient in reshaping its structure dynamically
        # At the end, we convert this to csr_matrix, which is efficient for algebra operations
        # See https://docs.scipy.org/doc/scipy-0.18.1/reference/generated/scipy.sparse.lil_matrix.html#scipy.sparse.lil_matrix
        # See http://scikit-learn.org/stable/modules/svm.html#svm
        X = scipy.sparse.lil_matrix((num_instances, num_features),
                                    dtype=np.float64)
        y = np.zeros(
            num_instances, order='C'
        )  # -- see: http://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html#sklearn.svm.SVC

        X, y, groups = setting_function(X, y, corpus)

        X = X.tocsr()

        end = time.time()
        print_debug("SVC convert instances, running time: ", (end - start))

        return (X, y, groups)
    def execute(self, dataset, only_features=False):
        # Note: the order of splitter/tokenizer/edger/parser is important
        # Note: we could avoid the re-splitting & tokenization (see c3d320f08ed8893460d5a68b1b5c87aab6ea0c27)
        #   yet that may later create unforseen problems and re-doing has no significant impact in running time

        start = time.time()

        if not only_features:
            self.splitter.split(dataset)
            self.tokenizer.tokenize(dataset)
            self.parser.parse(
                dataset
            )  # Note, the percolate_tokens_to_entities should go before the edge generator due to sentences adjustments
            self.edge_generator.generate(dataset)

        # The labels are always re-generated
        dataset.label_edges()

        for feature_generator in self.feature_generators:
            feature_generator.generate(dataset,
                                       self.feature_set,
                                       use_gold=self.edge_generator.use_gold,
                                       use_pred=self.edge_generator.use_pred)

        end = time.time()
        print_debug(
            "Relation pipeline (only_features: {}), running time: {}".format(
                only_features, str(end - start)))
Example #3
0
    def __init__(self):
        self.jar_path = pkg_resources.resource_filename(
            'nalaf.data', "biolemmatizer-core-1.2-jar-with-dependencies.jar")
        if not os.path.isfile(self.jar_path):
            raise Exception("Could't find biolemmatizer jar: " + self.jar_path)

        self.program = ["java", "-Xmx1G", "-jar", self.jar_path, "-l", "-t"]
        self.p = Popen(self.program,
                       universal_newlines=True,
                       stdin=PIPE,
                       stdout=PIPE,
                       stderr=PIPE,
                       bufsize=1)
        BioLemmatizer.__setNonBlocking(self.p.stdout)
        BioLemmatizer.__setNonBlocking(self.p.stderr)

        # Initialize java program
        print_debug("BioLemmatizer: INIT START")
        out = None
        while not out:
            try:
                out = self.p.stdout.read()
            except TypeError as e:
                continue
            else:
                if ("Running BioLemmatizer in interactive mode" in out):
                    break
                else:
                    out = None

        print_debug("BioLemmatizer: INIT END")
Example #4
0
    def __init__(self,
                 model_path=None,
                 classification_threshold=0.0,
                 use_tree_kernel=False,
                 svmlight_dir_path=''):

        self.model_path = model_path if model_path is not None else tempfile.NamedTemporaryFile(
        ).name
        """the model (path) to read from / write to"""
        print_debug("SVM-Light model file path: " + self.model_path)

        self.classification_threshold = classification_threshold

        self.use_tree_kernel = use_tree_kernel
        """whether to use tree kernels or not"""

        self.svmlight_dir_path = svmlight_dir_path
        """
        The directory where the executables svm_classify and svm_learn are located.
        Defaults to the empty string '', which then means that the svmlight executables must be in your binary path
        """

        executables_extension = '' if sys.platform.startswith(
            'linux') or sys.platform.startswith('darwin') else '.exe'
        self.svm_learn_call = os.path.join(
            self.svmlight_dir_path, ('svm_learn' + executables_extension))
        self.svm_classify_call = os.path.join(
            self.svmlight_dir_path, ('svm_classify' + executables_extension))

        self.verbosity_level = str(
            0
        )  # for now, for verbosity=0; -- alternative: str(1 if is_verbose_mode else 0)
Example #5
0
    def __read_dictionaries(dic_paths, read_function, string_tokenizer,
                            case_sensitive, stop_words):
        stop_words = DictionaryFeatureGenerator.__normalize_stop_words(
            stop_words)

        ret = []

        for dic_path in dic_paths:
            try:
                reader = read_function(dic_path)
                try:
                    name = DictionaryFeatureGenerator.__get_filename(dic_path)
                    words_set = DictionaryFeatureGenerator.construct_words_set(
                        reader, string_tokenizer, case_sensitive, stop_words)
                    generator = DictionaryFeatureGenerator(
                        name, words_set, case_sensitive)
                    ret.append(generator)
                finally:
                    reader.close()
            except Exception as e:
                traceback.print_exc()
                print_debug("Could not read dictionary: {}".format(dic_path),
                            e)
                continue

        print_verbose("Using dictionaries: {}".format(", ".join(
            (repr(x) for x in ret))))

        return ret
Example #6
0
    def annotate(self, corpus):
        X, y = self.__convert_edges_features_to_vector_instances(corpus)

        if X.shape[0] == 0:
            # no instances at all (corpus with no edges) --> nothing to do with the corpus
            return corpus

        else:
            X = self.preprocess.transform(X)
            print_debug(
                "SVC after preprocessing, #features: {} && max value: {}".
                format(
                    X.shape[1],
                    max(sklearn.utils.sparsefuncs.min_max_axis(X, axis=0)[1])))

            # Pure classification prediction
            y_pred = self.model.predict(X)
            print_debug("Mean accuracy: {}".format(
                sum(real == pred for real, pred in zip(y, y_pred)) /
                len(y)))  # same as == self.model.score(X, y))

            for edge, target_pred in zip(corpus.edges(), y_pred):
                edge.pred_target = target_pred

            return corpus.form_predicted_relations()
Example #7
0
    def evaluate(self, dataset):
        """
        :type dataset: nala.structures.data.Dataset
        :returns Evaluations
        """

        subcounts = ['tp', 'fp', 'fn']
        counts = {docid: dict.fromkeys(subcounts, 0) for docid in dataset.documents.keys()}

        print_verbose()

        for docid, doc in dataset.documents.items():
            if self.evaluate_only_on_edges_plausible_relations:
                # a set would be better, but so far Relation is unshable
                relations_search_space = list(dataset.plausible_relations_from_generated_edges())
            else:
                relations_search_space = None

            gold = doc.map_relations(use_predicted=False, relation_type=self.rel_type, entity_map_fun=self.entity_map_fun, relations_search_space=relations_search_space).keys()
            pred = doc.map_relations(use_predicted=True, relation_type=self.rel_type, entity_map_fun=self.entity_map_fun).keys()

            for r_pred in pred:

                accept_decisions = {self.relation_accept_fun(r_gold, r_pred) for r_gold in gold}
                assert set.issubset(accept_decisions, {True, False, None}), "`relation_accept_fun` cannot return: " + str(accept_decisions)

                if True in accept_decisions:
                    # Count the true positives while iterating on gold
                    pass

                elif None in accept_decisions:
                    # Ignore as documented
                    pass

                else:
                    # either False or the set is empty, meaning that there are no gold annotations
                    print_debug("    ", docid, ": FALSE POSITIV", r_pred)
                    counts[docid]['fp'] += 1

            for r_gold in gold:

                r_preds = [r_pred for r_pred in pred if self.relation_accept_fun(r_gold, r_pred)]

                if len(r_preds) > 0:  # we could also do any(...); we have this in place only for debugging purposes
                    print_verbose("    ", docid, ": true positive", r_gold)
                    counts[docid]['tp'] += 1

                else:
                    print_debug("    ", docid, ": FALSE NEGATIV", r_gold)
                    counts[docid]['fn'] += 1

        print_verbose()

        evaluations = Evaluations()
        evaluations.add(EvaluationWithStandardError(self.rel_type, counts))
        return evaluations
Example #8
0
    def __init__(self, tokens, name="", is_edge_type_constant=False, there_is_target=True, default_n_grams=None):
        self.tokens = tokens
        self.nodes = []
        self.name = name
        self.is_edge_type_constant = is_edge_type_constant
        self.default_n_grams = default_n_grams if default_n_grams is not None else []

        for u_token, v_token in zip(tokens, tokens[1:]):  # Note: the last one is not added yet, see below
            if is_edge_type_constant:
                edge_type = ""
                is_forward = None

            else:
                parser_defined = __class__._get_dep_edges(
                    u_token, v_token,
                    __class__.__mk_list_rm_None(u_token.features['dependency_from']),
                    __class__.__mk_list_rm_None(v_token.features['dependency_from']))

                user_defined = __class__._get_dep_edges(
                    u_token, v_token,
                    u_token.features['user_dependency_from'],
                    v_token.features['user_dependency_from'])

                all_dep_edges = parser_defined + user_defined

                assert len(all_dep_edges) > 0, \
                    ("One must be a dependency of the other", u_token, v_token, tokens)

                if len(all_dep_edges) > 1:
                    print_debug("Multiple dependencies are not handled yet; defaulted to first. This should strictly only happen with user-defined dependencies")

                edge_type = all_dep_edges[0][0]
                is_forward = all_dep_edges[0][1]

            self.nodes.append(PathNode(u_token, edge_type, is_forward))

        if len(self.tokens) == 0:
            self.exists = False
            self.source = self.target = self.middle = []
        else:
            self.exists = True

            self.nodes.append(PathNode(self.tokens[-1], edge_type="", is_forward=None, is_target=there_is_target))
            self.nodes[0].is_source = True

            self.source = [self.nodes[+0]]

            if there_is_target:
                self.middle = self.nodes[1:-1]
                self.target = [self.nodes[-1]]
            else:
                self.middle = self.nodes[1:]
                self.target = []
Example #9
0
def _get_spacy_nlp_english(load_parser):
    import spacy

    start = time.time()
    print_debug("Spacy NLP English, Parser: {} -- INIT START".format(str(load_parser)))

    if load_parser is True:
        nlp = spacy.load('en', entity=False)
    else:
        nlp = spacy.load('en', parser=False, entity=False)

    print_debug("Spacy NLP English, Parser: {} -- INIT END   : ".format(str(load_parser)), (time.time() - start))

    return nlp
Example #10
0
 def generate(self, dataset):
     """
     :type dataset: nalaf.structures.data.Dataset
     """
     for sentence in dataset.sentences():
         try:
             sentence[0].features['BOS'] = 1
             sentence[-1].features['EOS'] = 1
         except IndexError as e:
             if isinstance(sentence, str):
                 raise Exception(
                     "Could not index the following sentence; likely the sentence was not tokenized: {}"
                     .format(sentence), e)
             else:
                 print_debug(
                     "ERROR: {}. Ignoring this sentence (type: {}); it is either empty or not tokenized: {}"
                     .format(e, type(sentence), sentence))
Example #11
0
    def classify(self, instancesfile):

        predictionsfile = tempfile.NamedTemporaryFile('r+', delete=False)
        print_debug("predict: svm predictions file: " + predictionsfile.name)

        callv = [
            self.svm_classify_call, '-v', '1', instancesfile.name,
            self.model_path, predictionsfile.name
        ]

        print_debug("svm light classify parameters: " + ' '.join(callv) + "\n")
        exitcode = subprocess.call(callv)

        if exitcode != 0:
            raise Exception("Error when tagging: " + ' '.join(call))

        predictionsfile.flush()
        # Note, we do not close the file

        return predictionsfile
Example #12
0
    def learn(self, instancesfile, c=None):

        with instancesfile:

            if self.use_tree_kernel:
                callv = [
                    self.svm_learn_call, '-v', self.verbosity_level, '-t', '5',
                    '-T', '1', '-W', 'S', '-V', 'S', '-C', '+', '-c',
                    str(c), instancesfile.name, self.model_path
                ]

            else:
                callv = [self.svm_learn_call, '-v', self.verbosity_level]

                if c is not None:
                    callv = callv + ['-c', str(c)]

                callv = callv + [instancesfile.name, self.model_path]

            print_debug("svm light learn parameters: " + ' '.join(callv) +
                        "\n")
            subprocess.call(callv)

            return self.model_path
Example #13
0
    def cross_validate(annotator_gen_fun, corpus, evaluator, k_num_folds, use_validation_set=True):
        merged_evaluations = []

        print_debug("Cross-Validation")
        for training_set, evaluation_set in corpus.cv_kfold_splits(k_num_folds, validation_set=use_validation_set):

            annotator_apply = annotator_gen_fun(training_set)
            annotator_apply(evaluation_set)

            r = evaluator.evaluate(evaluation_set)
            print_debug(r)
            merged_evaluations.append(r)

        ret = Evaluations.merge(merged_evaluations)
        print_debug("\n" + str(ret) + "\n")

        return ret
Example #14
0
    def train(self, training_corpus):
        X, y = self.__convert_edges_features_to_vector_instances(
            training_corpus)
        X = self.preprocess.fit_transform(X)
        print_debug(
            "SVC after preprocessing, #features: {} && max value: {}".format(
                X.shape[1],
                max(sklearn.utils.sparsefuncs.min_max_axis(X, axis=0)[1])))

        print_debug(
            "Train SVC with #samples {} - #features {} - params: {}".format(
                X.shape[0], X.shape[1], str(self.model.get_params())))
        start = time.time()
        self.model.fit(X, y)
        end = time.time()
        print_debug("SVC train, running time: ", (end - start))

        return self
Example #15
0
    def tag(self, data):
        """
        :type data: nalaf.structures.data.Dataset
        """
        for doc_id, doc in data.documents.items():
            if doc_id in self.cache:
                print_debug("Use cached response", doc_id)

                response_text = self.cache[doc_id]
            elif len(doc.parts) == 2 and self._is_pmid(doc_id):
                print_debug("Use PMID-based API", doc_id)

                r = requests.get(self.url_tmvar_pmids.format(doc_id))
                if r.status_code == 200:
                    response_text = r.text
                    self.cache[doc_id] = response_text
                else:
                    continue
            else:
                print_debug("Use free-text API", doc_id)

                r = requests.post(self.url_tmvar_freetext,
                                  self._doc_to_json(doc))

                if 'Receive' in r.url:
                    s = 501
                    while s == 501:
                        time.sleep(5)
                        s = requests.get(r.url)
                        response_text = s.text
                        s = s.status_code
                    response_text = '[' + response_text + ']'
                    self.cache[doc_id] = response_text
                else:
                    continue

            if response_text.startswith('[Error]'):
                warnings.warn(response_text)
            else:
                if response_text.startswith("["):
                    self._parse_json(doc_id, doc, response_text)
                else:
                    self._parse_pubtator(doc_id, doc, response_text)
Example #16
0
 def _clean_predictions(self, dataset, name="tagger"):
     for part in dataset.parts():
         print_debug(name, [ann.text for ann in part.predicted_annotations])
         part.predicted_annotations = []
Example #17
0
def train(argv):
    parser = argparse.ArgumentParser(description='Train model')

    parser.add_argument(
        '--training_corpus',
        help=
        'Name of the corpus to train on. Ex: nala_training, IDP4+_training, nala_training_5'
    )
    parser.add_argument('--test_corpus', help='Name of the corpus to test on')
    parser.add_argument('--string', help='String to tag')

    parser.add_argument('--validation',
                        required=False,
                        default="stratified",
                        choices=["cross-validation", "stratified", "none"],
                        help='Type of validation to use when training')

    parser.add_argument(
        '--cv_n',
        required=False,
        help=
        'if given, cross validation (instead of stratification) is used for validating the training. \
                        In this case you must also set `cv_fold` and only that fold number will be run'
    )
    parser.add_argument(
        '--cv_fold',
        required=False,
        help=
        'fold number if cross validation is activated (it starts at 0; i.e. for cv_n=5, you have folds: [0,1,2,3,4] )'
    )

    parser.add_argument(
        '--output_folder',
        required=False,
        help=
        'Folder where the training model is written to. Otherwise a tmp folder is used'
    )
    parser.add_argument(
        '--model_name_suffix',
        default='',
        required=False,
        help=
        'Optional suffix to add to the generated model name in training mode'),
    parser.add_argument(
        '--write_anndoc',
        required=False,
        default=False,
        action='store_true',
        help='Write anndoc of predicted test_corpus (validation corpus in fact)'
    ),
    parser.add_argument(
        '--model_path_1',
        required=False,
        help='Path of the first model binary file if evaluation is performed')
    parser.add_argument(
        '--model_path_2',
        required=False,
        help=
        'Path of the second model binary file if evaluation is performed with two models'
    )

    parser.add_argument('--labeler',
                        required=False,
                        default="BIEO",
                        choices=["BIEO", "BIO", "IO", "11labels"],
                        help='Labeler to use for training')

    parser.add_argument(
        '--mutations_specific',
        default='True',
        help=
        'Apply feature pipelines specific to mutations or otherwise (false) use general one'
    )

    parser.add_argument(
        '--only_class_id',
        required=False,
        default=MUT_CLASS_ID,
        help=
        "By default, only the mutation entities are read from corpora (assumed to have class_id == '"
        + MUT_CLASS_ID + "'). Set this class_id to filter rest out")
    parser.add_argument(
        '--delete_subclasses',
        required=False,
        default="",
        help='Comma-separated subclasses to delete. Example: "2,3"')

    parser.add_argument('--pruner',
                        required=False,
                        default="parts",
                        choices=["parts", "sentences"])
    parser.add_argument('--ps_ST',
                        required=False,
                        default=False,
                        action='store_true')
    parser.add_argument('--ps_NL',
                        required=False,
                        default=False,
                        action='store_true')
    parser.add_argument('--ps_random', required=False, default=0.0, type=float)

    parser.add_argument('--elastic_net',
                        action='store_true',
                        help='Use elastic net regularization')

    parser.add_argument('--word_embeddings',
                        '--we',
                        default='True',
                        help='Use word embeddings features')
    parser.add_argument('--we_additive', type=float, default=0)
    parser.add_argument('--we_multiplicative', type=float, default=1)
    parser.add_argument('--we_model_location', type=str, default=None)

    parser.add_argument('--use_feat_windows', default='True')

    parser.add_argument('--nl',
                        action='store_true',
                        help='Use NLMentionFeatureGenerator')
    parser.add_argument('--nl_threshold', type=int, default=0)
    parser.add_argument('--nl_window',
                        action='store_true',
                        help='use window feature for NLFeatureGenerator')

    parser.add_argument(
        '--execute_pp',
        default='True',
        help='Execute post processing specific to mutations (default) or not')
    parser.add_argument(
        '--keep_silent',
        default='True',
        help=
        'Keep silent mutations (default) or not, i.e., delete mentions like `Cys23-Cys`'
    )
    parser.add_argument(
        '--keep_genetic_markers',
        default='True',
        help='Keep genetic markers of the form D17S250, true (default) or false'
    )
    parser.add_argument(
        '--keep_unnumbered',
        default='True',
        help=
        'Keep unnumbered mentions (default) or not, i.e., delete mentions like `C -> T`'
    )
    parser.add_argument(
        '--keep_rs_ids',
        default='True',
        help=
        'Keep unnumbered mentions (default) or not, i.e., delete mentions like `rs1801280` or `ss221`'
    )

    parser.add_argument(
        '--dictionaries_paths',
        default=None,
        help=
        'Dictionary paths to use for dictionary features. Can be used within hdfs'
    )
    parser.add_argument('--dictionaries_stop_words',
                        default=None,
                        help='Stop words for dictionaries if these are used')

    parser.add_argument('--hdfs_url',
                        required=False,
                        default=None,
                        type=str,
                        help='URL of hdfs if this is used')
    parser.add_argument(
        '--hdfs_user',
        required=False,
        default=None,
        type=str,
        help="user of hdfs if this used. Must be given if `hdfs_url` is given")

    FALSE = ['false', 'f', '0', 'no', 'none']

    def arg_bool(arg_value):
        return False if arg_value.lower() in FALSE else True

    args = parser.parse_args(argv)

    start_time = time.time()

    # ------------------------------------------------------------------------------

    delete_subclasses = []
    for c in args.delete_subclasses.split(","):
        c.strip()
        if c:
            delete_subclasses.append(int(c))

    args.delete_subclasses = delete_subclasses

    if not args.output_folder:
        args.output_folder = tempfile.mkdtemp()

    str_delete_subclasses = "None" if not args.delete_subclasses else str(
        args.delete_subclasses).strip('[]').replace(' ', '')

    if args.labeler == "BIEO":
        labeler = BIEOLabeler()
    elif args.labeler == "BIO":
        labeler = BIOLabeler()
    elif args.labeler == "IO":
        labeler = IOLabeler()
    elif args.labeler == "11labels":
        labeler = TmVarLabeler()

    args.word_embeddings = arg_bool(args.word_embeddings)

    if args.word_embeddings:
        args.we_params = {
            'additive': args.we_additive,
            'multiplicative': args.we_multiplicative,
            'location': args.we_model_location
        }
    else:
        args.we_params = {}  # means: do not use we

    if args.nl:
        args.nl_features = {
            'threshold':
            args.nl_threshold,  # threshold for neighbour space in dictionaries
            'window': args.nl_window,
        }
    else:
        args.nl_features = None

    if args.elastic_net:
        args.crf_train_params = {
            'c1': 1.0,  # coefficient for L1 penalty
            'c2': 1e-3,  # coefficient for L2 penalty
        }
    else:
        args.crf_train_params = None

    args.use_feat_windows = False if args.use_feat_windows.lower(
    ) in FALSE else True
    args.mutations_specific = False if args.mutations_specific.lower(
    ) in FALSE else True
    args.execute_pp = False if args.execute_pp.lower() in FALSE else True
    args.keep_silent = False if args.keep_silent.lower() in FALSE else True
    args.keep_genetic_markers = False if args.keep_genetic_markers.lower(
    ) in FALSE else True
    args.keep_unnumbered = False if args.keep_unnumbered.lower(
    ) in FALSE else True
    args.keep_rs_ids = False if args.keep_rs_ids.lower() in FALSE else True

    args.do_train = False if args.model_path_1 else True

    if args.cv_n is not None or args.cv_fold is not None:
        args.validation = "cross-validation"

    if args.validation == "cross-validation":
        assert (args.cv_n is not None and args.cv_fold
                is not None), "You must set both cv_n AND cv_n"

    # ------------------------------------------------------------------------------

    if args.training_corpus:
        # Get the name of training corpus even if this is given as a folder path, in which case the last folder name is used
        training_corpus_name = list(
            filter(None, args.training_corpus.split('/')))[-1]

        args.model_name = "{}_{}_del_{}".format(training_corpus_name,
                                                args.labeler,
                                                str_delete_subclasses)

        if args.validation == "cross-validation":
            args.model_name += "_cvfold_" + str(args.cv_fold)
        args.model_name_suffix = args.model_name_suffix.strip()
        if args.model_name_suffix:
            args.model_name += "_" + str(args.model_name_suffix)

    else:
        args.model_name = args.test_corpus

    # ------------------------------------------------------------------------------

    def stats(dataset, name):
        print('\n\t{} size: {}'.format(name, len(dataset)))
        print('\tsubclass distribution: {}'.format(repr(dataset)))
        # Caveat: the dataset must be passed through the pipeline first
        print('\tnum sentences: {}\n'.format(
            sum(1 for x in dataset.sentences())))

    definer = ExclusiveNLDefiner()

    if args.training_corpus:
        train_set = get_corpus(args.training_corpus,
                               only_class_id=args.only_class_id,
                               hdfs_url=args.hdfs_url,
                               hdfs_user=args.hdfs_user)

        if args.test_corpus:
            test_set = get_corpus(args.test_corpus,
                                  only_class_id=args.only_class_id,
                                  hdfs_url=args.hdfs_url,
                                  hdfs_user=args.hdfs_user)
        elif args.string:
            test_set = StringReader(args.string).read()
        elif args.validation == "none":
            test_set = None
        elif args.validation == "cross-validation":
            train_set, test_set = train_set.fold_nr_split(
                int(args.cv_n), int(args.cv_fold))
        elif args.validation == "stratified":
            definer.define(train_set)
            train_set, test_set = train_set.stratified_split()

    elif args.test_corpus:
        train_set = None
        test_set = get_corpora(args.test_corpus, args.only_class_id)

    elif args.string:
        train_set = None
        test_set = StringReader(args.string).read()

    else:
        raise Exception(
            "you must give at least a parameter of: training_corpus, test_corpus, or string"
        )

    def verify_corpus(corpus):
        if corpus is not None:
            assert len(
                corpus
            ) > 0, f"The corpus should have at least one document; had 0: {args.training_corpus}"
            assert next(
                corpus.entities(), None
            ) is not None, "The corpus should have at least one entity; had 0"

    verify_corpus(train_set)

    # ------------------------------------------------------------------------------

    if args.mutations_specific:
        print("Pipeline specific to mutations")
        features_pipeline = get_prepare_pipeline_for_best_model(
            args.use_feat_windows, args.we_params, args.nl_features)
    else:
        print("Pipeline is general")
        features_pipeline = get_prepare_pipeline_for_best_model_general(
            args.use_feat_windows, args.we_params, args.dictionaries_paths,
            args.hdfs_url, args.hdfs_user, args.dictionaries_stop_words)

    # ------------------------------------------------------------------------------

    def print_run_args():
        for key, value in sorted((vars(args)).items()):
            print("\t{} = {}".format(key, value))
        print()

    print("Running arguments: ")

    print_run_args()

    # ------------------------------------------------------------------------------

    def train(train_set):
        definer.define(train_set)
        train_set.delete_subclass_annotations(args.delete_subclasses)
        features_pipeline.execute(train_set)
        labeler.label(train_set)

        if args.pruner == "parts":
            train_set.prune_empty_parts()
        else:
            try:
                f = HighRecallRegexClassifier(ST=args.ps_ST, NL=args.ps_NL)
            except AssertionError:
                f = (lambda _: False)
            train_set.prune_filtered_sentences(filterin=f,
                                               percent_to_keep=args.ps_random)

        stats(train_set, "training")

        model_path = os.path.join(args.output_folder, args.model_name + ".bin")
        PyCRFSuite.train(train_set, model_path, args.crf_train_params)

        return model_path

    # ------------------------------------------------------------------------------

    if args.do_train:
        args.model_path_1 = train(train_set)

    # ------------------------------------------------------------------------------

    def test(tagger, test_set, print_eval=True, print_results=False):
        tagger.tag(test_set)
        definer.define(test_set)
        stats(test_set, "test")
        evaluation = MentionLevelEvaluator(
            subclass_analysis=True).evaluate(test_set)

        print_run_args()

        if print_eval:
            print(evaluation)
        if print_results:
            ConsoleWriter(ent1_class_id=PRO_CLASS_ID,
                          ent2_class_id=MUT_CLASS_ID,
                          color=True).write(test_set)

    # ------------------------------------------------------------------------------

    assert (args.model_path_1 is not None)

    if args.model_path_2:
        tagger = NalaMultipleModelTagger(
            st_model=args.model_path_1,
            all3_model=args.model_path_2,
            features_pipeline=features_pipeline,
            execute_pp=args.execute_pp,
            keep_silent=args.keep_silent,
            keep_genetic_markers=args.keep_genetic_markers,
            keep_unnumbered=args.keep_unnumbered,
            keep_rs_ids=args.keep_rs_ids)
    else:
        tagger = NalaSingleModelTagger(
            bin_model=args.model_path_1,
            features_pipeline=features_pipeline,
            execute_pp=args.execute_pp,
            keep_silent=args.keep_silent,
            keep_genetic_markers=args.keep_genetic_markers,
            keep_unnumbered=args.keep_unnumbered,
            keep_rs_ids=args.keep_rs_ids)

    # ------------------------------------------------------------------------------

    print("\n{}".format(args.model_name))

    if train_set:
        stats(train_set, "training")

    if test_set:
        test(tagger,
             test_set,
             print_eval=args.string is None,
             print_results=args.string is not None)

    if args.do_train:
        print("\nThe model is saved to: {}\n".format(args.model_path_1))

    if args.write_anndoc:
        outdir = os.path.join(args.output_folder, args.model_name)
        os.mkdir(outdir)
        print("\nThe predicted test data is saved to: {}\n".format(outdir))
        TagTogFormat(test_set, use_predicted=True, to_save_to=outdir).export(0)

    end_time = time.time()

    print_debug("Elapsed time: ", (end_time - start_time))

    return {
        "tagger":
        tagger,
        "trained_model_path":
        args.model_path_1,
        "training_num_docs":
        0 if train_set is None else len(train_set.documents),
        "training_num_annotations":
        0 if train_set is None else sum(1 for e in train_set.entities()
                                        if e.class_id == args.only_class_id)
    }
Example #18
0
    def evaluate(self, dataset):
        """
        :type dataset: nalaf.structures.data.Dataset
        :returns (tp, fp, fn, tp_overlapping, precision, recall, f_measure): (int, int, int, int, float, float, float)

        Calculates precision, recall and subsequently F1 measure, defined as:
            * precision: number of correctly predicted items as a percentage of the total number of predicted items
                len(predicted items that are also real)/len(predicted)
                or in other words tp / tp + fp
            * recall: number of correctly predicted items as a percentage of the total number of correct items
                len(real items that are also predicted)/len(real)
                or in other words tp / tp + fn
            * possibly considers overlapping matches as well
        """

        TOTAL = MentionLevelEvaluator.TOTAL_LABEL
        labels = [TOTAL]

        def labelize(e):
            """
            Use this to represent an entity subclass as string and, if this is None or False (but not 0!), represent the entity with its class_id

            Convert to subclasses / classes ids to avoid the misstep of comparing possible subclass '0' with False, which in python breaks the universe
            --> info: https://twitter.com/juanmirocks/status/802209750612054016
            """
            return str(e.subclass) if str(
                e.subclass) not in ['None', 'False'] else str(e.class_id)

        if self.subclass_analysis:
            # find all possible subclasses or otherwise full classes

            subclasses = set(labelize(e) for e in dataset.entities())
            subclasses.update(
                set(labelize(e) for e in dataset.predicted_entities()))

            for x in subclasses:
                labels.append(x)

        docids = dataset.documents.keys()
        subcounts = ['tp', 'fp', 'fn', 'fp_ov', 'fn_ov']
        counts = {
            label: {docid: dict.fromkeys(subcounts, 0)
                    for docid in docids}
            for label in labels
        }

        for docid, doc in dataset.documents.items():
            for partid, part in doc.parts.items():

                overlap_real = {label: [] for label in labels}
                overlap_predicted = {label: [] for label in labels}

                Entity.equality_operator = 'overlapping'
                for ann_a in part.annotations:
                    for ann_b in part.predicted_annotations:
                        if ann_a == ann_b:  # equal according according to exclusive overlapping eq (not exact)
                            overlap_real[TOTAL].append(ann_a)
                            overlap_predicted[TOTAL].append(ann_b)

                            if self.subclass_analysis:
                                if labelize(ann_a) != labelize(ann_b):
                                    print_debug(
                                        'overlapping subclasses do not match',
                                        ann_a.subclass, ann_b.subclass)
                                    ann_b.subclass = ann_a.subclass

                                overlap_real[labelize(ann_a)].append(ann_a)
                                overlap_predicted[labelize(ann_b)].append(
                                    ann_b)

                Entity.equality_operator = 'exact'
                for ann in part.predicted_annotations:
                    if ann in part.annotations:
                        counts[TOTAL][docid]['tp'] += 1
                        print_verbose("    ", docid, ": TRUE POSITVE", ann)

                        if self.subclass_analysis:
                            counts[labelize(ann)][docid]['tp'] += 1

                    else:
                        counts[TOTAL][docid]['fp'] += 1

                        if ann in overlap_predicted[TOTAL]:
                            counts[TOTAL][docid]['fp_ov'] += 1
                        else:
                            print_debug("    ", docid, ": FALSE POSITIV", ann)

                        if self.subclass_analysis:
                            counts[labelize(ann)][docid]['fp'] += 1
                            if ann in overlap_predicted[labelize(ann)]:
                                counts[labelize(ann)][docid]['fp_ov'] += 1

                for ann in part.annotations:
                    if ann not in part.predicted_annotations:
                        counts[TOTAL][docid]['fn'] += 1

                        if ann in overlap_real[TOTAL]:
                            counts[TOTAL][docid]['fn_ov'] += 1
                        else:
                            print_debug("    ", docid, ": FALSE NEGATIV", ann)

                        if self.subclass_analysis:
                            counts[labelize(ann)][docid]['fn'] += 1
                            if ann in overlap_real[labelize(ann)]:
                                counts[labelize(ann)][docid]['fn_ov'] += 1

        evaluations = Evaluations()

        for label in labels:
            evaluations.add(EvaluationWithStandardError(label, counts[label]))

        return evaluations
Example #19
0
    def evaluate(self, dataset):
        """
        :type dataset: nalaf.structures.data.Dataset
        :returns (tp, fp, fn, precision, recall, f_measure): (int, int, int, float, float, float)

        Calculates precision, recall and subsequently F1 measure, defined as:
            * precision: number of correctly predicted items as a percentage of the total number of predicted items
                len(predicted items that are also real)/len(predicted)
                or in other words tp / tp + fp
            * recall: number of correctly predicted items as a percentage of the total number of correct items
                len(real items that are also predicted)/len(real)
                or in other words tp / tp + fn
        """
        TOTAL = EntityEvaluator.TOTAL_LABEL
        labels = [TOTAL]

        # find all possible subclasses or otherwise full classes
        labels += list(set(__class__._labelize(e) for e in dataset.entities()))
        labels += list(
            set(__class__._labelize(e) for e in dataset.predicted_entities()))

        docids = dataset.documents.keys()
        subcounts = ['tp', 'fp', 'fn']
        counts = {
            label: {docid: dict.fromkeys(subcounts, 0)
                    for docid in docids}
            for label in labels
        }

        for docid, doc in dataset.documents.items():
            for partid, part in doc.parts.items():

                gold_anns = set(
                    filter(None,
                           (self.entity_map_fun(e) for e in part.annotations)))
                pred_anns = set(
                    filter(None, (self.entity_map_fun(e)
                                  for e in part.predicted_annotations)))

                for pred in pred_anns:
                    accept_decisions = {
                        self.entity_accept_fun(gold, pred)
                        for gold in gold_anns
                    }
                    assert set.issubset(
                        accept_decisions,
                        {True, False, None
                         }), "did not expect: " + str(accept_decisions)

                    if True in accept_decisions:
                        # Count the true positives while iterating on gold
                        pass

                    elif None in accept_decisions:
                        pass

                    else:
                        # either False or the set is empty, meaning that there are no gold annotations
                        print_debug("    ", docid, ": FALSE POSITIV", pred)
                        counts[TOTAL][docid]['fp'] += 1
                        counts[__class__._labelize(pred)][docid]['fp'] += 1

                for gold in gold_anns:

                    accept_decisions = {
                        self.entity_accept_fun(gold, pred)
                        for pred in pred_anns
                    }

                    if True in accept_decisions:
                        print_verbose("    ", docid, ": true positive", gold)
                        counts[TOTAL][docid]['tp'] += 1
                        counts[__class__._labelize(gold)][docid]['tp'] += 1

                    elif "UNKNOWN:" in gold:  # Pass when unknown normalization
                        pass

                    else:
                        print_debug("    ", docid, ": FALSE NEGATIV", gold)
                        counts[TOTAL][docid]['fn'] += 1
                        counts[__class__._labelize(gold)][docid]['fn'] += 1

        evaluations = Evaluations()

        for label in labels:
            evaluations.add(EvaluationWithStandardError(label, counts[label]))

        return evaluations
Example #20
0
    def filter(self, documents, min_found=1, use_nala=False):
        """
        :type documents: collections.Iterable[(str, nalaf.structures.data.Document)]
        """

        _progress = 1
        _start_time = time.time()
        _total_time = 0

        _time_avg_per_pattern = 0
        _pattern_calls = 0
        _time_reg_pattern_total = 0
        _time_max_pattern = 0
        _low_performant_pattern = ""

        # NLDefiners init
        exclusive_definer = ExclusiveNLDefiner()
        _e_array = [0, 0, 0]
        inclusive_definer = InclusiveNLDefiner()
        _i_array = [0, 0]

        last_found = 0
        crf = PyCRFSuite(self.location_binary_model)

        # counter_to_stop_for_caching = 0

        for pmid, doc in documents:
            # if any part of the document contains any of the keywords
            # yield that document

            # if counter_to_stop_for_caching > 400:
            #     break
            # counter_to_stop_for_caching += 1
            # print(counter_to_stop_for_caching)

            part_offset = 0
            data_tmp = Dataset()
            data_tmp.documents[pmid] = doc
            data_nala = deepcopy(data_tmp)
            NLTKSplitter().split(data_tmp)
            # data_tmvar = TmVarTagger().generate_abstracts([pmid])
            if use_nala:
                self.pipeline.execute(data_nala)
                self.labeler.label(data_nala)
                crf.tag(data_nala, MUT_CLASS_ID)
                PostProcessing().process(data_nala)
                ExclusiveNLDefiner().define(data_nala)

            used_regexs = {}

            positive_sentences = 0
            for i, x in enumerate(doc.parts):
                # print("Part", i)
                sent_offset = 0
                cur_part = doc.parts.get(x)
                sentences = cur_part.sentences_

                for sent in sentences:
                    sent_length = len(sent)
                    new_text = sent.lower()
                    new_text = re.sub('[\./\\-(){}\[\],%]', ' ', new_text)
                    # new_text = re.sub('\W+', ' ', new_text)

                    found_in_sentence = False

                    for i, reg in enumerate(self.patterns):
                        _lasttime = time.time()  # time start var
                        match = reg.search(new_text)

                        # debug bottleneck patterns
                        _time_current_reg = time.time(
                        ) - _lasttime  # time end var
                        _pattern_calls += 1  # pattern calls already occured
                        _time_reg_pattern_total += _time_current_reg  # total time spent on searching with patterns
                        if _time_reg_pattern_total > 0:
                            _time_avg_per_pattern = _time_reg_pattern_total / _pattern_calls  # avg spent time per pattern call
                        # todo create pattern performance eval for descending amount of recognized patterns
                        # if _pattern_calls > len(patterns) * 20 and _time_avg_per_pattern * 10000 < _time_current_reg:
                        #     print("BAD_PATTERN_PERFORMANCE:", _time_avg_per_pattern, _time_current_reg, reg.pattern)
                        # if _time_max_pattern < _time_current_reg:
                        #     _time_max_pattern = _time_current_reg
                        #     _low_performant_pattern = reg.pattern
                        #     print(_time_avg_per_pattern, _low_performant_pattern, _time_max_pattern)

                        # if reg.pattern == r'(\b\w*\d+\w*\b\s?){1,3} (\b\w+\b\s?){1,4} (\b\w*\d+\w*\b\s?){1,3} (\b\w+\b\s?){1,4} (deletion|deleting|deleted)':
                        #     if _time_current_reg > _time_avg_per_pattern * 10:
                        #         # print(_time_avg_per_pattern, _time_current_reg)
                        #         f.write("BAD_PATTERN\n")
                        #         f.write(sent + "\n")
                        #         f.write(new_text + "\n")
                        if match:
                            # if pmid in data_tmvar.documents:
                            #     anti_doc = data_tmvar.documents.get(pmid)
                            nala_doc = data_nala.documents.get(pmid)

                            start = part_offset + sent_offset + match.span()[0]
                            end = part_offset + sent_offset + match.span()[1]
                            # print("TmVar is not overlapping?:", not anti_doc.overlaps_with_mention(start, end))
                            # print(not nala_doc.overlaps_with_mention(start, end, annotated=False))

                            if reg.pattern in used_regexs:
                                used_regexs[reg.pattern] += 1
                            else:
                                used_regexs[reg.pattern] = 1
                            print(color.PURPLE + new_text.replace(
                                match.group(), color.BOLD + color.DARKCYAN +
                                color.UNDERLINE + match.group() + color.END +
                                color.PURPLE) + color.END)
                            if not found_in_sentence:
                                positive_sentences += 1
                                found_in_sentence = True
                                # if not anti_doc.overlaps_with_mention(start,
                                #                                       end) \
                                #         and not nala_doc.overlaps_with_mention(start, end, annotated=False):
                                #     _e_result = exclusive_definer.define_string(
                                #         new_text[match.span()[0]:match.span()[1]])
                                #     _e_array[_e_result] += 1
                                #     _i_result = inclusive_definer.define_string(
                                #         new_text[match.span()[0]:match.span()[1]])
                                #     _i_array[_i_result] += 1
                                # todo write to file param + saving to manually annotate and find tp + fp for performance eval on each pattern
                                # print("e{}\ti{}\t{}\t{}\t{}\n".format(_e_result, _i_result, sent, match, reg.pattern))

                                # last_found += 1
                                # found_in_sentence = True
                                # else:
                                #     # if nala not used only tmvar considered
                                #     if not anti_doc.overlaps_with_mention(start, end):
                                #         _e_result = exclusive_definer.define_string(
                                #             new_text[match.span()[0]:match.span()[1]])
                                #         _e_array[_e_result] += 1
                                #         _i_result = inclusive_definer.define_string(
                                #             new_text[match.span()[0]:match.span()[1]])
                                #         _i_array[_i_result] += 1
                                #         # todo write to file param + saving to manually annotate and find tp + fp for performance eval on each pattern
                                #         # print("e{}\ti{}\t{}\t{}\t{}\n".format(_e_result, _i_result, sent, match, reg.pattern))
                                #         last_found += 1
                                #         found_in_sentence = True

                            if use_nala:
                                nala_found_mention = nala_doc.overlaps_with_mention(
                                    start, end, annotated=False)
                                if nala_found_mention:
                                    print_verbose(nala_found_mention)
                                    if nala_found_mention.subclass > 0 and nala_found_mention.confidence <= self.threshold:
                                        yield pmid, doc

                        if _lasttime - time.time() > 1:
                            print_verbose('time intensive regex', i)
                    sent_offset += 2 + sent_length

                    # for per sentence positives
                    if found_in_sentence:
                        positive_sentences += 1

                part_offset += sent_offset
            if use_nala:
                for part in nala_doc:
                    for ann in part.predicted_annotations:
                        if ann.subclass > 0:
                            print_verbose(part.text[:ann.offset] + color.BOLD +
                                          ann.text + color.END +
                                          part.text[ann.offset +
                                                    len(ann.text):])
                            positive_sentences += min_found
            _old_time = _start_time
            _start_time = time.time()
            _one_time = _start_time - _old_time

            if _one_time > 0.3 and positive_sentences > min_found:
                _progress += 1
                _total_time += _one_time

            _time_per_doc = _total_time / _progress
            print_verbose(
                "PROGRESS: {:.2f} secs ETA per one positive document:"
                " {:.2f} secs".format(_total_time, _time_per_doc))
            print_debug('used regular expressions:',
                        json.dumps(used_regexs, indent=4))
            if positive_sentences >= min_found:
                last_found = 0
                print_verbose('YEP', pmid)
                yield pmid, doc
            else:
                print_verbose('NOPE', pmid)
Example #21
0
    def create_input_file(self,
                          dataset,
                          mode,
                          features,
                          minority_class=None,
                          majority_class_undersampling=1.0):
        string = ''

        # Real counts vs Used ones after undersampling is applied
        num_pos_instances = [0, 0]
        num_neg_instances = [0, 0]
        num_unl_instances = [0, 0]

        allowed_features_keys = set(features.values())

        for edge in dataset.edges():
            if edge.real_target == +1:
                num_pos_instances[0] += 1
            elif edge.real_target == -1:
                num_neg_instances[0] += 1
            else:
                num_unl_instances[0] += 1

            if mode != 'train' or minority_class is None or edge.real_target == minority_class or random(
            ) <= majority_class_undersampling:
                if edge.real_target == +1:
                    num_pos_instances[1] += 1
                elif edge.real_target == -1:
                    num_neg_instances[1] += 1
                else:
                    num_unl_instances[1] += 1

                # (Estimation) Writing any dummy target/class (0 in particular) or the actual target is irrelevant
                # Yet, with the actual target, svmlight can throw useful evaluation performance numbers
                instance_label = str(edge.real_target)

                string += instance_label

                if self.use_tree_kernel:
                    string += ' |BT| '
                    string += edge.same_part.sentence_parse_trees[
                        edge.same_sentence_id]
                    string += ' |ET|'

                for key in sorted(edge.features.keys()):
                    if key in allowed_features_keys:
                        value = edge.features[key]
                        string += ' ' + str(key) + ':' + str(value)

                string += '\n'

        instancesfile = tempfile.NamedTemporaryFile('w', delete=False)
        print_debug("{}: svmlight instances file: {}".format(
            mode, instancesfile.name))
        instancesfile.write(string)
        instancesfile.flush()
        # Note, we do not close the file

        total_real = (num_pos_instances[0] + num_neg_instances[0] +
                      num_unl_instances[0])
        total_used = (num_pos_instances[1] + num_neg_instances[1] +
                      num_unl_instances[1])
        print_line = "{}: instances, #REAL: {} == P: {} vs N: {} vs ?: {} || vs. #USED: {} == P {} vs N: {} vs ?: {}"
        print_debug(
            print_line.format(mode, total_real, num_pos_instances[0],
                              num_neg_instances[0], num_unl_instances[0],
                              total_used, num_pos_instances[1],
                              num_neg_instances[1], num_unl_instances[1]))

        return instancesfile