def test_remove_entity_overlapping_empty(self):
        output = remove_entity_overlapping(
            [EntityCollection([]), EntityCollection([])], tokens_words=["There", "is", "really", "nothing"]
        )

        expected = [EntityCollection([]), EntityCollection([])]

        self.assertCountEqual(expected[0].entities, output[0].entities)
        self.assertCountEqual(expected[1].entities, output[1].entities)
    def test_remove_entity_overlapping_2(self):
        bacteria = EntityCollection(
            [Entity("M. tuberculosis", "111", BACTERIA_TAG), Entity("M. tuberculosis", "111", BACTERIA_TAG)],
            tag=BACTERIA_TAG,
        )
        nutrients = EntityCollection([Entity("propionic", "123", NUTRIENT_TAG)], NUTRIENT_TAG)
        diseases = EntityCollection(
            [
                Entity("tuberculosis", "a", DISEASE_TAG),
                Entity("tuberculosis", "a", DISEASE_TAG),
                Entity("tuberculosis", "a", DISEASE_TAG),
                Entity("chronic obstructive syndrome", "a1", DISEASE_TAG),
                Entity("obstructive syndrome", "b1", DISEASE_TAG),
            ],
            DISEASE_TAG,
        )

        output = remove_entity_overlapping(
            [bacteria, nutrients, diseases],
            tokens_words=[
                "M.",
                "tuberculosis",
                "is",
                "the",
                "cause",
                "of",
                "tuberculosis",
                "and",
                "chronic",
                "obstructive",
                "syndrome",
                ",",
                "also",
                "M.",
                "tuberculosis",
                "is",
                "a",
                "propionic",
                "acid",
                "producer",
                ".",
            ],
        )

        expected = [
            EntityCollection([bacteria.entities[0], bacteria.entities[1]], BACTERIA_TAG),
            EntityCollection([diseases.entities[1], diseases.entities[3]], DISEASE_TAG),
            EntityCollection([nutrients.entities[0]], NUTRIENT_TAG),
        ]

        self.assertCountEqual(expected[0].entities, output[0].entities)
        self.assertCountEqual(expected[1].entities, output[1].entities)
        self.assertCountEqual(expected[2].entities, output[2].entities)
    def get_sentence(self, sentence_text, article):
        if not self.check_if_title(article.title):
            return None

        if len(sentence_text) > SENTENCE_LENGTH_THRESHOLD:
            return None

        entities_collections = []
        for catalog in self.catalog_list:
            found_entities = catalog.find(sentence_text)
            entities_collections.append(found_entities)

        tags_in_sentence = set([collection.tag for collection in entities_collections if len(collection.entities) > 0])

        if not self.check_if_tags(tags_in_sentence):
            return None

        tokens = self.nlp(sentence_text)
        tokens_words = [token.orth_ for token in tokens]

        # todo: check - if we found bacteria both in gut_catalog and all_bacteria_catalog - which we would keep?
        logger.info("entities before remove overlapping: %s" % str(entities_collections))
        entities_collections = remove_entity_overlapping(entities_collections, tokens_words)
        logger.info("entities after remove overlapping: %s" % str(entities_collections))

        # separate all several-words-names by underscope (_)
        for collection in entities_collections:
            for entity in collection.entities:
                dashed_name = entity.name.replace(' ', '_')
                sentence_text = sentence_text.replace(entity.name, dashed_name)
                entity.name = dashed_name

        # remove bad entities
        for collection in entities_collections:
            bad_entities = [x for x in collection.entities
                            if any(y in self.tags_to_exclude for y in x.additional_tags) or
                            x.tag in self.tags_to_exclude]
            for entity in bad_entities:
                collection.entities.remove(entity)

        entities_collections = [collection for collection in entities_collections if len(collection.entities) > 0]
        tags_in_sentence = set([collection.tag for collection in entities_collections if len(collection.entities) > 0])

        logger.info("entities after excluding: %s" % str(entities_collections))
        if not self.check_if_tags(tags_in_sentence):
            return None

        tokens = self.nlp(sentence_text)

        # entities list for parser
        all_entities_list = []
        for collection in entities_collections:
            all_entities_list.extend(collection.entities)

        parser_output = self.sentence_parser.parse_sentence(sentence_text, all_entities_list, tokens)

        paths = self.sentence_analyzer.analyze_sentence(parser_output, tags_in_sentence)

        sentence = Sentence(text=sentence_text,
                            article=article,
                            entities_collections=entities_collections,
                            parser_output=parser_output,
                            shortest_paths=paths)

        return sentence