Esempio n. 1
0
class MedCatProcessor(NlpProcessor):
    """"
    MedCAT Processor class is wrapper over MedCAT that implements annotations extractions functionality
    (both single and bulk processing) that can be easily exposed for an API.
    """
    def __init__(self):
        super().__init__()

        self.log.info('Initializing MedCAT processor ...')

        self.app_name = 'MedCAT'
        self.app_lang = 'en'
        self.app_version = MedCatProcessor._get_medcat_version()
        self.app_model = os.getenv("APP_MODEL_NAME", 'unknown')

        self.vocab = Vocab()
        self.cdb = CDB()

        self.cdb.load_dict(
            os.getenv("APP_MODEL_CDB_PATH", '/cat/models/cdb.dat'))
        self.vocab.load_dict(
            path=os.getenv("APP_MODEL_VOCAB_PATH", '/cat/models/vocab.dat'))
        self.cat = CAT(self.cdb, vocab=self.vocab)

        self.cat.spacy_cat.train = os.getenv("APP_TRAINING_MODE", False)
        self.bulk_nproc = int(os.getenv('APP_BULK_NPROC', 8))

        self.log.info('MedCAT processor is ready')

    def get_app_info(self):
        """
        Returns general information about the application
        :return: application information stored as KVPs
        """
        return {
            'name': self.app_name,
            'language': self.app_lang,
            'version': self.app_version,
            'model': self.app_model
        }

    def process_content(self, content):
        """
        Processes a single document extracting the annotations.
        :param content: document to be processed, containing 'text' field.
        :return: processing result containing document with extracted annotations stored as KVPs.
        """
        if 'text' not in content:
            error_msg = "'text' field missing in the payload content."
            nlp_result = {
                'success': False,
                'errors': [error_msg],
                'timestamp': NlpProcessor._get_timestamp()
            }
            return nlp_result, False

        text = content['text']

        # assume an that a blank document is a valid document and process it only
        # when it contains any non-blank characters
        if text is not None and len(text.strip()) > 0:
            entities = self.cat.get_entities(text)
        else:
            entities = []

        nlp_result = {
            'text': text,
            'annotations': entities,
            'success': True,
            'timestamp': NlpProcessor._get_timestamp()
        }

        # append the footer
        if 'footer' in content:
            nlp_result['footer'] = content['footer']

        return nlp_result

    def process_content_bulk(self, content):
        """
        Processes an array of documents extracting the annotations.
        :param content: document to be processed, containing 'text' field.
        :return: processing result containing documents with extracted annotations,stored as KVPs.
        """

        # process at least 10 docs per thread and don't bother with starting
        # additional threads when less documents were provided
        min_doc_per_thread = 10
        num_slices = max(1, int(len(content) / min_doc_per_thread))
        batch_size = min(300, num_slices)

        if batch_size >= self.bulk_nproc:
            nproc = self.bulk_nproc
        else:
            batch_size = min_doc_per_thread
            nproc = max(1, num_slices)
            if len(content) > batch_size * nproc:
                nproc += 1

        # use generators both to provide input documents and to provide resulting annotations
        # to avoid too many mem-copies
        invalid_doc_ids = []
        ann_res = self.cat.multi_processing(
            MedCatProcessor._generate_input_doc(content, invalid_doc_ids),
            nproc=nproc,
            batch_size=batch_size)

        return MedCatProcessor._generate_result(content, ann_res,
                                                invalid_doc_ids)

    # helper generator functions to avoid multiple copies of data
    #
    @staticmethod
    def _generate_input_doc(documents, invalid_doc_idx):
        """
        Generator function returning documents to be processed as a list of tuples:
          (idx, text), (idx, text), ...
        Skips empty documents and reports their ids to the invalid_doc_idx array
        :param documents: array of input documents that contain 'text' field
        :param invalid_doc_idx:  array that will contain invalid document idx
        :return: consecutive tuples of (idx, document)
        """
        for i in range(0, len(documents)):
            # assume the document to be processed only when it is not blank
            if 'text' in documents[i] and documents[i][
                    'text'] is not None and len(
                        documents[i]['text'].strip()) > 0:
                yield i, documents[i]['text']
            else:
                invalid_doc_idx.append(i)

    @staticmethod
    def _generate_result(in_documents, annotations, invalid_doc_idx):
        """
        Generator function merging the resulting annotations with the input documents.
        The result for documents that were invalid will not contain any annotations.
        :param in_documents: array of input documents that contain 'text' field
        :param annotations: array of annotations extracted from documents
        :param invalid_doc_idx: array of invalid document idx
        :return:
        """
        # generate output for valid annotations
        for i in range(len(annotations)):
            res = annotations[i]
            res_idx = res[0]
            in_ct = in_documents[res_idx]

            # parse the result
            out_res = {
                'text': res[1]["text"],
                'annotations': res[1]["entities"],
                'success': True,
                'timestamp': NlpProcessor._get_timestamp()
            }
            # append the footer
            if 'footer' in in_ct:
                out_res['footer'] = in_ct['footer']

            yield out_res

        # generate output for invalid documents
        for i in invalid_doc_idx:
            in_ct = in_documents[i]

            out_res = {
                'text': in_ct["text"],
                'annotations': [],
                'success': True,
                'timestamp': NlpProcessor._get_timestamp()
            }
            # append the footer
            if 'footer' in in_ct:
                out_res['footer'] = in_ct['footer']

            yield out_res

    @staticmethod
    def _get_medcat_version():
        """
        Returns the version string of the MedCAT module as reported by pip
        :return:
        """
        try:
            import subprocess
            result = subprocess.check_output(['pip', 'show', 'medcat'],
                                             universal_newlines=True)
            version_line = list(
                filter(lambda v: 'Version' in v, result.split('\n')))
            return version_line[0].split(' ')[1]
        except Exception:
            raise Exception("Cannot read the MedCAT library version")
# Save the new CDB for later
cdb.save_dict(os.path.join(medcat_path, 'simple_cdb.cdb'))

# To annotate documents we do
doc = "My simple document with kidney failure"
cat = CAT(cdb=cdb, vocab=vocab)
cat.train = False
doc_spacy = cat(doc)
# Entities are in
doc_spacy._.ents
# Or to get a json
doc_json = cat.get_json(doc)

# To have a look at the results:
from spacy import displacy
# Note that this will not show all entites, but only the longest ones
displacy.serve(doc_spacy, style='ent')

# To run cat on a large number of documents
data = [] # [(<doc_id>, <text>), (<doc_id>, <text>), ...]
docs = cat.multi_processing(data)

# Training and Fine-tuning
# To fine-tune or train everything from the ground up (excluding word-vectors), you can use the following:

# Loadinga CDB or creating a new one is as above.

# To run the training do
f = open("<some file with a lot of medical text>", 'r')
# If you want fine tune set it to True, old training will be preserved
cat.run_training(f, fine_tune=False)