Ejemplo n.º 1
0
def setup(args):
    dataset = Dataset(args.dataset)

    pipeline = None

    if args.pipeline == 'spacy':
        model = SpacyModel
        return dataset, model

    else:
        labels = list(dataset.get_labels())

        pipeline_arg = args.pipeline

        #Parse the argument as a class name in module medacy.ner.pipelines
        module = importlib.import_module("medacy.ner.pipelines")
        pipeline_class = getattr(module, pipeline_arg)

        if args.word_embeddings is not None:
            pipeline = pipeline_class(entities=labels,
                                      word_embeddings=args.word_embeddings)
        else:
            pipeline = pipeline_class(entities=labels)

    model = Model(pipeline)

    return dataset, model
Ejemplo n.º 2
0
 def test_init_training(self):
     """
     Tests initialization of DataManager
     :return:
     """
     dataset = Dataset(self.training_directory)
     self.assertIsInstance(dataset, Dataset)
     self.assertTrue(dataset.is_training())
Ejemplo n.º 3
0
    def test_init_with_data_limit(self):
        """
        Tests initialization of DataManager
        :return:
        """
        dataset = Dataset(self.training_directory, data_limit=6)

        self.assertEqual(len(dataset.get_data_files()), 6)
Ejemplo n.º 4
0
    def test_init_prediction(self):
        """
        Tests initialization of DataManager
        :return:
        """
        dataset = Dataset(self.prediction_directory)

        self.assertIsInstance(dataset, Dataset)
        self.assertFalse(dataset.is_training())
Ejemplo n.º 5
0
    def setUpClass(cls):
        """Loads END dataset and writes files to temp directory"""
        cls.test_dir = tempfile.mkdtemp()  # set up temp directory
        cls.dataset, _, meta_data = Dataset.load_external('medacy_dataset_end')
        cls.entities = meta_data['entities']
        cls.ann_files = []
        # fill directory of training files
        for data_file in cls.dataset.get_data_files():
            file_name, raw_text, ann_text = (data_file.file_name,
                                             data_file.raw_path,
                                             data_file.ann_path)
            cls.ann_files.append(file_name + '.ann')

        with open(join(cls.test_dir, "broken_ann_file.ann"), 'w') as f:
            f.write("This is clearly not a valid ann file")

        cls.ann_file_path_one = join(cls.test_dir, "ann1.ann")
        with open(cls.ann_file_path_one, "w+") as f:
            f.write(ann_text_one)

        cls.ann_file_path_two = join(cls.test_dir, "ann1.ann")
        with open(cls.ann_file_path_one, "w+") as f:
            f.write(ann_text_two)

        cls.ann_file_path_modified = join(cls.test_dir, "ann_mod.ann")
        with open(cls.ann_file_path_modified, "w+") as f:
            f.write(ann_text_one_modified)

        cls.ann_file_path_source = join(cls.test_dir, "ann_source.txt")
        with open(cls.ann_file_path_source, "w+") as f:
            f.write(ann_text_one_source)
Ejemplo n.º 6
0
    def setUpClass(cls):

        if importlib.util.find_spec('medacy_dataset_end') is None:
            raise ImportError(
                "medacy_dataset_end was not automatically installed for testing. See testing instructions for details."
            )
        cls.training_directory = tempfile.mkdtemp()  #set up train directory
        cls.prediction_directory = tempfile.mkdtemp(
        )  # set up predict directory
        dataset, entities = Dataset.load_external('medacy_dataset_end')
        cls.entities = entities
        cls.ann_files = []

        #fill directory of training files
        for data_file in dataset.get_data_files():
            file_name, raw_text, ann_text = (data_file.file_name,
                                             data_file.raw_path,
                                             data_file.ann_path)
            cls.ann_files.append(file_name + '.ann')
            with open(
                    os.path.join(cls.training_directory, "%s.txt" % file_name),
                    'w') as f:
                f.write(raw_text)
            with open(
                    os.path.join(cls.training_directory, "%s.ann" % file_name),
                    'w') as f:
                f.write(ann_text)

            #place only text files into prediction directory.
            with open(
                    os.path.join(cls.prediction_directory,
                                 "%s.txt" % file_name), 'w') as f:
                f.write(raw_text)
Ejemplo n.º 7
0
    def setUpClass(cls):

        if importlib.util.find_spec('medacy_dataset_end') is None:
            raise ImportError(
                "medacy_dataset_end was not automatically installed for testing. See testing instructions for details."
            )

        cls.train_dataset, cls.entities = Dataset.load_external(
            'medacy_dataset_end')
        cls.train_dataset.set_data_limit(1)

        cls.test_dataset, _ = Dataset.load_external('medacy_dataset_end')
        cls.test_dataset.set_data_limit(2)

        cls.prediction_directory = tempfile.mkdtemp(
        )  #directory to store predictions
Ejemplo n.º 8
0
    def setUpClass(cls):

        if importlib.util.find_spec('medacy_dataset_end') is None:
            raise ImportError(
                "medacy_dataset_end was not automatically installed for testing. See testing instructions for details."
            )

        cls.dataset, cls.entities = Dataset.load_external('medacy_dataset_end')
Ejemplo n.º 9
0
def get_evaluation_dataset():
    """
    Leave the evaluation folder empty if no evaluation data is provided.

    :return: a medaCy Dataset object containing this Dataset's designated evaluation data.
    """
    # if evaluation is empty return None.
    if not resource_isdir(package_name, join('data', 'evaluation')) \
            or not resource_listdir(package_name, join('data', 'evaluation')):
        return None

    return Dataset(resource_filename(package_name, join('data', 'evaluation')))
Ejemplo n.º 10
0
    def setUpClass(cls):
        """
        Loads END dataset and writes files to temp directory
        :return:
        """
        cls.test_dir = tempfile.mkdtemp()  # set up temp directory
        cls.dataset, cls.entities = Dataset.load_external('medacy_dataset_end')
        cls.ann_files = []
        # fill directory of training files
        for data_file in cls.dataset.get_data_files():
            file_name, raw_text, ann_text = (data_file.file_name, data_file.raw_path, data_file.ann_path)
            cls.ann_files.append(file_name + '.ann')

        with open(join(cls.test_dir, "broken_ann_file.ann"), 'w') as f:
            f.write("This is clearly not a valid ann file")
Ejemplo n.º 11
0
def get_training_dataset():
    """
    :return: a medaCy Dataset object containing this Dataset's designated training data.
    """
    return Dataset(resource_filename(package_name, 'data/training'))
Ejemplo n.º 12
0
from medacy.data import Dataset
from medacy.pipelines import SystematicReviewPipeline
from medacy.model import Model
from medacy.pipeline_components import MetaMap
import logging, sys

# print logs
logging.basicConfig(
    stream=sys.stdout,
    level=logging.DEBUG)  #set level=logging.DEBUG for more information

#entity types
entities = ['ADR', 'Indication', 'Drug']

# training_dataset, evaluation_dataset, meta_data = Dataset.load_external('medacy_dataset_smm4h_2019')
training_dataset = Dataset(
    '/home/mahendrand/VE/SMM4H/data_smmh4h/task2/training/dataset')
#path = '../data_smmh4h/task2/training/dataset_1'
#set metamap path
metamap = MetaMap(
    metamap_path="/home/share/programs/metamap/2016/public_mm/bin/metamap",
    convert_ascii=True)
training_dataset.metamap(metamap)

# pipeline = SystematicReviewPipeline(metamap=None, entities=meta_data['entities'])
pipeline = SystematicReviewPipeline(metamap=metamap, entities=entities)
model = Model(
    pipeline, n_jobs=1
)  #distribute documents between 30 processes during training and prediction

model.fit(training_dataset)
model.cross_validate(num_folds=5,
Ejemplo n.º 13
0
    create_directory(dirTrain)

    print("Fold : ",i)

    for item in ann_files_1:
        shutil.copy(dataset1 + '/' + item, dirTrain)
    for item in ann_files_2:
        shutil.copy(dataset2 + '/' + item, dirTrain)
    for item in txt_files_1:
        shutil.copy(dataset1 + '/' + item, dirTrain)
    for item in txt_files_2:
        shutil.copy(dataset2 + '/' + item, dirTrain)

    for item in ann_files_1[i * num_files:(i + 1) * num_files]:
        shutil.copy(dataset1 + '/' + item, dirTest)
        os.remove(dirTrain + '/' + item)
    for item in txt_files_1[i * num_files:(i + 1) * num_files]:
        shutil.copy(dataset1 + '/' + item, dirTest)
        os.remove(dirTrain + '/' + item)

    training_dataset = Dataset(dirTrain)
    training_dataset.metamap(metamap)

    model = Model(pipeline, n_jobs=1)
    model.fit(training_dataset)

    # run on a separate testing dataset
    testing_dataset = Dataset(dirTest)

    # location to store the predictions
    model.predict(testing_dataset, prediction_directory = dirPrediction)
Ejemplo n.º 14
0
# This script demonstrates utilizing medaCy for a full model training/predictive/cross validation use-case.
# > python training_predicting.py model_name
# Will build a model named model_name with the pipeline and parameters defined below. This script places the model in
# it's own directory along the models build log and model/pipeline parameters to keep results easily referencable during run time.
# Once a sufficent model is produced, consider wrapping it up into a medaCy compatible model as defined the example guide.

from medacy.model import Model
from medacy.pipelines import SystematicReviewPipeline
from medacy.data import Dataset
from medacy.pipeline_components import MetaMap
import logging, datetime, time, os, sys

train_dataset, evaluation_dataset, entities = Dataset.load_external(
    'medacy_dataset_tac_2018')

if sys.argv[1] is None:
    exit(0)

#For rapid model prototyping, will train and predict by simply running the script with a model name as a parameter.
model_name = sys.argv[1]  #name for the model, use underscores
model_notes = "notes about the current model"  #notes about current model to be stored in a model information file by this script.

model_directory = "/home/username/named_entity_recognition/challenges/challenge_n/models/%s" % model_name.replace(
    " ", '_')

if model_name is "" or os.path.isdir(model_directory):
    print("Model directory already exists, aborting")
    exit(0)
else:
    os.mkdir(model_directory)
Ejemplo n.º 15
0
from medacy.data import Dataset
# from medacy.ner.pipelines import SystematicReviewPipeline
from medacy.ner.pipelines import ClinicalPipeline
from medacy.ner.model import Model
import logging, sys

#logging.basicConfig(filename=model_directory+'/build_%cd .log' % current_time,level=logging.DEBUG) #set level=logging.DEBUG for more information
logging.basicConfig(
    stream=sys.stdout,
    level=logging.DEBUG)  #set level=logging.DEBUG for more information

# entities = ['Form','Route','Frequency', 'Reason', 'Duration', 'Dosage', 'ADE', 'Strength', 'Drug' ]
entities = ['Symptom', 'Drug']

# training_dataset, evaluation_dataset, meta_data = Dataset.load_external('medacy_dataset_smm4h_2019')
training_dataset = Dataset('/home/mahendrand/VE/Data/N2C2/symptom')

#training_dataset.set_data_limit(10)
# pipeline = SystematicReviewPipeline(metamap=None, entities=meta_data['entities'])
pipeline = ClinicalPipeline(metamap=None, entities=entities)
model = Model(
    pipeline, n_jobs=1
)  #distribute documents between 30 processes during training and prediction
#
model.fit(training_dataset)

model.cross_validate(num_folds=5,
                     training_dataset=training_dataset,
                     prediction_directory=True,
                     groundtruth_directory=True)
Ejemplo n.º 16
0
    def cross_validate(self,
                       num_folds=10,
                       training_dataset=None,
                       prediction_directory=None):
        """
        Performs k-fold stratified cross-validation using our model and pipeline.

        If the training dataset and prediction_directory are passed, intermediate predictions during cross validation
        are written to the directory `write_predictions`. This allows one to construct a confusion matrix or to compute
        the prediction ambiguity with the methods present in the Dataset class to support pipeline development without
        a designated evaluation set.

        :param num_folds: number of folds to split training data into for cross validation
        :param training_dataset: Dataset that is being cross validated (optional)
        :param prediction_directory: directory to write predictions of cross validation to or `True` for default predictions sub-directory.
        :return: Prints out performance metrics, if prediction_directory
        """

        if num_folds <= 1:
            raise ValueError(
                "Number of folds for cross validation must be greater than 1")

        if prediction_directory is not None and training_dataset is None:
            raise ValueError(
                "Cannot generated predictions during cross validation if training dataset is not given."
                " Please pass the training dataset in the 'training_dataset' parameter."
            )

        assert self.model is not None, "Cannot cross validate a un-fit model"
        assert self.X_data is not None and self.y_data is not None, \
            "Must have features and labels extracted for cross validation"

        X_data = self.X_data
        Y_data = self.y_data

        medacy_pipeline = self.pipeline

        cv = SequenceStratifiedKFold(folds=num_folds)

        named_entities = medacy_pipeline.entities

        evaluation_statistics = {}
        fold = 1
        for train_indices, test_indices in cv(X_data, Y_data):
            fold_statistics = {}
            learner_name, learner = medacy_pipeline.get_learner()

            X_train = [X_data[index] for index in train_indices]
            y_train = [Y_data[index] for index in train_indices]

            X_test = [X_data[index] for index in test_indices]
            y_test = [Y_data[index] for index in test_indices]

            logging.info("Training Fold %i", fold)
            train_data = [x[0] for x in X_train]
            test_data = [x[0] for x in X_test]
            learner.fit(train_data, y_train)
            y_pred = learner.predict(test_data)

            if prediction_directory is not None:
                # Dict for storing mapping of sequences to their corresponding file
                preds_by_document = {
                    filename: []
                    for filename in list(set([x[2] for x in X_data]))
                }

                # Flattening nested structures into 2d lists
                document_indices = []
                span_indices = []
                for sequence in X_test:
                    document_indices += [
                        sequence[2] for x in range(len(sequence[0]))
                    ]
                    span_indices += [element for element in sequence[1]]
                predictions = [
                    element for sentence in y_pred for element in sentence
                ]

                # Map the predicted sequences to their corresponding documents
                i = 0
                while i < len(predictions):
                    if predictions[i] == 'O':
                        i += 1
                        continue
                    entity = predictions[i]
                    document = document_indices[i]
                    first_start, first_end = span_indices[i]
                    # Ensure that consecutive tokens with the same label are merged
                    while i < len(predictions) - 1 and predictions[
                            i +
                            1] == entity:  # If inside entity, keep incrementing
                        i += 1
                    last_start, last_end = span_indices[i]

                    preds_by_document[document].append(
                        (entity, first_start, last_end))
                    i += 1

            # Write the metrics for this fold.
            for label in named_entities:
                fold_statistics[label] = {}
                recall = metrics.flat_recall_score(y_test,
                                                   y_pred,
                                                   average='weighted',
                                                   labels=[label])
                precision = metrics.flat_precision_score(y_test,
                                                         y_pred,
                                                         average='weighted',
                                                         labels=[label])
                f1 = metrics.flat_f1_score(y_test,
                                           y_pred,
                                           average='weighted',
                                           labels=[label])
                fold_statistics[label]['precision'] = precision
                fold_statistics[label]['recall'] = recall
                fold_statistics[label]['f1'] = f1

            # add averages
            fold_statistics['system'] = {}
            recall = metrics.flat_recall_score(y_test,
                                               y_pred,
                                               average='weighted',
                                               labels=named_entities)
            precision = metrics.flat_precision_score(y_test,
                                                     y_pred,
                                                     average='weighted',
                                                     labels=named_entities)
            f1 = metrics.flat_f1_score(y_test,
                                       y_pred,
                                       average='weighted',
                                       labels=named_entities)
            fold_statistics['system']['precision'] = precision
            fold_statistics['system']['recall'] = recall
            fold_statistics['system']['f1'] = f1

            table_data = [[
                label,
                format(fold_statistics[label]['precision'], ".3f"),
                format(fold_statistics[label]['recall'], ".3f"),
                format(fold_statistics[label]['f1'], ".3f")
            ] for label in named_entities + ['system']]

            logging.info(
                tabulate(table_data,
                         headers=['Entity', 'Precision', 'Recall', 'F1'],
                         tablefmt='orgtbl'))

            evaluation_statistics[fold] = fold_statistics
            fold += 1

        statistics_all_folds = {}

        for label in named_entities + ['system']:
            statistics_all_folds[label] = {}
            statistics_all_folds[label]['precision_average'] = mean([
                evaluation_statistics[fold][label]['precision']
                for fold in evaluation_statistics
            ])
            statistics_all_folds[label]['precision_max'] = max([
                evaluation_statistics[fold][label]['precision']
                for fold in evaluation_statistics
            ])
            statistics_all_folds[label]['precision_min'] = min([
                evaluation_statistics[fold][label]['precision']
                for fold in evaluation_statistics
            ])

            statistics_all_folds[label]['recall_average'] = mean([
                evaluation_statistics[fold][label]['recall']
                for fold in evaluation_statistics
            ])
            statistics_all_folds[label]['recall_max'] = max([
                evaluation_statistics[fold][label]['recall']
                for fold in evaluation_statistics
            ])
            statistics_all_folds[label]['recall_min'] = min([
                evaluation_statistics[fold][label]['recall']
                for fold in evaluation_statistics
            ])

            statistics_all_folds[label]['f1_average'] = mean([
                evaluation_statistics[fold][label]['f1']
                for fold in evaluation_statistics
            ])
            statistics_all_folds[label]['f1_max'] = max([
                evaluation_statistics[fold][label]['f1']
                for fold in evaluation_statistics
            ])
            statistics_all_folds[label]['f1_min'] = min([
                evaluation_statistics[fold][label]['f1']
                for fold in evaluation_statistics
            ])

        table_data = [[
            label,
            format(statistics_all_folds[label]['precision_average'], ".3f"),
            format(statistics_all_folds[label]['recall_average'], ".3f"),
            format(statistics_all_folds[label]['f1_average'], ".3f"),
            format(statistics_all_folds[label]['f1_min'], ".3f"),
            format(statistics_all_folds[label]['f1_max'], ".3f")
        ] for label in named_entities + ['system']]

        logging.info("\n" + tabulate(table_data,
                                     headers=[
                                         'Entity', 'Precision', 'Recall', 'F1',
                                         'F1_Min', 'F1_Max'
                                     ],
                                     tablefmt='orgtbl'))

        if prediction_directory:
            # Write annotations generated from cross-validation
            if isinstance(prediction_directory, str):
                prediction_directory = prediction_directory
            else:
                prediction_directory = training_dataset.data_directory + "/predictions/"
            if os.path.isdir(prediction_directory):
                logging.warning("Overwritting existing predictions")
            else:
                os.makedirs(prediction_directory)
            for data_file in training_dataset.get_data_files():
                logging.info("Predicting file: %s", data_file.file_name)
                with open(data_file.raw_path, 'r') as raw_text:
                    doc = medacy_pipeline.spacy_pipeline.make_doc(
                        raw_text.read())
                    preds = preds_by_document[data_file.file_name]
                    annotations = construct_annotations_from_tuples(doc, preds)
                    annotations.to_ann(write_location=os.path.join(
                        prediction_directory, data_file.file_name + ".ann"))
            return Dataset(data_directory=prediction_directory)
Ejemplo n.º 17
0
from medacy.data import Dataset
import logging,sys

from pprint import pprint

# print logs
# logging.basicConfig(stream=sys.stdout,level=logging.DEBUG) #set level=logging.DEBUG for more information

#entity types
# entities = ['CellLine','Dose','DoseDuration', 'DoseDurationUnits', 'DoseFrequency', 'DoseRoute', 'DoseUnits', 'Endpoint', 'EndpointUnitOfMeasure', 'GroupName', 'GroupSize', 'SampleSize', 'Sex', 'Species', 'Strain', 'TestArticle', 'TestArticlePurity', 'TestArticleVerification', 'TimeAtDose', 'TimeAtFirstDose', 'TimeAtLastDose', 'TimeEndpointAssessed', 'TimeUnits', 'Vehicle' ]

# training_dataset, evaluation_dataset, meta_data = Dataset.load_external('medacy_dataset_smm4h_2019')
training_dataset = Dataset('/home/mahendrand/VE/TAC/data_TAC')
prediction_dataset = Dataset('/home/mahendrand/VE/TAC/data_TAC/predictions')



ambiguity_dict = training_dataset.compute_ambiguity(prediction_dataset)
#pprint(ambiguity_dict)

entities, confusion_matrix = training_dataset.compute_confusion_matrix(prediction_dataset, leniency=1)

pprint(training_dataset.compute_counts())

print(entities)
pprint(confusion_matrix)
Ejemplo n.º 18
0
from medacy.ner.pipelines import ClinicalPipeline
from medacy.ner.model import Model
from medacy.pipeline_components import MetaMap

import logging, sys

# print logs
logging.basicConfig(
    stream=sys.stdout,
    level=logging.DEBUG)  #set level=logging.DEBUG for more information

#entity types
entities = ['Symptom', 'Drug']

# training_dataset, evaluation_dataset, meta_data = Dataset.load_external('medacy_dataset_smm4h_2019')
training_dataset = Dataset('/home/mahendrand/VE/Data/N2C2_END/symptom')

#set metamap path
metamap = MetaMap(
    metamap_path="/home/share/programs/metamap/2016/public_mm/bin/metamap",
    convert_ascii=True)
training_dataset.metamap(metamap)

# pipeline = SystematicReviewPipeline(metamap=None, entities=meta_data['entities'])
pipeline = ClinicalPipeline(metamap=metamap, entities=entities)
model = Model(
    pipeline, n_jobs=1
)  #distribute documents between 30 processes during training and prediction

model.fit(training_dataset)
model.cross_validate(num_folds=5,
Ejemplo n.º 19
0
from medacy.ner.pipelines import ClinicalPipeline
from medacy.ner.model import Model
from medacy.pipeline_components import MetaMap

import logging, sys

# print logs
logging.basicConfig(
    stream=sys.stdout,
    level=logging.DEBUG)  #set level=logging.DEBUG for more information

#entity types
entities = ['treatment', 'problem', 'test']

# training_dataset, evaluation_dataset, meta_data = Dataset.load_external('medacy_dataset_smm4h_2019')
training_dataset = Dataset('/home/samantha/Desktop/Research/Data/i2b2/data')
#path = '../data_smmh4h/task2/training/dataset_1'
#set metamap path
# metamap = MetaMap(metamap_path="/home/share/programs/metamap/2016/public_mm/bin/metamap", convert_ascii=True)
# training_dataset.metamap(metamap)

# pipeline = SystematicReviewPipeline(metamap=None, entities=meta_data['entities'])
pipeline = ClinicalPipeline(metamap=None, entities=entities)
model = Model(
    pipeline, n_jobs=1
)  #distribute documents between 30 processes during training and prediction

model.fit(training_dataset)
model.cross_validate(num_folds=5,
                     training_dataset=training_dataset,
                     prediction_directory=True,
Ejemplo n.º 20
0
from medacy.data import Dataset
from medacy.ner.pipelines import ClinicalPipeline
from medacy.ner.model import Model
from medacy.pipeline_components import MetaMap

import logging,sys


# print logs
logging.basicConfig(stream=sys.stdout,level=logging.DEBUG) #set level=logging.DEBUG for more information

#entity types
entities = ['treatment', 'problem','test']

# training_dataset, evaluation_dataset, meta_data = Dataset.load_external('medacy_dataset_smm4h_2019')
training_dataset = Dataset('/home/mahendrand/VE/Data/i2b2/data')
#path = '../data_smmh4h/task2/training/dataset_1'
#set metamap path
metamap = MetaMap(metamap_path="/home/share/programs/metamap/2016/public_mm/bin/metamap", convert_ascii=True)
training_dataset.metamap(metamap)

# pipeline = SystematicReviewPipeline(metamap=None, entities=meta_data['entities'])
pipeline = ClinicalPipeline(metamap=metamap, entities=entities)
model = Model(pipeline, n_jobs=1) #distribute documents between 30 processes during training and prediction

model.fit(training_dataset)
model.cross_validate(num_folds = 5, training_dataset = training_dataset, prediction_directory=True, groundtruth_directory=True)

#location to store the clinical model
model.dump('/home/mahendrand/VE/SMM4H/medaCy/medacy/clinical_model.pickle')
Ejemplo n.º 21
0
from medacy.data import Dataset
from medacy.ner.pipelines import ClinicalPipeline
from medacy.ner.model import Model
from medacy.pipeline_components import MetaMap

import logging, sys

# print logs
logging.basicConfig(
    stream=sys.stdout,
    level=logging.DEBUG)  #set level=logging.DEBUG for more information

#entity types
entities = ['ADE', 'Drug', 'Dose']

training_dataset = Dataset('/home/mahendrand/VE/Data/MADE/training')

#set metamap path
metamap = MetaMap(
    metamap_path="/home/share/programs/metamap/2016/public_mm/bin/metamap",
    convert_ascii=True)
training_dataset.metamap(metamap)

pipeline = ClinicalPipeline(metamap=metamap, entities=entities)
model = Model(
    pipeline, n_jobs=1
)  #distribute documents between 30 processes during training and prediction

model.fit(training_dataset)

#cross validation
Ejemplo n.º 22
0
from medacy.data import Dataset
from medacy.ner.pipelines import SystematicReviewPipeline
from medacy.ner.model import Model
from medacy.pipeline_components import MetaMap
import logging,sys



# print logs
# logging.basicConfig(stream=sys.stdout,level=logging.DEBUG) #set level=logging.DEBUG for more information

#entity types
entities = ['CellLine','Dose','DoseDuration', 'DoseDurationUnits', 'DoseFrequency', 'DoseRoute', 'DoseUnits', 'Endpoint','EndpointUnitOfMeasure', 'GroupName', 'GroupSize', 'SampleSize', 'Sex', 'Species', 'Strain', 'TestArticle', 'TestArticlePurity', 'TestArticleVerification', 'TimeAtDose', 'TimeAtFirstDose', 'TimeAtLastDose', 'TimeEndpointAssessed', 'TimeUnits', 'Vehicle' ]

# training_dataset, evaluation_dataset, meta_data = Dataset.load_external('medacy_dataset_smm4h_2019')
training_dataset = Dataset('/home/mahendrand/VE/TAC/data_TAC')
#set metamap path
metamap = MetaMap(metamap_path="/home/share/programs/metamap/2016/public_mm/bin/metamap", convert_ascii=True)
training_dataset.metamap(metamap)

# pipeline = SystematicReviewPipeline(metamap=None, entities=meta_data['entities'])
pipeline = SystematicReviewPipeline(metamap=metamap, entities=entities)
model = Model(pipeline, n_jobs=1) #distribute documents between 30 processes during training and prediction

model.fit(training_dataset)
model.cross_validate(num_folds = 5, dataset = training_dataset, write_predictions=True)

#location to store the clinical model
model.dump('/home/mahendrand/VE/SMM4H/medaCy/medacy/clinical_model.pickle')

#location to store the predictions
Ejemplo n.º 23
0
from medacy.data import Dataset
from medacy.ner.pipelines import SystematicReviewPipeline
from medacy.ner.model import Model
import logging,sys

#logging.basicConfig(filename=model_directory+'/build_%cd .log' % current_time,level=logging.DEBUG) #set level=logging.DEBUG for more information
logging.basicConfig(stream=sys.stdout,level=logging.DEBUG) #set level=logging.DEBUG for more information

# entities = ['CellLine','Dose','DoseDuration', 'DoseDurationUnits', 'DoseFrequency', 'DoseRoute', 'DoseUnits', 'EndpointUnitOfMeasure', 'GroupName', 'GroupSize', 'SampleSize', 'Sex', 'Species', 'Strain', 'TestArticle', 'TestArticlePurity', 'TestArticleVerification', 'TimeAtDose', 'TimeAtFirstDose', 'TimeAtLastDose', 'TimeEndpointAssessed', 'TimeUnits', 'Vehicle' ]
entities = ['CellLine','Dose','DoseDuration', 'DoseDurationUnits', 'DoseFrequency', 'DoseRoute', 'DoseUnits', 'Endpoint','EndpointUnitOfMeasure', 'GroupName', 'GroupSize', 'SampleSize', 'Sex', 'Species', 'Strain', 'TestArticle', 'TestArticlePurity', 'TestArticleVerification', 'TimeAtDose', 'TimeAtFirstDose', 'TimeAtLastDose', 'TimeEndpointAssessed', 'TimeUnits', 'Vehicle' ]

# training_dataset, evaluation_dataset, meta_data = Dataset.load_external('medacy_dataset_smm4h_2019')
training_dataset = Dataset('/home/mahendrand/VE/TAC/sample')


#training_dataset.set_data_limit(10)
# pipeline = SystematicReviewPipeline(metamap=None, entities=meta_data['entities'])
pipeline = SystematicReviewPipeline(metamap=None, entities=entities)
model = Model(pipeline, n_jobs=1) #distribute documents between 30 processes during training and prediction
#
model.fit(training_dataset)

model.cross_validate(num_folds = 5, training_dataset = training_dataset, prediction_directory=True, groundtruth_directory=True)


model.dump('/home/mahendrand/VE/SMM4H/medaCy/medacy/clinical_model.pickle')
# model.predict(training_dataset, prediction_directory='/home/mahendrand/VE/data_smmh4h/task2/training/metamap_predictions')

# model.predict(training_dataset)

Ejemplo n.º 24
0
from medacy.data import Dataset
from medacy.ner.pipelines import ClinicalPipeline
from medacy.ner.model import Model
from medacy.pipeline_components import MetaMap

import logging,sys

# print logs
logging.basicConfig(stream=sys.stdout,level=logging.DEBUG) #set level=logging.DEBUG for more information

#entity types
entities = ['ADR','Drug', 'Symptom']

training_dataset = Dataset('/home/mahendrand/VE/Data/CADEC/converted')

#set metamap path
metamap = MetaMap(metamap_path="/home/share/programs/metamap/2016/public_mm/bin/metamap", convert_ascii=True)
training_dataset.metamap(metamap)


pipeline = ClinicalPipeline(metamap=metamap, entities=entities)
model = Model(pipeline, n_jobs=1) #distribute documents between 30 processes during training and prediction

model.fit(training_dataset)

#cross validation
model.cross_validate(num_folds = 5, training_dataset = training_dataset, prediction_directory=True, groundtruth_directory=True)

#location to store the clinical model
# model.dump('/home/mahendrand/VE/SMM4H/medaCy/medacy/clinical_model.pickle')
Ejemplo n.º 25
0
# fold1
# training_dataset_1 = Dataset('/home/mahendrand/VE/Data/CADEC_END/1/train')
# # training_dataset_1.metamap(metamap)
#
# model_1 = Model(pipeline, n_jobs=1)
# model_1.fit(training_dataset_1)

# #run on a separate testing dataset
# testing_dataset_1= Dataset('/home/mahendrand/VE/Data/CADEC_END/1/test')
# # location to store the predictions
# model.predict(testing_dataset_1, prediction_directory='/home/mahendrand/VE/Data/preds/5 fold/CADEC_END')
#
#
# #fold 2
training_dataset_2 = Dataset('/home/mahendrand/VE/Data/CADEC_END/2/train')
# training_dataset_2.metamap(metamap)
#
model_2 = Model(pipeline, n_jobs=1)
model_2.fit(training_dataset_2)

#run on a separate testing dataset
testing_dataset_2 = Dataset('/home/mahendrand/VE/Data/CADEC_END/2/test')
# location to store the predictions
model.predict(
    testing_dataset_2,
    prediction_directory='/home/mahendrand/VE/Data/preds/5 fold/CADEC_END')
#
#
# #fold 3
# training_dataset_3 = Dataset('/home/mahendrand/VE/Data/CADEC_END/3/train')
Ejemplo n.º 26
0
from medacy.data import Dataset
from medacy.ner.pipelines import ClinicalPipeline
from medacy.ner.model import Model
from medacy.pipeline_components import MetaMap

import logging, sys

# print logs
logging.basicConfig(
    stream=sys.stdout,
    level=logging.DEBUG)  #set level=logging.DEBUG for more information

#entity types
entities = ['ADE', 'Drug', 'Reason']

training_dataset = Dataset('/home/mahendrand/VE/Data/END/drug')

#set metamap path
metamap = MetaMap(
    metamap_path="/home/share/programs/metamap/2016/public_mm/bin/metamap",
    convert_ascii=True)
training_dataset.metamap(metamap)

pipeline = ClinicalPipeline(metamap=metamap, entities=entities)
model = Model(
    pipeline, n_jobs=1
)  #distribute documents between 30 processes during training and prediction

model.fit(training_dataset)

#cross validation
Ejemplo n.º 27
0
from medacy.ner.pipelines import SystematicReviewPipeline
from medacy.ner.model import Model
from medacy.pipeline_components import MetaMap

import logging, sys

# print logs
logging.basicConfig(
    stream=sys.stdout,
    level=logging.DEBUG)  #set level=logging.DEBUG for more information

#entity types
entities = ['Reason', 'ADE', 'Drug']
# entities = ['Symptom', 'Form', 'Route', 'Frequency', 'Duration', 'Dosage', 'Strength', 'Drug']
# dirPred = '/home/mahendrand/VE/Predictions/CV/N2C2'
training_dataset = Dataset('/home/mahendrand/VE/Data/N2C2/data')

#set metamap path
metamap = MetaMap(
    metamap_path="/home/share/programs/metamap/2016/public_mm/bin/metamap",
    convert_ascii=True)
training_dataset.metamap(metamap)

# pipeline = ClinicalPipeline(metamap=metamap, entities=entities)
pipeline = SystematicReviewPipeline(metamap=metamap, entities=entities)
model = Model(
    pipeline, n_jobs=1
)  #distribute documents between 30 processes during training and prediction

model.fit(training_dataset)
Ejemplo n.º 28
0
logging.basicConfig(
    stream=sys.stdout,
    level=logging.DEBUG)  #set level=logging.DEBUG for more information

# entities = ['CellLine','Dose','DoseDuration', 'DoseDurationUnits', 'DoseFrequency', 'DoseRoute', 'DoseUnits', 'EndpointUnitOfMeasure', 'GroupName', 'GroupSize', 'SampleSize', 'Sex', 'Species', 'Strain', 'TestArticle', 'TestArticlePurity', 'TestArticleVerification', 'TimeAtDose', 'TimeAtFirstDose', 'TimeAtLastDose', 'TimeEndpointAssessed', 'TimeUnits', 'Vehicle' ]
entities = [
    'CellLine', 'Dose', 'DoseDuration', 'DoseDurationUnits', 'DoseFrequency',
    'DoseRoute', 'DoseUnits', 'Endpoint', 'EndpointUnitOfMeasure', 'GroupName',
    'GroupSize', 'SampleSize', 'Sex', 'Species', 'Strain', 'TestArticle',
    'TestArticlePurity', 'TestArticleVerification', 'TimeAtDose',
    'TimeAtFirstDose', 'TimeAtLastDose', 'TimeEndpointAssessed', 'TimeUnits',
    'Vehicle'
]

# training_dataset, evaluation_dataset, meta_data = Dataset.load_external('medacy_dataset_smm4h_2019')
training_dataset = Dataset('/home/mahendrand/VE/TAC/data_TAC')

#training_dataset.set_data_limit(10)
# pipeline = SystematicReviewPipeline(metamap=None, entities=meta_data['entities'])
pipeline = ClinicalPipeline(metamap=None, entities=entities)
model = Model(
    pipeline, n_jobs=1
)  #distribute documents between 30 processes during training and prediction
#
model.fit(training_dataset)

model.cross_validate(num_folds=5,
                     training_dataset=training_dataset,
                     prediction_directory=True,
                     groundtruth_directory=True)