def cfg():


    description = 'covid_xray'

    source = '/home/lybarger/clinical_extractors/analyses_pulmonary/step005_text_import/covid_xray/corpus.pkl'
    include = None
    exclude = None
    as_stem = True

    model_dir = '/home/lybarger/clinical_extractors/analyses_pulmonary/step320_pulmonary_modeling/fit/sent1+concat0+run1/'


    fast_run = True

    output_dir = paths_pulmonary.predict
    if fast_run:
        destination = os.path.join(output_dir, description + '_FAST_RUN')
    else:
        destination = os.path.join(output_dir, description)

    # Scratch directory
    make_and_clear(destination)

    device = 0

    # Create observers
    file_observ = FileStorageObserver.create(destination)
    cust_observ = CustomObserver(destination)
    ex.observers.append(file_observ)
    ex.observers.append(cust_observ)
def cfg():

    description = 'basic'

    source_dir = paths_pulmonary.brat_import
    source = constants_pulmonary.COVID_XRAY
    file = constants.CORPUS_FILE
    source = os.path.join(source_dir, source, file)

    labels = [INFILTRATES, EXTRAPARENCHYMAL]

    doc_map = constants_pulmonary.DOC_MAP

    doc_label_order = [INFILTRATES, EXTRAPARENCHYMAL]
    assertion_label_order = [NONE, PRESENT, UNILATERAL, BILATERAL]
    '''
    Paths
    '''
    destination = os.path.join(paths_pulmonary.stats, description)

    # Scratch directory
    make_and_clear(destination)

    # Create observers
    file_observ = FileStorageObserver.create(destination)
    cust_observ = CustomObserver(destination)
    ex.observers.append(file_observ)
    ex.observers.append(cust_observ)
def cfg():

    #mode = CV
    mode = FIT
    #mode = PREDICT
    #mode = SCORE

    #file_doc_scores = ["scores_doc_labels.csv", "scores_doc_labels_summary.csv", "scores_sent_labels_summary.csv"] #"scores_entities.csv", "scores_relations.csv"]
    file_doc_scores = "scores_doc_labels.csv"  #"scores_entities.csv", "scores_relations.csv"]
    file_sent_scores = "scores_sent_labels_summary.csv"

    source_dirs = [os.path.join(paths_pulmonary.modeling, mode)]
    discrete_dir = '/home/lybarger/clinical_extractors/analyses_pulmonary/step322_pulmonary_discrete/ngrams/'

    if mode == FIT:
        source_dirs.append(discrete_dir)

    metric = F1
    destination = os.path.join(paths_pulmonary.summary, mode)

    suffix_pat = '\+run\d'

    # Destination file for corpus

    # Scratch directory
    make_and_clear(destination)

    # Create observers
    file_observ = FileStorageObserver.create(destination)
    cust_observ = CustomObserver(destination)
    ex.observers.append(file_observ)
    ex.observers.append(cust_observ)
Beispiel #4
0
def text_to_disk(corpus, destination, sub_dir="text"):
    '''
    Save corpus to disc as txt files
    '''
    dir = os.path.join(destination, sub_dir)
    make_and_clear(dir, recursive=True)
    for doc in corpus.docs():
        doc.write_text(dir)
Beispiel #5
0
    def fit_cv(self,
               X,
               y,
               device=None,
               path=None,
               n_splits=3,
               shuffle=True,
               seed=1):

        if shuffle:
            z = list(zip(X, y))
            random.Random(seed).shuffle(z)
            X, y = zip(*z)
            if not isinstance(X, list):
                X = list(X)
            if not isinstance(y, list):
                y = list(y)

        kf = KFold(n_splits=n_splits)

        dfs = OrderedDict()
        for j, (train_index, test_index) in enumerate(kf.split(X)):

            self.reset_parameters()

            X_train = [X[i] for i in train_index]
            y_train = [y[i] for i in train_index]

            X_test = [X[i] for i in test_index]
            y_test = [y[i] for i in test_index]

            dir = os.path.join(path, f'cross_val_{j}')
            make_and_clear(dir)

            self.fit(X_train, y_train, device=device, path=dir)
            y_pred, scores = self.score(X_test,
                                        y_test,
                                        device=device,
                                        path=dir)

            for name, df in scores.items():
                if name not in dfs:
                    dfs[name] = []
                dfs[name].append(df)

        dfs = self.scorer.combine_cv(dfs, path=path)

        return dfs
Beispiel #6
0
    def tokenization_examples(self, path, n, **kwargs):

        make_and_clear(path, recursive=True)

        for doc in self.docs(**kwargs)[:n]:

            # Output file name
            fn = os.path.join(path, '{}_original.{}'.format(doc.id, 'txt'))

            # Directory, including path in id
            dir_ = os.path.dirname(fn)
            if not os.path.exists(dir_):
                os.makedirs(dir_)

            with open(fn, 'w') as f:
                f.write(doc.text())

            fn = os.path.join(path, '{}_tokenized.{}'.format(doc.id, 'txt'))
            with open(fn, 'w') as f:
                f.write('\n====\n====\n'.join(doc.sents()))
def cfg():

    description = None

    use_binary = False
    description = f'binary_{int(use_binary)}'

    source = constants_pulmonary.XRAY_IMAGES

    source_corpus_text = '/home/lybarger/clinical_extractors/analyses_pulmonary/step005_text_import/covid_xray/corpus.pkl'

    source_corpus_anno = '/home/lybarger/clinical_extractors/analyses_pulmonary/step010_brat_import/covid_xray/corpus.pkl'

    if use_binary:
        source_model = '/home/lybarger/clinical_extractors/analyses_pulmonary/step320_pulmonary_modeling/fit/sent1+concat0+run0+bd1/'
        doc_map = constants_pulmonary.DOC_MAP_BINARY
    else:
        source_model = '/home/lybarger/clinical_extractors/analyses_pulmonary/step320_pulmonary_modeling/fit/sent1+concat0+run0/'
        doc_map = constants_pulmonary.DOC_MAP

    source_image = paths_pulmonary.xray_quadrant_interp

    load_predictions = False

    #file = constants.CORPUS_FILE
    #source = os.path.join(source_dir, source, file)

    device = 0
    '''
    Paths
    '''
    destination = os.path.join(paths_pulmonary.image_anno_comp, description)

    # Scratch directory
    make_and_clear(destination)

    # Create observers
    file_observ = FileStorageObserver.create(destination)
    cust_observ = CustomObserver(destination)
    ex.observers.append(file_observ)
    ex.observers.append(cust_observ)
def get_predictions(source_model,
                    device,
                    text_dict,
                    target_ids,
                    path,
                    load_predictions=False):

    logging.info(f"=" * 72)
    logging.info(f"Predictions")
    logging.info(f"=" * 72)

    logging.info(f"Document count, all: {len(text_dict)}")

    text = []
    for study_id, accession in target_ids:

        k = (study_id, accession)

        text.append(text_dict[k])

    logging.info(f"Document count, target: {len(text)}")

    dir = os.path.join(path, 'predictions')
    f = os.path.join(dir, PREDICTIONS_FILE)

    if load_predictions:
        y = joblib.load(f)
    else:
        model = load_pretrained(ModelXray, source_model)
        y = model.predict(X=text, device=device)

        make_and_clear(dir)
        joblib.dump(y, f)

    labels = [y_[constants.DOC_LABELS] for y_ in y]
    labels = nest_list(labels)

    return labels
Beispiel #9
0
def cfg():

    # Annotation source
    #source = constants.SYMPTOMS
    #source = constants.SDOH
    source = constants_pulmonary.COVID_XRAY
    #source = constants.SDOH_DEID
    #source = constants.SDOH_PARTIAL

    fast_run = False
    fast_count = 50 if fast_run else None

    source_dir = None

    skip = None
    source_tags = None
    source_original = None
    write_brat = False
    write_text = False
    map_ids = False
    corpus_object = CorpusBrat
    update_lb = False
    id2tags = None
    rm_extra_lb = False
    snap_textbounds = False
    linebreak_bound = True

    if source == constants.SDOH:
        source_dir = paths_deid.sdoh_brat
        source_tags = paths_deid.sdoh_doc_tags
        source_original = paths_deid.sdoh_original
        dir = paths_deid.brat_import
        write_brat = True
        write_text = True
        update_lb = True

    elif source == constants.SDOH_DEID:
        source_dir = paths_deid.sdoh_brat_deid
        dir = paths_deid.brat_import
        corpus_object = CorpusBratDeid

    elif source == constants.SDOH_PARTIAL:
        source_dir = paths.sdoh_brat_partial
        source_tags = paths.sdoh_doc_tags
        source_original = paths.sdoh_original
        dir = paths.brat_import
        write_brat = True
        write_text = True
        fast_run = True

    elif source == constants.SYMPTOMS:
        source_dir = paths_symptoms.symptoms_brat
        source_tags = paths_symptoms.symptoms_doc_tags
        #source_original = paths_symptoms.symptoms_original
        dir = paths_symptoms.brat_import
        corpus_object = CorpusBratSymptoms
        write_brat = False
        write_text = False

    elif source == constants_pulmonary.COVID_XRAY:
        source_dir = paths_pulmonary.brat_xray
        source_tags = paths_pulmonary.pulmonary_doc_tags
        dir = paths_pulmonary.brat_import
        corpus_object = CorpusBratXray
        id2tags = corpus_brat_xray.id2tags
        rm_extra_lb = True
        snap_textbounds = True
        skip = []
    else:
        ValueError("invalid source: {}".format(source))
    '''
    Paths
    '''
    if fast_run:
        destination = os.path.join(dir, source + '_FAST_RUN')
    else:
        destination = os.path.join(dir, source)

    # Destination file for corpus

    # Scratch directory
    make_and_clear(destination)

    # Create observers
    file_observ = FileStorageObserver.create(destination)
    cust_observ = CustomObserver(destination)
    ex.observers.append(file_observ)
    ex.observers.append(cust_observ)
def cfg():


    description = 'bert'



    #description = 'baseline+crf'

    source = constants_pulmonary.COVID_XRAY
    dir = None


    mode = CV
    #mode = FIT
    #mode = PREDICT
    #mode = SCORE

    model_dir = '/home/lybarger/clinical_extractors/analyses_pulmonary/step320_pulmonary_modeling/fit/baseline/'

    fast_run = False
    fast_count = 20 if fast_run else None

    source_dir = None

    n_splits = 3

    doc_map = constants_pulmonary.DOC_MAP


    if source == constants_pulmonary.COVID_XRAY:
        source_dir = paths_pulmonary.brat_import
        source = constants_pulmonary.COVID_XRAY
        source = os.path.join(source_dir, source, constants.CORPUS_FILE)
        output_dir = paths_pulmonary.modeling
    else:
        ValueError("invalid source: {}".format(source))


    side_swap = False
    if side_swap:
        entity_definition = ENTITY_DEFINITION_SWAP
        relation_definition = RELATION_DEFINITION_SWAP
    else:
        entity_definition = ENTITY_DEFINITION
        relation_definition = RELATION_DEFINITION

    sent_definition = SENT_DEFINITION


    '''
    Paths
    '''
    if fast_run:
        destination = os.path.join(output_dir, mode, description + '_FAST_RUN')
    else:
        destination = os.path.join(output_dir, mode, description)

    # Destination file for corpus

    # Scratch directory
    make_and_clear(destination)

    device = 0
    use_sent_objective = True
    concat_sent_scores = True
    span_embed_dim = 50
    batch_size = 4

    num_workers = 0


    max_sent_count = 35
    keep_ws = False
    linebreak_bound = True

    dropout_sent = 0.0
    dropout_doc = 0.0

    lr = 1e-5
    lr_ratio = 1.0

    pretrained = "emilyalsentzer/Bio_ClinicalBERT" # 'bert-base-uncased'
    doc_definition = DOC_DEFINITION
    sent_definition = SENT_DEFINITION
    grad_max_norm = 1.0
    loss_reduction = "sum"


    project_sent = True
    project_size = 200

    attention_query_dim = 100

    max_length = 60

    num_workers = 6
    num_epochs = 10

    # Create observers
    file_observ = FileStorageObserver.create(destination)
    cust_observ = CustomObserver(destination)
    ex.observers.append(file_observ)
    ex.observers.append(cust_observ)
Beispiel #11
0
    def write_brat(self, path, **kwargs):

        make_and_clear(path, recursive=True)
        for doc in self.docs(**kwargs):
            doc.write_brat(path)
def cfg():

    description = 'baseline'

    source = constants.SYMPTOMS

    dir = None

    fast_run = True

    source_dir = None

    include_train = [constants.TRAIN]

    if source == constants.SYMPTOMS:
        source_dir = paths_symptoms.brat_import
        if fast_run:
            source += '_FAST_RUN'
        source = os.path.join(source_dir, source, constants.CORPUS_FILE)
        output_dir = paths_symptoms.modeling
    else:
        ValueError("invalid source: {}".format(source))
    '''
    Paths
    '''
    if fast_run:
        destination = os.path.join(output_dir, description + '_FAST_RUN')
    else:
        destination = os.path.join(output_dir, description)

    # Destination file for corpus

    # Scratch directory
    make_and_clear(destination)

    device = 1
    use_rnn = True
    num_workers = 0
    xfmr_dim = 768
    lstm_size = 200
    h_size = lstm_size * 2 if use_rnn else xfmr_dim
    loss_reduction = "sum"

    hyperparams = {}
    hyperparams['use_rnn'] = use_rnn

    rnn = {}
    rnn['input_size'] = xfmr_dim
    rnn['output_size'] = lstm_size
    rnn['type_'] = 'lstm'
    rnn['num_layers'] = 1
    rnn['dropout_output'] = 0.0
    hyperparams['rnn'] = rnn

    relation_extractor = {}
    relation_extractor[
        "entity_definition"] = constants_symptoms.ENTITY_DEFINITION
    relation_extractor["input_dim"] = h_size
    relation_extractor["span_scorer_type"] = "span"
    relation_extractor["span_embed_project"] = True
    relation_extractor["span_embed_dim"] = 100
    relation_extractor["span_embed_dropout"] = 0.0
    relation_extractor["span_scorer_hidden_dim"] = 100
    relation_extractor["span_scorer_dropout"] = 0.0
    relation_extractor["span_class_weights"] = None

    relation_extractor["spans_per_word"] = 2
    relation_extractor[
        "relation_definition"] = constants_symptoms.RELATION_DEFINITION
    relation_extractor["role_hidden_dim"] = 100
    relation_extractor["role_output_dim"] = 2
    relation_extractor["role_dropout"] = 0.0
    relation_extractor["loss_reduction"] = loss_reduction

    hyperparams["relation_extractor"] = relation_extractor
    hyperparams['grad_max_norm'] = 1.0
    hyperparams["loss_reduction"] = loss_reduction

    dataset_params = {}
    dataset_params["pretrained"] = "emilyalsentzer/Bio_ClinicalBERT"
    dataset_params["max_length"] = 30
    dataset_params["max_wp_length"] = 60
    dataset_params["linebreak_bound"] = True
    dataset_params["keep"] = 'mean'
    dataset_params["max_span_width"] = 6
    dataset_params["entity_definition"] = constants_symptoms.ENTITY_DEFINITION
    dataset_params[
        "relation_definition"] = constants_symptoms.RELATION_DEFINITION

    dataloader_params = {}
    dataloader_params['batch_size'] = 100
    dataloader_params['num_workers'] = num_workers

    optimizer_params = {}
    optimizer_params['lr'] = 0.001

    num_workers = 6
    num_epochs = 100

    # Create observers
    file_observ = FileStorageObserver.create(destination)
    cust_observ = CustomObserver(destination)
    ex.observers.append(file_observ)
    ex.observers.append(cust_observ)
Beispiel #13
0
def main(source, destination, corpus_fn, num_examples, source_params, sampling,
         exclude, annotators, id2filename):

    # Load and create corpus
    logging.info("Corpus loading...")
    if source == constants_pulmonary.COVID_XRAY:
        corpus = xray.load_corpus(**source_params)

        logging.info('-' * 72)
        logging.info('Only day 0')
        logging.info('-' * 72)
        corpus.summary(path=destination, day_range=(0, 0), rep_range=None)

        logging.info('-' * 72)
        logging.info('Only day in [0-7]')
        logging.info('-' * 72)
        corpus.summary(path=destination, day_range=(0, 7), rep_range=None)

        logging.info('-' * 72)
        logging.info('All notes')
        logging.info('-' * 72)
        corpus.summary(path=destination, day_range=None, rep_range=None)

    elif source == constants_pulmonary.EMERGE_XRAY:
        corpus = xray.load_emerge_corpus(**source_params)

    else:
        raise ValueError("Incorrect corpus:\t{}".format(corpus))
    logging.info("Corpus loaded")

    # Save examples for review
    example_dir = os.path.join(destination, "Examples")
    make_and_clear(example_dir, recursive=True)
    corpus.write_examples(example_dir, num_examples=num_examples)

    corpus.summary(path=destination)
    corpus.write_ids(destination)

    exclude = [tuple(e) for e in exclude]

    if sampling is not None:
        logging.info('Sampling')

        # Sampling
        brat_dir = os.path.join(destination, "brat")
        make_and_clear(brat_dir, recursive=True)

        sampled = []
        for params in sampling:
            dir_ = os.path.join(brat_dir,
                                'round{:0>2d}'.format(params['round']))
            os.mkdir(dir_)

            logging.info('')
            logging.info('Round:\t{}'.format(params['round']))

            docs = corpus.random_sample( \
                    size = params['size'],
                    exclude = exclude,
                    seed = params['seed'],
                    path = dir_,
                    brat = True,
                    footer = params['footer'],
                    annotators = annotators,
                    anno_type = params['anno_type'],
                    **params['kwargs'])

            exclude.extend(list(docs.keys()))

            for id in docs:
                if id2filename is not None:
                    id = id2filename(id)
                sampled.append((params['round'], id))

        sample_check(brat_dir, destination)

        fn = os.path.join(destination, 'sampled_documents.csv')
        df = pd.DataFrame(sampled, columns=['round', 'id'])
        df.to_csv(fn)

    # Save corpus
    logging.info("Saving to disk...")
    joblib.dump(corpus, corpus_fn)
    logging.info("Saving complete")

    return True
Beispiel #14
0
def cfg():


    description = 'baseline'



    #description = 'baseline+crf'

    source = constants_pulmonary.COVID_XRAY
    dir = None


    #mode = CV
    #mode = FIT
    #mode = PREDICT
    #mode = SCORE
    mode = PROB
    model_dir='/home/lybarger/clinical_extractors/analyses_pulmonary/step320_pulmonary_modeling/fit/sent1+concat0+run0+bd1/'

    fast_run = False
    fast_count = 20 if fast_run else None

    source_dir = None

    n_splits = 3


    binary_doc_map = False




    if binary_doc_map:
        doc_map = constants_pulmonary.DOC_MAP_BINARY
        doc_definition = DOC_DEFINITION_BINARY
    else:
        doc_map = constants_pulmonary.DOC_MAP
        doc_definition = DOC_DEFINITION


    if source == constants_pulmonary.COVID_XRAY:
        source_dir = paths_pulmonary.brat_import
        source = constants_pulmonary.COVID_XRAY
        source = os.path.join(source_dir, source, constants.CORPUS_FILE)
        output_dir = paths_pulmonary.modeling
    else:
        ValueError("invalid source: {}".format(source))


    side_swap = False
    if side_swap:
        entity_definition = ENTITY_DEFINITION_SWAP
        relation_definition = RELATION_DEFINITION_SWAP
    else:
        entity_definition = ENTITY_DEFINITION
        relation_definition = RELATION_DEFINITION

    sent_definition = SENT_DEFINITION


    '''
    Paths
    '''
    if fast_run:
        destination = os.path.join(output_dir, mode, description + '_FAST_RUN')
    else:
        destination = os.path.join(output_dir, mode, description)

    # Destination file for corpus

    # Scratch directory
    make_and_clear(destination)

    device = 0
    use_rnn = True
    use_doc_classifier = True
    use_span_classifier = False
    use_doc_features = False
    use_sent_objective = True
    concat_sent_scores = True


    projection_dim = 100

    if concat_sent_scores:
        assert use_sent_objective

    span_embed_dim = 50

    dropout_rnn = 0.0
    dropout_sent_classifier = 0.0
    dropout_doc_classifier = 0.0

    batch_size = 30

    linebreak_bound = True
    max_sent_count = 35
    keep_ws = True

    num_workers = 0
    xfmr_dim = 768
    rnn_size = 100
    h_size = rnn_size*2 if use_rnn else xfmr_dim
    loss_reduction = "sum"

    lr = 0.002

    hyperparams = {}
    hyperparams['use_rnn'] = use_rnn
    hyperparams['use_doc_classifier'] = use_doc_classifier
    hyperparams['use_span_classifier'] = use_span_classifier
    hyperparams['use_doc_features'] = use_doc_features


    rnn = {}
    rnn['input_size'] = xfmr_dim
    rnn['output_size'] = rnn_size
    rnn['type_'] = 'lstm'
    rnn['num_layers'] = 1
    rnn['dropout_output'] = dropout_rnn
    rnn['layer_norm'] = True
    hyperparams['rnn'] = rnn


    span_class_weights = OrderedDict()
    w = 100.0
    span_class_weights[REGION] =    [1.0, w, w]  # [NONE, PARENCHYMAL, EXTRAPARENCHYMAL]
    span_class_weights[SIDE] =      [1.0, w, w] # [NONE, UNILATERAL, BILATERAL]
    span_class_weights[SIZE] =      [1.0, w, w, w] # [NONE, SMALL, MODERATE, LARGE]
    span_class_weights[NEGATION] =  [1.0, w] # [NONE, NEGATION]

    relation_extractor = {}
    relation_extractor["entity_definition"] = entity_definition
    relation_extractor["input_dim"] = h_size
    relation_extractor["span_scorer_type"] = "span"
    relation_extractor["span_embed_project"] = True
    relation_extractor["span_embed_dim"] = span_embed_dim
    relation_extractor["span_embed_dropout"] = 0
    relation_extractor["span_scorer_hidden_dim"] = 50
    relation_extractor["span_scorer_dropout"] = 0
    relation_extractor["span_class_weights"] = None  # span_class_weights

    relation_extractor["spans_per_word"] = 2
    relation_extractor["relation_definition"] = relation_definition
    relation_extractor["role_hidden_dim"] = 50
    relation_extractor["role_output_dim"] = 2
    relation_extractor["role_dropout"] = 0
    relation_extractor["create_doc_vector"] = use_doc_features
    relation_extractor["doc_attention_dropout"] = 0
    relation_extractor["loss_reduction"] = loss_reduction
    hyperparams["relation_extractor"] = relation_extractor


    doc_classifier = {}
    doc_classifier["doc_definition"] = doc_definition
    doc_classifier["input_dim"] = h_size
    doc_classifier["query_dim"] = 100
    doc_classifier["use_ffnn"] = True
    doc_classifier["dropout_sent_classifier"] = dropout_sent_classifier
    doc_classifier["dropout_doc_classifier"] = dropout_doc_classifier
    doc_classifier["activation"] = 'tanh'
    doc_classifier["loss_reduction"] = loss_reduction
    doc_classifier["use_sent_objective"] = use_sent_objective
    doc_classifier["concat_sent_scores"] = concat_sent_scores
    doc_classifier["sent_definition"] = sent_definition
    doc_classifier["projection_dim"] = projection_dim
    hyperparams['doc_classifier'] = doc_classifier



    hyperparams['grad_max_norm'] = 1.0
    hyperparams["loss_reduction"] = loss_reduction

    dataset_params = {}
    dataset_params["pretrained"] = "emilyalsentzer/Bio_ClinicalBERT"
    dataset_params["max_length"] = 30
    dataset_params["max_wp_length"] = 50
    dataset_params["max_sent_count"] = max_sent_count
    dataset_params["linebreak_bound"] = linebreak_bound
    dataset_params["keep"] = 'mean'
    dataset_params["max_span_width"] = 6
    dataset_params["document_definition"] = doc_definition
    dataset_params["sent_definition"] = sent_definition
    dataset_params["entity_definition"] = entity_definition
    dataset_params["relation_definition"] = relation_definition
    dataset_params["pad_start"] = True
    dataset_params["pad_end"] = True
    dataset_params["keep_ws"] = False

    dataloader_params = {}
    dataloader_params['batch_size'] = batch_size
    dataloader_params['num_workers'] = num_workers

    optimizer_params = {}
    optimizer_params['lr'] = lr


    tokenization_params = {}
    tokenization_params['max_length'] = dataset_params["max_length"]
    tokenization_params['max_sent_count'] = dataset_params["max_sent_count"]
    tokenization_params['linebreak_bound'] = linebreak_bound
    tokenization_params['pad_start'] = dataset_params["pad_start"]
    tokenization_params['pad_end'] = dataset_params["pad_end"]


    num_workers = 6
    num_epochs = 300

    # Create observers
    file_observ = FileStorageObserver.create(destination)
    cust_observ = CustomObserver(destination)
    ex.observers.append(file_observ)
    ex.observers.append(cust_observ)
Beispiel #15
0
def cfg():

    # Annotation source
    #source = constants.SYMPTOMS
    #source = constants.SDOH
    source = constants_pulmonary.COVID_XRAY

    description = 'all'

    fast_run = False

    source_dir = None

    index_round = 0
    index_annotator = 1
    index_note = 2

    round = None
    annotators = None

    scorer = Scorer

    label_spec = {}

    if source == constants.SDOH:
        pass
    elif source == constants.SDOH_PARTIAL:
        pass

    elif source == constants.SYMPTOMS:
        pass

    elif source == constants_pulmonary.COVID_XRAY:

        source_dir = paths_pulmonary.brat_import
        dir = paths_pulmonary.agreement
        if description == 'round01':
            target_rounds_aggree = ["round01"]
        elif description == 'round04':
            target_rounds_aggree = ["round04"]
        elif description == 'all':
            target_rounds_aggree = ["round01", "round04"]
        target_rounds_dist = ["round02"]
        #annotator_pairs = [('Mark', 'Linzee'),
        #                   ('Mark', 'Matthew'),
        #                   ('Linzee', 'Matthew')]
        annotator_pairs = [('Linzee', 'Matthew')]
        scorer = ScorerXray
        doc_map = constants_pulmonary.DOC_MAP

        label_spec = {'doc_map': doc_map}

    else:
        ValueError("invalid source: {}".format(source))

    source_corpus = os.path.join(source_dir, source, constants.CORPUS_FILE)
    '''
    Paths55309
    '''

    destination = os.path.join(dir, source, description)

    # Destination file for corpus

    # Scratch directory
    make_and_clear(destination)

    # Create observers
    file_observ = FileStorageObserver.create(destination)
    cust_observ = CustomObserver(destination)
    ex.observers.append(file_observ)
    ex.observers.append(cust_observ)
def cfg():

    #prediction_type = 'oracle'
    run = 0
    prediction_type = 'ngrams'

    exclude_size = False
    exclude = None
    description = f"{prediction_type}+run{run}"

    fast_run = False

    n_splits = 3

    doc_map = constants_pulmonary.DOC_MAP

    source_dir = paths_pulmonary.brat_import
    source = constants_pulmonary.COVID_XRAY
    source = os.path.join(source_dir, source, constants.CORPUS_FILE)

    output_dir = paths_pulmonary.discrete

    labels = [INFILTRATES, EXTRAPARENCHYMAL]

    if prediction_type == 'oracle':

        model_type = 'random_forest'

        hyperparams = OrderedDict()
        hyperparams['n_estimators'] = 200
        hyperparams['max_depth'] = None
        hyperparams['min_samples_split'] = 2
        hyperparams['min_samples_leaf'] = 1
        hyperparams['min_weight_fraction_leaf'] = 0.0
        hyperparams['max_features'] = 'auto'
        hyperparams['max_leaf_nodes'] = None
        hyperparams['random_state'] = None
        hyperparams['ccp_alpha'] = 0.0
        hyperparams['n_jobs'] = 1
        #hyperparams['class_weight'] = {0:1, 1:1}

        tuned_parameters = OrderedDict()
        tuned_parameters['max_depth'] = [5, 10, 30] if fast_run else [
            2, 4, 6, 8, 10, 15, 20, 25, 30, 35
        ]
        tuned_parameters['min_samples_split'] = [2, 4] if fast_run else [
            2, 3, 4, 6, 8, 10, 12
        ]
        tuned_parameters['n_estimators'] = [100] if fast_run else [
            50, 100, 200, 500
        ]

        exclude_size = True
        if exclude_size:
            description = 'exclude_size'
            exclude = [SIZE]

    elif prediction_type == 'ngrams':

        model_type = 'svm'

        hyperparams = OrderedDict()
        hyperparams['C'] = 1.0
        #hyperparams['kernel'] = 'rbf'
        #hyperparams['degree'] = 3
        #hyperparams['gamma'] = 'scale'
        #hyperparams['coef0'] = 0.0
        #hyperparams['shrinking'] = True
        #hyperparams['probability'] = True
        #hyperparams['tol'] = 0.001
        #hyperparams['cache_size'] = 200
        #hyperparams['class_weight'] = None
        #hyperparams['verbose'] = False
        #hyperparams['max_iter'] = - 1,
        #hyperparams['decision_function_shape'] = 'ovr'
        #hyperparams['break_ties'] = False
        #hyperparams['random_state'] = None

        tuned_parameters = OrderedDict()
        tuned_parameters['C'] = [0.0001, 1.0, 1000.0] if fast_run else [
            0.0001, 0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0
        ]

    else:
        raise ValueError(f"invalid prediction type: {prediction_type}")
    '''
    Paths
    '''
    if fast_run:
        destination = os.path.join(output_dir, prediction_type,
                                   description + '_FAST_RUN')
    else:
        destination = os.path.join(output_dir, prediction_type, description)

    # Scratch directory
    make_and_clear(destination)

    # Create observers
    file_observ = FileStorageObserver.create(destination)
    cust_observ = CustomObserver(destination)
    ex.observers.append(file_observ)
    ex.observers.append(cust_observ)