Ejemplo n.º 1
0
def get_bioc_collection(df):
    collection = bioc.BioCCollection()
    splitter = NegBioSSplitter()
    for i, report in enumerate(df["Report Impression"]):
        document = text2bioc.text2document(str(i), report)
        document = splitter.split_doc(document)
        collection.add_document(document)
    return collection
Ejemplo n.º 2
0
def get_negbio_preds(df):
    collection = get_bioc_collection(df)
    lemmatizer = Lemmatizer()
    ptb2dep = NegBioPtb2DepConverter(lemmatizer, universal=True)
    ssplitter = NegBioSSplitter(newline=True)
    parser = NegBioParser(model_dir=PARSING_MODEL_DIR)
    loader = NegBioLoader()
    extractor = NegBioExtractor(Path(MENTION_PATH), Path(UNMENTION_PATH))
    neg_detector = ModifiedDetector(PRE_NEG_PATH, NEG_PATH, POST_NEG_PATH)
    aggregator = NegBioAggregator(CATEGORIES)
    collection = pipeline(collection,
                          loader,
                          ssplitter,
                          extractor,
                          parser,
                          ptb2dep,
                          neg_detector,
                          aggregator,
                          verbose=True)

    # convert BioC collection to dataframe for reporting
    negbio_pred = pd.DataFrame()
    for doc in collection.documents:
        dictionary = {}
        for key, val in doc.infons.items():
            dictionary[key[9:]] = val
        negbio_pred = negbio_pred.append(dictionary, ignore_index=True)
    negbio_pred = negbio_pred.replace("Positive", True).replace(
        "Negative", False).replace("Uncertain", False).fillna(False)
    return negbio_pred
Ejemplo n.º 3
0
def main():
    argv = parse_args(__doc__, version='version 2')
    print(argv)

    lemmatizer = Lemmatizer()
    ptb2dep = NegBioPtb2DepConverter(lemmatizer, universal=True)
    splitter = NegBioSSplitter(newline=argv['--newline_is_sentence_break'])
    parser = NegBioParser(model_dir=argv['--bllip-model'])

    argv = get_absolute_path(argv, '--neg-patterns',
                             'negbio/patterns/neg_patterns.txt')
    argv = get_absolute_path(argv, '--uncertainty-patterns',
                             'negbio/patterns/uncertainty_patterns.txt')

    mm = pymetamap.MetaMap.get_instance(argv['--metamap'])
    neg_detector = negdetect.Detector(argv['--neg-patterns'],
                                      argv['--uncertainty-patterns'])

    if argv['--cuis'] == 'None':
        cuis = None
    else:
        cuis = read_cuis(argv['--cuis'])

    if argv['text']:
        collection = text2bioc.text2collection(argv['SOURCES'])
    elif argv['bioc']:
        with open(argv['SOURCE']) as fp:
            collection = bioc.load(fp)
    else:
        raise KeyError

    pipeline(collection, mm, splitter, parser, ptb2dep, neg_detector, cuis)

    with open(os.path.expanduser(argv['--output']), 'w') as fp:
        bioc.dump(collection, fp)
Ejemplo n.º 4
0
def main():
    argv = parse_args(__doc__, version='version 2')
    print(argv)

    lemmatizer = Lemmatizer()
    ptb2dep = NegBioPtb2DepConverter(lemmatizer, universal=True)
    ssplitter = NegBioSSplitter(newline=argv['--newline_is_sentence_break'])
    parser = NegBioParser(model_dir=argv['--bllip-model'])

    argv = get_absolute_path(argv, '--mention_phrases_dir',
                             'negbio/chexpert/phrases/mention')
    argv = get_absolute_path(argv, '--unmention_phrases_dir',
                             'negbio/chexpert/phrases/unmention')
    argv = get_absolute_path(
        argv, '--pre-negation-uncertainty-patterns',
        'negbio/chexpert/patterns/pre_negation_uncertainty.txt')
    argv = get_absolute_path(
        argv, '--post-negation-uncertainty-patterns',
        'negbio/chexpert/patterns/post_negation_uncertainty.txt')
    argv = get_absolute_path(argv, '--neg-patterns',
                             'negbio/chexpert/patterns/negation.txt')

    # chexpert
    loader = NegBioLoader()
    extractor = NegBioExtractor(Path(argv['--mention_phrases_dir']),
                                Path(argv['--unmention_phrases_dir']),
                                verbose=argv['--verbose'])
    neg_detector = ModifiedDetector(
        argv['--pre-negation-uncertainty-patterns'], argv['--neg-patterns'],
        argv['--post-negation-uncertainty-patterns'])
    aggregator = NegBioAggregator(CATEGORIES, verbose=argv['--verbose'])

    if argv['text']:
        collection = text2bioc.text2collection(argv['SOURCES'])
    elif argv['bioc']:
        with open(argv['SOURCE']) as fp:
            collection = bioc.load(fp)
    else:
        raise KeyError

    pipeline(collection,
             loader,
             ssplitter,
             extractor,
             parser,
             ptb2dep,
             neg_detector,
             aggregator,
             verbose=argv['--verbose'])

    with open(os.path.expanduser(argv['--output']), 'w') as fp:
        bioc.dump(collection, fp)
Ejemplo n.º 5
0
"""
Split text into sentences

Usage:
    negbio_pipeline ssplit [options] --output=<directory> <file> ...

Options:
    --newline_is_sentence_break     Whether to treat newlines as sentence breaks. True means that a newline is always a
                                    sentence break. False means to ignore newlines for the purpose of sentence
                                    splitting. This is appropriate for continuous text, when just the non-whitespace
                                    characters should be used to determine sentence breaks. [default=False]
    --suffix=<suffix>               Append an additional SUFFIX to file names. [default: .ssplit.xml]
    --output=<directory>            Specify the output directory.
    --verbose                       Print more information about progress.
"""
from negbio.pipeline.scan import scan_document
from negbio.pipeline.ssplit import NegBioSSplitter
from negbio.cli_utils import parse_args

if __name__ == '__main__':
    argv = parse_args(__doc__)
    splitter = NegBioSSplitter(newline=argv['--newline_is_sentence_break'])
    scan_document(source=argv['<file>'],
                  directory=argv['--output'],
                  suffix=argv['--suffix'],
                  fn=splitter.split_doc,
                  non_sequences=[])