def get_bioc_collection(df): collection = bioc.BioCCollection() splitter = NegBioSSplitter() for i, report in enumerate(df["Report Impression"]): document = text2bioc.text2document(str(i), report) document = splitter.split_doc(document) collection.add_document(document) return collection
def get_negbio_preds(df): collection = get_bioc_collection(df) lemmatizer = Lemmatizer() ptb2dep = NegBioPtb2DepConverter(lemmatizer, universal=True) ssplitter = NegBioSSplitter(newline=True) parser = NegBioParser(model_dir=PARSING_MODEL_DIR) loader = NegBioLoader() extractor = NegBioExtractor(Path(MENTION_PATH), Path(UNMENTION_PATH)) neg_detector = ModifiedDetector(PRE_NEG_PATH, NEG_PATH, POST_NEG_PATH) aggregator = NegBioAggregator(CATEGORIES) collection = pipeline(collection, loader, ssplitter, extractor, parser, ptb2dep, neg_detector, aggregator, verbose=True) # convert BioC collection to dataframe for reporting negbio_pred = pd.DataFrame() for doc in collection.documents: dictionary = {} for key, val in doc.infons.items(): dictionary[key[9:]] = val negbio_pred = negbio_pred.append(dictionary, ignore_index=True) negbio_pred = negbio_pred.replace("Positive", True).replace( "Negative", False).replace("Uncertain", False).fillna(False) return negbio_pred
def main(): argv = parse_args(__doc__, version='version 2') print(argv) lemmatizer = Lemmatizer() ptb2dep = NegBioPtb2DepConverter(lemmatizer, universal=True) splitter = NegBioSSplitter(newline=argv['--newline_is_sentence_break']) parser = NegBioParser(model_dir=argv['--bllip-model']) argv = get_absolute_path(argv, '--neg-patterns', 'negbio/patterns/neg_patterns.txt') argv = get_absolute_path(argv, '--uncertainty-patterns', 'negbio/patterns/uncertainty_patterns.txt') mm = pymetamap.MetaMap.get_instance(argv['--metamap']) neg_detector = negdetect.Detector(argv['--neg-patterns'], argv['--uncertainty-patterns']) if argv['--cuis'] == 'None': cuis = None else: cuis = read_cuis(argv['--cuis']) if argv['text']: collection = text2bioc.text2collection(argv['SOURCES']) elif argv['bioc']: with open(argv['SOURCE']) as fp: collection = bioc.load(fp) else: raise KeyError pipeline(collection, mm, splitter, parser, ptb2dep, neg_detector, cuis) with open(os.path.expanduser(argv['--output']), 'w') as fp: bioc.dump(collection, fp)
def main(): argv = parse_args(__doc__, version='version 2') print(argv) lemmatizer = Lemmatizer() ptb2dep = NegBioPtb2DepConverter(lemmatizer, universal=True) ssplitter = NegBioSSplitter(newline=argv['--newline_is_sentence_break']) parser = NegBioParser(model_dir=argv['--bllip-model']) argv = get_absolute_path(argv, '--mention_phrases_dir', 'negbio/chexpert/phrases/mention') argv = get_absolute_path(argv, '--unmention_phrases_dir', 'negbio/chexpert/phrases/unmention') argv = get_absolute_path( argv, '--pre-negation-uncertainty-patterns', 'negbio/chexpert/patterns/pre_negation_uncertainty.txt') argv = get_absolute_path( argv, '--post-negation-uncertainty-patterns', 'negbio/chexpert/patterns/post_negation_uncertainty.txt') argv = get_absolute_path(argv, '--neg-patterns', 'negbio/chexpert/patterns/negation.txt') # chexpert loader = NegBioLoader() extractor = NegBioExtractor(Path(argv['--mention_phrases_dir']), Path(argv['--unmention_phrases_dir']), verbose=argv['--verbose']) neg_detector = ModifiedDetector( argv['--pre-negation-uncertainty-patterns'], argv['--neg-patterns'], argv['--post-negation-uncertainty-patterns']) aggregator = NegBioAggregator(CATEGORIES, verbose=argv['--verbose']) if argv['text']: collection = text2bioc.text2collection(argv['SOURCES']) elif argv['bioc']: with open(argv['SOURCE']) as fp: collection = bioc.load(fp) else: raise KeyError pipeline(collection, loader, ssplitter, extractor, parser, ptb2dep, neg_detector, aggregator, verbose=argv['--verbose']) with open(os.path.expanduser(argv['--output']), 'w') as fp: bioc.dump(collection, fp)
""" Split text into sentences Usage: negbio_pipeline ssplit [options] --output=<directory> <file> ... Options: --newline_is_sentence_break Whether to treat newlines as sentence breaks. True means that a newline is always a sentence break. False means to ignore newlines for the purpose of sentence splitting. This is appropriate for continuous text, when just the non-whitespace characters should be used to determine sentence breaks. [default=False] --suffix=<suffix> Append an additional SUFFIX to file names. [default: .ssplit.xml] --output=<directory> Specify the output directory. --verbose Print more information about progress. """ from negbio.pipeline.scan import scan_document from negbio.pipeline.ssplit import NegBioSSplitter from negbio.cli_utils import parse_args if __name__ == '__main__': argv = parse_args(__doc__) splitter = NegBioSSplitter(newline=argv['--newline_is_sentence_break']) scan_document(source=argv['<file>'], directory=argv['--output'], suffix=argv['--suffix'], fn=splitter.split_doc, non_sequences=[])