Esempio n. 1
0
def processOneFile_fda(fileName, annotation_dir, nlp_tool, isTraining, types,
                       type_filter, isFDA2018, isNorm):
    documents = []
    annotation_file = get_fda_file(join(annotation_dir, fileName))

    # each section is a document
    for section in annotation_file.sections:
        document = Document()
        document.name = fileName[:fileName.find('.')] + "_" + section.id
        if section.text is None:
            document.text = ""
            document.entities = []
            document.sentences = []
            documents.append(document)
            continue

        document.text = section.text

        entities = []

        if isFDA2018 == False and isNorm == True:
            for reaction in annotation_file.reactions:
                entity = Entity()
                entity.name = reaction.name
                for normalization in reaction.normalizations:
                    entity.norm_ids.append(
                        normalization.meddra_pt_id)  # can be none
                    entity.norm_names.append(normalization.meddra_pt)
                entities.append(entity)

        else:
            for entity in annotation_file.mentions:
                if entity.section != section.id:
                    continue
                if types and (entity.type not in type_filter):
                    continue
                entities.append(entity)

        document.entities = entities

        if opt.nlp_tool == "nltk":
            if isTraining:
                sentences = get_sentences_and_tokens_from_nltk(
                    section.text, nlp_tool, document.entities,
                    annotation_file.ignore_regions, section.id)
            else:
                sentences = get_sentences_and_tokens_from_nltk(
                    section.text, nlp_tool, None,
                    annotation_file.ignore_regions, section.id)
        else:
            raise RuntimeError("invalid nlp tool")

        document.sentences = sentences

        documents.append(document)

    return documents, annotation_file
Esempio n. 2
0
def parse_one_gold_file(annotation_dir, corpus_dir, fileName):
    document = Document()
    document.name = fileName[:fileName.find('.')]

    annotation_file = get_bioc_file(os.path.join(annotation_dir, fileName))
    bioc_passage = annotation_file[0].passages[0]
    entities = []

    for entity in bioc_passage.annotations:
        if entity.infons['type'] not in type_we_care:
            continue

        entity_ = Entity()
        entity_.id = entity.id
        processed_name = entity.text.replace('\\n', ' ')
        if len(processed_name) == 0:
            logging.debug("{}: entity {} name is empty".format(
                fileName, entity.id))
            continue
        entity_.name = processed_name
        entity_.type = entity.infons['type']
        entity_.spans.append(
            [entity.locations[0].offset, entity.locations[0].end])

        if ('SNOMED code' in entity.infons and entity.infons['SNOMED code'] != 'N/A') \
                and ('SNOMED term' in entity.infons and entity.infons['SNOMED term'] != 'N/A'):
            entity_.norm_ids.append(entity.infons['SNOMED code'])
            entity_.norm_names.append(entity.infons['SNOMED term'])

        elif ('MedDRA code' in entity.infons and entity.infons['MedDRA code'] != 'N/A') \
                and ('MedDRA term' in entity.infons and entity.infons['MedDRA term'] != 'N/A'):
            entity_.norm_ids.append(entity.infons['MedDRA code'])
            entity_.norm_names.append(entity.infons['MedDRA term'])
        else:
            logging.debug("{}: no norm id in entity {}".format(
                fileName, entity.id))
            # some entities may have no norm id
            continue

        entities.append(entity_)

    document.entities = entities

    corpus_file = get_text_file(
        os.path.join(corpus_dir,
                     fileName.split('.bioc')[0]))
    document.text = corpus_file

    return document
Esempio n. 3
0
 def convert_to_document(self,text_data,title=None):
     ''' text_data can be a string or string list
         title is a string
     '''
     if isinstance(text_data, str):
         text_data=self.sent_detector.tokenize(text_data.strip())
     sents=[]
     for content in text_data:
         sent=self.sent_converter.convert_to_sentence(content)
         if sent:
             sents.append(sent)
     doc=Document(sents)
     if title:
         doc.title=self.sent_converter.convert_to_sentence(title)
     return doc
Esempio n. 4
0
def processOneFile_fda(fileName, annotation_dir, types, type_filter, isFDA2018,
                       isNorm):
    documents = []
    annotation_file = get_fda_file(os.path.join(annotation_dir, fileName))

    # each section is a document
    for section in annotation_file.sections:
        document = Document()
        document.name = fileName[:fileName.find('.')] + "_" + section.id
        if section.text is None:
            document.text = ""
            document.entities = []
            document.sentences = []
            documents.append(document)
            continue

        document.text = section.text

        entities = []

        if isFDA2018 == False and isNorm == True:
            for reaction in annotation_file.reactions:
                entity = Entity()
                entity.name = reaction.name
                for normalization in reaction.normalizations:
                    entity.norm_ids.append(
                        normalization.meddra_pt_id)  # can be none
                    entity.norm_names.append(normalization.meddra_pt)
                entities.append(entity)

        else:
            for entity in annotation_file.mentions:
                if entity.section != section.id:
                    continue
                if types and (entity.type not in type_filter):
                    continue
                entities.append(entity)

        document.entities = entities

        document.sentences = []

        documents.append(document)

    return documents, annotation_file
Esempio n. 5
0
def processOneFile(fileName, annotation_dir, corpus_dir, nlp_tool, isTraining,
                   types, type_filter):
    document = Document()
    document.name = fileName[:fileName.find('.')]

    ct_snomed = 0
    ct_meddra = 0
    ct_unnormed = 0

    if annotation_dir:
        annotation_file = get_bioc_file(join(annotation_dir, fileName))
        bioc_passage = annotation_file[0].passages[0]
        entities = []

        for entity in bioc_passage.annotations:
            if types and (entity.infons['type'] not in type_filter):
                continue
            entity_ = Entity()
            entity_.id = entity.id
            processed_name = entity.text.replace('\\n', ' ')
            if len(processed_name) == 0:
                logging.debug("{}: entity {} name is empty".format(
                    fileName, entity.id))
                continue
            entity_.name = processed_name

            entity_.type = entity.infons['type']
            entity_.spans.append(
                [entity.locations[0].offset, entity.locations[0].end])
            if ('SNOMED code' in entity.infons and entity.infons['SNOMED code'] != 'N/A')\
                    and ('SNOMED term' in entity.infons and entity.infons['SNOMED term'] != 'N/A'):
                entity_.norm_ids.append(entity.infons['SNOMED code'])
                entity_.norm_names.append(entity.infons['SNOMED term'])
                ct_snomed += 1
            elif ('MedDRA code' in entity.infons and entity.infons['MedDRA code'] != 'N/A')\
                    and ('MedDRA term' in entity.infons and entity.infons['MedDRA term'] != 'N/A'):
                entity_.norm_ids.append(entity.infons['MedDRA code'])
                entity_.norm_names.append(entity.infons['MedDRA term'])
                ct_meddra += 1
            else:
                logging.debug("{}: no norm id in entity {}".format(
                    fileName, entity.id))
                ct_unnormed += 1
                continue

            entities.append(entity_)

        document.entities = entities

    corpus_file = get_text_file(join(corpus_dir, fileName.split('.bioc')[0]))
    document.text = corpus_file

    if opt.nlp_tool == "spacy":
        if isTraining:
            sentences = get_sentences_and_tokens_from_spacy(
                corpus_file, nlp_tool, document.entities)
        else:
            sentences = get_sentences_and_tokens_from_spacy(
                corpus_file, nlp_tool, None)
    elif opt.nlp_tool == "nltk":
        if isTraining:
            sentences = get_sentences_and_tokens_from_nltk(
                corpus_file, nlp_tool, document.entities, None, None)
        else:
            sentences = get_sentences_and_tokens_from_nltk(
                corpus_file, nlp_tool, None, None, None)
    elif opt.nlp_tool == "stanford":
        if isTraining:
            sentences = get_sentences_and_tokens_from_stanford(
                corpus_file, nlp_tool, document.entities)
        else:
            sentences = get_sentences_and_tokens_from_stanford(
                corpus_file, nlp_tool, None)
    else:
        raise RuntimeError("invalid nlp tool")

    document.sentences = sentences

    return document, ct_snomed, ct_meddra, ct_unnormed
Esempio n. 6
0
def load_metamap_result_from_file(file_path):
    re_brackets = re.compile(r'\[[0-9|/]+\]')
    document = Document()
    entities = []
    with codecs.open(file_path, 'r', 'UTF-8') as fp:
        for line in fp.readlines():
            fields = line.strip().split(u"|")

            if fields[1] != u'MMI':
                continue

            ID = fields[
                0]  # Unique identifier used to identify text being processed. If no identifier is found in the text, 00000000 will be displayed
            MMI = fields[1]  # Always MMI
            Score = fields[
                2]  # MetaMap Indexing (MMI) score with a maximum score of 1000.00
            UMLS_Prefer_Name = fields[
                3]  # The UMLS preferred name for the UMLS concept
            UMLS_ID = fields[4]  # The CUI for the identified UMLS concept.
            Semantic_Type_List = fields[
                5]  # Comma-separated list of Semantic Type abbreviations
            Trigger_Information = fields[
                6]  # Comma separated sextuple showing what triggered MMI to identify this UMLS concept
            Location = fields[
                7]  # Summarizes where UMLS concept was found. TI – Title, AB – Abstract, TX – Free Text, TI;AB – Title and Abstract
            Positional_Information = fields[
                8]  # Semicolon-separated list of positional-information terns, showing StartPos, slash (/), and Length of each trigger identified in the Trigger Information field
            Treecode = fields[
                9]  # Semicolon-separated list of any MeSH treecode

            triggers = Trigger_Information[1:-1].split(u",\"")
            spans = Positional_Information.split(u";")
            if len(triggers) != len(spans):
                raise RuntimeError(
                    "the number of triggers is not equal to that of spans: {} in {}"
                    .format(UMLS_ID, file_path[file_path.rfind('/') + 1:]))

            for idx, span in enumerate(spans):
                bracket_spans = re_brackets.findall(span)
                if len(bracket_spans) == 0:  # simple form
                    if span.find(u',') != -1:
                        logging.debug(
                            "ignore non-continuous form of Positional_Information: {} in {}"
                            .format(triggers[idx],
                                    file_path[file_path.rfind('/') + 1:]))
                        continue

                    tmps = span.split(u"/")
                    entity = Entity()
                    entity.spans.append(
                        [int(tmps[0]),
                         int(tmps[0]) + int(tmps[1])])
                    entity.norm_ids.append(str(UMLS_ID))
                    # "B cell lymphoma"-tx-5-"B cell lymphoma"-noun-0
                    tmps = triggers[idx].split(u"-")

                    if tmps[3].find('"') == -1:
                        logging.debug(
                            "ignore non-string entity: {} in {}".format(
                                tmps[3], file_path[file_path.rfind('/') + 1:]))
                        continue

                    if len(tmps) != 6:
                        logging.debug(
                            "parsing trigger error, ignore entity: {} in {}".
                            format(triggers[idx],
                                   file_path[file_path.rfind('/') + 1:]))
                        continue

                    entity.name = tmps[3][1:-1]  # remove ""

                    entities.append(entity)
                else:
                    for bracket_span in bracket_spans:
                        if bracket_span.find(u',') != -1:
                            logging.debug(
                                "ignore non-continuous form of Positional_Information: {} in {}"
                                .format(triggers[idx],
                                        file_path[file_path.rfind('/') + 1:]))
                            continue

                        tmps = bracket_span[1:-1].split(u"/")
                        entity = Entity()
                        entity.spans.append(
                            [int(tmps[0]),
                             int(tmps[0]) + int(tmps[1])])
                        entity.norm_ids.append(str(UMLS_ID))
                        # "B cell lymphoma"-tx-5-"B cell lymphoma"-noun-0
                        tmps = triggers[idx].split(u"-")

                        if tmps[3].find('"') == -1:
                            logging.debug(
                                "ignore non-string entity: {} in {}".format(
                                    tmps[3],
                                    file_path[file_path.rfind('/') + 1:]))
                            continue

                        if len(tmps) != 6:
                            logging.debug(
                                "parsing trigger error, ignore entity: {} in {}"
                                .format(triggers[idx],
                                        file_path[file_path.rfind('/') + 1:]))
                            continue

                        entity.name = tmps[3][1:-1]

                        entities.append(entity)

    document.entities = entities
    return document
Esempio n. 7
0
def load_data_pubtator(file_path):

    # stat
    ct_doc = 0
    ct_entity = 0

    documents = []
    with codecs.open(file_path, 'r', 'UTF-8') as fp:

        document = None

        for line in fp:

            line = line.strip()

            if line == '':
                if document is None:
                    continue
                else:
                    # save the document
                    documents.append(document)
                    document = None
                    ct_doc += 1
            elif line.find('|t|') != -1:
                # a new document
                document = Document()
                columns = line.split('|t|')
                document.name = columns[0]
                document.text = columns[1] + " "  # offset need + 1

            elif line.find('|a|') != -1:

                columns = line.split('|a|')

                document.text += columns[1]

                generator = nlp_tool.span_tokenize(document.text)
                for t in generator:
                    document.all_sents_inds.append(t)

                for ind in range(len(document.all_sents_inds)):
                    t_start = document.all_sents_inds[ind][0]
                    t_end = document.all_sents_inds[ind][1]

                    tmp_tokens = FoxTokenizer.tokenize(
                        t_start, document.text[t_start:t_end], False)
                    sentence_tokens = []
                    for token_idx, token in enumerate(tmp_tokens):
                        token_dict = {}
                        token_dict['start'], token_dict['end'] = token[
                            1], token[2]
                        token_dict['text'] = token[0]

                        sentence_tokens.append(token_dict)

                    document.sentences.append(sentence_tokens)

            else:
                columns = line.split('\t')

                if columns[1] == 'CID':  # for cdr corpus, we ignore relation
                    continue

                if columns[4].find(
                        "Chemical"
                ) != -1:  # for cdr corpus, we ignore chemical
                    continue

                entity = Entity()
                entity.spans.append([int(columns[1]), int(columns[2])])
                entity.name = columns[3]
                entity.type = columns[4]

                if columns[5].find('|') != -1:
                    ids = columns[5].split('|')
                    for id in ids:
                        if id == '-1':
                            raise RuntimeError("id == -1")
                        if id.find("OMIM:") != -1:
                            id = id[id.find("OMIM:") + len("OMIM:"):]
                            entity.norm_ids.append(id)
                        else:
                            entity.norm_ids.append(id)
                elif columns[5].find('+') != -1:
                    ids = columns[5].split('+')
                    for id in ids:
                        if id == '-1':
                            raise RuntimeError("id == -1")
                        if id.find("OMIM:") != -1:
                            id = id[id.find("OMIM:") + len("OMIM:"):]
                            entity.norm_ids.append(id)
                        else:
                            entity.norm_ids.append(id)
                else:
                    id = columns[5]
                    if id.find("OMIM:") != -1:
                        id = id[id.find("OMIM:") + len("OMIM:"):]
                        entity.norm_ids.append(id)
                    else:
                        entity.norm_ids.append(id)

                # columns[6], cdr may has Individual mentions, we don't use it yet

                for sent_idx, (sent_start,
                               sent_end) in enumerate(document.all_sents_inds):
                    if entity.spans[0][0] >= sent_start and entity.spans[0][
                            1] <= sent_end:  # we assume entity has only one span
                        entity.sent_idx = sent_idx
                        break
                if entity.sent_idx == -1:
                    logging.debug("can't find entity.sent_idx: {} ".format(
                        entity.name))
                    continue
                    # raise RuntimeError("can't find entity.sent_idx")

                tkStart = -1
                tkEnd = -1
                for tkidx, token_dict in enumerate(
                        document.sentences[entity.sent_idx]):
                    if token_dict['start'] == entity.spans[0][0]:
                        tkStart = tkidx

                    if token_dict['end'] == entity.spans[0][1]:
                        tkEnd = tkidx

                    if tkStart != -1 and tkEnd != -1:
                        break

                if tkStart == -1 or tkEnd == -1:
                    raise RuntimeError('tkStart == -1 or tkEnd == -1')

                entity.tkSpans.append([tkStart, tkEnd])

                document.entities.append(entity)
                ct_entity += 1

    logging.info("document number {}, entity number {}".format(
        ct_doc, ct_entity))

    return documents