Esempio n. 1
0
def processOneFile_fda(fileName, annotation_dir, nlp_tool, isTraining, types,
                       type_filter, isFDA2018, isNorm):
    documents = []
    annotation_file = get_fda_file(join(annotation_dir, fileName))

    # each section is a document
    for section in annotation_file.sections:
        document = Document()
        document.name = fileName[:fileName.find('.')] + "_" + section.id
        if section.text is None:
            document.text = ""
            document.entities = []
            document.sentences = []
            documents.append(document)
            continue

        document.text = section.text

        entities = []

        if isFDA2018 == False and isNorm == True:
            for reaction in annotation_file.reactions:
                entity = Entity()
                entity.name = reaction.name
                for normalization in reaction.normalizations:
                    entity.norm_ids.append(
                        normalization.meddra_pt_id)  # can be none
                    entity.norm_names.append(normalization.meddra_pt)
                entities.append(entity)

        else:
            for entity in annotation_file.mentions:
                if entity.section != section.id:
                    continue
                if types and (entity.type not in type_filter):
                    continue
                entities.append(entity)

        document.entities = entities

        if opt.nlp_tool == "nltk":
            if isTraining:
                sentences = get_sentences_and_tokens_from_nltk(
                    section.text, nlp_tool, document.entities,
                    annotation_file.ignore_regions, section.id)
            else:
                sentences = get_sentences_and_tokens_from_nltk(
                    section.text, nlp_tool, None,
                    annotation_file.ignore_regions, section.id)
        else:
            raise RuntimeError("invalid nlp tool")

        document.sentences = sentences

        documents.append(document)

    return documents, annotation_file
Esempio n. 2
0
def processOneFile_fda(fileName, annotation_dir, types, type_filter, isFDA2018,
                       isNorm):
    documents = []
    annotation_file = get_fda_file(os.path.join(annotation_dir, fileName))

    # each section is a document
    for section in annotation_file.sections:
        document = Document()
        document.name = fileName[:fileName.find('.')] + "_" + section.id
        if section.text is None:
            document.text = ""
            document.entities = []
            document.sentences = []
            documents.append(document)
            continue

        document.text = section.text

        entities = []

        if isFDA2018 == False and isNorm == True:
            for reaction in annotation_file.reactions:
                entity = Entity()
                entity.name = reaction.name
                for normalization in reaction.normalizations:
                    entity.norm_ids.append(
                        normalization.meddra_pt_id)  # can be none
                    entity.norm_names.append(normalization.meddra_pt)
                entities.append(entity)

        else:
            for entity in annotation_file.mentions:
                if entity.section != section.id:
                    continue
                if types and (entity.type not in type_filter):
                    continue
                entities.append(entity)

        document.entities = entities

        document.sentences = []

        documents.append(document)

    return documents, annotation_file
Esempio n. 3
0
def parse_one_gold_file(annotation_dir, corpus_dir, fileName):
    document = Document()
    document.name = fileName[:fileName.find('.')]

    annotation_file = get_bioc_file(os.path.join(annotation_dir, fileName))
    bioc_passage = annotation_file[0].passages[0]
    entities = []

    for entity in bioc_passage.annotations:
        if entity.infons['type'] not in type_we_care:
            continue

        entity_ = Entity()
        entity_.id = entity.id
        processed_name = entity.text.replace('\\n', ' ')
        if len(processed_name) == 0:
            logging.debug("{}: entity {} name is empty".format(
                fileName, entity.id))
            continue
        entity_.name = processed_name
        entity_.type = entity.infons['type']
        entity_.spans.append(
            [entity.locations[0].offset, entity.locations[0].end])

        if ('SNOMED code' in entity.infons and entity.infons['SNOMED code'] != 'N/A') \
                and ('SNOMED term' in entity.infons and entity.infons['SNOMED term'] != 'N/A'):
            entity_.norm_ids.append(entity.infons['SNOMED code'])
            entity_.norm_names.append(entity.infons['SNOMED term'])

        elif ('MedDRA code' in entity.infons and entity.infons['MedDRA code'] != 'N/A') \
                and ('MedDRA term' in entity.infons and entity.infons['MedDRA term'] != 'N/A'):
            entity_.norm_ids.append(entity.infons['MedDRA code'])
            entity_.norm_names.append(entity.infons['MedDRA term'])
        else:
            logging.debug("{}: no norm id in entity {}".format(
                fileName, entity.id))
            # some entities may have no norm id
            continue

        entities.append(entity_)

    document.entities = entities

    corpus_file = get_text_file(
        os.path.join(corpus_dir,
                     fileName.split('.bioc')[0]))
    document.text = corpus_file

    return document
Esempio n. 4
0
def processOneFile(fileName, annotation_dir, corpus_dir, nlp_tool, isTraining,
                   types, type_filter):
    document = Document()
    document.name = fileName[:fileName.find('.')]

    ct_snomed = 0
    ct_meddra = 0
    ct_unnormed = 0

    if annotation_dir:
        annotation_file = get_bioc_file(join(annotation_dir, fileName))
        bioc_passage = annotation_file[0].passages[0]
        entities = []

        for entity in bioc_passage.annotations:
            if types and (entity.infons['type'] not in type_filter):
                continue
            entity_ = Entity()
            entity_.id = entity.id
            processed_name = entity.text.replace('\\n', ' ')
            if len(processed_name) == 0:
                logging.debug("{}: entity {} name is empty".format(
                    fileName, entity.id))
                continue
            entity_.name = processed_name

            entity_.type = entity.infons['type']
            entity_.spans.append(
                [entity.locations[0].offset, entity.locations[0].end])
            if ('SNOMED code' in entity.infons and entity.infons['SNOMED code'] != 'N/A')\
                    and ('SNOMED term' in entity.infons and entity.infons['SNOMED term'] != 'N/A'):
                entity_.norm_ids.append(entity.infons['SNOMED code'])
                entity_.norm_names.append(entity.infons['SNOMED term'])
                ct_snomed += 1
            elif ('MedDRA code' in entity.infons and entity.infons['MedDRA code'] != 'N/A')\
                    and ('MedDRA term' in entity.infons and entity.infons['MedDRA term'] != 'N/A'):
                entity_.norm_ids.append(entity.infons['MedDRA code'])
                entity_.norm_names.append(entity.infons['MedDRA term'])
                ct_meddra += 1
            else:
                logging.debug("{}: no norm id in entity {}".format(
                    fileName, entity.id))
                ct_unnormed += 1
                continue

            entities.append(entity_)

        document.entities = entities

    corpus_file = get_text_file(join(corpus_dir, fileName.split('.bioc')[0]))
    document.text = corpus_file

    if opt.nlp_tool == "spacy":
        if isTraining:
            sentences = get_sentences_and_tokens_from_spacy(
                corpus_file, nlp_tool, document.entities)
        else:
            sentences = get_sentences_and_tokens_from_spacy(
                corpus_file, nlp_tool, None)
    elif opt.nlp_tool == "nltk":
        if isTraining:
            sentences = get_sentences_and_tokens_from_nltk(
                corpus_file, nlp_tool, document.entities, None, None)
        else:
            sentences = get_sentences_and_tokens_from_nltk(
                corpus_file, nlp_tool, None, None, None)
    elif opt.nlp_tool == "stanford":
        if isTraining:
            sentences = get_sentences_and_tokens_from_stanford(
                corpus_file, nlp_tool, document.entities)
        else:
            sentences = get_sentences_and_tokens_from_stanford(
                corpus_file, nlp_tool, None)
    else:
        raise RuntimeError("invalid nlp tool")

    document.sentences = sentences

    return document, ct_snomed, ct_meddra, ct_unnormed
Esempio n. 5
0
def load_data_pubtator(file_path):

    # stat
    ct_doc = 0
    ct_entity = 0

    documents = []
    with codecs.open(file_path, 'r', 'UTF-8') as fp:

        document = None

        for line in fp:

            line = line.strip()

            if line == '':
                if document is None:
                    continue
                else:
                    # save the document
                    documents.append(document)
                    document = None
                    ct_doc += 1
            elif line.find('|t|') != -1:
                # a new document
                document = Document()
                columns = line.split('|t|')
                document.name = columns[0]
                document.text = columns[1] + " "  # offset need + 1

            elif line.find('|a|') != -1:

                columns = line.split('|a|')

                document.text += columns[1]

                generator = nlp_tool.span_tokenize(document.text)
                for t in generator:
                    document.all_sents_inds.append(t)

                for ind in range(len(document.all_sents_inds)):
                    t_start = document.all_sents_inds[ind][0]
                    t_end = document.all_sents_inds[ind][1]

                    tmp_tokens = FoxTokenizer.tokenize(
                        t_start, document.text[t_start:t_end], False)
                    sentence_tokens = []
                    for token_idx, token in enumerate(tmp_tokens):
                        token_dict = {}
                        token_dict['start'], token_dict['end'] = token[
                            1], token[2]
                        token_dict['text'] = token[0]

                        sentence_tokens.append(token_dict)

                    document.sentences.append(sentence_tokens)

            else:
                columns = line.split('\t')

                if columns[1] == 'CID':  # for cdr corpus, we ignore relation
                    continue

                if columns[4].find(
                        "Chemical"
                ) != -1:  # for cdr corpus, we ignore chemical
                    continue

                entity = Entity()
                entity.spans.append([int(columns[1]), int(columns[2])])
                entity.name = columns[3]
                entity.type = columns[4]

                if columns[5].find('|') != -1:
                    ids = columns[5].split('|')
                    for id in ids:
                        if id == '-1':
                            raise RuntimeError("id == -1")
                        if id.find("OMIM:") != -1:
                            id = id[id.find("OMIM:") + len("OMIM:"):]
                            entity.norm_ids.append(id)
                        else:
                            entity.norm_ids.append(id)
                elif columns[5].find('+') != -1:
                    ids = columns[5].split('+')
                    for id in ids:
                        if id == '-1':
                            raise RuntimeError("id == -1")
                        if id.find("OMIM:") != -1:
                            id = id[id.find("OMIM:") + len("OMIM:"):]
                            entity.norm_ids.append(id)
                        else:
                            entity.norm_ids.append(id)
                else:
                    id = columns[5]
                    if id.find("OMIM:") != -1:
                        id = id[id.find("OMIM:") + len("OMIM:"):]
                        entity.norm_ids.append(id)
                    else:
                        entity.norm_ids.append(id)

                # columns[6], cdr may has Individual mentions, we don't use it yet

                for sent_idx, (sent_start,
                               sent_end) in enumerate(document.all_sents_inds):
                    if entity.spans[0][0] >= sent_start and entity.spans[0][
                            1] <= sent_end:  # we assume entity has only one span
                        entity.sent_idx = sent_idx
                        break
                if entity.sent_idx == -1:
                    logging.debug("can't find entity.sent_idx: {} ".format(
                        entity.name))
                    continue
                    # raise RuntimeError("can't find entity.sent_idx")

                tkStart = -1
                tkEnd = -1
                for tkidx, token_dict in enumerate(
                        document.sentences[entity.sent_idx]):
                    if token_dict['start'] == entity.spans[0][0]:
                        tkStart = tkidx

                    if token_dict['end'] == entity.spans[0][1]:
                        tkEnd = tkidx

                    if tkStart != -1 and tkEnd != -1:
                        break

                if tkStart == -1 or tkEnd == -1:
                    raise RuntimeError('tkStart == -1 or tkEnd == -1')

                entity.tkSpans.append([tkStart, tkEnd])

                document.entities.append(entity)
                ct_entity += 1

    logging.info("document number {}, entity number {}".format(
        ct_doc, ct_entity))

    return documents