Exemple #1
0
parser = argparse.ArgumentParser()
parser.add_argument("input")
parser.add_argument("types")
parser.add_argument("output")
args = parser.parse_args()

sentences = []
entities = []
document_ids = []
types = args.types.split(',')

with open(args.input) as f_in:
    tree = etree.parse(f_in)
    for document in tree.xpath('.//document'):
        sentence_elems = document.xpath('.//sentence')
        for sentence in sentence_elems:
            sentences += [sentence.get('text')]
            document_ids += [document.get('id')]
            entities += [[]]

            for entity in sentence.xpath(".//entity"):
                if not entity.get('type') in types:
                    continue
                char_offset = entity.get('charOffset').split('-')
                entities[-1] += [(int(char_offset[0]), int(char_offset[1]))]


with open(args.output, 'w') as f_out:
    utils.write_to_conll(sentences, entities, document_ids, f_out)
Exemple #2
0
            tmp_sentence = []
            akt_pos = 0
            start = word_to_id[sentence[1].split('..')[0]]
            end = word_to_id[sentence[1].split('..')[1]]
            for i in range(start, end + 1):
                tmp_sentence += [words[i]]
                word_pos[i] = (j, akt_pos)
                akt_pos += len(words[i]) + 1
            tmp_sentences += [tmp_sentence]
        tmp_entities = [[] for _ in tmp_sentences]
        for protein in protein_tree:
            try:
                start = word_to_id[protein.get('span').split('..')[0]]
                end = word_to_id[protein.get('span').split('..')[-1]]
                tmp_entities[word_pos[start - 1][0]] += [
                    (word_pos[start - 1][1],
                     word_pos[end - 1][1] + len(words[end - 1]))
                ]
            except:
                print('Skipped multipart entity')
        sentences += tmp_sentences
        entities += tmp_entities
        document_ids += [document_id] * len(tmp_sentences)

with open(args.output, 'w') as f_out:
    utils.write_to_conll([' '.join(x) for x in sentences],
                         entities,
                         document_ids,
                         f_out,
                         tokenizer=lambda x: x.split(' '))
Exemple #3
0
        sentences[sentence_id] = sentence_text

        for entity_token_ids in all_entity_token_ids:
            entity_start = None
            for token_idx, (token_id, token_offset) in enumerate(
                    zip(token_ids, token_offsets)):
                if token_id in entity_token_ids:
                    if entity_start is None:
                        entity_start = token_offset
                else:
                    if entity_start is not None:
                        entities_per_sentence[sentence_id].append(
                            (entity_start, token_offset - 1))
                        entity_start = None

with open(args.output, 'w') as f_out:
    sentences = [sentences[s_id] for s_id in sentence_ids_in_order]
    entities = [
        utils.merge_overlapping_entities(entities_per_sentence[s_id])
        for s_id in sentence_ids_in_order
    ]

    def tokenizer(sentence):
        return sentence.split()

    utils.write_to_conll(
        sentences,
        entities, [str(x) for x in range(0, len(sentence_ids_in_order))],
        f_out,
        tokenizer=tokenizer)
sentences = []
entities = []

with open(args.input, 'r') as f_in:
    tree = etree.parse(f_in)
    for document in tree.xpath('.//document'):
        assert len(document.xpath('passage/text')) == 1
        text = document.xpath('passage/text')[0].text
        tmp_sentences = text.split('\n')
        tmp_entities = [[] for x in tmp_sentences]
        for annotation in document.xpath('.//annotation'):
            prot = False
            for infon in annotation.xpath('.//infon'):
                prot |= ((infon.get('key') == 'type') &
                         (infon.text[0:2] == 'pm'))
            if not prot:
                continue
            offset = int(annotation.xpath('.//location')[0].get('offset'))
            length = int(annotation.xpath('.//location')[0].get('length'))
            i = 0
            while offset > len(tmp_sentences[i]):
                offset -= len(tmp_sentences[i]) + 1
                i += 1
            tmp_entities[i] += [(offset, offset + length)]
        sentences += tmp_sentences
        entities += tmp_entities

with open(args.output, 'w') as f_out:
    utils.write_to_conll(sentences, entities, f_out)
            soup = BeautifulSoup(f_in, 'lxml')
            for paragraph in soup.find_all('snippet'):
                if len(paragraph.text.strip()) == 0:
                    continue
                # each (new-line-separated) sentence seems to be valid html
                # so we extract the source, split at newline and parse again
                source = "".join(str(elem) for elem in paragraph.children)
                sentences = source.split('\n')
                for sentence in sentences:
                    document_id = os.path.basename(os.path.dirname(document))
                    sentence_bs = BeautifulSoup(sentence, 'lxml')
                    sentence_text, sentence_entities = extract_sentence_info(sentence_bs)
                    tmp_sentences = sentence_splitter.split(sentence_text)
                    tmp_entities = [[] for _ in tmp_sentences]
                    for entity in sentence_entities:
                        org_start = entity[0]
                        org_end = entity[1]
                        id, start, end = sentence_splitter.map_offsets(org_start, org_end)
                        while len(tmp_sentences[id]) < end:
                            tmp_sentences = sentence_splitter.merge_sentences(id)
                            tmp_entities[id] += tmp_entities[id + 1]
                            del tmp_entities[id + 1]
                            id, start, end = sentence_splitter.map_offsets(org_start, org_end)
                        tmp_entities[id] += [(start, end)]
                    all_sentences += tmp_sentences
                    all_entities += tmp_entities
                    document_ids += [document_id] * len(tmp_sentences)

    with open(args.output, 'w') as f_out:
        utils.write_to_conll(all_sentences, all_entities, document_ids, f_out)