Ejemplo n.º 1
0
    def test_ner(self):
        predictor = pretrained.named_entity_recognition_with_elmo_peters_2018()

        sentence = """Michael Jordan is a professor at Berkeley."""

        result = predictor.predict_json({"sentence": sentence})

        assert result["words"] == ["Michael", "Jordan", "is", "a", "professor", "at", "Berkeley", "."]
        assert result["tags"] == ["B-PER", "L-PER", "O", "O", "O", "O", "U-LOC", "O"]
Ejemplo n.º 2
0
    def __init__(self, tagger_model: str) -> None:
        super(DropNERTokenIndexer, self).__init__()

        if tagger_model == 'elmo_peters_2018':
            self.ner_tagger = named_entity_recognition_with_elmo_peters_2018()
        elif tagger_model == 'fine_grained_elmo_peters_2018':
            self.ner_tagger = fine_grained_named_entity_recognition_with_elmo_peters_2018(
            )
        else:
            raise ValueError(
                'Unsupported NER Tagger module. '
                'Please use \"elmo_peters_2018\" or \"fine_grained_elmo_peters_2018\"'
            )
def main(args):
    fname = args.metadata
    with open(fname, 'r') as f:
        metadatas = json.load(f)

    print('Building model...')
    predictor = pretrained.named_entity_recognition_with_elmo_peters_2018()

    output = list()
    data_path = args.data

    for i, metadata in enumerate(metadatas):
        pdf_fname = os.path.split(metadata['pdf'])[1]

        paper_id = pdf_fname.split('-')[0]
        if len(KEEP) > 0 and paper_id not in KEEP:
            continue
        print(paper_id)
        if paper_id in IGNORE:
            print('Ignored')
            continue

        txt_fname = pdf_fname.replace('.pdf', '.txt')
        txt_path = os.path.join(data_path, txt_fname)

        affiliations = extract_affiliations(txt_path, metadata, predictor)

        has_code = metadata['code']
        data = {
            'name': pdf_fname.replace('.pdf', ''),
            'code': has_code,
            'affiliations': affiliations
        }
        output.append(data)

        if i % 100 == 0:
            # Write periodically as a failsafe
            write_output(args.output, output)

    write_output(args.output, output)