def append_ner(input_file=None, output_file=None, batch_size=None):
    """

     Parameters
     ----------

     Returns
     -------

    """
    input = open(input_file,
                 encoding='utf-8') if input_file is not None else sys.stdin
    csv_reader = csv.reader(input)

    column_mapper = CsvColumnMapper(next(csv_reader), ['ner'],
                                    source_required=['sentence', 'words'])

    mnofc = ManageNewOutputFileCreation(output_file, batch_size)

    spacy_pipeline = en_core_web_sm.load()

    for count, entry in enumerate(csv_reader, start=0):

        new_file = mnofc.get_new_file_if_necessary()
        if new_file:
            csv_writer = csv.writer(new_file)
            csv_writer.writerow(column_mapper.get_new_headers())

        # now that we've finished creating a new file as necessary, we can proceed with the business
        # at hand:

        words = column_mapper.get_field_value_from_source(entry, 'words', True)
        if words is None:
            csv_writer.writerow(column_mapper.get_new_row_values(
                entry, [None]))
            continue

        sentence = column_mapper.get_field_value_from_source(entry, 'sentence')
        spacy_doc = spacy_pipeline(sentence)
        spacy_tokens = [token.text for token in spacy_doc]

        ner_lookup_spacy_tokenization = {}
        for index, token in enumerate(spacy_doc, start=1):
            if token.ent_type != 0:  # in ['PERSON', 'ORG']:
                ner_lookup_spacy_tokenization[index] = token.ent_type_

        ner_lookup = ner_lookup_spacy_tokenization
        tokens = [token for _, token in words]

        if tokens != spacy_tokens and len(ner_lookup_spacy_tokenization) > 0:
            ner_lookup = SyncTags.b_lookup_to_a_lookup(
                tokens, spacy_tokens, ner_lookup_spacy_tokenization)

        csv_writer.writerow(
            column_mapper.get_new_row_values(entry, [ner_lookup]))
def identify_ucca_paths(input, output):
    csv_reader = csv.reader(input)
    csv_writer = csv.writer(output)

    column_mapper = CsvColumnMapper(
        next(csv_reader),
        target_columns=['path_id', 'path', 'comment'],
        source_required=[
            'id', 'sentence', 'ent1_start', 'ent1_end', 'ent2_start',
            'ent2_end', 'ucca_parse', 'trigger_idx'
        ])

    csv_writer.writerow(column_mapper.get_new_headers())

    for counter, entry in enumerate(csv_reader, start=1):

        print('Processing sentence #',
              column_mapper.get_field_value_from_source(entry, 'id'))

        ucca_parse_serialization = column_mapper.get_field_value_from_source(
            entry, 'ucca_parse')
        if ucca_parse_serialization is None:
            csv_writer.writerow(
                column_mapper.get_new_row_values(
                    entry, [None, None, 'ucca_parse missing']))
            continue

        ucca_parse = UccaParsedPassage.from_serialization(
            ucca_parse_serialization)
        links = ucca_parse.get_links()

        trigger_token_id = column_mapper.get_field_value_from_source(
            entry, 'trigger_idx', as_int=True)
        ent1_start_token_id = column_mapper.get_field_value_from_source(
            entry, 'ent1_start', as_int=True)
        ent2_start_token_id = column_mapper.get_field_value_from_source(
            entry, 'ent2_start', as_int=True)

        if trigger_token_id is None or ent1_start_token_id is None or ent2_start_token_id is None:
            csv_writer.writerow(
                column_mapper.get_new_row_values(
                    entry, [None, None, 'indices missing']))
            continue

        trigger_node_id = ucca_parse.get_node_id_by_token_id(trigger_token_id)
        trigger_parent_node_id = Link.get_parents(links, trigger_node_id)[0]

        ent1_start_node_id = ucca_parse.get_node_id_by_token_id(
            ent1_start_token_id)
        ent1_parent_node_ids = Link.get_parents(links, ent1_start_node_id)
        if len(ent1_parent_node_ids) == 0:
            csv_writer.writerow(
                column_mapper.get_new_row_values(
                    entry, [None, None, 'Could not find parent of ent1']))
            continue
        ent1_parent_node_id = ent1_parent_node_ids[0]

        ent2_start_node_id = ucca_parse.get_node_id_by_token_id(
            ent2_start_token_id)
        ent2_parent_node_ids = Link.get_parents(links, ent2_start_node_id)
        if len(ent2_parent_node_ids) == 0:
            csv_writer.writerow(
                column_mapper.get_new_row_values(
                    entry, [None, None, 'Could not find parent of ent2']))
            continue
        ent2_parent_node_id = ent2_parent_node_ids[0]

        graph = DepGraph(links)

        ent1_to_trigger_steps = graph.get_steps(ent1_parent_node_id,
                                                trigger_parent_node_id)
        ent1_to_trigger_strings = ucca_parse.get_path_representations(
            ent1_to_trigger_steps)

        trigger_to_ent2_steps = graph.get_steps(trigger_parent_node_id,
                                                ent2_parent_node_id)
        trigger_to_ent2_strings = ucca_parse.get_path_representations(
            trigger_to_ent2_steps)

        sentence_id = column_mapper.get_field_value_from_source(entry,
                                                                'id',
                                                                as_int=True)

        for count, (segment1,
                    segment2) in enumerate(product(ent1_to_trigger_strings,
                                                   trigger_to_ent2_strings),
                                           start=1):
            path_id = '{0}_{1}'.format(sentence_id, count)
            path = '{0} >< {1}'.format(segment1, segment2)
            comment = None

            csv_writer.writerow(
                column_mapper.get_new_row_values(entry,
                                                 [path_id, path, comment]))
Ejemplo n.º 3
0
def parse_pss(port,
              model_path,
              input_file=None,
              output_file=None,
              batch_size=None):
    """

     Parameters
     ----------

     Returns
     -------

    """

    input = open(input_file,
                 encoding='utf-8') if input_file is not None else sys.stdin
    csv_reader = csv.reader(input)

    column_mapper = CsvColumnMapper(
        next(csv_reader), ['pss'],
        source_required=['sentence', 'ud_parse', 'words'])

    batch = 0
    output = None
    output_file = output_file[:-len('.csv')] if output_file is not None and output_file.endswith('.csv') \
        else output_file

    print('BEGIN-INIT-PSS')

    from models.supersenses.lstm_mlp_supersenses_model import LstmMlpSupersensesModel
    from models.supersenses.preprocessing import preprocess_sentence
    from models.supersenses.preprocessing.corenlp import CoreNLPServer

    corenlp = CoreNLPServer()
    corenlp.start(port)

    model = LstmMlpSupersensesModel.load(model_path)

    print('END-INIT-PSS')

    for count, entry in enumerate(csv_reader, start=0):

        # the next few lines of code deal with opening and closing files (depending on the batching argument, etc)
        new_file = False

        # first option: standard output ...
        if count == 0 and output_file is None:
            output = sys.stdout
            new_file = True

        # second option: we've just started, we're writing to a real file, but no batching
        if count == 0 and output_file is not None and batch_size is None:
            output_file_actual = '{0}.csv'.format(output_file)

            output = open(output_file_actual,
                          'w',
                          encoding='utf-8',
                          newline='')
            new_file = True

        # second case: we've finished a batch (and we are batching..)
        if batch_size is not None and count % batch_size == 0:
            output_file_actual = '{0}-{1}.csv'.format(output_file, batch)

            if output is not None:
                output.close()

            output = open(output_file_actual,
                          'w',
                          encoding='utf-8',
                          newline='')
            batch += 1
            new_file = True

        # if we did create a new file, let's ensure that the first row consists of column titles
        if new_file:
            csv_writer = csv.writer(output)
            csv_writer.writerow(column_mapper.get_new_headers())

        # now that we've finished creating a new file as necessary, we can proceed with the business
        # at hand:

        words = column_mapper.get_field_value_from_source(entry, 'words', True)
        if words is None:
            csv_writer.writerow(column_mapper.get_new_row_values(
                entry, [None]))
            continue

        sentence = column_mapper.get_field_value_from_source(entry, 'sentence')

        proper_tokens = word_tokenize(sentence)

        print('BEGIN-PROCESS-PSS')
        preprocessed = preprocess_sentence(' '.join(proper_tokens))
        pss_pred = model.predict(
            preprocessed.xs, [x.identified_for_pss for x in preprocessed.xs])
        print('END-PROCESS-PSS')

        pss_lookup_nltk_tokens = {}
        for index in range(len(preprocessed.xs)):
            if pss_pred[index].supersense_role:
                pss_lookup_nltk_tokens[index +
                                       1] = (pss_pred[index].supersense_role,
                                             pss_pred[index].supersense_func)

        ud_tokens = [token for _, token in words]
        pss_lookup = pss_lookup_nltk_tokens

        if ud_tokens != proper_tokens and len(pss_lookup_nltk_tokens) > 0:
            pss_lookup = SyncTags.b_lookup_to_a_lookup(ud_tokens,
                                                       proper_tokens,
                                                       pss_lookup_nltk_tokens)

        csv_writer.writerow(
            column_mapper.get_new_row_values(entry, [pss_lookup]))

    corenlp.stop()
Ejemplo n.º 4
0
def parse_ucca(model_prefix, input_file=None, output_file=None, batch_size=None):
    """

     Parameters
     ----------

     Returns
     -------

    """
    input = open(input_file, encoding='utf-8') if input_file is not None else sys.stdin
    csv_reader = csv.reader(input)

    column_mapper = CsvColumnMapper(
        source_first_row=next(csv_reader),
        target_columns=
        ['id',
         'sentence',
         'ent1',
         'ent2',
         'ent1_start',
         'ent1_end',
         'ent2_start',
         'ent2_end',
         'ucca_parse',
         'words',
         'lemmas',
         'comment'],
        source_required=
        ['tac_tokens',
         'subj_start',
         'subj_end',
         'obj_start',
         'obj_end'],
        filter_source_from_result=
        ['subj_start',
         'subj_end',
         'obj_start',
         'obj_end']
    )

    detokenizer = Detokenizer()
    mnofc = ManageNewOutputFileCreation(output_file, batch_size)

    print('BEGIN-INIT-TUPA')
    parser = TupaParser(model_prefix)
    print('END-INIT-TUPA')

    for count, entry in enumerate(csv_reader, start=0):

        new_file = mnofc.get_new_file_if_necessary()
        if new_file:
            csv_writer = csv.writer(new_file)
            csv_writer.writerow(column_mapper.get_new_headers())

        tac_tokens = eval(column_mapper.get_field_value_from_source(entry, 'tac_tokens'))
        sentence = detokenizer.detokenize(tac_tokens)

        print('BEGIN-PROCESS-TUPA')
        parsed_sentence = parser.parse_sentence(sentence)
        print('END-PROCESS-TUPA')

        tokens = []
        tokens_with_indices = []
        lemmas_with_indices = []

        for ucca_terminal in parsed_sentence.terminals:
            tokens.append(ucca_terminal.text)
            tokens_with_indices.append((ucca_terminal.token_id, ucca_terminal.text))
            lemmas_with_indices.append((ucca_terminal.token_id, ucca_terminal.lemma))

        tac_tokens_lookup = {}
        tac_tokens_lookup['subj_start'] = int(column_mapper.get_field_value_from_source(entry, 'subj_start'))
        tac_tokens_lookup['subj_end'] = int(column_mapper.get_field_value_from_source(entry, 'subj_end'))
        tac_tokens_lookup['obj_start'] = int(column_mapper.get_field_value_from_source(entry, 'obj_start'))
        tac_tokens_lookup['obj_end'] = int(column_mapper.get_field_value_from_source(entry, 'obj_end'))

        token_lookup = SyncTacTags.b_lookup_to_a_lookup(tokens, tac_tokens, tac_tokens_lookup)

        if len(token_lookup) != len(tac_tokens_lookup):
            csv_writer.writerow(column_mapper.get_new_row_values(entry,
                                                                 [count,
                                                                  sentence,
                                                                  None,
                                                                  None,
                                                                  None,
                                                                  None,
                                                                  None,
                                                                  None,
                                                                  None,
                                                                  None,
                                                                  None,
                                                                  'was not able to reconcile TAC and Tupa\'s Spacy based indexing']))
            continue

        ent1_start = token_lookup['subj_start']
        ent1_end = token_lookup['subj_end']
        ent1 = ' '.join(tokens[ent1_start:ent1_end + 1])

        ent2_start = token_lookup['obj_start']
        ent2_end = token_lookup['obj_end']
        ent2 = ' '.join(tokens[ent2_start:ent2_end + 1])

        csv_writer.writerow(column_mapper.get_new_row_values(entry,
                                                             [count,
                                                              sentence,
                                                              ent1,
                                                              ent2,
                                                              ent1_start + 1,
                                                              ent1_end + 1,
                                                              ent2_start + 1,
                                                              ent2_end + 1,
                                                              parsed_sentence.serialize(),
                                                              tokens_with_indices,
                                                              lemmas_with_indices,
                                                              None]))
def extract_relations(output, ud_input, ud_paths, ucca_input, ucca_paths,
                      triggers):
    def get_output_entry_list(id,
                              sentence,
                              ud_words='',
                              ud_lemmas='',
                              ud_parse='',
                              ucca_words='',
                              ucca_lemmas='',
                              ucca_parse='',
                              ud_trigger='',
                              ud_path='',
                              ucca_trigger='',
                              ucca_path='',
                              extraction_comment=''):

        return [
            id, sentence, ud_words, ud_lemmas, ud_parse, ucca_words,
            ucca_lemmas, ucca_parse, ud_trigger, ud_path, ucca_trigger,
            ucca_path, extraction_comment
        ]

    csv_writer = csv.writer(output)

    csv_writer.writerow([
        'id', 'sentence', 'ud_words', 'ud_lemmas', 'ud_parse', 'ucca_words',
        'ucca_lemmas', 'ucca_parse', 'ud_trigger', 'ud_path', 'ucca_trigger',
        'ucca_path', 'extraction_comment'
    ])

    ud_reader = csv.reader(ud_input)
    ucca_reader = csv.reader(ucca_input)

    ud_column_mapper = CsvColumnMapper(
        source_first_row=next(ud_reader),
        target_columns=['trigger', 'trigger_idx', 'matched-lemma', 'path'],
        source_required=[
            'sentence', 'ud_parse', 'lemmas', 'ent1_start', 'ent1_end',
            'ent2_start', 'ent2_end'
        ])

    ucca_column_mapper = CsvColumnMapper(
        source_first_row=next(ucca_reader),
        target_columns=['trigger', 'trigger_idx', 'matched-lemma', 'path'],
        source_required=[
            'sentence', 'ucca_parse', 'lemmas', 'ent1_start', 'ent1_end',
            'ent2_start', 'ent2_end'
        ])

    ucca_entry_lookup = {}
    for ucca_row in ucca_reader:
        id = ucca_column_mapper.get_field_value_from_source(ucca_row,
                                                            'id',
                                                            as_int=True)
        ucca_entry_lookup[id] = ucca_row

    for ud_row in ud_reader:
        id = ud_column_mapper.get_field_value_from_source(ud_row,
                                                          'id',
                                                          as_int=True)
        sentence = ud_column_mapper.get_field_value_from_source(
            ud_row, 'sentence')

        ucca_row = ucca_entry_lookup.get(id)
        if ucca_row is None:
            csv_writer.writerow(
                get_output_entry_list(
                    id, sentence, extraction_comment='No matching UCCA row'))
            continue

        ud_words = ud_column_mapper.get_field_value_from_source(
            ud_row, 'words')
        ud_lemmas = ud_column_mapper.get_field_value_from_source(
            ud_row, 'lemmas')
        ud_parse = ud_column_mapper.get_field_value_from_source(
            ud_row, 'ud_parse')
        ud_match = __extract_relation_ud(ud_row, ud_paths, ud_column_mapper,
                                         triggers)
        ud_trigger = ud_match.trigger if ud_match is not None else None
        ud_path = ud_match.path if ud_match is not None else None

        ucca_words = ucca_column_mapper.get_field_value_from_source(
            ucca_row, 'words')
        ucca1_lemmas = ucca_column_mapper.get_field_value_from_source(
            ucca_row, 'lemmas')
        ucca_parse = ucca_column_mapper.get_field_value_from_source(
            ucca_row, 'ucca_parse')
        ucca_match = __extract_relation_ucca(ucca_row, ucca_paths,
                                             ucca_column_mapper, triggers)
        ucca_trigger = ucca_match.trigger if ucca_match is not None else None
        ucca_path = ucca_match.path if ucca_match is not None else None

        csv_writer.writerow(
            get_output_entry_list(id,
                                  sentence,
                                  ud_words=ud_words,
                                  ud_lemmas=ud_lemmas,
                                  ud_parse=ud_parse,
                                  ucca_words=ucca_words,
                                  ucca_lemmas=ucca1_lemmas,
                                  ucca_parse=ucca_parse,
                                  ud_trigger=ud_trigger,
                                  ud_path=ud_path,
                                  ucca_trigger=ucca_trigger,
                                  ucca_path=ucca_path))
def filter_relations(input, output, entity_types=None):
    csv_reader = csv.reader(input)
    csv_writer = csv.writer(output)

    required_columns = [
        'id', 'sentence', 'words', 'lemmas', 'ent1_start', 'ent1_end',
        'ent2_start', 'ent2_end', 'path'
    ]

    if entity_types is not None:
        required_columns.append('ner')

    column_mapper = CsvColumnMapper(source_first_row=next(csv_reader),
                                    target_columns=[],
                                    source_required=required_columns)

    csv_writer.writerow(column_mapper.get_new_headers())

    for counter, entry in enumerate(csv_reader, start=1):

        path = column_mapper.get_field_value_from_source(entry, 'path')
        if path is None or path == '':
            continue

        ent1_start = column_mapper.get_field_value_from_source(entry,
                                                               'ent1_start',
                                                               as_int=True)
        ent1_end = column_mapper.get_field_value_from_source(entry,
                                                             'ent1_end',
                                                             as_int=True)
        ent1_indexes = [idx for idx in range(ent1_start, ent1_end + 1)]

        ent2_start = column_mapper.get_field_value_from_source(entry,
                                                               'ent2_start',
                                                               as_int=True)
        ent2_end = column_mapper.get_field_value_from_source(entry,
                                                             'ent2_end',
                                                             as_int=True)
        ent2_indexes = [idx for idx in range(ent2_start, ent2_end + 1)]

        filtered = False

        if entity_types is not None:

            entity1_type = entity_types[0]
            entity2_type = entity_types[1]

            ner_tags = column_mapper.get_field_value_from_source(entry,
                                                                 'ner',
                                                                 evaluate=True)

            # let's see if any of entity 1's tokens match entity1_type
            entity1_type_match = False
            for ent1_index in ent1_indexes:
                if ent1_index in ner_tags and ner_tags[
                        ent1_index] == entity1_type:
                    entity1_type_match = True
                    break

            entity2_type_match = False
            for ent2_index in ent2_indexes:
                if ent2_index in ner_tags and ner_tags[
                        ent2_index] == entity2_type:
                    entity2_type_match = True
                    break

            filtered = not entity1_type_match or not entity2_type_match

        if not filtered:
            csv_writer.writerow(column_mapper.get_new_row_values(entry, []))

    output.close()
Ejemplo n.º 7
0
def parse_ud(input_file=None, output_file=None, batch_size=None):
    """

     Parameters
     ----------

     Returns
     -------

    """
    input = open(input_file,
                 encoding='utf-8') if input_file is not None else sys.stdin
    csv_reader = csv.reader(input)

    column_mapper = CsvColumnMapper(source_first_row=next(csv_reader),
                                    target_columns=[
                                        'id', 'sentence', 'ent1', 'ent2',
                                        'ent1_start', 'ent1_end', 'ent2_start',
                                        'ent2_end', 'ud_parse', 'words',
                                        'lemmas', 'comment'
                                    ],
                                    source_required=[
                                        'tac_tokens', 'subj_start', 'subj_end',
                                        'obj_start', 'obj_end'
                                    ],
                                    filter_source_from_result=[
                                        'subj_start', 'subj_end', 'obj_start',
                                        'obj_end'
                                    ])
    detokenizer = Detokenizer()

    ## The prints before and after Pipeline initialization are used by the calling script
    ## as markets to indicated output that should be filtered out - see bin/parse_ud
    ## implementation
    print('BEGIN-INIT-NLP')
    nlp = stanfordnlp.Pipeline()
    print('END-INIT-NLP')

    batch = 0
    output = None
    output_file = output_file[:-len('.csv')] if output_file is not None and output_file.endswith('.csv') \
        else output_file

    for count, entry in enumerate(csv_reader, start=0):

        # the next few lines of code deal with opening and closing files (depending on the batching argument, etc)
        new_file = False

        # first option: standard output ...
        if count == 0 and output_file is None:
            output = sys.stdout
            new_file = True

        # second option: we've just started, we're writing to a real file, but no batching
        if count == 0 and output_file is not None and batch_size is None:
            output_file_actual = '{0}.csv'.format(output_file)

            output = open(output_file_actual,
                          'w',
                          encoding='utf-8',
                          newline='')
            new_file = True

        # third option: we've finished a batch (and we are batching..)
        if output_file is not None and batch_size is not None and count % batch_size == 0:
            output_file_actual = '{0}-{1}.csv'.format(output_file, batch)

            if output is not None:
                output.close()

            output = open(output_file_actual,
                          'w',
                          encoding='utf-8',
                          newline='')
            batch += 1
            new_file = True

        # if we did create a new file, let's ensure that the first row consists of column titles
        if new_file:
            csv_writer = csv.writer(output)
            csv_writer.writerow(column_mapper.get_new_headers())

        # now that we've finished creating a new file as necessary, we can proceed with the business
        # at hand:

        tac_tokens = eval(
            column_mapper.get_field_value_from_source(entry, 'tac_tokens'))
        sentence = detokenizer.detokenize(tac_tokens)

        parsed_sentence = nlp(sentence)
        # let's ignore sentences who parse into multiple sentences - so as to avoid confusion
        if len(parsed_sentence.sentences) > 1:
            csv_writer.writerow(
                column_mapper.get_new_row_values(entry, [
                    count, sentence, None, None, None, None, None, None, None,
                    None, None,
                    'python stanfordnlp parse produced more than one sentence'
                ]))

            continue

        ud_parse = []
        for governor, dep, word in parsed_sentence.sentences[0].dependencies:
            ud_parse.append(
                (word.index, word.text, dep, governor.index, governor.text))

        tokens = []
        tokens_with_indices = []
        lemmas_with_indices = []
        for token in parsed_sentence.sentences[0].tokens:
            for word in token.words:
                tokens.append(word.text)
                tokens_with_indices.append((word.index, word.text))
                lemmas_with_indices.append((word.index, word.lemma))

        ud_parse.sort(key=lambda x: int(x[0]))

        tac_tokens_lookup = {}
        tac_tokens_lookup['subj_start'] = int(
            column_mapper.get_field_value_from_source(entry, 'subj_start'))
        tac_tokens_lookup['subj_end'] = int(
            column_mapper.get_field_value_from_source(entry, 'subj_end'))
        tac_tokens_lookup['obj_start'] = int(
            column_mapper.get_field_value_from_source(entry, 'obj_start'))
        tac_tokens_lookup['obj_end'] = int(
            column_mapper.get_field_value_from_source(entry, 'obj_end'))

        token_lookup = SyncTacTags.b_lookup_to_a_lookup(
            tokens, tac_tokens, tac_tokens_lookup)

        if len(token_lookup) != len(tac_tokens_lookup):
            csv_writer.writerow(
                column_mapper.get_new_row_values(entry, [
                    count, sentence, None, None, None, None, None, None, None,
                    None, None,
                    'was not able to reconcile TAC and python stanfordnlp parse indexing'
                ]))
            continue

        ent1_start = token_lookup['subj_start']
        ent1_end = token_lookup['subj_end']
        ent1 = ' '.join(tokens[ent1_start:ent1_end + 1])

        ent2_start = token_lookup['obj_start']
        ent2_end = token_lookup['obj_end']
        ent2 = ' '.join(tokens[ent2_start:ent2_end + 1])

        csv_writer.writerow(
            column_mapper.get_new_row_values(entry, [
                count, sentence, ent1, ent2, ent1_start + 1, ent1_end + 1,
                ent2_start + 1, ent2_end + 1, ud_parse, tokens_with_indices,
                lemmas_with_indices, None
            ]))
Ejemplo n.º 8
0
def extract_relations_ud(input, output, triggers, paths, include_miss=False):
    csv_reader = csv.reader(input)
    csv_writer = csv.writer(output)

    required_columns = [
        'sentence', 'ud_parse', 'lemmas', 'ent1_start', 'ent1_end',
        'ent2_start', 'ent2_end'
    ]

    column_mapper = CsvColumnMapper(
        next(csv_reader),
        ['trigger', 'trigger_idx', 'path', 'extraction_comment'],
        source_required=required_columns)

    csv_writer.writerow(column_mapper.get_new_headers())

    for counter, entry in enumerate(csv_reader, start=1):

        ud_parse = column_mapper.get_field_value_from_source(entry,
                                                             'ud_parse',
                                                             evaluate=True)
        if ud_parse is None:
            if include_miss:
                csv_writer.writerow(
                    column_mapper.get_new_row_values(
                        entry, [None, None, None, 'ud_parse missing']))
            continue

        links = UdRepresentationPlaceholder.get_links_from_ud_dep(ud_parse)

        lemma_indices = column_mapper.get_field_value_from_source(
            entry, 'lemmas', evaluate=True)
        lemmas = [lemma for _, lemma in lemma_indices]

        word_indices = column_mapper.get_field_value_from_source(entry,
                                                                 'words',
                                                                 evaluate=True)
        words = [word for _, word in word_indices]

        ent1_start = column_mapper.get_field_value_from_source(entry,
                                                               'ent1_start',
                                                               as_int=True)
        ent1_end = column_mapper.get_field_value_from_source(entry,
                                                             'ent1_end',
                                                             as_int=True)
        if ent1_start is None or ent1_end is None:
            if include_miss:
                csv_writer.writerow(
                    column_mapper.get_new_row_values(
                        entry, [None, None, None, 'could not identify ent1']))
            continue
        ent1_indexes = [idx for idx in range(ent1_start, ent1_end + 1)]
        ent1_head = Link.get_head(links, ent1_indexes)

        ent2_start = column_mapper.get_field_value_from_source(entry,
                                                               'ent2_start',
                                                               as_int=True)
        ent2_end = column_mapper.get_field_value_from_source(entry,
                                                             'ent2_end',
                                                             as_int=True)
        if ent2_start is None or ent2_end is None:
            if include_miss:
                csv_writer.writerow(
                    column_mapper.get_new_row_values(
                        entry, [None, None, None, 'could not identify ent2']))
            continue

        ent2_indexes = [idx for idx in range(ent2_start, ent2_end + 1)]
        ent2_head = Link.get_head(links, ent2_indexes)

        graph = DepGraph(links)

        found_relation = False
        trigger_word_matches = []

        for trigger_index, (word, lemma) in enumerate(zip(words, lemmas),
                                                      start=1):

            if word in triggers or lemma in triggers:

                trigger_word_matches.append(word)

                trigger_to_ent2 = Step.get_default_representation(
                    graph.get_steps(trigger_index, ent2_head))
                ent1_to_trigger = Step.get_default_representation(
                    graph.get_steps(ent1_head, trigger_index))
                ent1_to_ent2_via_trigger = '{0} >< {1}'.format(
                    ent1_to_trigger, trigger_to_ent2)

                if ent1_to_ent2_via_trigger in paths:
                    found_relation = True
                    trigger = word if word in triggers else lemma

                    csv_writer.writerow(
                        column_mapper.get_new_row_values(
                            entry, [
                                trigger, trigger_index,
                                ent1_to_ent2_via_trigger, None
                            ]))
                    break

        if not found_relation:
            if include_miss:
                comment = 'relation not found - considered the following matching triggers: {}' \
                    .format(' '.join(trigger_word_matches))

                csv_writer.writerow(
                    column_mapper.get_new_row_values(
                        entry, [None, None, None, comment]))

    output.close()
Ejemplo n.º 9
0
def parse_ucca(tupa_dir,
               model_prefix,
               tupa_batch_size,
               input_file=None,
               output_file=None,
               batch_size=None):
    """

     Parameters
     ----------

     Returns
     -------

    """
    input = open(input_file,
                 encoding='utf-8') if input_file is not None else sys.stdin
    csv_reader = csv.reader(input)

    column_mapper = CsvColumnMapper(source_first_row=next(csv_reader),
                                    target_columns=[
                                        'id', 'sentence', 'ent1', 'ent2',
                                        'ent1_start', 'ent1_end', 'ent2_start',
                                        'ent2_end', 'ucca_parse', 'words',
                                        'lemmas', 'comment'
                                    ],
                                    source_required=[
                                        'tac_tokens', 'subj_start', 'subj_end',
                                        'obj_start', 'obj_end'
                                    ],
                                    filter_source_from_result=[
                                        'subj_start', 'subj_end', 'obj_start',
                                        'obj_end'
                                    ])

    detokenizer = Detokenizer()
    mnofc = ManageNewOutputFileCreation(output_file, batch_size)

    parser = TupaParser2(tupa_dir, model_prefix)
    nlp = spacy.load('en_core_web_md')

    count = -1
    for next_batch in zip_longest(*([csv_reader] * tupa_batch_size)):

        entries = []
        sentences = []
        for entry in next_batch:
            if entry is None:
                #we've reached the end of the batch
                break

            entries.append(entry)

            tac_tokens = eval(
                column_mapper.get_field_value_from_source(entry, 'tac_tokens'))
            sentence = detokenizer.detokenize(tac_tokens)
            sentences.append(sentence)

        # send multiple sentnces for parsing (as many as 'TUPA_BATCH_SIZE')
        parsed_sentences = parser.parse_sentences(sentences)

        # if the length of parsed_sentences is different to the length of sentences, then
        # all bets are off - no point in trying to consolidate.
        # (that's especially true as the situation in which parsed_sentences' lenght will be
        # different is when the 'python -m tupa' command fails, in which case parsed_sentences will
        # be empty
        if len(parsed_sentences) != len(sentences):
            parsed_sentences = [None] * len(sentences)

        for sentence, parsed_sentence, entry in zip(sentences,
                                                    parsed_sentences, entries):

            count += 1

            new_file = mnofc.get_new_file_if_necessary()
            if new_file:
                csv_writer = csv.writer(new_file)
                csv_writer.writerow(column_mapper.get_new_headers())

            if parsed_sentence is None:
                csv_writer.writerow(
                    column_mapper.get_new_row_values(entry, [
                        count, sentence, None, None, None, None, None, None,
                        None, None, None, 'TUPA did not produce a UCCA parse'
                    ]))
                continue

            tokens = []
            tokens_with_indices = []
            lemmas_with_indices = []

            for ucca_terminal in parsed_sentence.terminals:
                tokens.append(ucca_terminal.text)
                tokens_with_indices.append(
                    (ucca_terminal.token_id, ucca_terminal.text))

            # use spacy to get lemmas
            spacied = nlp(sentence)
            for token_id, word in enumerate(spacied, start=1):
                lemmas_with_indices.append((token_id, word.lemma_))

            tac_tokens = eval(
                column_mapper.get_field_value_from_source(entry, 'tac_tokens'))

            tac_tokens_lookup = {}
            tac_tokens_lookup['subj_start'] = int(
                column_mapper.get_field_value_from_source(entry, 'subj_start'))
            tac_tokens_lookup['subj_end'] = int(
                column_mapper.get_field_value_from_source(entry, 'subj_end'))
            tac_tokens_lookup['obj_start'] = int(
                column_mapper.get_field_value_from_source(entry, 'obj_start'))
            tac_tokens_lookup['obj_end'] = int(
                column_mapper.get_field_value_from_source(entry, 'obj_end'))

            token_lookup = SyncTacTags.b_lookup_to_a_lookup(
                tokens, tac_tokens, tac_tokens_lookup)

            if len(token_lookup) != len(tac_tokens_lookup):
                csv_writer.writerow(
                    column_mapper.get_new_row_values(entry, [
                        count, sentence, None, None, None, None, None, None,
                        None, None, None,
                        'was not able to reconcile TAC and Tupa\'s Spacy based indexing'
                    ]))
                continue

            ent1_start = token_lookup['subj_start']
            ent1_end = token_lookup['subj_end']
            ent1 = ' '.join(tokens[ent1_start:ent1_end + 1])

            ent2_start = token_lookup['obj_start']
            ent2_end = token_lookup['obj_end']
            ent2 = ' '.join(tokens[ent2_start:ent2_end + 1])

            csv_writer.writerow(
                column_mapper.get_new_row_values(entry, [
                    count, sentence, ent1, ent2, ent1_start + 1, ent1_end + 1,
                    ent2_start + 1, ent2_end + 1,
                    parsed_sentence.serialize(), tokens_with_indices,
                    lemmas_with_indices, None
                ]))