def handle_line(line, document_state, ner_type):
    begin_document_match = re.match(conll.BEGIN_DOCUMENT_REGEX, line)
    if begin_document_match:
        document_state.assert_empty()
        document_state.doc_key = conll.get_doc_key(
            begin_document_match.group(1), begin_document_match.group(2))
        return None
    elif line.startswith("#end document"):
        document_state.assert_finalizable()
        return document_state.finalize()
    else:
        row = line.split()
        if len(row) == 0:
            document_state.sentences.append(tuple(document_state.text))
            del document_state.text[:]
            document_state.speakers.append(tuple(document_state.text_speakers))
            del document_state.text_speakers[:]
            return None
        assert len(row) >= 12

        word = normalize_word(row[3])
        coref = row[-1]
        doc_key = conll.get_doc_key(row[0], row[1])
        speaker = row[9]
        person = row[10]

        word_index = len(document_state.text) + sum(
            len(s) for s in document_state.sentences)
        document_state.text.append(word)
        document_state.text_speakers.append(speaker)

        if coref != "-":
            for segment in coref.split("|"):
                if segment[0] == "(":
                    if segment[-1] == ")":
                        cluster_id = int(segment[1:-1])
                        document_state.clusters[cluster_id].append(
                            (word_index, word_index))
                    else:
                        cluster_id = int(segment[1:])
                        document_state.stacks[cluster_id].append(word_index)
                else:
                    cluster_id = int(segment[:-1])
                    start = document_state.stacks[cluster_id].pop()
                    document_state.clusters[cluster_id].append(
                        (start, word_index))
        if ner_type in person:
            t = regex.sub('', person)
            if ')' in person:
                document_state.people.append((word_index, word_index))
            else:
                document_state.person_start = word_index
        elif ')' in person and document_state.person_start is not None:
            document_state.people.append(
                (document_state.person_start, word_index))
            document_state.person_start = None
        return None
def handle_line(line, document_state, language, labels, stats):
    begin_document_match = re.match(conll.BEGIN_DOCUMENT_REGEX, line)
    if begin_document_match:
        document_state.assert_empty()
        document_state.doc_key = conll.get_doc_key(
            begin_document_match.group(1), begin_document_match.group(2))
        return None
    elif line.startswith("#end document"):
        document_state.assert_finalizable()
        finalized_state = document_state.finalize()
        stats["num_clusters"] += len(finalized_state["clusters"])
        stats["num_mentions"] += sum(
            len(c) for c in finalized_state["clusters"])
        return finalized_state
    else:
        row = line.split()
        if len(row) == 0:
            stats["max_sent_len_{}".format(language)] = max(
                len(document_state.text),
                stats["max_sent_len_{}".format(language)])
            stats["num_sents_{}".format(language)] += 1
            document_state.sentences.append(tuple(document_state.text))
            del document_state.text[:]
            document_state.speakers.append(tuple(document_state.text_speakers))
            del document_state.text_speakers[:]
            return None
        assert len(row) >= 12

        doc_key = conll.get_doc_key(row[0], row[1])
        word = normalize_word(row[3], language)
        parse = row[5]
        speaker = row[9]
        ner = row[10]
        coref = row[-1]

        word_index = len(document_state.text) + sum(
            len(s) for s in document_state.sentences)
        document_state.text.append(word)
        document_state.text_speakers.append(speaker)

        if coref != "-":
            for segment in coref.split("|"):
                if segment[0] == "(":
                    if segment[-1] == ")":
                        cluster_id = int(segment[1:-1])
                        document_state.clusters[cluster_id].append(
                            (word_index, word_index))
                    else:
                        cluster_id = int(segment[1:])
                        document_state.coref_stacks[cluster_id].append(
                            word_index)
                else:
                    cluster_id = int(segment[:-1])
                    start = document_state.coref_stacks[cluster_id].pop()
                    document_state.clusters[cluster_id].append(
                        (start, word_index))
        return None
Beispiel #3
0
def minimize_partition(name, language, extension, labels, stats, tokenizer,
                       seg_len, input_dir, output_dir):
    input_path = "{}/{}.{}.{}".format(input_dir, name, language, extension)
    output_path = "{}/{}.{}.{}.jsonlines".format(output_dir, name, language,
                                                 seg_len)
    count = 0
    print("Minimizing {}".format(input_path))
    documents = []
    with open(input_path, "r") as input_file:
        for line in input_file.readlines():
            begin_document_match = re.match(conll.BEGIN_DOCUMENT_REGEX, line)
            if begin_document_match:
                doc_key = conll.get_doc_key(begin_document_match.group(1),
                                            begin_document_match.group(2))
                documents.append((doc_key, []))
            elif line.startswith("#end document"):
                continue
            else:
                documents[-1][1].append(line)
    with open(output_path, "w") as output_file:
        for document_lines in documents:
            if skip(document_lines[0]):
                continue
            document = get_document(document_lines, tokenizer, language,
                                    seg_len)
            output_file.write(json.dumps(document))
            output_file.write("\n")
            count += 1
    print("Wrote {} documents to {}".format(count, output_path))
Beispiel #4
0
def minimize_partition(partition, extension, args, tokenizer):
    input_path = os.path.join(args.input_dir,
                              f'{partition}.{args.language}.{extension}')
    output_path = os.path.join(
        args.output_dir,
        f'{partition}.{args.language}.{args.seg_len}.jsonlines')
    doc_count = 0
    logger.info(f'Minimizing {input_path}...')

    # Read documents
    documents = []  # [(doc_key, lines)]
    with open(input_path, 'r') as input_file:
        for line in input_file.readlines():
            begin_document_match = re.match(conll.BEGIN_DOCUMENT_REGEX, line)
            if begin_document_match:
                doc_key = conll.get_doc_key(begin_document_match.group(1),
                                            begin_document_match.group(2))
                documents.append((doc_key, []))
            elif line.startswith('#end document'):
                continue
            else:
                documents[-1][1].append(line)

    # Write documents
    with open(output_path, 'w') as output_file:
        for doc_key, doc_lines in documents:
            if skip_doc(doc_key):
                continue
            document = get_document(doc_key, doc_lines, args.language,
                                    args.seg_len, tokenizer)
            output_file.write(json.dumps(document))
            output_file.write('\n')
            doc_count += 1
    logger.info(f'Processed {doc_count} documents to {output_path}')
Beispiel #5
0
def handle_line(line, document_state):
  begin_document_match = re.match(conll.BEGIN_DOCUMENT_REGEX, line)
  if begin_document_match:
    document_state.assert_empty()
    document_state.doc_key = conll.get_doc_key(begin_document_match.group(1), begin_document_match.group(2))
    return None
  elif line.startswith("#end document"):
    document_state.assert_finalizable()
    return document_state.finalize()
  else:
    row = line.split()
    if len(row) == 0:
      document_state.sentences.append(tuple(document_state.text))
      del document_state.text[:]
      document_state.speakers.append(tuple(document_state.text_speakers))
      del document_state.text_speakers[:]
      return None
    assert len(row) >= 12

    word = normalize_word(row[3])
    coref = row[-1]
    doc_key = conll.get_doc_key(row[0], row[1])
    speaker = row[9]

    word_index = len(document_state.text) + sum(len(s) for s in document_state.sentences)
    document_state.text.append(word)
    document_state.text_speakers.append(speaker)

    if coref == "-":
      return None

    for segment in coref.split("|"):
      if segment[0] == "(":
        if segment[-1] == ")":
          cluster_id = int(segment[1:-1])
          document_state.clusters[cluster_id].append((word_index, word_index))
        else:
          cluster_id = int(segment[1:])
          document_state.stacks[cluster_id].append(word_index)
      else:
        cluster_id = int(segment[:-1])
        start = document_state.stacks[cluster_id].pop()
        document_state.clusters[cluster_id].append((start, word_index))
    return None
def read_conll_file(conll_file_path: str) -> List[Tuple]:
    documents = []
    with open(conll_file_path) as fi:
        for line in fi:
            begin_document_match = re.match(conll.BEGIN_DOCUMENT_REGEX, line)
            if begin_document_match:
                doc_key = conll.get_doc_key(begin_document_match.group(1),
                                            begin_document_match.group(2))
                documents.append((doc_key, []))
            elif line.startswith("#end document"):
                continue
            else:
                documents[-1][1].append(line.strip())
    return documents
def handle_line(line, document_state):
    begin_document_match = re.match(conll.BEGIN_DOCUMENT_REGEX, line)
    if begin_document_match:
        document_state.assert_empty()
        document_state.doc_key = conll.get_doc_key(
            begin_document_match.group(1), begin_document_match.group(2))
        return None
    elif line.startswith("#end document"):
        document_state.assert_finalizable()
        return document_state.finalize()
    else:
        row = line.split()
        if len(row) == 0:
            document_state.sentences.append(tuple(document_state.text))

            # print document_state.text
            # print document_state.text_ner_tags

            # I added this
            document_state.pos_tags.append(tuple(document_state.text_pos_tags))
            del document_state.text_pos_tags[:]

            document_state.ner_tags.append(tuple(document_state.text_ner_tags))
            del document_state.text_ner_tags[:]

            del document_state.text[:]
            document_state.speakers.append(tuple(document_state.text_speakers))
            del document_state.text_speakers[:]

            return None

        # print line
        assert len(row) >= 12

        word = normalize_word(row[3])
        coref = row[-1]
        doc_key = conll.get_doc_key(row[0], row[1])
        # print doc_key
        speaker = row[9]

        # I added this
        pos_tag = row[4]
        document_state.text_pos_tags.append(pos_tag)

        # -------------------------------------------
        # NER stuff
        ner_tag = row[10]
        ner = ''

        if ner_tag[0] == "(":  # beginning of tag
            if ner_tag[-1] == ")":  # one liner
                ner = "B-" + ner_tag[1:-1]
            else:
                ner = "B-" + ner_tag[1:-1]
                document_state.ner_stack.append(ner_tag[1:-1])
        else:
            if len(document_state.ner_stack) > 0:
                ner = "I-" + document_state.ner_stack[-1]
                if ner_tag[-1] == ")":
                    document_state.ner_stack.pop()
            else:
                ner = "O"

        document_state.text_ner_tags.append(ner)
        # -------------------------------------------

        word_index = len(document_state.text) + sum(
            len(s) for s in document_state.sentences)
        document_state.text.append(word)
        document_state.text_speakers.append(speaker)

        if coref == "-":
            return None

        for segment in coref.split("|"):
            if segment[0] == "(":
                if segment[-1] == ")":
                    cluster_id = int(segment[1:-1])
                    document_state.clusters[cluster_id].append(
                        (word_index, word_index))
                else:
                    cluster_id = int(segment[1:])
                    document_state.stacks[cluster_id].append(word_index)
            else:
                cluster_id = int(segment[:-1])
                start = document_state.stacks[cluster_id].pop()
                document_state.clusters[cluster_id].append((start, word_index))
        return None
Beispiel #8
0
def handle_line(line, document_state, language, labels, stats):
    begin_document_match = re.match(conll.BEGIN_DOCUMENT_REGEX, line)
    if begin_document_match:
        document_state.assert_empty()
        document_state.doc_key = conll.get_doc_key(
            begin_document_match.group(1), begin_document_match.group(2))
        return None
    elif line.startswith("#end document"):
        document_state.assert_finalizable()
        finalized_state = document_state.finalize()
        stats["num_clusters"] += len(finalized_state["clusters"])
        stats["num_mentions"] += sum(
            len(c) for c in finalized_state["clusters"])
        #labels["{}_const_labels".format(language)].update(l for _, _, l in finalized_state["constituents"])
        labels["ner"].update(l for _, _, l in finalized_state["ner"])
        return finalized_state
    else:
        row = line.split()
        if len(row) == 0:
            stats["max_sent_len_{}".format(language)] = max(
                len(document_state.text),
                stats["max_sent_len_{}".format(language)])
            stats["num_sents_{}".format(language)] += 1
            document_state.sentences.append(tuple(document_state.text))
            del document_state.text[:]
            document_state.speakers.append(tuple(document_state.text_speakers))
            del document_state.text_speakers[:]

            document_state.start_times.append(
                tuple(document_state.start_times_item))
            document_state.end_times.append(
                tuple(document_state.end_times_item))
            document_state.video_npy_files.append(
                tuple(document_state.video_npy_files_item))
            document_state.genders.append(tuple(document_state.text_genders))
            document_state.fpronouns.append(
                tuple(document_state.text_fpronouns))

            del document_state.start_times_item[:]
            del document_state.end_times_item[:]
            del document_state.video_npy_files_item[:]
            del document_state.text_genders[:]
            del document_state.text_fpronouns[:]

            return None

        assert len(row) >= 12

        doc_key = conll.get_doc_key(row[0], row[1])
        word = normalize_word(row[3], language)
        parse = row[5]
        speaker = row[9]
        ner = row[10]
        st_time = -1 if (row[-4] == 'NOTIME') else int(row[-4])
        en_time = -1 if (row[-3] == 'NOTIME') else int(row[-3])
        video_npy_file = row[-2]
        coref = row[-1]

        word_index = len(document_state.text) + sum(
            len(s) for s in document_state.sentences)
        document_state.text.append(word)
        document_state.text_speakers.append(speaker)

        document_state.start_times_item.append(st_time)
        document_state.end_times_item.append(en_time)
        document_state.video_npy_files_item.append(video_npy_file)

        if (word.lower() in ['he', 'him', 'his', 'himself', 'boy', 'man']):
            gender = 1
        elif (word.lower()
              in ['she', 'her', 'hers', 'herself', 'girl', 'woman', 'lady']):
            gender = -1
        else:
            gender = 0
        document_state.text_genders.append(gender)

        firstpronoun = 1 if (word.lower()
                             in ['i', 'my', 'me', 'mine', 'myself']) else 0
        document_state.text_fpronouns.append(firstpronoun)

        #handle_bit(word_index, parse, document_state.const_stack, document_state.constituents)
        handle_bit(word_index, ner, document_state.ner_stack,
                   document_state.ner)

        if coref != "-":
            for segment in coref.split("|"):
                if segment[0] == "(":
                    if segment[-1] == ")":
                        cluster_id = int(segment[1:-1])
                        document_state.clusters[cluster_id].append(
                            (word_index, word_index))
                    else:
                        cluster_id = int(segment[1:])
                        document_state.coref_stacks[cluster_id].append(
                            word_index)
                else:
                    cluster_id = int(segment[:-1])
                    start = document_state.coref_stacks[cluster_id].pop()
                    document_state.clusters[cluster_id].append(
                        (start, word_index))
        return None
def handle_line(line, document_state, language, labels, stats):
    begin_document_match = re.match(conll.BEGIN_DOCUMENT_REGEX, line)
    if begin_document_match:
        document_state.assert_empty()
        document_state.doc_key = conll.get_doc_key(
            begin_document_match.group(1), begin_document_match.group(2))
        print(document_state.doc_key)
        return None
    elif line.startswith("#end document"):
        #document_state.assert_finalizable()
        finalized_state = document_state.finalize()
        stats["num_clusters"] += len(finalized_state["clusters"])
        stats["num_mentions"] += sum(
            len(c) for c in finalized_state["clusters"])
        labels["{}_const_labels".format(language)].update(
            l for _, _, l in finalized_state["constituents"])
        #labels["ner"].update(l for _, _, l in finalized_state["ner"])
        return finalized_state
    else:
        row = line.split()
        if len(row) == 0:
            stats["max_sent_len_{}".format(language)] = max(
                len(document_state.text),
                stats["max_sent_len_{}".format(language)])
            stats["num_sents_{}".format(language)] += 1
            document_state.sentences.append(tuple(document_state.text))
            del document_state.text[:]
            document_state.speakers.append(tuple(document_state.text_speakers))
            del document_state.text_speakers[:]
            return None
        assert len(row) >= 12

        doc_key = conll.get_doc_key(row[0], row[1])
        word = normalize_word(row[3], language)
        #POS = row[4]
        #head_POS = row[7]
        parse = row[5]
        speaker = row[9]
        ner = row[10]
        st_time = -1 if (row[-4] == 'NOTIME') else int(row[-4])
        en_time = -1 if (row[-3] == 'NOTIME') else int(row[-3])
        video_npy_file = row[-2]
        coref = row[-1]
        entity = row[-5]

        word_index = len(document_state.text) + sum(
            len(s) for s in document_state.sentences)
        document_state.text.append(word)
        document_state.text_speakers.append(speaker)
        #document_state.POS.append(pos)
        #document_state.head_POS.append(head_POS)
        if (len(document_state.start_times) == 0
                or (not (document_state.start_times[-1] == st_time
                         and document_state.end_times[-1] == en_time))):
            document_state.start_times.append(st_time)
            document_state.end_times.append(en_time)
            document_state.video_npy_files.append(video_npy_file)
        #print(word_index, parse)
        #handle_bit(word_index, parse, document_state.const_stack, document_state.constituents)
        #handle_bit(word_index, ner, document_state.ner_stack, document_state.ner)
        #coref_number = 0
        #entity_number = 0
        if coref != "-":
            for segment in coref.split("|"):
                if segment[0] == "(":
                    if segment[-1] == ")":
                        cluster_id = int(segment[1:-1])
                        document_state.clusters[cluster_id].append(
                            (word_index, word_index))
                        #coref_number += 1
                    else:
                        cluster_id = int(segment[1:])
                        document_state.coref_stacks[cluster_id].append(
                            word_index)
                else:
                    cluster_id = int(segment[:-1])
                    #print(segment,cluster_id)
                    start = document_state.coref_stacks[cluster_id].pop()
                    #coref_number += 1
                    document_state.clusters[cluster_id].append(
                        (start, word_index))
        if entity != "-":
            for segment in entity.split("|"):
                if segment[0] == "<":
                    if segment[-1] == ">":
                        entity_id = int(segment[1:-1])
                        document_state.entities.append(
                            (word_index, word_index, entity_id))
                        #entity_number += 1
                    else:
                        entity_id = int(segment[1:])
                        document_state.entity_stacks[entity_id].append(
                            word_index)
                else:
                    entity_id = int(segment[:-1])
                    #print(segment,entity_id)
                    start = document_state.entity_stacks[entity_id].pop()
                    #entity_number += 1
                    document_state.entities.append(
                        (start, word_index, entity_id))
        if ner != "*":
            for segment in ner.split("|"):
                if segment[0] == "[":
                    if segment[-1] == "]":
                        ner_id = int(segment[1:-1])
                        document_state.ners.append(
                            (word_index, word_index, ner_id))
                    else:
                        ner_id = int(segment[1:])
                        document_state.ner_stacks[ner_id].append(word_index)
                else:
                    ner_id = int(segment[:-1])
                    #print(segment, ner_id)
                    start = document_state.ner_stacks[ner_id].pop()
                    document_state.ners.append((start, word_index, ner_id))

        return None