def main(args):
    if len(args) < 2:
        sys.stderr.write('Required argument(s): <unlabeled anafora directory> <labeled anafora directory>\n')
        sys.exit(-1)

    for sub_dir, text_name, xml_names in walk(args[0], xml_name_regex='[.]dave\.inprogress\.xml$'):
        # print("text_name = %s" % text_name)
        assert len(xml_names) == 1
        xml_file = join(args[0], sub_dir, xml_names[0])
        gold_file = join(args[1], sub_dir, xml_names[0].replace('inprogress', 'completed'))
        use_gold = False

        # print("xml file = %s" % (xml_file,))
        if exists(gold_file):
            anafora_data = AnaforaData.from_file(gold_file)
            use_gold = True
        else:
            anafora_data = AnaforaData.from_file(xml_file)
        
        for event in anafora_data.annotations.select_type('CuiEntity'):
            if use_gold:
                negated = event.properties['negated']
            else:
                negated = "Unlabeled"
            
            span = event.spans
            assert len(span) == 1
            span = span[0]
            assert len(span) == 2

            # print("Row id %s event with span (%d, %d) is negated=%s" % (text_name, span[0], span[1], str(negated)))
            print('%s\t%d\t%d\t%s' % (text_name, span[0], span[1], str(negated)))
Example #2
0
 def from_texts(cls, text_dir, nlp, tokenizer):
     if not os.path.exists(text_dir):
         raise Exception("The %s directory does not exist." % text_dir)
     text_directory_files = anafora.walk(
         text_dir, xml_name_regex=".*((?<![.].{3})|[.]txt)$")
     features = []
     doc_indices = []
     for text_files in text_directory_files:
         doc_index = len(features)
         text_subdir_path, text_doc_name, text_file_names = text_files
         if len(text_file_names) != 1:
             raise Exception("Wrong number of text files in %s" %
                             text_subdir_path)
         text_file_path = os.path.join(text_dir, text_subdir_path,
                                       text_file_names[0])
         with open(text_file_path) as txt_file:
             text = txt_file.read()
         doc = nlp(text)
         input_raw = [sent.text_with_ws for sent in doc.sents]
         input_data = tokenizer(input_raw,
                                return_tensors="pt",
                                padding="max_length",
                                truncation="longest_first",
                                return_offsets_mapping=True)
         sent_offset = 0
         for sent_idx, _ in enumerate(input_data["input_ids"]):
             features.append(
                 TimexInputFeatures.from_sentence(input_data, sent_idx,
                                                  sent_offset))
             sent_offset += len(input_raw[sent_idx])
         doc_indices.append((text_subdir_path, doc_index, len(features)))
     return cls(doc_indices, features)
Example #3
0
  def write(self, predictions):
    """Write predictions in anafora XML format"""

    index = 0

    if os.path.isdir(self.out_dir):
      shutil.rmtree(self.out_dir)
    os.mkdir(self.out_dir)

    for sub_dir, text_name, file_names in \
            anafora.walk(self.xml_dir, self.xml_regex):

      xml_path = os.path.join(self.xml_dir, sub_dir, file_names[0])
      ref_data = anafora.AnaforaData.from_file(xml_path)

      data = anafora.AnaforaData()

      for event in ref_data.annotations.select_type('EVENT'):
        entity = anafora.AnaforaEntity()

        entity.id = event.id
        start, end = event.spans[0]
        entity.spans = event.spans
        entity.type = event.type
        entity.properties['DocTimeRel'] = int2label[predictions[index]]

        data.annotations.append(entity)
        index = index + 1

      os.mkdir(os.path.join(self.out_dir, sub_dir))
      out_path = os.path.join(self.out_dir, sub_dir, file_names[0])

      data.indent()
      data.to_file(out_path)
Example #4
0
def main(xml_dir, text_dir, xml_regex, context_size):
    """Main Driver"""

    for sub_dir, text_name, file_names in anafora.walk(xml_dir, xml_regex):

        xml_path = os.path.join(xml_dir, sub_dir, file_names[0])
        ref_data = anafora.AnaforaData.from_file(xml_path)

        text_path = os.path.join(text_dir, text_name)
        text = open(text_path).read()

        for data in ref_data.annotations.select_type('EVENT'):

            start, end = data.spans[0]
            context = text[start - context_size:end + context_size].replace(
                '\n', '')
            event = text[start:end]
            dtr = data.properties['DocTimeRel']

            left = text[start - context_size:start]
            right = text[end:end + context_size]
            tagged = left + ' [ES] ' + event + ' [EE] ' + right
            print('{}|{}|{}'.format(dtr, event, context))
            print(tagged)
            print()
Example #5
0
    def notes_to_annotations(self):
        """Map note paths to relation, time, and event offsets"""

        for sub_dir, text_name, file_names in anafora.walk(
                self.xml_dir, self.xml_regex):
            note_path = os.path.join(self.text_dir, text_name)
            xml_path = os.path.join(self.xml_dir, sub_dir, file_names[0])
            ref_data = anafora.AnaforaData.from_file(xml_path)

            # collect (annot_start, annot_end, annot_id) tuples
            add_annotations(self.note2times[note_path], ref_data, 'TIMEX3')
            add_annotations(self.note2times[note_path], ref_data,
                            'SECTIONTIME')
            add_annotations(self.note2times[note_path], ref_data, 'DOCTIME')
            add_annotations(self.note2events[note_path], ref_data, 'EVENT')

            # collect (src spans, targ spans, src id, targ id) tuples
            for rel in ref_data.annotations.select_type('TLINK'):
                src = rel.properties['Source']
                targ = rel.properties['Target']
                label = rel.properties['Type']
                if label == 'CONTAINS':
                    src_start, src_end = src.spans[0]
                    targ_start, targ_end = targ.spans[0]
                    self.note2rels[note_path].append(
                        (src_start, src_end, targ_start, targ_end, src.id,
                         targ.id))
def create_datasets(model, nlp, dataset_path, train=False, valid=False):
    text_directory_files = anafora.walk(
        dataset_path, xml_name_regex=".*((?<![.].{3})|[.]txt)$")
    features = []
    annotations = {}
    doc_indices = []
    if train or valid:
        for text_files in text_directory_files:
            doc_index = len(features)
            text_subdir_path, text_doc_name, text_file_names = text_files
            if len(text_file_names) != 1:
                raise Exception("Wrong number of text files in %s" %
                                text_subdir_path)
            anafora_path = os.path.join(dataset_path, text_subdir_path)
            anafora_directory_files = anafora.walk(anafora_path,
                                                   xml_name_regex="[.]xml$")
            anafora_directory_files = list(anafora_directory_files)
            if len(anafora_directory_files) != 1:
                raise Exception("Wrong structure in %s" % anafora_path)
            anafora_subdir_path, anafora_doc_name, anafora_file_names = anafora_directory_files[
                0]
            if len(anafora_file_names) != 1:
                raise Exception("Wrong number of anafora files in %s" %
                                anafora_subdir_path)
            text_file_path = os.path.join(dataset_path, text_subdir_path,
                                          text_file_names[0])
            anafora_file_path = os.path.join(anafora_path, anafora_subdir_path,
                                             anafora_file_names[0])
            doc_features, doc_annotations = from_doc_to_features(
                model, nlp, text_file_path, anafora_file_path, True)
            features.extend(doc_features)
            if valid:
                annotations[text_doc_name] = doc_annotations
            doc_indices.append((text_subdir_path, doc_index, len(features)))
    else:
        for text_files in text_directory_files:
            doc_index = len(features)
            text_subdir_path, text_doc_name, text_file_names = text_files
            if len(text_file_names) != 1:
                raise Exception("Wrong number of text files in %s" %
                                text_subdir_path)
            text_file_path = os.path.join(dataset_path, text_subdir_path,
                                          text_file_names[0])
            doc_features, _ = from_doc_to_features(model, nlp, text_file_path)
            features.extend(doc_features)
            doc_indices.append((text_subdir_path, doc_index, len(features)))
    return TimeDataset(doc_indices, features, annotations)
Example #7
0
def main(args):

    TYPE = 'Annotation type'
    ROWID = 'Row Id'
    STATUS = 'Status'
    ANNOTATOR = 'Annotator'
    LABEL = 'Label'
    SPAN_TEXT = 'Span text'
    ANNOT_TEXT = 'Annotated text'

    if len(args) < 2:
        sys.stderr.write('2 required arguments: <input anafora dir> <output csv file>\n')
        sys.exit(-1)

    with open(args[1], 'w', newline='') as csvfile:
        # fieldnames=['Annotation type', 'row_id', 'Status', 'Annotator', 'Label', 'Span text', 'Annotated text']
        fieldnames=[TYPE, ROWID, STATUS, ANNOTATOR, LABEL, SPAN_TEXT, ANNOT_TEXT]
        csvout = csv.DictWriter(csvfile, fieldnames, delimiter=',', quotechar='"')
        csvout.writeheader()
        
        for sub_dir, text_name, xml_names in anafora.walk(args[0]):
            if len(xml_names) == 0:
                continue

            with open( join( join(args[0], sub_dir), text_name), 'r') as tf:
                text = tf.read()

            for xml_name in xml_names:
                xml_path = os.path.join(args[0], sub_dir, xml_name)
                xml_parts = xml_name.split('.')
                annotator = xml_parts[2]
                status = xml_parts[3]
                data = anafora.AnaforaData.from_file(xml_path)

                for annot in data.annotations.select_type('Problem'):
                    span = annot.spans[0]
                    span_text = text[span[0]:span[1]]
                    cat = annot.properties['Content']
                    annotated_text = text[:span[0]] + '<problem> ' + span_text + ' </problem>' + text[span[1]:]
                    csvout.writerow({TYPE:'Problem', 
                                     ROWID:sub_dir, 
                                     STATUS:status, 
                                     ANNOTATOR:annotator, 
                                     LABEL:cat, 
                                     SPAN_TEXT:span_text, 
                                     ANNOT_TEXT: annotated_text})

                for annot in data.annotations.select_type('Question Type'):
                    span = annot.spans[0]
                    span_text = text[span[0]:span[1]]
                    cat = annot.properties['Type']
                    annotated_text = text[:span[0]] + '<type> ' + span_text + ' </type>' + text[span[1]:]
                    csvout.writerow({TYPE:'Type', 
                                     ROWID:sub_dir, 
                                     STATUS:status, 
                                     ANNOTATOR:annotator, 
                                     LABEL:cat, 
                                     SPAN_TEXT:span_text, 
                                     ANNOT_TEXT: annotated_text})
Example #8
0
def fix_thyme_errors(schema, input_dir, output_dir, xml_name_regex="[.]xml$"):
    """
    :param schema anafora.validate.Schema: the THYME schema
    :param input_dir str: the root of a set of THYME Anafora XML directories
    :param output_dir str: the directory where the cleaned versions of the THYME Anafora XML files should be written.
        The directory structure will mirror the input directory structure.
    """
    for sub_dir, text_name, xml_names in anafora.walk(input_dir, xml_name_regex):
        for xml_name in xml_names:
            xml_path = os.path.join(input_dir, sub_dir, xml_name)

            # load the data from the Anafora XML
            try:
                data = anafora.AnaforaData.from_file(xml_path)
            except anafora.ElementTree.ParseError as e:
                logging.warning("SKIPPING invalid XML: %s: %s", e, xml_path)
                continue

            # remove invalid TLINKs and ALINKs
            changed = False
            to_remove = []
            for annotation in data.annotations:
                try:
                    schema.validate(annotation)
                except anafora.validate.SchemaValidationError as e:
                    if annotation.type in {"TLINK", "ALINK"}:
                        logging.warning("REMOVING %s: %s", e, annotation)
                        to_remove.append(annotation)
            for annotation in to_remove:
                data.annotations.remove(annotation)
                changed = True

            # remove TIMEX3s that are directly on top of SECTIONTIMEs and DOCTIMEs
            for span, annotations in anafora.validate.find_entities_with_identical_spans(data):
                try:
                    # sorts SECTIONTIME and DOCTIME before TIMEX3
                    special_time, timex = sorted(annotations, key=lambda a: a.type)
                except ValueError:
                    pass
                else:
                    if special_time.type in {"SECTIONTIME", "DOCTIME"} and timex.type == "TIMEX3":
                        msg = "REPLACING multiple entities for span %s: %s WITH %s"
                        logging.warning(msg, span, timex, special_time)
                        for annotation in data.annotations:
                            for name, value in annotation.properties.items():
                                if value is timex:
                                    annotation.properties[name] = special_time
                        data.annotations.remove(timex)
                        changed = True

            # if we found and fixed any errors, write out the new XML file
            if changed:
                output_sub_dir = os.path.join(output_dir, sub_dir)
                if not os.path.exists(output_sub_dir):
                    os.makedirs(output_sub_dir)
                output_path = os.path.join(output_sub_dir, xml_name)
                data.to_file(output_path)
Example #9
0
def xml_json_xml(input_dir, output_dir, set_to_super_interval):
    paths = anafora.walk(input_dir, xml_name_regex=r'TimeNorm\.gold\.completed')
    for sub_dir, text_file_name, xml_file_names in paths:
        for xml_file_name in xml_file_names:
            input_path = os.path.join(input_dir, sub_dir, xml_file_name)
            data = anafora.AnaforaData.from_file(input_path)
            data = parse_json(parse_element(data, set_to_super_interval))
            output_parent = os.path.join(output_dir, sub_dir)
            if not os.path.exists(output_parent):
                os.makedirs(output_parent)
            data.to_file(os.path.join(output_parent, xml_file_name))
Example #10
0
 def text_data_pairs():
     for sub_dir, text_name, xml_names in anafora.walk(train_dir, xml_name_regex):
         if text_dir is not None:
             text_path = os.path.join(text_dir, text_name)
         else:
             text_path = os.path.join(train_dir, sub_dir, text_name)
         if not os.path.exists(text_path):
             logging.warning("no text found at %s", text_path)
             continue
         with codecs.open(text_path, 'r', text_encoding) as text_file:
             text = text_file.read()
         for xml_name in xml_names:
             data = anafora.AnaforaData.from_file(os.path.join(train_dir, sub_dir, xml_name))
             yield text, data
Example #11
0
    def write_xml(self, predicted_relations):
        """Write predictions in anafora XML format"""

        # make a directory to write anafora xml
        if os.path.isdir(self.out_dir):
            shutil.rmtree(self.out_dir)
        os.mkdir(self.out_dir)

        # key: note, value: list of rel arg tuples
        note2rels = defaultdict(list)

        # map notes to relations in these notes
        # for container_id, contained_id in predicted_relations:
        for contained_id, container_id in predicted_relations:
            note_name = container_id.split('@')[2]
            note2rels[note_name].append((container_id, contained_id))

        # iterate over reference anafora xml files
        for sub_dir, text_name, file_names in anafora.walk(
                self.xml_dir, self.xml_regex):

            path = os.path.join(self.xml_dir, sub_dir, file_names[0])
            ref_data = anafora.AnaforaData.from_file(path)

            # make a new XML file
            generated_data = anafora.AnaforaData()

            # copy gold events and times
            copy_annotations(ref_data, generated_data, 'EVENT')
            copy_annotations(ref_data, generated_data, 'TIMEX3')
            copy_annotations(ref_data, generated_data, 'SECTIONTIME')
            copy_annotations(ref_data, generated_data, 'DOCTIME')

            # add generated relations
            note_name = file_names[0].split('.')[0]
            for container_id, contained_id in note2rels[note_name]:
                relation = anafora.AnaforaRelation()
                relation.id = str(random.random())[2:]
                relation.type = 'TLINK'
                relation.parents_type = 'TemporalRelations'
                relation.properties['Source'] = container_id
                relation.properties['Type'] = 'CONTAINS'
                relation.properties['Target'] = contained_id
                generated_data.annotations.append(relation)

            # write xml to file
            generated_data.indent()
            os.mkdir(os.path.join(self.out_dir, sub_dir))
            out_path = os.path.join(self.out_dir, sub_dir, file_names[0])
            generated_data.to_file(out_path)
Example #12
0
def log_entities_with_identical_spans(anafora_dir, xml_name_regex):
    """
    :param AnaforaData data: the Anafora data to be searched
    """
    for sub_dir, text_name, xml_names in anafora.walk(anafora_dir, xml_name_regex):
        for xml_name in xml_names:
            xml_path = os.path.join(anafora_dir, sub_dir, xml_name)
            try:
                data = anafora.AnaforaData.from_file(xml_path)
            except anafora.ElementTree.ParseError:
                pass
            else:
                for span, annotations in find_entities_with_identical_spans(data):
                    logging.warn("%s: multiple entities for span %s:\n%s",
                                 xml_path, span, "\n".join(str(ann).rstrip() for ann in annotations))
def compare_through_golden_files(input_dir):
    paths = anafora.walk(input_dir,
                         xml_name_regex=r'TimeNorm\.gold\.completed')

    for sub_dir, text_file_name, xml_file_names in paths:
        for xml_file_name in xml_file_names:
            input_path = os.path.join(input_dir, sub_dir, xml_file_name)
            data = anafora.AnaforaData.from_file(input_path)
            data = parse_json(parse_element(data))
            data.to_file("output_from_labelstud.xml")
            remove_some_elements(input_path, "output_from_golden.xml")
            compare_two_files("output_from_labelstud.xml",
                              "output_from_golden.xml", xml_file_name)

    return
Example #14
0
def log_schema_errors(schema, anafora_dir, xml_name_regex):
    """
    :param Schema schema: the schema to validate against
    :param string anafora_dir: the Anafora directory containing directories to validate
    """
    for sub_dir, text_name, xml_names in anafora.walk(anafora_dir, xml_name_regex):
        for xml_name in xml_names:
            xml_path = os.path.join(anafora_dir, sub_dir, xml_name)
            try:
                data = anafora.AnaforaData.from_file(xml_path)
            except anafora.ElementTree.ParseError:
                logging.warn("%s: invalid XML", xml_path)
            else:
                for annotation, error in schema.errors(data):
                    logging.warn("%s: %s", xml_path, error)
Example #15
0
 def text_data_pairs():
     for sub_dir, text_name, xml_names in anafora.walk(
             train_dir, xml_name_regex):
         if text_dir is not None:
             text_path = os.path.join(text_dir, text_name)
         else:
             text_path = os.path.join(train_dir, sub_dir, text_name)
         if not os.path.exists(text_path):
             logging.warning("no text found at %s", text_path)
             continue
         with codecs.open(text_path, 'r', text_encoding) as text_file:
             text = text_file.read()
         for xml_name in xml_names:
             data = anafora.AnaforaData.from_file(
                 os.path.join(train_dir, sub_dir, xml_name))
             yield text, data
Example #16
0
def log_schema_errors(schema, anafora_dir, xml_name_regex):
    """
    :param Schema schema: the schema to validate against
    :param string anafora_dir: the Anafora directory containing directories to validate
    """
    for sub_dir, text_name, xml_names in anafora.walk(anafora_dir,
                                                      xml_name_regex):
        for xml_name in xml_names:
            xml_path = os.path.join(anafora_dir, sub_dir, xml_name)
            try:
                data = anafora.AnaforaData.from_file(xml_path)
            except anafora.ElementTree.ParseError:
                logging.warn("%s: invalid XML", xml_path)
            else:
                for annotation, error in schema.errors(data):
                    logging.warn("%s: %s", xml_path, error)
def _main(input_dir,
          output_dir,
          xml_name_regex="[.]xml$",
          include=None,
          exclude=None):
    select = Select(include, exclude)

    for sub_dir, text_name, xml_names in anafora.walk(input_dir,
                                                      xml_name_regex):
        for xml_name in xml_names:

            # reads in the data from the input file
            xml_path = os.path.join(input_dir, sub_dir, xml_name)
            data = anafora.AnaforaData.from_file(xml_path)

            # find annotations and properties to remove
            annotations_to_remove = []
            annotation_properties_to_remove = []
            for annotation in data.annotations:

                # remove the annotation if its type has not been selected
                if not select(annotation.type):
                    annotations_to_remove.append(annotation)
                else:
                    for name, value in annotation.properties.items():

                        # remove the property if its name or value has not been selected
                        if not select(annotation.type, name, value):
                            annotation_properties_to_remove.append(
                                (annotation, name))

            # if we're overwriting, save a backup of the original
            if annotations_to_remove or annotation_properties_to_remove:
                data.to_file(xml_path + ".bak")

            # do the actual removal of annotations here
            for annotation in annotations_to_remove:
                data.annotations.remove(annotation)
            for annotation, name in annotation_properties_to_remove:
                del annotation.properties[name]

            # writes out the modified data to the output file
            output_sub_dir = os.path.join(output_dir or input_dir, sub_dir)
            if not os.path.exists(output_sub_dir):
                os.makedirs(output_sub_dir)
            output_path = os.path.join(output_sub_dir, xml_name)
            data.to_file(output_path)
Example #18
0
    def write_xml(self, prediction_lookup):
        """Write predictions in anafora XML format"""

        # make a directory to write anafora xml
        if os.path.isdir(self.xml_out_dir):
            shutil.rmtree(self.xml_out_dir)
        os.mkdir(self.xml_out_dir)

        # t5 occasionally fails to predict
        missing_predictions = []

        # iterate over reference xml files
        # look up the DTR prediction for each event
        # and write it in anafora format to specificed dir
        for sub_dir, text_name, file_names in \
                anafora.walk(self.xml_ref_dir, xml_regex):

            path = os.path.join(self.xml_ref_dir, sub_dir, file_names[0])
            ref_data = anafora.AnaforaData.from_file(path)
            data = anafora.AnaforaData()

            for event in ref_data.annotations.select_type('EVENT'):

                # make a new entity and copy some ref info
                entity = anafora.AnaforaEntity()
                entity.id = event.id
                start, end = event.spans[0]
                entity.spans = event.spans
                entity.type = event.type

                # lookup the prediction
                key = '|'.join((sub_dir, str(start), str(end)))
                if key not in prediction_lookup:
                    # use majority class for now
                    entity.properties['DocTimeRel'] = 'OVERLAP'
                    missing_predictions.append(key)
                else:
                    entity.properties['DocTimeRel'] = prediction_lookup[key]

                data.annotations.append(entity)

            data.indent()
            os.mkdir(os.path.join(self.xml_out_dir, sub_dir))
            out_path = os.path.join(self.xml_out_dir, sub_dir, file_names[0])
            data.to_file(out_path)

        print('number of missing predictions:', len(missing_predictions))
def convert_dir(input_dir, output_dir, raw_dir=None):
    if not os.path.exists(output_dir):
        os.mkdir(output_dir)

    for document in anafora.walk(input_dir):
        document_dir = document[0]
        document_name = document[1]

        for xml_name in document[2]:
            if xml_name.endswith(".TimeNorm.gold.completed.xml"):
                xml_path = os.path.join(input_dir, document_dir, xml_name)
                output_path = os.path.join(output_dir, document_name, xml_name)
                raw_path = None
                if raw_dir is not None:
                    raw_path = os.path.join(raw_dir, document_dir,
                                            document_name)
                convert_xml(xml_path, output_path, raw_path)
def sub_to_super(input_dir, output_dir):
    paths = anafora.walk(input_dir,
                         xml_name_regex=r'TimeNorm\.gold\.completed')
    for sub_dir, text_file_name, xml_file_names in paths:
        for xml_file_name in xml_file_names:
            input_path = os.path.join(input_dir, sub_dir, xml_file_name)
            data = anafora.AnaforaData.from_file(input_path)
            data = check_year_of_sub_interval(data)
            for entity in data.annotations:
                if 'Sub-Interval' in entity.properties:
                    sub_entity = entity.properties['Sub-Interval']
                    if sub_entity:
                        sub_entity.properties['Super-Interval'] = entity.id
                    del entity.properties['Sub-Interval']
            output_parent = os.path.join(output_dir, sub_dir)
            if not os.path.exists(output_parent):
                os.makedirs(output_parent)
            data.to_file(os.path.join(output_parent, xml_file_name))
Example #21
0
  def read(self):
    """Make x, y etc."""

    inputs = []
    labels = []

    tokenizer = BertTokenizer.from_pretrained(
      'bert-base-uncased',
      do_lower_case=True)

    for sub_dir, text_name, file_names in \
            anafora.walk(self.xml_dir, self.xml_regex):

      xml_path = os.path.join(self.xml_dir, sub_dir, file_names[0])
      ref_data = anafora.AnaforaData.from_file(xml_path)

      text_path = os.path.join(self.text_dir, text_name)
      text = open(text_path).read()

      for event in ref_data.annotations.select_type('EVENT'):
        label = event.properties['DocTimeRel']
        labels.append(label2int[label])

        start, end = event.spans[0]
        event = text[start:end] # should be end+1?
        left = text[start - self.context_chars : start]
        right = text[end : end + self.context_chars]

        context = left + ' es ' + event + ' ee ' + right
        inputs.append(tokenizer.encode(context.replace('\n', '')))

    inputs = pad_sequences(
      inputs,
      maxlen=self.max_length,
      dtype='long',
      truncating='post',
      padding='post')

    masks = [] # attention masks
    for sequence in inputs:
      mask = [float(value > 0) for value in sequence]
      masks.append(mask)

    return inputs, labels, masks
Example #22
0
    def write(self, predictions):
        """Write predictions in anafora XML format"""

        # predictions are in the same order in which they were read
        prediction_lookup = dict(zip(self.offsets, predictions))

        # make a directory to write anafora xml
        if os.path.isdir(self.xml_out_dir):
            shutil.rmtree(self.xml_out_dir)
        os.mkdir(self.xml_out_dir)

        # iterate over reference xml files
        # look up the DTR prediction for each event
        # and write it in anafora format to specificed dir
        for sub_dir, text_name, file_names in \
                anafora.walk(self.xml_ref_dir, xml_regex):

            path = os.path.join(self.xml_ref_dir, sub_dir, file_names[0])
            ref_data = anafora.AnaforaData.from_file(path)
            data = anafora.AnaforaData()

            for event in ref_data.annotations.select_type('EVENT'):

                # make a new entity and copy some ref info
                entity = anafora.AnaforaEntity()
                entity.id = event.id
                start, end = event.spans[0]
                entity.spans = event.spans
                entity.type = event.type

                # lookup the prediction
                if (sub_dir, start, end) not in prediction_lookup:
                    print('missing key:', (sub_dir, start, end))
                    continue

                label = prediction_lookup[(sub_dir, start, end)]
                entity.properties['DocTimeRel'] = int2label[label]

                data.annotations.append(entity)

            data.indent()
            os.mkdir(os.path.join(self.xml_out_dir, sub_dir))
            out_path = os.path.join(self.xml_out_dir, sub_dir, file_names[0])
            data.to_file(out_path)
def main(input_dir, exclude, verbose):
    duplicate_types = Counter()
    parent_types = Counter()
    paths = anafora.walk(input_dir,
                         xml_name_regex=r'TimeNorm\.gold\.completed')
    for sub_dir, text_file_name, xml_file_names in paths:
        for xml_file_name in xml_file_names:
            input_path = os.path.join(input_dir, sub_dir, xml_file_name)
            data = anafora.AnaforaData.from_file(input_path)

            # find entities that share the same type and span
            counts = Counter()
            for entity in data.annotations:
                #if entity.type not in exclude:
                counts[entity] += 1

            duplicates = {key for key, count in counts.items() if count > 1}

            # which types are most often duplicated
            if duplicates:
                if verbose:
                    print(f"{xml_file_name}")
                for entity in sorted(duplicates):
                    if verbose:
                        print(f"  {entity.spans} {entity.type}")
                    duplicate_types[entity.type] += 1

            # which types most often have duplicated entities as arguments
            for entity in data.annotations:
                for _, value in entity.properties.items():
                    if isinstance(
                            value,
                            anafora.AnaforaAnnotation) and value in duplicates:
                        if verbose:
                            print(
                                f"  parent: {entity.id} {entity.spans} ----> "
                                f"{value.id} {value.spans}")

                        parent_types[entity.type] += 1
                        break

    print(f'duplicate types: {duplicate_types}')
    print(f'parent types:    {parent_types}')
Example #24
0
def log_entities_with_identical_spans(anafora_dir, xml_name_regex):
    """
    :param AnaforaData data: the Anafora data to be searched
    """
    for sub_dir, text_name, xml_names in anafora.walk(anafora_dir,
                                                      xml_name_regex):
        for xml_name in xml_names:
            xml_path = os.path.join(anafora_dir, sub_dir, xml_name)
            try:
                data = anafora.AnaforaData.from_file(xml_path)
            except anafora.ElementTree.ParseError:
                pass
            else:
                for span, annotations in find_entities_with_identical_spans(
                        data):
                    logging.warn(
                        "%s: multiple entities for span %s:\n%s", xml_path,
                        span,
                        "\n".join(str(ann).rstrip() for ann in annotations))
Example #25
0
def _main(input_dir, output_dir, xml_name_regex="[.]xml$", include=None, exclude=None):
    select = Select(include, exclude)

    for sub_dir, text_name, xml_names in anafora.walk(input_dir, xml_name_regex):
        for xml_name in xml_names:

            # reads in the data from the input file
            xml_path = os.path.join(input_dir, sub_dir, xml_name)
            data = anafora.AnaforaData.from_file(xml_path)

            # find annotations and properties to remove
            annotations_to_remove = []
            annotation_properties_to_remove = []
            for annotation in data.annotations:

                # remove the annotation if its type has not been selected
                if not select(annotation.type):
                    annotations_to_remove.append(annotation)
                else:
                    for name, value in annotation.properties.items():

                        # remove the property if its name or value has not been selected
                        if not select(annotation.type, name, value):
                            annotation_properties_to_remove.append((annotation, name))

            # if we're overwriting, save a backup of the original
            if annotations_to_remove or annotation_properties_to_remove:
                data.to_file(xml_path + ".bak")

            # do the actual removal of annotations here
            for annotation in annotations_to_remove:
                data.annotations.remove(annotation)
            for annotation, name in annotation_properties_to_remove:
                del annotation.properties[name]

            # writes out the modified data to the output file
            output_sub_dir = os.path.join(output_dir or input_dir, sub_dir)
            if not os.path.exists(output_sub_dir):
                os.makedirs(output_sub_dir)
            output_path = os.path.join(output_sub_dir, xml_name)
            data.to_file(output_path)
def _copy_text(text_name_to_path, get_text, get_dct, anafora_dir,
               xml_name_regex):
    for sub_dir, text_file_name, _ in anafora.walk(
            anafora_dir, xml_name_regex=xml_name_regex):
        if text_file_name not in text_name_to_path:
            sys.exit("No text file found for " + text_file_name)
        text_path = os.path.join(anafora_dir, sub_dir, text_file_name)
        if os.path.exists(text_path):
            sys.exit("Text file already exists: " + text_path)
        input_path = text_name_to_path[text_file_name]
        text = get_text(input_path)
        with open(text_path, 'w') as text_file:
            text_file.write(text)
        if get_dct is not None:
            dct_path = text_path + ".dct"
            if os.path.exists(dct_path):
                sys.exit("DCT file already exists: " + dct_path)
            dct = get_dct(input_path)
            with open(dct_path, 'w') as dct_file:
                dct_file.write(dct)
                dct_file.write("\n")
Example #27
0
def main(args):
    if len(args) < 1:
        sys.stderr.write('Required argument(s): <anafora directory>\n')
        sys.exit(-1)

    for sub_dir, text_name, xml_names in walk(
            args[0], xml_name_regex='[.]dave\.completed\.xml$'):
        # print("text_name = %s" % text_name)
        assert len(xml_names) == 1
        xml_file = join(args[0], sub_dir, xml_names[0])
        # print("xml file = %s" % (xml_file,))
        anafora_data = AnaforaData.from_file(xml_file)
        for event in anafora_data.annotations.select_type('CuiEntity'):
            negated = event.properties['negated']
            span = event.spans
            assert len(span) == 1
            span = span[0]
            assert len(span) == 2

            # print("Row id %s event with span (%d, %d) is negated=%s" % (text_name, span[0], span[1], str(negated)))
            print('%s\t%d\t%d\t%s' %
                  (text_name, span[0], span[1], str(negated)))
Example #28
0
def main(input_dir, output_dir):

    paths = anafora.walk(input_dir, xml_name_regex=r'TimeNorm\.gold\.completed')
    for sub_dir, text_file_name, xml_file_names in paths:
        for xml_file_name in xml_file_names:
            input_path = os.path.join(input_dir, sub_dir, xml_file_name)
            data = anafora.AnaforaData.from_file(input_path)

            # find entities that share the same type and span
            counts = Counter()
            for entity in data.annotations:
                counts[entity] += 1

            duplicates = {key for key, count in counts.items() if count > 1}

            identical_span_to_parent_entity = {}  # same spans with different id
            remove_id = []
            # which types most often have duplicated entities as arguments
            for entity in data.annotations:
                for _, value in entity.properties.items():
                    if isinstance(value, anafora.AnaforaAnnotation) and value in duplicates:
                        if value.spans not in identical_span_to_parent_entity:
                            identical_span_to_parent_entity[value.spans] = value.id
                        else:
                            remove_id.append(value.id)
                            entity.properties['Super-Interval'] = identical_span_to_parent_entity[value.spans]
                        break

            for entity in list(data.annotations):
                if entity.id in remove_id:
                    data.annotations.remove(entity)

            output_parent = os.path.join(output_dir, sub_dir)
            if not os.path.exists(output_parent):
                os.makedirs(output_parent)
            data.to_file(os.path.join(output_parent, xml_file_name))
Example #29
0
                                             help="The <type> of the target annotations.")
    relations_to_closest_parser.add_argument("-r", "--relation", metavar="TYPE", dest="relation_type", required=True,
                                             help="The <type> of relation annotation to be created.")
    relations_to_closest_parser.add_argument("-rs", "--relation-source", metavar="NAME", required=True,
                                             dest="relation_source_property_name",
                                             help="The name of the property on the relation annotation that should " +
                                                  "point to the source annotation.")
    relations_to_closest_parser.add_argument("-rt", "--relation-target", metavar="NAME", required=True,
                                             dest="relation_target_property_name",
                                             help="The name of the property on the relation annotation that should " +
                                                  "point to the target annotation.")
    relations_to_closest_parser.add_argument("-ro", "--relation-other", metavar="NAME=VALUE", nargs='+', type=_pair,
                                             dest="relation_other_properties",
                                             help="Other properties that should be added to the relation annotation.")

    args = parser.parse_args()
    kwargs = vars(args)
    func = kwargs.pop("func")
    input_dir = kwargs.pop('input_dir')
    xml_name_regex = kwargs.pop('xml_name_regex')
    output_dir = kwargs.pop('output_dir')

    for sub_dir, _, xml_file_names in anafora.walk(input_dir, xml_name_regex):
        for xml_file_name in xml_file_names:
            input_data = anafora.AnaforaData.from_file(os.path.join(input_dir, sub_dir, xml_file_name))
            func(input_data, **kwargs)
            output_sub_dir = os.path.join(output_dir, sub_dir)
            if not os.path.exists(output_sub_dir):
                os.makedirs(output_sub_dir)
            input_data.to_file(os.path.join(output_dir, sub_dir, xml_file_name))
    def from_texts(cls, data_dir, nlp, tokenizer, config):
        if not os.path.exists(data_dir):
            raise Exception("The %s directory does not exist." % data_dir)
        text_directory_files = anafora.walk(
            data_dir, xml_name_regex=".*((?<![.].{3})|[.]txt)$")
        features = []
        doc_indices = []
        for text_files in text_directory_files:
            doc_index = len(features)
            text_subdir_path, text_doc_name, text_file_names = text_files
            if len(text_file_names) != 1:
                raise Exception("Wrong number of text files in %s" %
                                text_subdir_path)
            anafora_path = os.path.join(data_dir, text_subdir_path)
            anafora_directory_files = anafora.walk(anafora_path,
                                                   xml_name_regex="[.]xml$")
            anafora_directory_files = list(anafora_directory_files)
            if len(anafora_directory_files) != 1:
                raise Exception("Wrong structure in %s" % anafora_path)
            anafora_subdir_path, anafora_doc_name, anafora_file_names = anafora_directory_files[
                0]
            if len(anafora_file_names) != 1:
                raise Exception("Wrong number of anafora files in %s" %
                                anafora_subdir_path)
            text_file_path = os.path.join(data_dir, text_subdir_path,
                                          text_file_names[0])

            # Load the annotations
            anafora_file_path = os.path.join(anafora_path, anafora_subdir_path,
                                             anafora_file_names[0])
            data = anafora.AnaforaData.from_file(anafora_file_path)
            annotations = dict()
            for annotation in data.annotations:
                label = annotation.type
                for span in annotation.spans:
                    start, end = span
                    annotations[start] = (end, label)

            # Read, segment and tokenize the raw text.
            with open(text_file_path) as txt_file:
                text = txt_file.read()
            doc = nlp(text)
            input_raw = [sent.text_with_ws for sent in doc.sents]
            input_data = tokenizer(input_raw,
                                   return_tensors="pt",
                                   padding="max_length",
                                   truncation="longest_first",
                                   return_offsets_mapping=True)

            # Initialize label sequence with 0. Use ignore index for padding tokens
            negative_attention_mask = (
                ~input_data["attention_mask"].byte()).true_divide(255).long()
            input_data["labels"] = negative_attention_mask.mul(
                config.label_pad_id)
            # Assign label_pad to </s> token
            sent_indices = torch.arange(input_data["labels"].shape[0])
            last_non_padded = [
                sent_indices, input_data["labels"].argmax(dim=1)
            ]
            input_data["labels"][last_non_padded] = config.label_pad_id
            # Assign label_pad to <s> token
            input_data["labels"][:, 0] = config.label_pad_id

            sent_offset = 0
            for sent_idx, _ in enumerate(input_data["input_ids"]):
                features.append(
                    TimexInputFeatures.from_sentence(input_data, sent_idx,
                                                     sent_offset, annotations,
                                                     config))
                sent_offset += len(input_raw[sent_idx])

            doc_indices.append((text_subdir_path, doc_index, len(features)))
        return cls(doc_indices, features)
Example #31
0
from shutil import copyfile
import anafora
import os
split_files_path = 'path/to/train-all-data'
split_train_path = 'path/to/train-new-data'
split_test_input_path = 'path/to/test-input'
split_test_label_path = 'path/to/test-label'
text_directory_files = anafora.walk(split_files_path,
                                    xml_name_regex=".*((?<![.].{3})|[.]xml)$")
i = 0
for text_files in text_directory_files:
    text_subdir_path, text_doc_name, text_file_names = text_files
    for text_file_name in text_file_names:
        old_xml_file_path = os.path.join(split_files_path, text_subdir_path,
                                         text_file_name)
        if i % 5 == 4:
            if text_file_name.endswith("xml"):
                new_xml_dir_path = os.path.join(split_test_label_path,
                                                text_subdir_path)
            else:
                new_xml_dir_path = os.path.join(split_test_input_path,
                                                text_subdir_path)
        else:
            new_xml_dir_path = os.path.join(split_train_path, text_subdir_path)
        if not os.path.exists(new_xml_dir_path):
            os.makedirs(new_xml_dir_path, 0o0755)
        new_xml_file_path = os.path.join(new_xml_dir_path, text_file_name)
        copyfile(old_xml_file_path, new_xml_file_path)
    i += 1
        for l in f:
            predict_modality.append(int(l.strip()))

    labelidx = 0

    for dir_path, dir_names, file_names in os.walk(input_text_dir):

        pbar = ProgressBar(maxval=len(file_names)).start()

        for i, fn in enumerate(sorted(file_names)):

            time.sleep(0.01)
            pbar.update(i + 1)

            # this for to make consistence
            for sub_dir, text_name, xml_names in anafora.walk(
                    os.path.join(ann_dir, fn)):

                for xml_name in xml_names:

                    if "Temporal" not in xml_name:
                        continue

                    xml_path = os.path.join(ann_dir, text_name, xml_name)
                    data = anafora.AnaforaData.from_file(xml_path)

                    positive_span_label_map = {}

                    for annotation in data.annotations:
                        if annotation.type == 'EVENT':

                            startoffset = annotation.spans[0][0]
Example #33
0
def score_dirs(reference_dir,
               predicted_dir,
               xml_name_regex="[.]xml$",
               text_dir=None,
               include=None,
               exclude=None,
               scores_type=Scores,
               annotation_wrapper=None):
    """
    :param string reference_dir: directory containing reference ("gold standard") Anafora XML directories
    :param string predicted_dir: directory containing predicted (system-generated) Anafora XML directories
    :param xml_name_regex: regular expression matching the files to be compared
    :param string text_dir: directory containing the raw texts corresponding to the Anafora XML
        (if None, texts are assumed to be in the reference dir)
    :param set include: types of annotations to include (others will be excluded); may be type names,
        (type-name, property-name) tuples, (type-name, property-name, property-value) tuples
    :param set exclude: types of annotations to exclude; may be type names, (type-name, property-name) tuples,
        (type-name, property-name, property-value) tuples
    :param type scores_type: type for calculating matches between predictions and reference
    :param type annotation_wrapper: wrapper object to apply to AnaforaAnnotations
    :return iter: an iterator of (file-name, name-to-scores) where name-to-scores is a mapping from
        (annotation type[, property name[, property value]]) to a Scores object
    """

    # walks through the reference Anafora XML directories, scoring each and adding those to the overall scores
    for sub_dir, text_name, reference_xml_names in anafora.walk(
            reference_dir, xml_name_regex):

        # load the reference data from its Anafora XML
        try:
            [reference_xml_name] = reference_xml_names
        except ValueError:
            logging.warn("expected one reference file for %s, found %s",
                         text_name, reference_xml_names)
            if not reference_xml_names:
                continue
            reference_xml_name = reference_xml_names[0]
        reference_xml_path = os.path.join(reference_dir, sub_dir,
                                          reference_xml_name)
        reference_data = _load(reference_xml_path)

        # check for self-references in the annotations, which cause equality and hashing to fail
        self_reference = reference_data.annotations.find_self_referential()
        if self_reference is not None:
            msg = "skipping reference file %s with self-referential annotation %s"
            logging.warn(msg, reference_xml_path, self_reference.id)
            continue

        # find and load the corresponding predicted data from its Anafora XML
        predicted_xml_glob = os.path.join(predicted_dir, sub_dir,
                                          text_name + "*.xml")
        predicted_xml_paths = [
            f for f in glob.glob(predicted_xml_glob)
            if re.search(xml_name_regex, f) is not None
        ]
        try:
            [predicted_xml_path] = predicted_xml_paths
            predicted_data = _load(predicted_xml_path)
        except ValueError:
            logging.warn("expected one predicted file at %s, found %s",
                         predicted_xml_glob, predicted_xml_paths)
            if not predicted_xml_paths:
                predicted_xml_path = None
                predicted_data = anafora.AnaforaData()
            else:
                predicted_xml_path = predicted_xml_paths[0]
                predicted_data = _load(predicted_xml_path)

        # check for self-references in the annotations, which cause equality and hashing to fail
        self_reference = predicted_data.annotations.find_self_referential()
        if self_reference is not None:
            msg = "skipping predicted file %s with self-referential annotation %s"
            logging.warn(msg, predicted_xml_path, self_reference.id)
            predicted_data = anafora.AnaforaData()

        # determine the path for the raw text source file
        if text_dir is None:
            text_path = os.path.join(reference_dir, sub_dir, text_name)
        else:
            text_path = os.path.join(text_dir, text_name)

        # if no raw text was found, then asking for the text of an annotation is an error
        if not os.path.exists(text_path) or not os.path.isfile(text_path):

            def _span_text(_):
                raise RuntimeError(
                    "no text file found at {0}".format(text_path))

        # otherwise, the text of an annotation can be extracted based on its spans
        else:
            with open(text_path) as text_file:
                text = text_file.read()

            def _flatten(items):
                if isinstance(items, tuple) and isinstance(items[0], int):
                    yield items
                else:
                    for item in items:
                        for flattened_items in _flatten(item):
                            yield flattened_items

            def _span_text(spans):
                return "...".join(text[start:end]
                                  for start, end in _flatten(spans))

        # score this data and update the overall scores
        named_scores = score_data(reference_data,
                                  predicted_data,
                                  include,
                                  exclude,
                                  scores_type=scores_type,
                                  annotation_wrapper=annotation_wrapper)
        for name, scores in named_scores.items():

            # if there were some predictions, and if we're using scores that keep track of errors, log the errors
            if predicted_xml_paths:
                for annotation, message in getattr(scores, "errors", []):
                    logging.debug('%s: %s: "%s" %s"', text_name, message,
                                  _span_text(annotation.spans), annotation)

        # generate the file name and the resulting scores
        yield text_name, named_scores
Example #34
0
def score_annotators(anafora_dir,
                     xml_name_regex,
                     include=None,
                     exclude=None,
                     scores_type=Scores,
                     annotation_wrapper=None):
    """
    :param anafora_dir: directory containing Anafora XML directories
    :param xml_name_regex: regular expression matching the annotator files to be compared
    :param include: types of annotations to include (others will be excluded); may be type names,
        (type-name, property-name) tuples, (type-name, property-name, property-value) tuples
    :param set exclude: types of annotations to exclude; may be type names, (type-name, property-name) tuples,
        (type-name, property-name, property-value) tuples
    :param type scores_type: type for calculating matches between predictions and reference
    :param type annotation_wrapper: wrapper object to apply to AnaforaAnnotations
    :return iter: an iterator of (file-name, name-to-scores) where name-to-scores is a mapping from
        (annotation type[, property name[, property value]]) to a Scores object
    """

    # pattern for extracting the annotator name from the Anafora XML file name
    annotator_name_regex = "([^.]*)[.][^.]*[.]xml$"

    # function for getting a canonical prefix corresponding to a pair of annotators
    def make_prefix(annotators):
        return "{0}-vs-{1}".format(*sorted(annotators))

    # walks through the Anafora XML directories, scoring each and adding those to the overall scores
    for sub_dir, text_name, xml_names in anafora.walk(anafora_dir,
                                                      xml_name_regex):

        # load the data from each Anafora XML file
        annotator_data = []
        for xml_name in xml_names:

            # ignore in-progress annotations and automatic pre-annotations
            if '.inprogress.' in xml_name or '.preannotation.' in xml_name:
                continue

            # ignore empty files
            xml_path = os.path.join(anafora_dir, sub_dir, xml_name)
            if os.stat(xml_path).st_size == 0:
                continue

            # load the data and add it to the list
            data = _load(xml_path)
            annotator_name = re.search(annotator_name_regex, xml_name).group(1)
            annotator_data.append((annotator_name, data))

        # at least 2 annotators are needed for annotator agreement
        if len(annotator_data) < 2:
            logging.warn("%s: found fewer than 2 annotators: %s", text_name,
                         xml_names)
            continue

        # pair each annotator with each other annotator
        annotator_named_scores = collections.defaultdict(lambda: scores_type())
        for i in range(len(annotator_data)):
            annotator1, data1 = annotator_data[i]
            for j in range(i + 1, len(annotator_data)):
                annotator2, data2 = annotator_data[j]

                # make a prefix for this specific pair of annotators
                prefix = make_prefix([annotator1, annotator2])

                # make a prefix where non-gold annotators are just called "annotator"
                general_prefix = make_prefix(a if a == "gold" else "annotator"
                                             for a in [annotator1, annotator2])

                # perform the comparison of the two annotation sets and update the overall scores
                named_scores = score_data(
                    data1,
                    data2,
                    include,
                    exclude,
                    scores_type=scores_type,
                    annotation_wrapper=annotation_wrapper)

                # add annotators as prefixes
                for name, scores in named_scores.items():
                    if not isinstance(name, tuple):
                        name = name,
                    annotator_named_scores[(prefix, ) + name].update(scores)
                    annotator_named_scores[(general_prefix, ) +
                                           name].update(scores)

        # generate the filename and the resulting scores
        yield text_name, annotator_named_scores
        input_text_dir = os.path.join(plain_dir, "test")

        ann_dir = os.path.join(base_dir, 'annotation/coloncancer/Test')
            
        for dir_path, dir_names, file_names in os.walk(input_text_dir):

            pbar = ProgressBar(maxval=len(file_names)).start()

            for i, fn in enumerate(sorted(file_names)):

                time.sleep(0.01)
                pbar.update(i + 1)

                # this for to make consistence
                for sub_dir, text_name, xml_names in anafora.walk(os.path.join(ann_dir, fn)):

                    for xml_name in xml_names:

                        if "Temporal" not in xml_name:
                            continue

                        xml_path = os.path.join(ann_dir, text_name, xml_name)
                        data = anafora.AnaforaData.from_file(xml_path)

                        positive_span_label_map={}

                        for annotation in data.annotations:
                            if annotation.type == 'EVENT':

                                startoffset = annotation.spans[0][0]
def preprocess_data_torch(input_text_dir, input_ann_dir, outDir, window_size, input_name, input_type, Shuffle):

    maxchar = 0
    num_doc = 0

    with open(os.path.join(outDir, input_name+"_"+input_type+".csv"), 'w') as csvf:

        for dir_path, dir_names, file_names in os.walk(input_text_dir):

            pbar = ProgressBar(maxval=len(file_names)).start()

            for i, fn in enumerate(sorted(file_names)):

                time.sleep(0.01)
                pbar.update(i + 1)

                for sub_dir, text_name, xml_names in anafora.walk(os.path.join(input_ann_dir, fn)):

                    for xml_name in xml_names:

                        if "Temporal" not in xml_name:
                            continue

                        num_doc += 1

                        #print fn
                        xml_path = os.path.join(input_ann_dir, text_name, xml_name)
                        data = anafora.AnaforaData.from_file(xml_path)

                        with open(os.path.join(input_text_dir, fn), 'r') as f:
                            content = f.read()

                        positive_span_label_map={}

                        for annotation in data.annotations:
                            if annotation.type == 'EVENT':

                                startoffset = annotation.spans[0][0]
                                endoffset = annotation.spans[0][1]

                                properties = annotation.properties
                                pros = {}
                
                                for pro_name in properties:
                                    pro_val = properties.__getitem__(pro_name)
                                    pros[pro_name] = pro_val

                                if input_name == "type":
                                    label = Type[pros["Type"]]
                                elif input_name == "polarity":
                                    label = Polarity[pros["Polarity"]]
                                elif input_name == "degree":
                                    label = Degree[pros["Degree"]]
                                elif input_name == "modality":
                                    label = ContextualModality[pros["ContextualModality"]]

                                positive_span_label_map[(startoffset,endoffset)] = label


                        all_spans = content2span(content)

                        negative_span_label_map={}
                        for span in all_spans:
                            if span not in positive_span_label_map:
                                if input_name == "type":
                                    negative_span_label_map[span] = "4"
                                elif input_name == "polarity":
                                    negative_span_label_map[span] = "3"
                                elif input_name == "degree":
                                    negative_span_label_map[span] = "4"
                                elif input_name == "modality":
                                    negative_span_label_map[span] = "5"

                        merged_spans = positive_span_label_map.keys() + negative_span_label_map.keys()

                        if Shuffle:
                            shuffle(merged_spans)

                        for span in merged_spans: 

                            feats = feature_generation_1(content, span[0], span[1], window_size)

                            if maxchar < len(feats):
                                maxchar = len(feats)

                            if span in positive_span_label_map:
                                label = positive_span_label_map[span]

                            elif span in negative_span_label_map:
                                label = negative_span_label_map[span]

                            label = "\"" +label+"\""
                            feats = "\"" +feats+"\""

                            csvf.write(label+","+feats+"\n")

            pbar.finish()

    print "max char is: " + str(maxchar)
    print "num_doc is: " +str(num_doc)
Example #37
0
        dest="relation_target_property_name",
        help="The name of the property on the relation annotation that should "
        + "point to the target annotation.")
    relations_to_closest_parser.add_argument(
        "-ro",
        "--relation-other",
        metavar="NAME=VALUE",
        nargs='+',
        type=_pair,
        dest="relation_other_properties",
        help="Other properties that should be added to the relation annotation."
    )

    args = parser.parse_args()
    kwargs = vars(args)
    func = kwargs.pop("func")
    input_dir = kwargs.pop('input_dir')
    xml_name_regex = kwargs.pop('xml_name_regex')
    output_dir = kwargs.pop('output_dir')

    for sub_dir, _, xml_file_names in anafora.walk(input_dir, xml_name_regex):
        for xml_file_name in xml_file_names:
            input_data = anafora.AnaforaData.from_file(
                os.path.join(input_dir, sub_dir, xml_file_name))
            func(input_data, **kwargs)
            output_sub_dir = os.path.join(output_dir, sub_dir)
            if not os.path.exists(output_sub_dir):
                os.makedirs(output_sub_dir)
            input_data.to_file(os.path.join(output_dir, sub_dir,
                                            xml_file_name))
Example #38
0
def main(args):
    if len(args) < 3:
        sys.stderr.write(
            "Required arguments: <input directory> <rest host> <output directory>\n"
        )
        sys.exit(-1)

    hostname = args[1]

    # initialize rest server
    init_url = 'http://%s:8000/temporal/initialize' % hostname
    process_url = 'http://%s:8000/temporal/process' % hostname

    # sentence segmenter
    rush = RuSH('conf/rush_rules.tsv')
    # tokenizer
    # tokenizer = TreebankWordTokenizer()

    r = requests.post(init_url)
    if r.status_code != 200:
        sys.stderr.write('Error: rest init call was not successful\n')
        sys.exit(-1)

    for sub_dir, text_name, xml_names in anafora.walk(args[0], xml_name_regex):
        print("Processing filename: %s" % (text_name))
        if len(xml_names) > 1:
            sys.stderr.write(
                'There were multiple valid xml files for file %s' %
                (text_name))
            sys.exit(-1)
        xml_name = xml_names[0]

        with open(os.path.join(args[0], sub_dir, text_name)) as f:
            text = f.read()

        sentences = rush.segToSentenceSpans(text)
        sent_tokens = []

        for sentence in sentences:
            sent_txt = text[sentence.begin:sentence.end]
            sent_tokens.append(tokenize(sent_txt))

        r = requests.post(process_url, json={'sent_tokens': sent_tokens})
        if r.status_code != 200:
            sys.stderr.write('Error: rest call was not successful\n')
            sys.exit(-1)

        json = r.json()
        anafora_data = AnaforaData()
        cur_id = 0

        for sent_ind, sentence in enumerate(sentences):
            sent_txt = text[sentence.begin:sentence.end]
            sent_events = json['events'][sent_ind]
            sent_timexes = json['timexes'][sent_ind]
            try:
                token_spans = align_tokens(sent_tokens[sent_ind], sent_txt)
            except Exception as e:
                sys.stderr.write(
                    'In document %s, error \n%s\n processing sentence:\n*****\n%s\n******\n'
                    % (text_name, str(e), sent_txt))
                sys.exit(-1)

            for event in sent_events:
                begin_token_ind = event['begin']
                end_token_ind = event['end']
                dtr = event['dtr']
                event_start_offset = token_spans[begin_token_ind][
                    0] + sentence.begin
                event_end_offset = token_spans[end_token_ind][
                    1] + sentence.begin
                event_text = text[event_start_offset:event_end_offset]
                annot = AnaforaEntity()
                annot.id = str(cur_id) + "@e@" + text_name
                cur_id += 1
                annot.spans = ((event_start_offset, event_end_offset), )
                annot.type = "EVENT"
                annot.properties['DocTimeRel'] = dtr
                anafora_data.annotations.append(annot)

                #print("Found event %s" % (event_text))

            for timex in sent_timexes:
                begin_token_ind = timex['begin']
                end_token_ind = timex['end']
                time_class = timex['timeClass']
                timex_start_offset = token_spans[begin_token_ind][
                    0] + sentence.begin
                timex_end_offset = token_spans[end_token_ind][
                    1] + sentence.begin
                timex_text = text[timex_start_offset:timex_end_offset]

                # create anafora entry
                annot = AnaforaEntity()
                annot.id = str(cur_id) + "@e@" + text_name
                cur_id += 1
                annot.spans = ((timex_start_offset, timex_end_offset), )
                annot.type = "TIMEX3"
                annot.properties['Class'] = time_class
                anafora_data.annotations.append(annot)

                #print("Found timex %s" % (timex_text))

        #break
        anafora_data.indent()
        os.makedirs(os.path.join(args[2], sub_dir), exist_ok=True)
        anafora_data.to_file(os.path.join(args[2], sub_dir, xml_name))
Example #39
0
def score_dirs(reference_dir, predicted_dir, xml_name_regex="[.]xml$", text_dir=None,
               include=None, exclude=None, scores_type=Scores, spans_type=None):
    """
    :param string reference_dir: directory containing reference ("gold standard") Anafora XML directories
    :param string predicted_dir: directory containing predicted (system-generated) Anafora XML directories
    :param xml_name_regex: regular expression matching the files to be compared
    :param string text_dir: directory containing the raw texts corresponding to the Anafora XML
        (if None, texts are assumed to be in the reference dir)
    :param set include: types of annotations to include (others will be excluded); may be type names,
        (type-name, property-name) tuples, (type-name, property-name, property-value) tuples
    :param set exclude: types of annotations to exclude; may be type names, (type-name, property-name) tuples,
        (type-name, property-name, property-value) tuples
    :param type scores_type: type for calculating matches between predictions and reference
    :param type spans_type: wrapper object to apply to annotation spans
    :return iter: an iterator of (file-name, name-to-scores) where name-to-scores is a mapping from
        (annotation type[, property name[, property value]]) to a Scores object
    """

    # walks through the reference Anafora XML directories, scoring each and adding those to the overall scores
    for sub_dir, text_name, reference_xml_names in anafora.walk(reference_dir, xml_name_regex):

        # load the reference data from its Anafora XML
        try:
            [reference_xml_name] = reference_xml_names
        except ValueError:
            logging.warn("expected one reference file for %s, found %s", text_name, reference_xml_names)
            if not reference_xml_names:
                continue
            reference_xml_name = reference_xml_names[0]
        reference_xml_path = os.path.join(reference_dir, sub_dir, reference_xml_name)
        reference_data = _load(reference_xml_path)

        # check for self-references in the annotations, which cause equality and hashing to fail
        self_reference = reference_data.annotations.find_self_referential()
        if self_reference is not None:
            msg = "skipping reference file %s with self-referential annotation %s"
            logging.warn(msg, reference_xml_path, self_reference.id)
            continue

        # find and load the corresponding predicted data from its Anafora XML
        predicted_xml_glob = os.path.join(predicted_dir, sub_dir, text_name + "*.xml")
        predicted_xml_paths = [f for f in glob.glob(predicted_xml_glob) if re.search(xml_name_regex, f) is not None]
        try:
            [predicted_xml_path] = predicted_xml_paths
            predicted_data = _load(predicted_xml_path)
        except ValueError:
            logging.warn("expected one predicted file at %s, found %s", predicted_xml_glob, predicted_xml_paths)
            if not predicted_xml_paths:
                predicted_xml_path = None
                predicted_data = anafora.AnaforaData()
            else:
                predicted_xml_path = predicted_xml_paths[0]
                predicted_data = _load(predicted_xml_path)

        # check for self-references in the annotations, which cause equality and hashing to fail
        self_reference = predicted_data.annotations.find_self_referential()
        if self_reference is not None:
            msg = "skipping predicted file %s with self-referential annotation %s"
            logging.warn(msg, predicted_xml_path, self_reference.id)
            predicted_data = anafora.AnaforaData()

        # determine the path for the raw text source file
        if text_dir is None:
            text_path = os.path.join(reference_dir, sub_dir, text_name)
        else:
            text_path = os.path.join(text_dir, text_name)

        # if no raw text was found, then asking for the text of an annotation is an error
        if not os.path.exists(text_path) or not os.path.isfile(text_path):
            def _span_text(_):
                raise RuntimeError("no text file found at {0}".format(text_path))

        # otherwise, the text of an annotation can be extracted based on its spans
        else:
            with open(text_path) as text_file:
                text = text_file.read()

            def _flatten(items):
                if isinstance(items, tuple) and isinstance(items[0], int):
                    yield items
                else:
                    for item in items:
                        for flattened_items in _flatten(item):
                            yield flattened_items

            def _span_text(spans):
                return "...".join(text[start:end] for start, end in _flatten(spans))

        # score this data and update the overall scores
        named_scores = score_data(reference_data, predicted_data, include, exclude,
                                  scores_type=scores_type, spans_type=spans_type)
        for name, scores in named_scores.items():

            # if there were some predictions, and if we're using scores that keep track of errors, log the errors
            if predicted_xml_paths:
                for annotation, message in getattr(scores, "errors", []):
                    spans, _, _ = annotation
                    logging.debug('%s: %s: "%s" %s"', text_name, message, _span_text(spans), annotation)

        # generate the file name and the resulting scores
        yield text_name, named_scores
Example #40
0
def score_annotators(anafora_dir, xml_name_regex, include=None, exclude=None,
                     scores_type=Scores, spans_type=None):
    """
    :param anafora_dir: directory containing Anafora XML directories
    :param xml_name_regex: regular expression matching the annotator files to be compared
    :param include: types of annotations to include (others will be excluded); may be type names,
        (type-name, property-name) tuples, (type-name, property-name, property-value) tuples
    :param set exclude: types of annotations to exclude; may be type names, (type-name, property-name) tuples,
        (type-name, property-name, property-value) tuples
    :param type scores_type: type for calculating matches between predictions and reference
    :param type spans_type: wrapper object to apply to annotation spans
    :return iter: an iterator of (file-name, name-to-scores) where name-to-scores is a mapping from
        (annotation type[, property name[, property value]]) to a Scores object
    """

    # pattern for extracting the annotator name from the Anafora XML file name
    annotator_name_regex = "([^.]*)[.][^.]*[.]xml$"

    # function for getting a canonical prefix corresponding to a pair of annotators
    def make_prefix(annotators):
        return "{0}-vs-{1}".format(*sorted(annotators))

    # walks through the Anafora XML directories, scoring each and adding those to the overall scores
    for sub_dir, text_name, xml_names in anafora.walk(anafora_dir, xml_name_regex):

        # load the data from each Anafora XML file
        annotator_data = []
        for xml_name in xml_names:

            # ignore in-progress annotations and automatic pre-annotations
            if '.inprogress.' in xml_name or '.preannotation.' in xml_name:
                continue

            # ignore empty files
            xml_path = os.path.join(anafora_dir, sub_dir, xml_name)
            if os.stat(xml_path).st_size == 0:
                continue

            # load the data and add it to the list
            data = _load(xml_path)
            annotator_name = re.search(annotator_name_regex, xml_name).group(1)
            annotator_data.append((annotator_name, data))

        # at least 2 annotators are needed for annotator agreement
        if len(annotator_data) < 2:
            logging.warn("%s: found fewer than 2 annotators: %s", text_name, xml_names)
            continue

        # pair each annotator with each other annotator
        annotator_named_scores = collections.defaultdict(lambda: scores_type())
        for i in range(len(annotator_data)):
            annotator1, data1 = annotator_data[i]
            for j in range(i + 1, len(annotator_data)):
                annotator2, data2 = annotator_data[j]

                # make a prefix for this specific pair of annotators
                prefix = make_prefix([annotator1, annotator2])

                # make a prefix where non-gold annotators are just called "annotator"
                general_prefix = make_prefix(
                    a if a == "gold" else "annotator" for a in [annotator1, annotator2])

                # perform the comparison of the two annotation sets and update the overall scores
                named_scores = score_data(data1, data2, include, exclude,
                                          scores_type=scores_type, spans_type=spans_type)

                # add annotators as prefixes
                for name, scores in named_scores.items():
                    if not isinstance(name, tuple):
                        name = name,
                    annotator_named_scores[(prefix,) + name].update(scores)
                    annotator_named_scores[(general_prefix,) + name].update(scores)

        # generate the filename and the resulting scores
        yield text_name, annotator_named_scores
def preprocess_data_lasagne(input_ann_dir, input_text_dir, outDir, window_size=3, num_feats=2, Shuffle = False):

    ext_positive = 0
    ext_negative=0

    with open(os.path.join(outDir, "feature.toks"), 'w') as g_feature,\
        open(os.path.join(outDir, "label.txt"), 'w') as g_label:

        g_feature.write(str(num_feats)+"\t"+str(window_size)+"\n")

        for dir_path, dir_names, file_names in os.walk(input_text_dir):

            pbar = ProgressBar(maxval=len(file_names)).start()

            for i, fn in enumerate(sorted(file_names)):

                time.sleep(0.01)
                pbar.update(i + 1)

                for sub_dir, text_name, xml_names in anafora.walk(os.path.join(input_ann_dir, fn)):

                    for xml_name in xml_names:

                        if "Temporal" not in xml_name:
                            continue

                        #print fn

                        xml_path = os.path.join(input_ann_dir, text_name, xml_name)
                        data = anafora.AnaforaData.from_file(xml_path)

                        positive_span_label_map={}

                        for annotation in data.annotations:
                            if annotation.type == 'EVENT':

                                startoffset = annotation.spans[0][0]
                                endoffset = annotation.spans[0][1]

                                properties = annotation.properties
                                pros = {}
                                for pro_name in properties:
                                    pro_val = properties.__getitem__(pro_name)
                                    pros[pro_name] = pro_val

                                positive_span_label_map[(startoffset,endoffset)] = "1"

                        with open(os.path.join(input_text_dir, fn), 'r') as f:
                            content = f.read()

                        all_spans = content2span(content)

                        negative_span_label_map={}
                        for span in all_spans:
                            if span not in positive_span_label_map:
                                negative_span_label_map[span] = "0"

                        merged_spans = positive_span_label_map.keys() + negative_span_label_map.keys()

                        if Shuffle:
                            shuffle(merged_spans)

                        for span in merged_spans:

                            if span not in positive_span_label_map:
                                ext_negative += 1
                                label = negative_span_label_map[span]
                            else:
                                ext_positive += 1
                                label = positive_span_label_map[span]

                            if num_feats == 2:
                                feat = feature_generation_2(content, span[0], span[1], window_size)
                            elif num_feats == 3:
                                feat = feature_generation_3(content, span[0], span[1], window_size)

                            seqlen = 2*window_size+1

                            toks_a = feat.rstrip('\n').split()
                            assert len(toks_a) == seqlen*num_feats, "wrong :"+a 

                            g_feature.write(feat+"\n")
                            g_label.write(label+"\n")

            pbar.finish()

    print "Extract positive events is %d"%ext_positive
    print "Extract negative events is %d"%ext_negative