Ejemplo n.º 1
0
    def _read(self, file_path_list):
        print(file_path_list)
        source_copy = self.source_copy
        for lang, file_path in file_path_list:
            try:
                u_pos = self.convert_postags(lang)
            except:
                u_pos = None
            # if `file_path` is a URL, redirect to the cache
            file_path = cached_path(file_path)
            logger.info("Reading instances from lines in file at: %s %s %s",
                        lang, file_path, self.split)
            i = 0
            for amr in AMRIO.read(file_path,
                                  lang=lang,
                                  universal_postags=self.universal_postags,
                                  postag_map=u_pos):
                i += 1

                try:
                    yield self.text_to_instance(amr, lang, source_copy,
                                                self.split)
                except Exception as e:
                    if self.split != "test":
                        continue
                    else:
                        raise e
        self.report_coverage()
Ejemplo n.º 2
0
 def restore_file(self, file_path):
     for amr in AMRIO.read(file_path):
         try:
             self.restore_instance(amr)
             yield amr
         except:
             yield amr
Ejemplo n.º 3
0
 def recategorize_file(self, file_path):
     for i, amr in enumerate(AMRIO.read(file_path), 1):
         self.recategorize_graph(amr)
         yield amr
         if i % 1000 == 0:
             logger.info('Processed {} examples.'.format(i))
     logger.info('Done.\n')
Ejemplo n.º 4
0
    def annotate_file(self, in_path, out):
        with open(out, 'w', encoding='utf-8') as f:
            for i, amr in enumerate(AMRIO.read(os.path.join(in_path))):
                if i % 1000 == 0:
                    logger.info('{} processed.'.format(i))

                sentence = amr.sentence
                if self.lang=="it":
                    annotation = self.tint_annotate(sentence.replace("[ ... ]",""))
                else:
                    annotation = self.stanza_annotate(sentence)
                amr.tokens = annotation['tokens']
                amr.lemmas = annotation['lemmas']
                amr.pos_tags = annotation['pos_tags']
                amr.ner_tags = annotation['ner_tags']
                amr.abstract_map = {}
                AMRIO.dump([amr], f)
Ejemplo n.º 5
0
 def _update_counter_from_train_files(self, amr_train_files, base_freq=1):
     logger.info('Updating (lemma, frame) counter from AMR train files.')
     for file_path in amr_train_files:
         for amr in AMRIO.read(file_path):
             for node in amr.graph.get_nodes():
                 for _, frame in node.get_frame_attributes():
                     frame_lemma = re.sub(WORDSENSE_RE, '', frame)
                     self._update_counter(self.lemma_frame_counter, frame_lemma, frame, base_freq)
                     self._update_counter(self.frame_lemma_counter, frame, frame_lemma, base_freq)
Ejemplo n.º 6
0
 def _get_senseless_node_counter(amr_train_files):
     logger.info('Building the senseless node counter.')
     sense_less_nodes = []
     for amr_file in amr_train_files:
         for amr in AMRIO.read(amr_file):
             for node in amr.graph.get_nodes():
                 for attr, value in node.get_senseless_attributes():
                     sense_less_nodes.append(value)
     return Counter(sense_less_nodes)
Ejemplo n.º 7
0
    def read_file_gold_amr(self, lang_sentences):
        with open(self.dump_dir + '{}_{}.txt'.format(self.split, self.lang),
                  'w',
                  encoding='utf-8') as f:
            for i, amr in enumerate(AMRIO.read(os.path.join(self.in_path))):
                if i % 1000 == 0:
                    logger.info('{} processed.'.format(i))

                sentence = amr.sentence
                parallel_sentence = lang_sentences[i]

                amr.sentence = parallel_sentence
                amr.tokens = None
                amr.lemmas = None
                amr.pos_tags = None
                amr.ner_tags = None
                amr.misc = ["# ::tok-{}".format("en") + " " + sentence]
                amr.abstract_map = {}
                AMRIO.dump([amr], f)
Ejemplo n.º 8
0
 def dump_spotlight_wiki(self, file_path):
     sent_map = {}
     for i, amr in tqdm(enumerate(AMRIO.read(file_path), 1)):
         if i % 20 == 0:
             print('+', end='')
         sent = amr.sentence
         wiki = self.spotlight_wiki_docker(sent, port=self.spotlight_port)
         sent_map[sent] = wiki
         # sleep(0.1)
     with open(os.path.join(self.util_dir, args.spotlight_wiki), 'w', encoding='utf-8') as f:
         json.dump(sent_map, f)
Ejemplo n.º 9
0
 def expand_file(self, file_path):
     for i, amr in enumerate(AMRIO.read(file_path, lang=self.lang, universal_postags=self.u_pos, postag_map=self.postag_map)):
         # print(amr)
         self.expand_graph(amr)
         yield amr
     self.print_stats()
Ejemplo n.º 10
0
 def wikify_file(self, file_path, lang="en"):
     for i, amr in enumerate(AMRIO.read(file_path, lang=lang)):
         self.wikify_graph(amr)
         yield amr
Ejemplo n.º 11
0
 def read(self, file_path):
     for amr in AMRIO.read(file_path):
         yield self(amr)
Ejemplo n.º 12
0
    parser.add_argument('files', nargs='+', help='files to annotate.')
    parser.add_argument('--compound_file', default='data/misc/joints.txt')
    parser.add_argument('--processed_sentences', default='')

    args = parser.parse_args()

    annotator = FeatureAnnotator('http://localhost:9000', args.compound_file)
    processed = set()
    if args.processed_sentences != "":
        with open(args.processed_sentences, "r") as infile:
            for line in infile:
                processed.add(line.rstrip())

    for file_path in args.files:
        logger.info('Processing {}'.format(file_path))
        with open(file_path + '.features{}'.format('' if len(processed) ==
                                                   0 else '.partial'),
                  'w',
                  encoding='utf-8') as f:
            for i, amr in enumerate(AMRIO.read(file_path), 1):
                if i % 1000 == 0:
                    logger.info('{} processed.'.format(i))
                if amr.sentence in processed: continue
                annotation = annotator(amr.sentence)
                amr.tokens = annotation['tokens']
                amr.lemmas = annotation['lemmas']
                amr.pos_tags = annotation['pos_tags']
                amr.ner_tags = annotation['ner_tags']
                AMRIO.dump([amr], f)
    logger.info('Done!')
Ejemplo n.º 13
0
 def read_translations(self, lang_sentences):
     for i, amr in enumerate(AMRIO.read(lang_sentences)):
         if amr.id not in self.translations:
             self.translations[amr.id] = amr
Ejemplo n.º 14
0
            if token == '911':
                index = i
                break
        else:
            break
        amr.replace_span([index], ['09', '11'], ['CD', 'CD'], ['DATE', 'DATE'])


def replace_NT_dollar_abbr(amr):
    # Replace 'NT' in front of '$' with 'Taiwan'.
    for i, token in enumerate(amr.tokens):
        if token == 'NT' and len(amr.tokens) > i + 1 and amr.tokens[i + 1] in (
                '$', 'dollars', 'dollar'):
            amr.replace_span([i], ['Taiwan'], ['NNP'], ['COUNTRY'])


if __name__ == '__main__':
    import argparse
    from xlamr_stog.data.dataset_readers.amr_parsing.io import AMRIO

    parser = argparse.ArgumentParser('input_cleaner.py')
    parser.add_argument('--amr_files', nargs='+', default=[])

    args = parser.parse_args()

    for file_path in args.amr_files:
        with open(file_path + '.input_clean', 'w', encoding='utf-8') as f:
            for amr in AMRIO.read(file_path):
                clean(amr)
                f.write(str(amr) + '\n\n')