def outputEdgeAlignment(tok_seq, amr, edge_to_toks, tok2rels):
    for edge_index in edge_to_toks:
        edge_label = amr.edges[edge_index].label
        for tok_idx in edge_to_toks[edge_index]:
            logger.writeln("Relation align: align %s to %s" %
                           (tok_seq[tok_idx], edge_label))
            tok2rels[tok_idx].add(edge_index)
Example #2
0
def parse_flags():
    try:
        argv = FLAGS(sys.argv)  # parse flags
    except gflags.FlagsError as e:
        logger.writeln('%s\nUsage: %s ARGS\n%s' % (e, sys.argv[0], FLAGS))
        sys.exit(1)
    return argv
 def retrieveUnaligned():
     num_nodes = len(amr.nodes)
     for node_idx in range(num_nodes):
         if node_idx in aligned_nodes:
             continue
         freq = amr.getFreq(node_idx)
         curr_node = amr.nodes[node_idx]
         node_str = curr_node.node_str()
         if node_str in role_rel_concepts:
             continue
         if (freq and freq < 100) or amr.is_predicate(curr_node):
             for (idx, word) in enumerate(tok_seq):
                 if idx not in aligned_toks:
                     lem = lem_seq[idx]
                     if similar(node_str, word) or similar(node_str, lem):
                         logger.writeln(
                             "Retrieved concept map: %s, %s ; %s" %
                             (word, lem, node_str))
                         category = "TOKEN"
                         if isNumber(node_str) or isNumber(word):
                             category = "NUMBER"
                         all_alignments[node_idx].append(
                             (idx, idx + 1, node_str, category))
                         aligned_nodes.add(node_idx)
                         aligned_toks.add(idx)
Example #4
0
def single_worker_decode():
    jobs = get_jobs(FLAGS.do)
    njobs = len(jobs)
    fout = open(FLAGS.output, 'w')
    if FLAGS.output_kbest:
        fkbest = open(FLAGS.kbest_output, 'w')
    totaltime = 0
    joblogger = JobLogger()

    while jobs:
        # finished jobs need to be discarded because jobs save the hypergraph
        job = jobs.pop(0)
        job.run()
        totaltime += job.time
        joblogger.log(job.id)

    if logger.level >= 1 and FLAGS.show_time:
        logger.writeln('total time: %.2f seconds (%.2f seconds/sentence)' %
                       (totaltime, totaltime/njobs))
    joblogger.finish()
    if FLAGS.consensus_training:
        consensus_trainer = ConsensusTrainer(FLAGS.lm_order,
                                             decoding_features,
                                             FLAGS.run_dir,
                                             refs)
        consensus_trainer.optimize()
Example #5
0
    def dump(self):
        """Rules are sorted by the English side. Remember to call this before
        finishing."""
        if self.parallel:
            name = "%04d.%04d" % (self.parallel[0], self.n_dump)
        else:
            name = "%04d" % self.n_dump
        if logger.level >= 1:
            logger.writeln('dumping %s...' % name)
        self.dumped += len(self.gram)
        lines = []
        for r in self.iter_rules():
            lines.append("%s ||| %s\n" % (' '.join(str(s)
                                                   for s in r.e), str(r)))
        lines.sort()
        outfile = open(os.path.join(self.outputdir, "extract.%s" % name), "w")
        for line in lines:
            outfile.write(line)
        outfile.close()
        if logger.level >= 1:
            logger.writeln('dumped: %s' % self.dumped)

        if FLAGS.accumulate:
            self.gram = {}
        else:
            self.gram = []
        self.n_dump += 1
def align_semeval_sentence(tok_seq, lemma_seq, alignment_seq, amr, verb_list,
                           multi_map):
    node_alignment, _ = initializeAlignment(amr)
    entity_toks = set()
    aligned_toks = set()
    all_alignments = defaultdict(list)
    node_to_toks, temp_aligned = extractNodeMapping(alignment_seq, amr)
    unaligned_set = set(xrange(len(tok_seq))) - temp_aligned
    alignEntities(tok_seq, amr, alignment_seq, entity_toks, aligned_toks,
                  all_alignments, unaligned_set, node_alignment)

    #Verbalization list
    verb_map = defaultdict(set)
    alignVerbalization(tok_seq, lemma_seq, amr, verb_list, all_alignments,
                       verb_map, aligned_toks, node_alignment, multi_map)

    aligned_nodes = set([
        node_idx for (node_idx, aligned) in enumerate(node_alignment)
        if aligned
    ])

    alignOtherConcepts(tok_seq, lemma_seq, amr, aligned_toks, aligned_nodes,
                       node_to_toks, all_alignments, multi_map)

    ##Based on the alignment from node index to spans in the string
    unaligned_set = set(xrange(len(tok_seq))) - aligned_toks
    unaligned_idxs = sorted(list(unaligned_set))
    logger.writeln("Unaligned tokens: %s" %
                   (" ".join([tok_seq[i] for i in unaligned_idxs])))

    unaligned_nodes = amr.unaligned_nodes(aligned_nodes)
    logger.writeln("Unaligned vertices: %s" %
                   " ".join([node.node_str() for node in unaligned_nodes]))

    return all_alignments
Example #7
0
def timed(l):
    prev = time.time()
    for i, x in enumerate(l, 1):
        if i % FLAGS.time_interval == 0:
            logger.writeln('%s (%s/sec)' % (i, FLAGS.time_interval /
                                            (time.time() - prev)))
            prev = time.time()
        yield x
Example #8
0
 def test_inside_exp_outside_exp_log(self):
     self.hp.set_semiring(LOGPROB)
     self.hp.set_functions(lambda x: log(x.prob), lambda x: 1, None)
     self.hp.inside()
     self.hp.outside()
     self.hp.inside_exp()
     self.hp.outside_exp()
     logger.writeln(self.hp.dot())
Example #9
0
def dump_trees(samples, filename):
    logger.writeln('dump trees')
    treefile = TreeFile(filename)
    for s in timed(samples):
        # call this before dumping rules for each sample!
        LEXICAL_WEIGHTER.compute_lexical_weights(s.a)
        treefile.dump(s)
    treefile.close()
Example #10
0
 def test_inside_exp_outside_exp(self):
     self.hp.set_semiring(INSIDE)
     self.hp.set_functions(lambda x: x.prob, lambda x: 1, None)
     self.hp.inside()
     self.hp.outside()
     self.hp.inside_exp()
     self.hp.outside_exp()
     logger.writeln(self.hp.dot())
Example #11
0
 def log(self, jid):
     """read and write result of one job"""
     if logger.level >= 1:
         fname = '%s/%s_%s' % (FLAGS.run_dir, 'log', str(jid).rjust(5, '0'))
         f = open(fname)
         for line in f:
             logger.write(line)
         logger.writeln()
         f.close()
 def serve(self):
     if logger.level >= 1:
         logger.writeln('start server')
     self.sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
     self.sock.bind((self.host, FLAGS.port))
     self.sock.listen(5)
     while self.nfinished != self.njobs or \
           self.nterminated != len(self.nodes):
         conn, addr = self.sock.accept()
         self.handle(conn)
Example #13
0
 def scan(self, i, j):
     if logger.level >= 3:
         logger.writeln('Scan: [%s, %s]' % (i, j))
     for dotitem in self.bins[i][j - 1]:
         word = self.chart.fwords[j - 1]
         next_node = dotitem.node.get(word)
         if next_node is not None:
             new_dotitem = DotItem(next_node, i, j, dotitem.ants)
             if logger.level >= 4:
                 logger.writeln(new_dotitem)
             self.add(new_dotitem)
Example #14
0
 def complete(self, i, k, j):
     if logger.level >= 3:
         logger.writeln('Complete: %s %s %s' % (i, k, j))
     for dotitem in self.bins[i][k]:
         for var, bin in self.chart.iter_items_by_nts(k, j):
             next_node = dotitem.node.get(var)
             if next_node is not None:
                 new_dotitem = DotItem(next_node, i, j,
                                       dotitem.ants + (bin, ))
                 if logger.level >= 4:
                     logger.writeln('new dotitem: %s' % new_dotitem)
                 self.add(new_dotitem)
 def start_slave(self, sid, host):
     if logger.level >= 1:
         logger.writeln('start slave %s on %s' % (sid, host))
     cmd = ' '.join(sys.argv)
     # slaves inherit master options but it's important to override _Parallel
     # and _Slave to make them work in slave mode
     # they write their detailed translation report to the same log file
     # but their stdout and stderr are still conviniently connected to the
     # master terminal
     options = "--noparallel --slave --slave_id=%s \
                --log=slaves.log \
                --server=%s" % (sid, socket.gethostname())
     system(r'ssh %s "cd %s; nohup %s %s" &' %
            (host, os.getcwd(), cmd, options))
Example #16
0
 def test_best_paths(self):
     self.hp.set_semiring(INSIDE)
     self.hp.set_functions(lambda x: x.prob, lambda x: 1, None)
     self.hp.assert_done('topo_sort')
     logger.writeln(self.hp.root.best_paths()[0].tree_str())
     logger.writeln(self.hp.root.best_paths()[0].weight)
     logger.writeln(self.hp.root.best_paths()[1].tree_str())
     logger.writeln(self.hp.root.best_paths()[1].weight)
Example #17
0
 def binary_expand(self, i, j):
     if logger.level >= 4:
         logger.writeln('span %s %s' % (i, j))
     new_items = Cube()
     for k in range(i + 1, j):
         for lvar, lbin in self.chart.iter_items_by_nts(i, k):
             for rvar, rbin in self.chart.iter_items_by_nts(k, j):
                 for grammar in self.grammars:
                     rulebin = grammar.itg.get_sorted_rules((lvar, rvar))
                     if rulebin:
                         new_items.add_cube((rulebin, lbin, rbin),
                                            self.get_cube_op(i, j))
     for new_item in new_items.iter_top(FLAGS.bin_size):
         if logger.level >= 4:
             logger.writeln(new_item)
         self.chart.add(new_item)
Example #18
0
 def load_features(self, features_weights):
     features = []
     weights = []
     for s in features_weights:
         feature_name, weight_str = s.split(':')
         weight = float(weight_str)
         feature_class = getattr(feature_lib, feature_name, None)
         if feature_class is None:
             logger.writeln('unknown feature: %s' % feature_name)
         else:
             if feature_name.endswith('LM'):
                 feature = feature_class(FLAGS.lm_order, FLAGS.lm)
             else:
                 feature = feature_class()
             features.append(feature)
             weights.append(weight)
     return features, weights
Example #19
0
def init_split(samples, split=True):
    global SAMPLER
    logger.writeln('initialization. split=%s' % split)
    SAMPLER = init_sampler()
    for sample in timed(samples):
        if split:
            for node in sample.hg.nodes:
                node.pnt = node.nt
                node.nt = random.choice(child_symbols(node.pnt))
        for n, rule in sample.composed_rules_under(sample.hg.root):
            SAMPLER.count(rule)
            if FLAGS.type:
                # mapping from rules to nodes, and from nodes to rules
                CUT_INDEX.add(rule, sample, n)
                n.composed_rule = rule
        if FLAGS.check_index:
            CUT_INDEX.check(sample)
Example #20
0
 def em_step(self, iteration):
     ffile = open(self.ffilename)
     efile = open(self.efilename)
     afile = open(self.afilename)
     alignments = Alignment.reader_pharaoh(ffile, efile, afile)
     dirname = os.path.join(self.outputdir,
                            'iter_%s' % str(iteration + 1).rjust(3, '0'))
     os.mkdir(dirname)
     if logger.level >= 1:
         logger.writeln('\niteration %s' % (iteration + 1))
     likelihood = 0
     starttime = time.time()
     for i, alignment in enumerate(alignments, 1):
         if i % FLAGS.emtrain_log_interval == 0:
             logger.writeln('%s sentences at %s secs/sent' %
                            (i, (time.time() - starttime) / i))
             starttime = time.time()
         extractor = Extractor(
             maxabslen=100000,
             maxlen=10000,
             minhole=1,
             maxvars=100000,
             lexical_weighter=self.lexical_weighter,
             forbid_adjacent=self.forbid_adjacent,
             maximize_derivation=self.maximize_derivation,
             require_aligned_terminal=self.require_aligned_terminal)
         hg = extractor.extract_hypergraph(alignment)
         if hg is None:
             continue
         # compute expected counts
         self.compute_expected_counts(hg)
         likelihood += hg.root.inside
         treefilename = os.path.join(dirname,
                                     'tree_%s' % str(i).rjust(8, '0'))
         self.write_viterbi_tree(hg, treefilename)
         #for edge in hg.edges():
         #    logger.writeln('%s %s' % (self.counter.get_prob(edge.rule),
         #                              edge.rule))
     if logger.level >= 1:
         logger.writeln('likelihood: %s' % likelihood)
     if logger.level >= 1:
         logger.writeln('normalizing...')
     self.counter.normalize_vbdp(self.alpha, self.threshold)
     if logger.level >= 1:
         logger.writeln('prob table size: %s' % len(self.counter.prob))
    def compute_gradient(self):
        self.collect_expected_feature_counts()
        self.collect_expected_products()
        result = []
        for i in range(self.feature_n):
            # print('feature %s' % i)
            gradient = 0

            tmp = 0
            for n in range(self.max_n):
                # print('clip count %s-gram gradient' % (n+1))
                if self.expected_clipped_counts[n] == 0:
                    continue
                clipped_count_grad = \
                        self.ep_clipped_count_feature[n][i] - \
                        self.expected_clipped_counts[n] * \
                        self.expected_feature_counts[i]
                # print(clipped_count_grad)
                tmp += clipped_count_grad / self.expected_clipped_counts[n]
            gradient += tmp / self.max_n

            tmp = 0
            for n in range(self.max_n):
                # print('count %s-gram gradient' % (n+1))
                if self.expected_counts[n] == 0:
                    continue
                count_grad = self.ep_count_feature[n][i] - \
                        self.expected_counts[n]*self.expected_feature_counts[i]
                # print(count_grad)
                tmp += count_grad / self.expected_counts[n]
            gradient -= tmp / self.max_n

            # brevity penalty
            if self.expected_counts[0] < self.ref_length:
                gradient += (self.ep_count_feature[0][i] -
                             self.expected_counts[0]* \
                             self.expected_feature_counts[i]) * \
                             self.ref_length / \
                        (self.expected_counts[0])^2

            result.append(gradient)
        if logger.level >= 1:
            logger.writeln('gradient: %s' % result)
        return result
 def handle(self, conn):
     msg = conn.recv(1024).decode()
     sid, status = msg.split()
     if status == 'ready':
         if self.i <= self.njobs:
             conn.send(('%s\n' % self.i).encode())
             self.sid2job[sid] = (self.i, time())
             self.i += 1
         else:
             conn.send('0'.encode())
             self.nterminated += 1
             if logger.level >= 1:
                 logger.writeln()
                 logger.writeln('slave %s told to terminate' % sid)
     elif status == 'ok':
         self.nfinished += 1
         jid, start_time = self.sid2job[sid]
         self.total_time += time() - start_time
         self.joblogger.log(jid)
Example #23
0
    def __init__(self, m, lmfile):
        Feature.__init__(self)
        self.stateless = False
        self.m = m
        self.lmfile = lmfile
        self.ngram_enum = NgramEnumerator(self.m)

        if FLAGS.use_python_lm:
            from python_lm import LanguageModel
        else:
            from swig_lm import LanguageModel

        logger.writeln('reading LM: %s' % self.lmfile)
        if FLAGS.use_python_lm:
            self.lm = LanguageModel(self.lmfile)
            self.getcost = self.lm.get
        else:
            self.lm = LanguageModel(self.m, self.lmfile)
            self.getcost = self.lm
Example #24
0
 def pop(self):
     "Return None if agenda is empty"
     while True:
         try:
             h, item = heappop(self.items)
         except IndexError:  # empty heap, return None
             break
         if item.dead:  # item pruned in chart
             if logger.level >= 5:
                 logger.writeln('pop dead item: %s' % item)
                 logger.writeln(item)
                 logger.writeln(item.incoming[0].rule)
             self.deadpop += 1
         else:
             if logger.level >= 4:
                 logger.writeln('pop: %s' % item)
                 logger.writeln(item.incoming[0].rule)
             self.popped += 1
             return item
Example #25
0
 def probe(self, cpu_reserve, mem_reserve):
     """return number of decoder instances that can run on this node."""
     result = 0
     if self.node:
         status = system('ssh %s :' % self.node)
         if status != 0:
             logger.writeln('%s down' % self.node)
         else:
             cpu = self.cpu_usage()
             mem = self.mem_free()
             if logger.level >= 1:
                 logger.writeln('cpu usage: %.1f%%' % cpu)
                 logger.writeln('mem free: %s kB' % mem)
             result = int(min((100-cpu)/cpu_reserve, mem/mem_reserve))
     if logger.level >= 1:
         logger.writeln('%s decoder instances will start on %s' %
                        (result, self.node))
         logger.writeln() 
     system('rm -f %s' % self.tmp_stat)
     system('rm -f %s' % self.tmp_meminfo)
     return result
Example #26
0
 def parse_agenda(self):
     while len(self.agenda) > 0:
         item = self.agenda.pop()
         if logger.level >= 4:
             logger.writeln('pop: %s' % item)
         for item1, item2, inverted in self.neighboring_pairs(item):
             # avoid duplicated edges. note that in ABC grammar,
             # if the boxes of item1 and item2 are given, the nt of the
             # new item is fixed
             if logger.level >= 4:
                 logger.writeln('neighbors: %s %s' % (item1, item2))
             key = (item1.nt, item1.fi, item1.fj, item1.ei, item1.ej,
                    item2.nt, item2.fi, item2.fj, item2.ei, item2.ej)
             if key not in self.edge_index:
                 self.edge_index.add(key)  
                 new_item = self.make_item(item1, item2, inverted)
                 if self.chart_add(new_item):
                     self.agenda.append(new_item)
                     self.neighbor_index.add(new_item)
                     self.glue_nodes.append(new_item)
                     if logger.level >= 4:
                         logger.writeln('push: %s' % new_item)
     # self.stats()
     root = self.final_glue()
     self.hg = Hypergraph(root)
     self.hg.topo_sort()
     self.stats()
     return self.hg
Example #27
0
 def load(self, filename):
     if logger.level >= 1:
         logger.writeln('loading rules from %s...' % filename)
     percent_counter = PercentCounter(input=filename, file=logger.file)
     f = open(filename)
     for i, line in enumerate(f):
         if logger.level >= 1:
             percent_counter.print_percent(i)
         try:
             rule = Rule()
             rule.fromstr(line)
         except AssertionError:
             logger.write('bad rule: %s %s: %s\n' % (filename, i, line))
             self.nbadrules += 1
             continue
         rule.grammar = self  # used in computing features scores
         self.features.score_rule(rule)
         if rule.arity == 0:
             self.lexgrammar.add(rule)
         else:
             self.itg.add(rule)
     f.close()
     if logger.level >= 1:
         logger.writeln()
         logger.writeln(self.stats())
def alignVerbalization(tok_seq, lemma_seq, amr, verb_list, all_alignments,
                       verb_map, aligned_toks, node_alignment, multi_map):

    matched_tuples = set()
    for (idx, curr_tok) in enumerate(tok_seq):
        if idx in aligned_toks:
            continue
        if not curr_tok in verb_list:
            curr_tok = lemma_seq[idx]
        if curr_tok in verb_list:
            for subgraph in verb_list[curr_tok]:
                matched_frags = amr.matchSubgraph(subgraph)
                if matched_frags:
                    subgraph_repr = subgraph_str(subgraph)
                    if len(matched_frags) > 1:
                        logger.writeln(
                            "Verbalize %s to more than 1 occurrences!" %
                            curr_tok)
                    for frag_tuples in matched_frags:
                        valid = True
                        for (head, rel, tail) in frag_tuples:
                            if (head, rel, tail) in matched_tuples:
                                valid = False
                                break
                            matched_tuples.add((head, rel, tail))
                        if valid:
                            logger.writeln(
                                "Verbalize %d-%d, %s to %s!" %
                                (idx, idx + 1, curr_tok, subgraph_repr))
                            aligned_toks.add(idx)
                            for (head, rel, tail) in frag_tuples:
                                verb_map[head].add((head, rel, tail))
                                node_alignment[head] = 1
                                node_alignment[tail] = 1
                            all_alignments[head].append(
                                (idx, idx + 1, subgraph_repr, "MULT"))
                            head = frag_tuples[0][0]
                            # head_concept = amr.nodes[head].node_str()
                            multi_map[subgraph_repr] += 1
                            break
Example #29
0
 def add(self, item):
     added = False
     bin_idx = self.key(item)
     if bin_idx:  # discard items with None key
         bin = self.bins.setdefault(bin_idx,
                                    self.binclass(FLAGS.bin_size, self))
         # preprune
         if not FLAGS.use_simple_bin and item.rank_cost() > bin.cutoff:
             if logger.level >= 4:
                 logger.writeln('prepruned: %s' % item)
             self.prepruned += 1
         # TODO: a hack: ban unary negative deduction,
         # only for ghkm rules
         elif item.incoming[0].rule.arity == 1 and len(item.incoming[0].rule.f) == 1 and \
              item.incoming[0].cost <= 0 and \
              item.incoming[0].rule.grammar is not None and \
              'ghkm' in item.incoming[0].rule.grammar.name:
             if logger.level >= 4:
                 logger.write(
                     'negative unary deduction for ghkm banned: %s' % item)
             self.neg_unary_pruned += 1
         # ban negative deduction
         elif FLAGS.ban_negative_deduction and item.incoming[0].cost <= 0:
             if logger.level >= 4:
                 logger.writeln('negative deduction banned: %s' %
                                item.incoming[0])
             self.negcost_pruned += 1
         # unary cycle banned
         elif item.unary_cycle():
             if logger.level >= 4:
                 logger.writeln('unary cycle broken: %s' % item)
             self.unary_cycle_broken += 1
         # merging needed
         elif (not FLAGS.use_simple_bin) and item in self.index:
             oldcost, olditem = self.index[item]
             item_merged = item.merge(olditem)
             if item_merged:  # old item better
                 if logger.level >= 4:
                     logger.writeln('merged: %s' % item)
             else:  # new item better
                 bin.add(item)
                 if not FLAGS.use_simple_bin:
                     bin.ndead += 1
                 added = True
             self.merged += 1
         # no need to merge
         else:
             bin.add(item)
             added = True
     return added
Example #30
0
 def push(self, item):
     if logger.level >= 4:
         logger.writeln('push:')
         logger.writeln(item)
         logger.writeln(item.incoming[0])
     h = self.heuristic(item)
     heappush(self.items, (h, item))
     self.pushed += 1
Example #31
0
def linearize_amr(args):
    logger.file = open(os.path.join(args.run_dir, 'logger'), 'w')

    amr_file = os.path.join(args.data_dir, 'aligned_amr_nosharp')
    alignment_file = os.path.join(args.data_dir, 'alignment')
    sent_file = os.path.join(args.data_dir, 'sentence')
    tok_file = os.path.join(args.data_dir, 'token')
    #lemma_file = os.path.join(args.data_dir, 'lemma')
    pos_file = os.path.join(args.data_dir, 'pos')

    amr_graphs = load_amr_graphs(amr_file)
    alignments = [line.strip().split() for line in open(alignment_file, 'r')]
    sents = [line.strip().split() for line in open(sent_file, 'r')]
    toks = [line.strip().split() for line in open(tok_file, 'r')]
    #lemmas = [line.strip().split() for line in open(lemma_file, 'r')]
    poss = [line.strip().split() for line in open(pos_file, 'r')]

    assert len(amr_graphs) == len(alignments) and len(amr_graphs) == len(sents) and len(amr_graphs) == len(toks) and len(amr_graphs) == len(poss), '%d %d %d %d %d' % (len(amr_graphs), len(alignments), len(sents), len(toks), len(poss))
    #assert len(amr_graphs) == len(alignments) and len(amr_graphs) == len(sents) and len(amr_graphs) == len(toks) and len(amr_graphs) == len(lemmas) and len(amr_graphs) == len(poss), '%d %d %d %d %d %d' % (len(amr_graphs), len(alignments), len(sents), len(toks), len(lemmas), len(poss))

    #lemma_map = initialize_lemma(args.lemma)
    num_self_cycle = 0
    used_sents = 0

    amr_statistics = AMR_stats()

    for (sent_index, (sent_seq, tok_seq, pos_seq, alignment_seq, amr_graph)) in enumerate(zip(sents, toks, poss, alignments, amr_graphs)):

        logger.writeln('Sentence #%d' % (sent_index+1))
        logger.writeln(str(amr_graph))

        #if sent_index > 100:
        #    break

        edge_alignment = bitarray(len(amr_graph.edges))
        if edge_alignment.count() != 0:
            edge_alignment ^= edge_alignment
        assert edge_alignment.count() == 0

        has_cycle = False
        if amr_graph.check_self_cycle():
            num_self_cycle += 1
            has_cycle = True
            #logger.writeln('self cycle detected')

        amr_graph.set_sentence(toks)
        #amr_graph.set_lemmas(lemma_seq)
        amr_graph.set_poss(pos_seq)

        aligned_fragments = []
        reentrancies = {}  #Map multiple spans as reentrancies, keeping only one as original, others as connections

        has_multiple = False
        no_alignment = False

        aligned_set = set()

        #all_frags = []

        #(opt_toks, role_toks, aligned_fragments) = extract_fragments(alignment_seq, amr_graph)
        ##logger.writeln(str(opt_toks))
        ##logger.writeln(str(role_toks))

        #if not aligned_fragments:
        #    logger.writeln('wrong alignments')
        #    continue

        #temp_aligned = set(aligned_fragments.keys())
        #aligned_fragments = sorted(aligned_fragments.items(), key=lambda frag: frag[0])

        #temp_unaligned = set(xrange(len(pos_seq))) - temp_aligned

        (entity_frags, root2entityfrag, root2entitynames) = amr_graph.extract_all_entities()

        new_graph = AMRGraph.collapsed_graph(amr_graph, root2entityfrag, root2entitynames)
        logger.writeln(str(new_graph))
        #logger.writeln(amr_graph.collapsed_form(root2entityfrag, root2entitynames))
        (relation_nums, entity_nums, predicate_nums, variable_nums, const_nums, reentrancy_nums) = amr_graph.statistics(root2entityfrag, root2entitynames)

        amr_statistics.update(reentrancy_nums, predicate_nums, variable_nums, const_nums, entity_nums, relation_nums)
Example #32
0
def linearize_amr(args):
    logger.file = open(os.path.join(args.run_dir, 'logger'), 'w')

    amr_file = os.path.join(args.data_dir, 'amr')
    alignment_file = os.path.join(args.data_dir, 'alignment')
    if args.use_lemma:
        tok_file = os.path.join(args.data_dir, 'lemmatized_token')
    else:
        tok_file = os.path.join(args.data_dir, 'token')
    pos_file = os.path.join(args.data_dir, 'pos')

    amr_graphs = load_amr_graphs(amr_file)
    alignments = [line.strip().split() for line in open(alignment_file, 'r')]
    toks = [line.strip().split() for line in open(tok_file, 'r')]
    poss = [line.strip().split() for line in open(pos_file, 'r')]

    assert len(amr_graphs) == len(alignments) and len(amr_graphs) == len(toks) and len(amr_graphs) == len(poss), '%d %d %d %d %d' % (len(amr_graphs), len(alignments), len(toks), len(poss))

    num_self_cycle = 0
    used_sents = 0

    amr_statistics = AMR_stats()

    if args.use_stats:
        amr_statistics.loadFromDir(args.stats_dir)
        #print amr_statistics
    else:
        os.system('mkdir -p %s' % args.stats_dir)
        amr_statistics.collect_stats(amr_graphs)
        amr_statistics.dump2dir(args.stats_dir)

    if args.parallel:
        singleton_num = 0.0
        multiple_num = 0.0
        total_num = 0.0
        empty_num = 0.0

        amr_seq_file = os.path.join(args.run_dir, 'amrseq')
        tok_seq_file = os.path.join(args.run_dir, 'tokseq')
        map_seq_file = os.path.join(args.run_dir, 'train_map')

        amrseq_wf = open(amr_seq_file, 'w')
        tokseq_wf = open(tok_seq_file, 'w')
        mapseq_wf = open(map_seq_file, 'w')

        for (sent_index, (tok_seq, pos_seq, alignment_seq, amr)) in enumerate(zip(toks, poss, alignments, amr_graphs)):

            logger.writeln('Sentence #%d' % (sent_index+1))
            logger.writeln(' '.join(tok_seq))

            amr.setStats(amr_statistics)

            edge_alignment = bitarray(len(amr.edges))
            if edge_alignment.count() != 0:
                edge_alignment ^= edge_alignment
            assert edge_alignment.count() == 0

            has_cycle = False
            if amr.check_self_cycle():
                num_self_cycle += 1
                has_cycle = True

            amr.set_sentence(tok_seq)
            amr.set_poss(pos_seq)

            aligned_fragments = []
            reentrancies = {}  #Map multiple spans as reentrancies, keeping only one as original, others as connections

            has_multiple = False
            no_alignment = False

            aligned_set = set()

            (opt_toks, role_toks, node_to_span, edge_to_span, temp_aligned) = extractNodeMapping(alignment_seq, amr)

            temp_unaligned = set(xrange(len(pos_seq))) - temp_aligned

            all_frags = []
            all_alignments = defaultdict(list)

            ####Extract named entities#####
            for (frag, wiki_label) in amr.extract_entities():
                if len(opt_toks) == 0:
                    logger.writeln("No alignment for the entity found")

                (aligned_indexes, entity_spans) = all_aligned_spans(frag, opt_toks, role_toks, temp_unaligned)
                root_node = amr.nodes[frag.root]

                entity_mention_toks = root_node.namedEntityMention()

                total_num += 1.0
                if entity_spans:
                    entity_spans = removeRedundant(tok_seq, entity_spans, entity_mention_toks)
                    if len(entity_spans) == 1:
                        singleton_num += 1.0
                        logger.writeln('Single fragment')
                        for (frag_start, frag_end) in entity_spans:
                            logger.writeln(' '.join(tok_seq[frag_start:frag_end]))
                            all_alignments[frag.root].append((frag_start, frag_end, wiki_label))
                            temp_aligned |= set(xrange(frag_start, frag_end))
                    else:
                        multiple_num += 1.0
                        logger.writeln('Multiple fragment')
                        logger.writeln(aligned_indexes)
                        logger.writeln(' '.join([tok_seq[index] for index in aligned_indexes]))

                        for (frag_start, frag_end) in entity_spans:
                            logger.writeln(' '.join(tok_seq[frag_start:frag_end]))
                            all_alignments[frag.root].append((frag_start, frag_end, wiki_label))
                            temp_aligned |= set(xrange(frag_start, frag_end))
                else:
                    empty_num += 1.0

            ####Process date entities
            date_entity_frags = amr.extract_all_dates()
            for frag in date_entity_frags:
                all_date_indices, index_to_attr = getDateAttr(frag)
                covered_toks, non_covered, index_to_toks = getSpanSide(tok_seq, alignment_seq, frag, temp_unaligned)

                covered_set = set(covered_toks)

                all_spans = getContinuousSpans(covered_toks, temp_unaligned, covered_set)
                if all_spans:
                    temp_spans = []
                    for span_start, span_end in all_spans:
                        if span_start > 0 and (span_start-1) in temp_unaligned:
                            if tok_seq[span_start-1] in str(frag) and tok_seq[0] in '0123456789':
                                temp_spans.append((span_start-1, span_end))
                            else:
                                temp_spans.append((span_start, span_end))
                        else:
                            temp_spans.append((span_start, span_end))
                    all_spans = temp_spans
                    all_spans = removeDateRedundant(all_spans)
                    for span_start, span_end in all_spans:
                        all_alignments[frag.root].append((span_start, span_end, None))
                        temp_aligned |= set(xrange(span_start, span_end))
                        if len(non_covered) == 0:
                            print 'Dates: %s' % ' '.join(tok_seq[span_start:span_end])
                else:
                    for index in temp_unaligned:
                        curr_tok = tok_seq[index]
                        found = False
                        for un_tok in non_covered:
                            if curr_tok[0] in '0123456789' and curr_tok in un_tok:
                                print 'recovered: %s' % curr_tok
                                found = True
                                break
                        if found:
                            all_alignments[frag.root].append((index, index+1, None))
                            temp_aligned.add(index)
                            print 'Date: %s' % tok_seq[index]

            #Verbalization list
            verb_map = {}
            for (index, curr_tok) in enumerate(tok_seq):
                if curr_tok in VERB_LIST:

                    for subgraph in VERB_LIST[curr_tok]:

                        matched_frags = amr.matchSubgraph(subgraph)
                        if matched_frags:
                            temp_aligned.add(index)

                        for (node_index, ex_rels) in matched_frags:
                            all_alignments[node_index].append((index, index+1, None))
                            verb_map[node_index] = subgraph

            #####Load verbalization list #####
            for node_index in node_to_span:
                if node_index in all_alignments:
                    continue

                all_alignments[node_index] = node_to_span[node_index]

            ##Based on the alignment from node index to spans in the string
            temp_unaligned = set(xrange(len(pos_seq))) - temp_aligned

            assert len(tok_seq) == len(pos_seq)

            amr_seq, cate_tok_seq, map_seq = categorizeParallelSequences(amr, tok_seq, all_alignments, temp_unaligned, verb_map, args.min_prd_freq, args.min_var_freq)
            print >> amrseq_wf, ' '.join(amr_seq)
            print >> tokseq_wf, ' '.join(cate_tok_seq)
            print >> mapseq_wf, '##'.join(map_seq)  #To separate single space

        amrseq_wf.close()
        tokseq_wf.close()
        mapseq_wf.close()

        #print "one to one alignment: %lf" % (singleton_num/total_num)
        #print "one to multiple alignment: %lf" % (multiple_num/total_num)
        #print "one to empty alignment: %lf" % (empty_num/total_num)
    else: #Only build the linearized token sequence

        mle_map = loadMap(args.map_file)
        if args.use_lemma:
            tok_file = os.path.join(args.data_dir, 'lemmatized_token')
        else:
            tok_file = os.path.join(args.data_dir, 'token')

        ner_file = os.path.join(args.data_dir, 'ner')
        date_file = os.path.join(args.data_dir, 'date')

        all_entities = identify_entities(tok_file, ner_file, mle_map)
        all_dates = dateMap(date_file)

        tokseq_result = os.path.join(args.data_dir, 'linearized_tokseq')
        dev_map_file = os.path.join(args.data_dir, 'cate_map')
        tokseq_wf = open(tokseq_result, 'w')
        dev_map_wf = open(dev_map_file, 'w')

        for (sent_index, (tok_seq, pos_seq, entities_in_sent)) in enumerate(zip(toks, poss, all_entities)):
            print 'snt: %d' % sent_index
            n_toks = len(tok_seq)
            aligned_set = set()

            all_spans = []
            date_spans = all_dates[sent_index]
            date_set = set()

            #Align dates
            for (start, end) in date_spans:
                if end - start > 1:
                    new_aligned = set(xrange(start, end))
                    aligned_set |= new_aligned
                    entity_name = ' '.join(tok_seq[start:end])
                    if entity_name in mle_map:
                        entity_typ = mle_map[entity_name]
                    else:
                        entity_typ = ('DATE', "date-entity", "NONE")
                    all_spans.append((start, end, entity_typ))
                    print 'Date:', start, end
                else:
                    date_set.add(start)

            #First align multi tokens
            for (start, end, entity_typ) in entities_in_sent:
                if end - start > 1:
                    new_aligned = set(xrange(start, end))
                    if len(aligned_set & new_aligned) != 0:
                        continue
                    aligned_set |= new_aligned
                    entity_name = ' '.join(tok_seq[start:end])
                    if entity_name in mle_map:
                        entity_typ = mle_map[entity_name]
                    else:
                        entity_typ = ('NE_person', "person", '-')
                    all_spans.append((start, end, entity_typ))

            #Single token
            for (index, curr_tok) in enumerate(tok_seq):
                if index in aligned_set:
                    continue

                curr_pos = pos_seq[index]
                aligned_set.add(index)

                if curr_tok in mle_map:
                    (category, node_repr, wiki_label) = mle_map[curr_tok]
                    if category.lower() == 'none':
                        all_spans.append((index, index+1, (curr_tok, "NONE", "NONE")))
                    else:
                        all_spans.append((index, index+1, mle_map[curr_tok]))
                else:

                    if curr_tok[0] in '\"\'.':
                        print 'weird token: %s, %s' % (curr_tok, curr_pos)
                        continue
                    if index in date_set:
                        entity_typ = ('DATE', "date-entity", "NONE")
                        all_spans.append((index, index+1, entity_typ))
                    elif curr_tok in VERB_LIST:
                        node_repr = VERB_LIST[curr_tok][0].keys()[0]
                        entity_typ = ('VERBAL', node_repr, "NONE")
                        all_spans.append((index, index+1, entity_typ))

                    elif curr_pos[0] == 'V':
                        node_repr = '%s-01' % curr_tok
                        all_spans.append((index, index+1, ('-VERB-', node_repr, "NONE")))
                    else:
                        node_repr = curr_tok
                        all_spans.append((index, index+1, ('-SURF-', curr_tok, "NONE")))

            all_spans = sorted(all_spans, key=lambda span: (span[0], span[1]))
            print all_spans
            linearized_tokseq, map_repr_seq = getIndexedForm(all_spans)

            print >> tokseq_wf, ' '.join(linearized_tokseq)
            print >> dev_map_wf, '##'.join(map_repr_seq)

        tokseq_wf.close()
        dev_map_wf.close()