def add_JAMR_align(instances,aligned_amr_file):
    comments,amr_strings = readAMR(aligned_amr_file)
    for i in range(len(instances)):
        amr = AMR.parse_string(amr_strings[i])
        alignment = Aligner.readJAMRAlignment(amr,comments[i]['alignments'])
        ggraph = SpanGraph.init_ref_graph(amr,alignment,instances[i].tokens)
        ggraph.pre_merge_netag(instances[i])
        #print >> log, "Graph ID:%s\n%s\n"%(ggraph.graphID,ggraph.print_tuples())
        instances[i].addAMR(amr)
        instances[i].addGoldGraph(ggraph)
Example #2
0
def add_JAMR_align(instances, aligned_amr_file):
    comments, amr_strings = readAMR(aligned_amr_file)
    for i in range(len(instances)):
        amr = AMR.parse_string(amr_strings[i])
        alignment = Aligner.readJAMRAlignment(amr, comments[i]['alignments'])
        ggraph = SpanGraph.init_ref_graph(amr, alignment, instances[i].tokens)
        ggraph.pre_merge_netag(instances[i])
        #print >> log, "Graph ID:%s\n%s\n"%(ggraph.graphID,ggraph.print_tuples())
        instances[i].addAMR(amr)
        instances[i].addGoldGraph(ggraph)
Example #3
0
def _init_instances(sent_file, amr_strings, comments):
    print >> log, "Preprocess 1:pos, ner and dependency using stanford parser..."
    proc = StanfordCoreNLP()
    instances = proc.parse(sent_file)

    print >> log, "Preprocess 2:adding amr and generating gold graph"
    assert len(instances) == len(amr_strings)
    for i in range(len(instances)):
        amr = AMR.parse_string(amr_strings[i])
        instances[i].addAMR(amr)
        alignment = Aligner.readJAMRAlignment(amr, comments[i]['alignments'])
        ggraph = SpanGraph.init_ref_graph(amr, alignment, comments[i]['snt'])
        ggraph.pre_merge_netag(instances[i])
        instances[i].addGoldGraph(ggraph)

    return instances
def _init_instances(sent_file,amr_strings,comments):
    print >> log, "Preprocess 1:pos, ner and dependency using stanford parser..."
    proc = StanfordCoreNLP()
    instances = proc.parse(sent_file)
    
    
    print >> log, "Preprocess 2:adding amr and generating gold graph"
    assert len(instances) == len(amr_strings)
    for i in range(len(instances)):
        amr = AMR.parse_string(amr_strings[i])
        instances[i].addAMR(amr)
        alignment = Aligner.readJAMRAlignment(amr,comments[i]['alignments'])
        ggraph = SpanGraph.init_ref_graph(amr,alignment,comments[i]['snt'])
        ggraph.pre_merge_netag(instances[i])
        instances[i].addGoldGraph(ggraph)

    return instances
def preprocess(amr_file,START_SNLP=True):
    '''nasty function'''
    aligned_amr_file = amr_file + '.aligned'
    if os.path.exists(aligned_amr_file):
        comments,amr_strings = readAMR(aligned_amr_file)
    else:
        comments,amr_strings = readAMR(amr_file)
    #comments,amr_strings = readAMR(aligned_amr_file)
    sentences = [c['snt'] for c in comments]
    tmp_sentence_file = amr_file+'.sent'
    if not os.path.exists(tmp_sentence_file):
        _write_sentences(tmp_sentence_file,sentences)

    print >> log, "pos, ner and dependency..."
    proc = StanfordCoreNLP()
    if START_SNLP: proc.setup()
    instances = proc.parse(tmp_sentence_file)

    tok_amr_filename = amr_file + '.tok'
    if not os.path.exists(tok_amr_filename):
        _write_tok_amr(tok_amr_filename,amr_file,instances)
    
    SpanGraph.graphID = 0
    for i in range(len(instances)):

        amr = AMR.parse_string(amr_strings[i])
        
        alignment = Aligner.readJAMRAlignment(amr,comments[i]['alignments'])
        ggraph = SpanGraph.init_ref_graph(amr,alignment,instances[i].tokens)
        #ggraph.pre_merge_netag(instances[i])
        #print >> log, "Graph ID:%s\n%s\n"%(ggraph.graphID,ggraph.print_tuples())
        instances[i].addAMR(amr)
        instances[i].addGoldGraph(ggraph)
        
    #print >> log, "adding amr"
    #_add_amr(instances,amr_strings)
    #if writeToFile:
    #    output_file = amr_file.rsplit('.',1)[0]+'_dataInst.p'
    #    pickle.dump(instances,open(output_file,'wb'),pickle.HIGHEST_PROTOCOL)
        
    return instances
Example #6
0
def preprocess(amr_file, START_SNLP=True):
    '''nasty function'''
    aligned_amr_file = amr_file + '.aligned'
    if os.path.exists(aligned_amr_file):
        comments, amr_strings = readAMR(aligned_amr_file)
    else:
        comments, amr_strings = readAMR(amr_file)
    #comments,amr_strings = readAMR(aligned_amr_file)
    sentences = [c['snt'] for c in comments]
    tmp_sentence_file = amr_file + '.sent'
    if not os.path.exists(tmp_sentence_file):
        _write_sentences(tmp_sentence_file, sentences)

    print >> log, "pos, ner and dependency..."
    proc = StanfordCoreNLP()
    if START_SNLP: proc.setup()
    instances = proc.parse(tmp_sentence_file)

    tok_amr_filename = amr_file + '.tok'
    if not os.path.exists(tok_amr_filename):
        _write_tok_amr(tok_amr_filename, amr_file, instances)

    SpanGraph.graphID = 0
    for i in range(len(instances)):

        amr = AMR.parse_string(amr_strings[i])

        alignment = Aligner.readJAMRAlignment(amr, comments[i]['alignments'])
        ggraph = SpanGraph.init_ref_graph(amr, alignment, instances[i].tokens)
        #ggraph.pre_merge_netag(instances[i])
        #print >> log, "Graph ID:%s\n%s\n"%(ggraph.graphID,ggraph.print_tuples())
        instances[i].addAMR(amr)
        instances[i].addGoldGraph(ggraph)

    #print >> log, "adding amr"
    #_add_amr(instances,amr_strings)
    #if writeToFile:
    #    output_file = amr_file.rsplit('.',1)[0]+'_dataInst.p'
    #    pickle.dump(instances,open(output_file,'wb'),pickle.HIGHEST_PROTOCOL)

    return instances
    def init_state(instance,verbose=0):
        depGraph = SpanGraph.init_dep_graph(instance,instance.tokens)
        #depGraph.pre_merge_netag(instance)
        seq = []
        #if instance.sentID == 104:
        #    import pdb
        #    pdb.set_trace()
        for r in sorted(depGraph.multi_roots,reverse=True): seq += depGraph.postorder(root=r)
        #seq = uniqify(seq)
        seq.append(-1)
        sigma = Buffer(seq)        

        GraphState.text = instance.text
        GraphState.sent = instance.tokens
        GraphState.gold_graph = instance.gold_graph 
        GraphState.deptree = depGraph
        GraphState.sentID = instance.sentID
        GraphState.verbose = verbose
        
        if verbose > 1:
            print >> sys.stderr,"Sentence ID:%s, initial sigma:%s" % (GraphState.sentID,sigma)

        return GraphState(sigma,copy.deepcopy(depGraph))
Example #8
0
def preprocess(input_file, START_SNLP=True, INPUT_AMR='amr'):
    '''nasty function'''
    tmp_sent_filename = None
    instances = None
    tok_sent_filename = None

    if INPUT_AMR == 'amr':  # the input file is amr annotation

        amr_file = input_file
        aligned_amr_file = amr_file + '.amr.tok.aligned'
        if os.path.exists(aligned_amr_file):
            comments, amr_strings = readAMR(aligned_amr_file)
        else:
            comments, amr_strings = readAMR(amr_file)
        sentences = [c['snt'] for c in comments]  # here should be 'snt'

        # write sentences(separate per line)
        tmp_sent_filename = amr_file + '.sent'
        if not os.path.exists(tmp_sent_filename):  # no cache found
            _write_sentences(tmp_sent_filename, sentences)

        tmp_prp_filename = tmp_sent_filename + '.prp'

        proc1 = StanfordCoreNLP()

        # preprocess 1: tokenization, POS tagging and name entity using Stanford CoreNLP

        if START_SNLP and not os.path.exists(tmp_prp_filename):
            print >> log, "Start Stanford CoreNLP..."
            proc1.setup()

        print >> log, 'Read token,lemma,name entity file %s...' % (
            tmp_prp_filename)
        instances = proc1.parse(tmp_sent_filename)

        tok_sent_filename = tmp_sent_filename + '.tok'  # write tokenized sentence file
        if not os.path.exists(tok_sent_filename):
            _write_tok_sentences(tok_sent_filename, instances)

        tok_amr_filename = amr_file + '.amr.tok'
        if not os.path.exists(tok_amr_filename):  # write tokenized amr file
            _write_tok_amr(tok_amr_filename, amr_file, instances)

        SpanGraph.graphID = 0
        for i in xrange(len(instances)):

            amr = AMR.parse_string(amr_strings[i])
            if 'alignments' in comments[i]:
                alignment, s2c_alignment = Aligner.readJAMRAlignment(
                    amr, comments[i]['alignments'])
                # use verbalization list to fix the unaligned tokens
                if constants.FLAG_VERB:
                    Aligner.postProcessVerbList(amr, comments[i]['tok'],
                                                alignment)
                #ggraph = SpanGraph.init_ref_graph(amr,alignment,instances[i].tokens)
                ggraph = SpanGraph.init_ref_graph_abt(amr, alignment,
                                                      s2c_alignment,
                                                      instances[i].tokens)
                #ggraph.pre_merge_netag(instances[i])
                #print >> log, "Graph ID:%s\n%s\n"%(ggraph.graphID,ggraph.print_tuples())
                instances[i].addComment(comments[i])
                instances[i].addAMR(amr)
                instances[i].addGoldGraph(ggraph)

    elif INPUT_AMR == 'amreval':
        eval_file = input_file
        comments = readAMREval(eval_file)
        sentences = [c['snt'] for c in comments]

        # write sentences(separate per line)
        tmp_sent_filename = eval_file + '.sent'
        if not os.path.exists(tmp_sent_filename):  # no cache found
            _write_sentences(tmp_sent_filename, sentences)

        tmp_prp_filename = tmp_sent_filename + '.prp'

        proc1 = StanfordCoreNLP()

        # preprocess 1: tokenization, POS tagging and name entity using Stanford CoreNLP
        if START_SNLP and not os.path.exists(tmp_prp_filename):
            print >> log, "Start Stanford CoreNLP ..."
            proc1.setup()
            instances = proc1.parse(tmp_sent_filename)
        elif os.path.exists(tmp_prp_filename):  # found cache file
            print >> log, 'Read token,lemma,name entity file %s...' % (
                tmp_prp_filename)
            instances = proc1.parse(tmp_sent_filename)
        else:
            raise Exception(
                'No cache file %s has been found. set START_SNLP=True to start corenlp.'
                % (tmp_prp_filename))

        tok_sent_filename = tmp_sent_filename + '.tok'  # write tokenized sentence file
        if not os.path.exists(tok_sent_filename):
            _write_tok_sentences(tok_sent_filename, instances)

        for i in xrange(len(instances)):
            instances[i].addComment(comments[i])

    else:  # input file is sentence
        tmp_sent_filename = input_file
        tmp_prp_filename = tmp_sent_filename + '.prp'

        proc1 = StanfordCoreNLP()

        # preprocess 1: tokenization, POS tagging and name entity using Stanford CoreNLP
        if START_SNLP and not os.path.exists(tmp_prp_filename):
            print >> log, "Start Stanford CoreNLP ..."
            proc1.setup()
            instances = proc1.parse(tmp_sent_filename)
        elif os.path.exists(tmp_prp_filename):  # found cache file
            print >> log, 'Read token,lemma,name entity file %s...' % (
                tmp_prp_filename)
            instances = proc1.parse(tmp_sent_filename)
        else:
            raise Exception(
                'No cache file %s has been found. set START_SNLP=True to start corenlp.'
                % (tmp_prp_filename))

        tok_sent_filename = tmp_sent_filename + '.tok'  # write tokenized sentence file
        if not os.path.exists(tok_sent_filename):
            _write_tok_sentences(tok_sent_filename, instances)

    # preprocess 2: dependency parsing
    if constants.FLAG_DEPPARSER == "stanford":
        dep_filename = tok_sent_filename + '.stanford.dep'
        if os.path.exists(dep_filename):
            print 'Read dependency file %s...' % (dep_filename)
            dep_result = codecs.open(dep_filename, 'r',
                                     encoding='utf-8').read()
        else:
            dparser = StanfordDepParser()
            dep_result = dparser.parse(tok_sent_filename)
            output_dep = codecs.open(dep_filename, 'w', encoding='utf-8')
            output_dep.write(dep_result)
            output_dep.close()

        _add_dependency(instances, dep_result)
    elif constants.FLAG_DEPPARSER == "stanfordConvert":
        dep_filename = tok_sent_filename + '.stanford.parse.dep'
        if os.path.exists(dep_filename):
            print 'Read dependency file %s...' % (dep_filename)

            dep_result = codecs.open(dep_filename, 'r',
                                     encoding='utf-8').read()
        else:
            raise IOError('Converted dependency file %s not founded' %
                          (dep_filename))

        _add_dependency(instances, dep_result, constants.FLAG_DEPPARSER)

    elif constants.FLAG_DEPPARSER == "stdconv+charniak":
        if constants.FLAG_ONTO == 'onto':
            dep_filename = tok_sent_filename + '.charniak.onto.parse.dep'
        elif constants.FLAG_ONTO == 'onto+bolt':
            dep_filename = tok_sent_filename + '.charniak.onto+bolt.parse.dep'
        else:
            dep_filename = tok_sent_filename + '.charniak.parse.dep'
        if not os.path.exists(dep_filename):
            dparser = CharniakParser()
            dparser.parse(tok_sent_filename)
            #raise IOError('Converted dependency file %s not founded' % (dep_filename))
        print 'Read dependency file %s...' % (dep_filename)
        dep_result = codecs.open(dep_filename, 'r', encoding='utf-8').read()
        _add_dependency(instances, dep_result, constants.FLAG_DEPPARSER)

    elif constants.FLAG_DEPPARSER == "clear":
        dep_filename = tok_sent_filename + '.clear.dep'
        if os.path.exists(dep_filename):
            print 'Read dependency file %s...' % (dep_filename)
            dep_result = open(dep_filename, 'r').read()
        else:
            dparser = ClearDepParser()
            dep_result = dparser.parse(tok_sent_filename)
        _add_dependency(instances, dep_result, constants.FLAG_DEPPARSER)

    elif constants.FLAG_DEPPARSER == "turbo":
        dep_filename = tok_sent_filename + '.turbo.dep'
        if os.path.exists(dep_filename):
            print 'Read dependency file %s...' % (dep_filename)
            dep_result = open(dep_filename, 'r').read()
        else:
            dparser = TurboDepParser()
            dep_result = dparser.parse(tok_sent_filename)
        _add_dependency(instances, dep_result, constants.FLAG_DEPPARSER)

    elif constants.FLAG_DEPPARSER == "mate":
        dep_filename = tok_sent_filename + '.mate.dep'
        if os.path.exists(dep_filename):
            print 'Read dependency file %s...' % (dep_filename)
            dep_result = open(dep_filename, 'r').read()
        else:
            dparser = MateDepParser()
            dep_result = dparser.parse(tok_sent_filename)
        _add_dependency(instances, dep_result, constants.FLAG_DEPPARSER)
    else:
        #pass
        raise Exception('Unknown dependency parse type %s' %
                        (constants.FLAG_DEPPARSER))

    if constants.FLAG_PROP:
        print >> log, "Adding SRL information..."
        prop_filename = tok_sent_filename + '.prop' if constants.FLAG_ONTO != 'onto+bolt' else tok_sent_filename + '.onto+bolt.prop'
        if os.path.exists(prop_filename):
            if constants.FLAG_DEPPARSER == "stdconv+charniak":
                _add_prop(instances,
                          prop_filename,
                          dep_filename,
                          FIX_PROP_HEAD=True)
            else:
                _add_prop(instances, prop_filename, dep_filename)

        else:
            raise IOError('Semantic role labeling file %s not found!' %
                          (prop_filename))

    if constants.FLAG_RNE:
        print >> log, "Using rich name entity instead..."
        rne_filename = tok_sent_filename + '.rne'
        if os.path.exists(rne_filename):
            _substitute_rne(instances, rne_filename)
        else:
            raise IOError('Rich name entity file %s not found!' %
                          (rne_filename))

    return instances
Example #9
0
def preprocess(input_file,START_SNLP=True,INPUT_AMR=True):
    '''nasty function'''
    tmp_sent_filename = None
    instances = None
    tok_sent_filename = None
    
    if INPUT_AMR: # the input file is amr annotation
        
        amr_file = input_file
        aligned_amr_file = amr_file + '.amr.tok.aligned'
        if os.path.exists(aligned_amr_file):
            comments,amr_strings = readAMR(aligned_amr_file)
        else:
            comments,amr_strings = readAMR(amr_file)
        sentences = [c['snt'] for c in comments] # here should be 'snt'
        tmp_sent_filename = amr_file+'.sent'
        if not os.path.exists(tmp_sent_filename): # write sentences into file
            _write_sentences(tmp_sent_filename,sentences)


        print >> log, "Start Stanford CoreNLP..."
        proc1 = StanfordCoreNLP()

        # preprocess 1: tokenization, POS tagging and name entity using Stanford CoreNLP
        if START_SNLP: proc1.setup()
        instances = proc1.parse(tmp_sent_filename)

        tok_sent_filename = tmp_sent_filename+'.tok' # write tokenized sentence file
        if not os.path.exists(tok_sent_filename):
            _write_tok_sentences(tok_sent_filename,instances)

        tok_amr_filename = amr_file + '.amr.tok'
        if not os.path.exists(tok_amr_filename): # write tokenized amr file
            _write_tok_amr(tok_amr_filename,amr_file,instances)
            
        SpanGraph.graphID = 0
        for i in range(len(instances)):

            amr = AMR.parse_string(amr_strings[i])
            if 'alignments' in comments[i]:
                alignment,s2c_alignment = Aligner.readJAMRAlignment(amr,comments[i]['alignments'])
                #ggraph = SpanGraph.init_ref_graph(amr,alignment,instances[i].tokens)
                ggraph = SpanGraph.init_ref_graph_abt(amr,alignment,s2c_alignment,instances[i].tokens)
                #ggraph.pre_merge_netag(instances[i])
                #print >> log, "Graph ID:%s\n%s\n"%(ggraph.graphID,ggraph.print_tuples())
                instances[i].addComment(comments[i])
                instances[i].addAMR(amr)
                instances[i].addGoldGraph(ggraph)

    else:
        # input file is sentence
        tmp_sent_filename = input_file 

        print >> log, "Start Stanford CoreNLP ..."
        proc1 = StanfordCoreNLP()

        # preprocess 1: tokenization, POS tagging and name entity using Stanford CoreNLP
        if START_SNLP: proc1.setup()
        instances = proc1.parse(tmp_sent_filename)

        tok_sent_filename = tmp_sent_filename+'.tok' # write tokenized sentence file
        if not os.path.exists(tok_sent_filename):
            _write_tok_sentences(tok_sent_filename,instances)
        
    # preprocess 2: dependency parsing 
    if constants.FLAG_DEPPARSER == "stanford":
        dep_filename = tok_sent_filename+'.stanford.dep'
        if os.path.exists(dep_filename):
            print 'Read dependency file %s...' % (dep_filename)                                                                 
            dep_result = open(dep_filename,'r').read()
        else:
            dparser = StanfordDepParser()
            dep_result = dparser.parse(tok_sent_filename)
            output_dep = open(dep_filename,'w')            
            output_dep.write(dep_result)
            output_dep.close()
            
        _add_dependency(instances,dep_result)
    elif constants.FLAG_DEPPARSER == "stanfordConvert":
        dep_filename = tok_sent_filename+'.stanford.parse.dep'
        if os.path.exists(dep_filename):
            print 'Read dependency file %s...' % (dep_filename)

            dep_result = open(dep_filename,'r').read()
        else:
            raise IOError('Converted dependency file %s not founded' % (dep_filename))

        _add_dependency(instances,dep_result,constants.FLAG_DEPPARSER)

    elif constants.FLAG_DEPPARSER == "stdconv+charniak":
        dep_filename = tok_sent_filename+'.charniak.parse.dep'
        if not os.path.exists(dep_filename):
            dparser = CharniakParser()
            dparser.parse(tok_sent_filename)
            #raise IOError('Converted dependency file %s not founded' % (dep_filename))
        print 'Read dependency file %s...' % (dep_filename)
        dep_result = open(dep_filename,'r').read()
        _add_dependency(instances,dep_result,constants.FLAG_DEPPARSER)
            
    elif constants.FLAG_DEPPARSER == "clear":
        dep_filename = tok_sent_filename+'.clear.dep'
        if os.path.exists(dep_filename):
            print 'Read dependency file %s...' % (dep_filename)                                                                 
            dep_result = open(dep_filename,'r').read()
        else:
            dparser = ClearDepParser()
            dep_result = dparser.parse(tok_sent_filename)
        _add_dependency(instances,dep_result,constants.FLAG_DEPPARSER)

    elif constants.FLAG_DEPPARSER == "turbo":
        dep_filename = tok_sent_filename+'.turbo.dep'
        if os.path.exists(dep_filename):
            print 'Read dependency file %s...' % (dep_filename)                                                                 
            dep_result = open(dep_filename,'r').read()
        else:
            dparser = TurboDepParser()
            dep_result = dparser.parse(tok_sent_filename)
        _add_dependency(instances,dep_result,constants.FLAG_DEPPARSER)

    elif constants.FLAG_DEPPARSER == "mate":
        dep_filename = tok_sent_filename+'.mate.dep'
        if os.path.exists(dep_filename):
            print 'Read dependency file %s...' % (dep_filename)                                                                 
            dep_result = open(dep_filename,'r').read()
        else:
            dparser = MateDepParser()
            dep_result = dparser.parse(tok_sent_filename)
        _add_dependency(instances,dep_result,constants.FLAG_DEPPARSER)
    else:
        pass
    
    if constants.FLAG_PROP:
        print >> log, "Adding SRL information..."
        prop_filename = tok_sent_filename + '.prop'
        if os.path.exists(prop_filename):
            if constants.FLAG_DEPPARSER == "stdconv+charniak":
                _add_prop(instances,prop_filename,dep_filename,FIX_PROP_HEAD=True)
            else:
                _add_prop(instances,prop_filename,dep_filename)
            
        else:
            raise FileNotFoundError('Semantic role labeling file %s not found!'%(prop_filename))

        
    return instances
def preprocess(input_file,START_SNLP=True,INPUT_AMR=True):
    '''nasty function'''
    tmp_sent_filename = None
    instances = None
    tok_sent_filename = None
    
    if INPUT_AMR: # the input file is amr annotation
        
        amr_file = input_file
        aligned_amr_file = amr_file + '.amr.tok.aligned'
        if os.path.exists(aligned_amr_file):
            print >> log, "Using aligned amr file..."
            comments,amr_strings = readAMR(aligned_amr_file)
        else:
            comments,amr_strings = readAMR(amr_file)
        sentences = [c['snt'] for c in comments] # here should be 'snt'
        tmp_sent_filename = amr_file+'.sent'
        if not os.path.exists(tmp_sent_filename): # write sentences into file
            _write_sentences(tmp_sent_filename,sentences)


        print >> log, "Start Stanford CoreNLP..."
        proc1 = StanfordCoreNLP()

        # preprocess 1: tokenization, POS tagging and name entity using Stanford CoreNLP
        if START_SNLP: proc1.setup()
        instances = proc1.parse(tmp_sent_filename)

        tok_sent_filename = tmp_sent_filename+'.tok' # write tokenized sentence file
        if not os.path.exists(tok_sent_filename):
            _write_tok_sentences(tok_sent_filename,instances)

        tok_amr_filename = amr_file + '.amr.tok'
        if not os.path.exists(tok_amr_filename): # write tokenized amr file
            _write_tok_amr(tok_amr_filename,amr_file,instances)
            
        SpanGraph.graphID = 0
        for i in range(len(instances)):

            amr = AMR.parse_string(amr_strings[i])
            if 'alignments' in comments[i]:
                alignment,s2c_alignment = Aligner.readJAMRAlignment(amr,comments[i]['alignments'])
                #ggraph = SpanGraph.init_ref_graph(amr,alignment,instances[i].tokens)
                ggraph = SpanGraph.init_ref_graph_abt(amr,alignment,s2c_alignment,instances[i].tokens)
                #ggraph.pre_merge_netag(instances[i])
                #print >> log, "Graph ID:%s\n%s\n"%(ggraph.graphID,ggraph.print_tuples())
                instances[i].addComment(comments[i])
                instances[i].addAMR(amr)
                instances[i].addGoldGraph(ggraph)

    else:
        # input file is sentence
        tmp_sent_filename = input_file 

        print >> log, "Start Stanford CoreNLP ..."
        proc1 = StanfordCoreNLP()

        # preprocess 1: tokenization, POS tagging and name entity using Stanford CoreNLP
        if START_SNLP: proc1.setup()
        instances = proc1.parse(tmp_sent_filename)

        tok_sent_filename = tmp_sent_filename+'.tok' # write tokenized sentence file
        if not os.path.exists(tok_sent_filename):
            _write_tok_sentences(tok_sent_filename,instances)
        
    # preprocess 2: dependency parsing 
    if constants.FLAG_DEPPARSER == "stanford":
        dep_filename = tok_sent_filename+'.stanford.dep'
        if os.path.exists(dep_filename):
            print 'Read dependency file %s...' % (dep_filename)                                                                 
            dep_result = open(dep_filename,'r').read()
        else:
            dparser = StanfordDepParser()
            dep_result = dparser.parse(tok_sent_filename)
            output_dep = open(dep_filename,'w')            
            output_dep.write(dep_result)
            output_dep.close()
            
        _add_dependency(instances,dep_result)
    elif constants.FLAG_DEPPARSER == "stanfordConvert":
        dep_filename = tok_sent_filename+'.stanford.parse.dep'
        if os.path.exists(dep_filename):
            print 'Read dependency file %s...' % (dep_filename)

            dep_result = open(dep_filename,'r').read()
        else:
            raise IOError('Converted dependency file %s not founded' % (dep_filename))

        _add_dependency(instances,dep_result,constants.FLAG_DEPPARSER)

    elif constants.FLAG_DEPPARSER == "stdconv+charniak":
        dep_filename = tok_sent_filename+'.charniak.parse.dep'
        if not os.path.exists(dep_filename):
            dparser = CharniakParser()
            dparser.parse(tok_sent_filename)
            #raise IOError('Converted dependency file %s not founded' % (dep_filename))
        print 'Read dependency file %s...' % (dep_filename)
        dep_result = open(dep_filename,'r').read()
        _add_dependency(instances,dep_result,constants.FLAG_DEPPARSER)
            
    elif constants.FLAG_DEPPARSER == "clear":
        dep_filename = tok_sent_filename+'.clear.dep'
        if os.path.exists(dep_filename):
            print 'Read dependency file %s...' % (dep_filename)                                                                 
            dep_result = open(dep_filename,'r').read()
        else:
            dparser = ClearDepParser()
            dep_result = dparser.parse(tok_sent_filename)
        _add_dependency(instances,dep_result,constants.FLAG_DEPPARSER)

    elif constants.FLAG_DEPPARSER == "turbo":
        dep_filename = tok_sent_filename+'.turbo.dep'
        if os.path.exists(dep_filename):
            print 'Read dependency file %s...' % (dep_filename)                                                                 
            dep_result = open(dep_filename,'r').read()
        else:
            dparser = TurboDepParser()
            dep_result = dparser.parse(tok_sent_filename)
        _add_dependency(instances,dep_result,constants.FLAG_DEPPARSER)

    elif constants.FLAG_DEPPARSER == "mate":
        dep_filename = tok_sent_filename+'.mate.dep'
        if os.path.exists(dep_filename):
            print 'Read dependency file %s...' % (dep_filename)                                                                 
            dep_result = open(dep_filename,'r').read()
        else:
            dparser = MateDepParser()
            dep_result = dparser.parse(tok_sent_filename)
        _add_dependency(instances,dep_result,constants.FLAG_DEPPARSER)
    else:
        pass
    
    if constants.FLAG_PROP:
        print >> log, "Adding SRL information..."
        prop_filename = tok_sent_filename + '.prop'
        if os.path.exists(prop_filename):
            if constants.FLAG_DEPPARSER == "stdconv+charniak":
                _add_prop(instances,prop_filename,dep_filename,FIX_PROP_HEAD=True)
            else:
                _add_prop(instances,prop_filename,dep_filename)
            
        else:
            raise FileNotFoundError('Semantic role labeling file %s not found!'%(prop_filename))

        
    return instances
Example #11
0
def preprocess(input_file,START_SNLP=True,INPUT_AMR='amr',PRP_FORMAT='plain'):
    '''nasty function'''
    tmp_sent_filename = None
    instances = None
    tok_sent_filename = None
    
    if INPUT_AMR == 'amr': # the input file is amr annotation
        
        amr_file = input_file
        aligned_amr_file = amr_file + '.amr.tok.aligned'
        if os.path.exists(aligned_amr_file):
            comments,amr_strings = readAMR(aligned_amr_file)
        else:
            comments,amr_strings = readAMR(amr_file)
        sentences = [c['snt'] for c in comments] # here should be 'snt'

        # write sentences(separate per line)
        tmp_sent_filename = amr_file+'.sent'
        if not os.path.exists(tmp_sent_filename): # no cache found
            _write_sentences(tmp_sent_filename,sentences)

        tmp_prp_filename = None
        instances = None
        if PRP_FORMAT == 'plain':
            tmp_prp_filename = tmp_sent_filename+'.prp'
            
            
            proc1 = StanfordCoreNLP()

            # preprocess 1: tokenization, POS tagging and name entity using Stanford CoreNLP

            if START_SNLP and not os.path.exists(tmp_prp_filename):
                print >> log, "Start Stanford CoreNLP..."
                proc1.setup()

            print >> log, 'Read token,lemma,name entity file %s...' % (tmp_prp_filename)            
            instances = proc1.parse(tmp_sent_filename)

        elif PRP_FORMAT == 'xml': # rather than using corenlp plain format; using xml format; also we don't use corenlp wrapper anymore
            tmp_prp_filename = tmp_sent_filename+'.prp.xml'
            if not os.path.exists(tmp_prp_filename):
                raise Exception("No preprocessed xml file found: %s" % tmp_prp_filename)
            print >> log, 'Read token,lemma,name entity file %s...' % (tmp_prp_filename)
            instances = load_xml_instances(tmp_prp_filename)
        else:
            raise Exception('Unknow preprocessed file format %s' % PRP_FORMAT)
            
        tok_sent_filename = tmp_sent_filename+'.tok' # write tokenized sentence file
        if not os.path.exists(tok_sent_filename):
            _write_tok_sentences(tok_sent_filename,instances)

        tok_amr_filename = amr_file + '.amr.tok'
        if not os.path.exists(tok_amr_filename): # write tokenized amr file
            _write_tok_amr(tok_amr_filename,amr_file,instances)
            
        SpanGraph.graphID = 0
        for i in xrange(len(instances)):

            amr = AMR.parse_string(amr_strings[i])
            if 'alignments' in comments[i]:
                alignment,s2c_alignment = Aligner.readJAMRAlignment(amr,comments[i]['alignments'])
                # use verbalization list to fix the unaligned tokens
                if constants.FLAG_VERB: Aligner.postProcessVerbList(amr, comments[i]['tok'], alignment)
                #ggraph = SpanGraph.init_ref_graph(amr,alignment,instances[i].tokens)
                ggraph = SpanGraph.init_ref_graph_abt(amr,alignment,s2c_alignment,instances[i].tokens)
                #ggraph.pre_merge_netag(instances[i])
                #print >> log, "Graph ID:%s\n%s\n"%(ggraph.graphID,ggraph.print_tuples())
                instances[i].addComment(comments[i])
                instances[i].addAMR(amr)
                instances[i].addGoldGraph(ggraph)

    elif INPUT_AMR == 'amreval':
        eval_file = input_file
        comments = readAMREval(eval_file)
        sentences = [c['snt'] for c in comments] 

        # write sentences(separate per line)
        tmp_sent_filename = eval_file+'.sent'
        if not os.path.exists(tmp_sent_filename): # no cache found
            _write_sentences(tmp_sent_filename,sentences)

        tmp_prp_filename = tmp_sent_filename+'.prp'

        proc1 = StanfordCoreNLP()

        # preprocess 1: tokenization, POS tagging and name entity using Stanford CoreNLP
        if START_SNLP and not os.path.exists(tmp_prp_filename):
            print >> log, "Start Stanford CoreNLP ..."
            proc1.setup()
            instances = proc1.parse(tmp_sent_filename)
        elif os.path.exists(tmp_prp_filename): # found cache file
            print >> log, 'Read token,lemma,name entity file %s...' % (tmp_prp_filename)
            instances = proc1.parse(tmp_sent_filename)
        else:
            raise Exception('No cache file %s has been found. set START_SNLP=True to start corenlp.' % (tmp_prp_filename))
            
        tok_sent_filename = tmp_sent_filename+'.tok' # write tokenized sentence file
        if not os.path.exists(tok_sent_filename):
            _write_tok_sentences(tok_sent_filename,instances)
            
        for i in xrange(len(instances)):
            instances[i].addComment(comments[i])
        
    else:        # input file is sentence
        tmp_sent_filename = input_file

        tmp_prp_filename = None
        instances = None
        if PRP_FORMAT == 'plain':
            tmp_prp_filename = tmp_sent_filename+'.prp'

            proc1 = StanfordCoreNLP()

            # preprocess 1: tokenization, POS tagging and name entity using Stanford CoreNLP

            if START_SNLP and not os.path.exists(tmp_prp_filename):
                print >> log, "Start Stanford CoreNLP..."
                proc1.setup()

            print >> log, 'Read token,lemma,name entity file %s...' % (tmp_prp_filename)            
            instances = proc1.parse(tmp_sent_filename)

        elif PRP_FORMAT == 'xml': # rather than using corenlp plain format; using xml format; also we don't use corenlp wrapper anymore
            tmp_prp_filename = tmp_sent_filename+'.xml'
            if not os.path.exists(tmp_prp_filename):
                raise Exception("No preprocessed xml file found: %s" % tmp_prp_filename)
            print >> log, 'Read token,lemma,name entity file %s...' % (tmp_prp_filename)
            instances = load_xml_instances(tmp_prp_filename)
        else:
            raise Exception('Unknow preprocessed file format %s' % PRP_FORMAT)

        
        # tmp_prp_filename = tmp_sent_filename+'.prp'
        # proc1 = StanfordCoreNLP()

        # # preprocess 1: tokenization, POS tagging and name entity using Stanford CoreNLP
        # if START_SNLP and not os.path.exists(tmp_prp_filename):
        #     print >> log, "Start Stanford CoreNLP ..."
        #     proc1.setup()
        #     instances = proc1.parse(tmp_sent_filename)
        # elif os.path.exists(tmp_prp_filename): # found cache file
        #     print >> log, 'Read token,lemma,name entity file %s...' % (tmp_prp_filename)
        #     instances = proc1.parse(tmp_sent_filename)
        # else:
        #     raise Exception('No cache file %s has been found. set START_SNLP=True to start corenlp.' % (tmp_prp_filename))
        

        tok_sent_filename = tmp_sent_filename+'.tok' # write tokenized sentence file
        if not os.path.exists(tok_sent_filename):
            _write_tok_sentences(tok_sent_filename,instances)
        
    # preprocess 2: dependency parsing 
    if constants.FLAG_DEPPARSER == "stanford":
        dep_filename = tok_sent_filename+'.stanford.dep'
        if os.path.exists(dep_filename):
            print 'Read dependency file %s...' % (dep_filename)                                                                 
            dep_result = codecs.open(dep_filename,'r',encoding='utf-8').read()
        else:
            dparser = StanfordDepParser()
            dep_result = dparser.parse(tok_sent_filename)
            output_dep = codecs.open(dep_filename,'w',encoding='utf-8')            
            output_dep.write(dep_result)
            output_dep.close()
            
        _add_dependency(instances,dep_result)
    elif constants.FLAG_DEPPARSER == "stanfordConvert":
        dep_filename = tok_sent_filename+'.stanford.parse.dep'
        if os.path.exists(dep_filename):
            print 'Read dependency file %s...' % (dep_filename)

            dep_result = codecs.open(dep_filename,'r',encoding='utf-8').read()
        else:
            raise IOError('Converted dependency file %s not founded' % (dep_filename))

        _add_dependency(instances,dep_result,constants.FLAG_DEPPARSER)

    elif constants.FLAG_DEPPARSER == "stdconv+charniak":
        if constants.FLAG_ONTO == 'onto':
            dep_filename = tok_sent_filename+'.charniak.onto.parse.dep'
        elif constants.FLAG_ONTO == 'onto+bolt':
            dep_filename = tok_sent_filename+'.charniak.onto+bolt.parse.dep'
        else:
            dep_filename = tok_sent_filename+'.charniak.parse.dep'            
        if not os.path.exists(dep_filename):
            dparser = CharniakParser()
            dparser.parse(tok_sent_filename)
            #raise IOError('Converted dependency file %s not founded' % (dep_filename))
        print 'Read dependency file %s...' % (dep_filename)
        dep_result = codecs.open(dep_filename,'r',encoding='utf-8').read()
        _add_dependency(instances,dep_result,constants.FLAG_DEPPARSER)
            
    elif constants.FLAG_DEPPARSER == "clear":
        dep_filename = tok_sent_filename+'.clear.dep'
        if os.path.exists(dep_filename):
            print 'Read dependency file %s...' % (dep_filename)                                                                 
            dep_result = open(dep_filename,'r').read()
        else:
            dparser = ClearDepParser()
            dep_result = dparser.parse(tok_sent_filename)
        _add_dependency(instances,dep_result,constants.FLAG_DEPPARSER)

    elif constants.FLAG_DEPPARSER == "turbo":
        dep_filename = tok_sent_filename+'.turbo.dep'
        if os.path.exists(dep_filename):
            print 'Read dependency file %s...' % (dep_filename)                                                                 
            dep_result = open(dep_filename,'r').read()
        else:
            dparser = TurboDepParser()
            dep_result = dparser.parse(tok_sent_filename)
        _add_dependency(instances,dep_result,constants.FLAG_DEPPARSER)

    elif constants.FLAG_DEPPARSER == "mate":
        dep_filename = tok_sent_filename+'.mate.dep'
        if os.path.exists(dep_filename):
            print 'Read dependency file %s...' % (dep_filename)                                                                 
            dep_result = open(dep_filename,'r').read()
        else:
            dparser = MateDepParser()
            dep_result = dparser.parse(tok_sent_filename)
        _add_dependency(instances,dep_result,constants.FLAG_DEPPARSER)
    else:
        #pass
        raise Exception('Unknown dependency parse type %s' % (constants.FLAG_DEPPARSER))
    
    if constants.FLAG_PROP:
        print >> log, "Adding SRL information..."
        prop_filename = tok_sent_filename + '.prop' if constants.FLAG_ONTO != 'onto+bolt' else tok_sent_filename + '.onto+bolt.prop'
        if os.path.exists(prop_filename):
            if constants.FLAG_DEPPARSER == "stdconv+charniak":
                _add_prop(instances,prop_filename,dep_filename,FIX_PROP_HEAD=True)
            else:
                _add_prop(instances,prop_filename,dep_filename)
            
        else:
            raise IOError('Semantic role labeling file %s not found!' % (prop_filename))

    if constants.FLAG_RNE:
        print >> log, "Using rich name entity instead..."
        rne_filename = tok_sent_filename + '.rne'
        if os.path.exists(rne_filename):
            _substitute_rne(instances, rne_filename)
        else:
            raise IOError('Rich name entity file %s not found!' % (rne_filename))

        
    return instances
Example #12
0
def preprocess(input_file,START_SNLP=True,INPUT_AMR=True, align=True, use_amr_tokens=False):
    '''nasty function'''
    tmp_sent_filename = None
    instances = None
    tok_sent_filename = None
    
    if INPUT_AMR: # the input file is amr annotation

        amr_file = input_file
        if amr_file.endswith('.amr'):
            aligned_amr_file = amr_file + '.tok.aligned'
            amr_tok_file = amr_file + '.tok'
        else:
            aligned_amr_file = amr_file + '.amr.tok.aligned'
            amr_tok_file = amr_file + '.amr.tok'

        tmp_sent_filename = amr_file+'.sent'
        tok_sent_filename = tmp_sent_filename+'.tok' # write tokenized sentence file

        comments,amr_strings = readAMR(amr_file)
        if os.path.exists(aligned_amr_file):
            print "Reading aligned AMR ..."
            # read aligned amr and transfer alignment comments
            comments_with_alignment,_ = readAMR(aligned_amr_file)
            for comment,comment_with_alignment in zip(comments,comments_with_alignment):
                comment['alignments'] = comment_with_alignment['alignments']

        tokenized_sentences = None
        try:
            if use_amr_tokens:
                tokenized_sentences = [c['tok'] for c in comments] # here should be 'snt'
                if not os.path.exists(tok_sent_filename):
                    with open(tok_sent_filename,'w') as f:
                        for sentence in tokenized_sentences:
                            print >> f, sentence
                if tokenized_sentences:
                    print >> log, "AMR has tokens, will use them"
        except:
            raise
            pass

        sentences = [c['snt'] for c in comments] # here should be 'snt'
        if not os.path.exists(tmp_sent_filename): # write sentences into file
            _write_sentences(tmp_sent_filename,sentences)

        print >> log, "Start Stanford CoreNLP..."
        proc1 = StanfordCoreNLP(tokenize=not tokenized_sentences)

        # preprocess 1: tokenization, POS tagging and name entity using Stanford CoreNLP
        if START_SNLP: proc1.setup()

        instances = proc1.parse(tmp_sent_filename if proc1.tokenize else tok_sent_filename)

        if not os.path.exists(tok_sent_filename):
            _write_tok_sentences(tok_sent_filename,instances)

        if len(instances) == 0:
            print 'Error: no instances!'
            sys.exit(1)

        if not os.path.exists(amr_tok_file): # write tokenized amr file
            _write_tok_amr(amr_tok_file,amr_file,instances)
            
        if not os.path.exists(aligned_amr_file) and align:
            # align
            print "Call JAMR to generate alignment ..."
            subprocess.call('./scripts/jamr_align.sh '+amr_tok_file,shell=True)
            print "Reading aligned AMR ..."
            # read aligned amr and transfer alignment comments
            comments_with_alignment,_ = readAMR(aligned_amr_file)
            for comment,comment_with_alignment in zip(comments,comments_with_alignment):
                comment['alignments'] = comment_with_alignment['alignments']

        from progress import Progress
        p = Progress(len(instances), estimate=True, values=True)
        print 'Parsing AMR:'
        SpanGraph.graphID = 0
        for i in range(len(instances)):

            amr = AMR.parse_string(amr_strings[i])
            if 'alignments' in comments[i]:
                alignment,s2c_alignment = Aligner.readJAMRAlignment(amr,comments[i]['alignments'])
                #ggraph = SpanGraph.init_ref_graph(amr,alignment,instances[i].tokens)
                ggraph = SpanGraph.init_ref_graph_abt(amr,alignment,s2c_alignment,instances[i].tokens)
                #ggraph.pre_merge_netag(instances[i])
                #print >> log, "Graph ID:%s\n%s\n"%(ggraph.graphID,ggraph.print_tuples())
                instances[i].addAMR(amr)
                instances[i].addGoldGraph(ggraph)
            instances[i].addComment(comments[i])
            p += 1
        p.complete()

    else:
        # input file is sentence
        tmp_sent_filename = input_file 

        print >> log, "Start Stanford CoreNLP ..."
        proc1 = StanfordCoreNLP()

        # preprocess 1: tokenization, POS tagging and name entity using Stanford CoreNLP
        if START_SNLP: proc1.setup()
        instances = proc1.parse(tmp_sent_filename)

        tok_sent_filename = tmp_sent_filename+'.tok' # write tokenized sentence file
        if not os.path.exists(tok_sent_filename):
            _write_tok_sentences(tok_sent_filename,instances)
        
    # preprocess 2: dependency parsing 
    if constants.FLAG_DEPPARSER == "stanford":
        dep_filename = tok_sent_filename+'.stanford.dep'
        if os.path.exists(dep_filename):
            print 'Read dependency file %s...' % (dep_filename)                                                                 
            dep_result = open(dep_filename,'r').read()
        else:
            dparser = StanfordDepParser()
            dep_result = dparser.parse(tok_sent_filename)
            output_dep = open(dep_filename,'w')            
            output_dep.write(dep_result)
            output_dep.close()
            
        _add_dependency(instances,dep_result)
    elif constants.FLAG_DEPPARSER == "stanfordConvert":
        dep_filename = tok_sent_filename+'.stanford.parse.dep'
        if os.path.exists(dep_filename):
            print 'Read dependency file %s...' % (dep_filename)

            dep_result = open(dep_filename,'r').read()
        else:
            raise IOError('Converted dependency file %s not founded' % (dep_filename))

        _add_dependency(instances,dep_result,constants.FLAG_DEPPARSER)

    elif constants.FLAG_DEPPARSER == "stdconv+charniak":
        dep_filename = tok_sent_filename+'.charniak.parse.dep'
        if not os.path.exists(dep_filename):
            dparser = CharniakParser()
            dparser.parse(tok_sent_filename)
            #raise IOError('Converted dependency file %s not founded' % (dep_filename))
        print 'Read dependency file %s...' % (dep_filename)
        dep_result = open(dep_filename,'r').read()
        _add_dependency(instances,dep_result,constants.FLAG_DEPPARSER)
            
    elif constants.FLAG_DEPPARSER == "clear":
        dep_filename = tok_sent_filename+'.clear.dep'
        if os.path.exists(dep_filename):
            print 'Read dependency file %s...' % (dep_filename)                                                                 
            dep_result = open(dep_filename,'r').read()
        else:
            dparser = ClearDepParser()
            dep_result = dparser.parse(tok_sent_filename)
        _add_dependency(instances,dep_result,constants.FLAG_DEPPARSER)

    elif constants.FLAG_DEPPARSER == "turbo":
        dep_filename = tok_sent_filename+'.turbo.dep'
        if os.path.exists(dep_filename):
            print 'Read dependency file %s...' % (dep_filename)                                                                 
            dep_result = open(dep_filename,'r').read()
        else:
            dparser = TurboDepParser()
            dep_result = dparser.parse(tok_sent_filename)
        _add_dependency(instances,dep_result,constants.FLAG_DEPPARSER)

    elif constants.FLAG_DEPPARSER == "mate":
        dep_filename = tok_sent_filename+'.mate.dep'
        if os.path.exists(dep_filename):
            print 'Read dependency file %s...' % (dep_filename)                                                                 
            dep_result = open(dep_filename,'r').read()
        else:
            dparser = MateDepParser()
            dep_result = dparser.parse(tok_sent_filename)
        _add_dependency(instances,dep_result,constants.FLAG_DEPPARSER)
    else:
        pass
    
    if constants.FLAG_PROP:
        print >> log, "Adding SRL information..."
        prop_filename = tok_sent_filename + '.prop'
        if os.path.exists(prop_filename):
            if constants.FLAG_DEPPARSER == "stdconv+charniak":
                _add_prop(instances,prop_filename,dep_filename,FIX_PROP_HEAD=True)
            else:
                _add_prop(instances,prop_filename,dep_filename)
            
        else:
            raise FileNotFoundError('Semantic role labeling file %s not found!'%(prop_filename))

        
    return instances