def get_new_edges(data_type, construction):
    tree_prop_file = 'd6.treeproperties'
    t2props_dict = get_t2props_dict(tree_prop_file)
    t2topsub_dict = get_t2topsub_dict(tree_prop_file)
    ## get predicted_dependencies and apply transformations
    predicted_dependencies = read_data(construction, data_type)
    unbounded_dependencies = read_unbounded(construction, data_type)
    sents = read_stags(construction, data_type, 'sents')
    predicted_stags = read_stags(construction, data_type)
    predicted_pos = read_stags(construction, data_type, 'predicted_pos')
    new_edges = []
    for sent_idx in range(len(unbounded_dependencies)):
        #for sent_idx in [0]:
        sent = sents[sent_idx]
        ## TAG analysis
        predicted_dependencies_sent = predicted_dependencies[sent_idx]
        predicted_stags_sent = predicted_stags[sent_idx]
        predicted_pos_sent = predicted_pos[sent_idx]
        transformed_sent = transform(t2props_dict, t2topsub_dict, sent,
                                     predicted_dependencies_sent,
                                     predicted_stags_sent, predicted_pos_sent)
        new_edges_sent = list(
            set(transformed_sent) - set(predicted_dependencies_sent))
        new_edges_sent = [x for x in new_edges_sent if x[0] != x[1]]
        #print(new_edges_sent)
        new_edges.append(new_edges_sent)
    return new_edges
def output_conllu(filename, sents, pos, stags, arcs, rels, dependencies,
                  new_edges, output_dir, result_file):
    scores = {}
    with open(result_file) as fin:
        for line in fin:
            line = line.split()
            scores[(int(line[0]), int(line[1]))] = int(line[2])
    tree_prop_file = 'd6.treeproperties'
    t2props_dict = get_t2props_dict(tree_prop_file)
    t2topsub_dict = get_t2topsub_dict(tree_prop_file)
    #for sent_idx in range(len(sents)):
    for sent_idx in [21]:
        deps_sent = dependencies[sent_idx]
        for dep_idx, dep in enumerate(deps_sent):
            unbounded_dep = dep
            #start = min(int(dep[0]), int(dep[1]))-1
            start = 25
            #end = max(int(dep[0]), int(dep[1]))+1
            end = 33
            conllu = ''
            sent = sents[sent_idx]
            pos_sent = pos[sent_idx]
            stags_sent = stags[sent_idx]
            arcs_sent = arcs[sent_idx]
            rels_sent = rels[sent_idx]
            token_idx = int(dep[1])
            output_list = [
                str(token_idx),
                sent[token_idx - 1] + '_' + stags_sent[token_idx - 1], '_',
                stags_sent[token_idx - 1], pos_sent[token_idx - 1], '_',
                str(dep[0]), dep[2], '_', '_'
            ]
            conllu += '\t'.join(output_list)
            conllu += '\n'
            for token_idx in range(len(sent)):
                if token_idx >= start and token_idx <= end:
                    #if  arcs_sent[token_idx] >= start and arcs_sent[token_idx] <= end:
                    output_list = [
                        str(token_idx + 1),
                        sent[token_idx] + '_' + stags_sent[token_idx], '_',
                        stags_sent[token_idx], pos_sent[token_idx], '_',
                        str(arcs_sent[token_idx]), rels_sent[token_idx], '_',
                        '_'
                    ]
                    conllu += '\t'.join(output_list)
                    conllu += '\n'
            for new_idx, dep in enumerate(new_edges[sent_idx]):
                if dep[0] >= start and dep[0] <= end:
                    #if  dep[1] >= start and dep[1] <= end:
                    token_idx = int(dep[0])
                    output_list = [
                        str(token_idx),
                        sent[token_idx - 1] + '_' + stags_sent[token_idx - 1],
                        '_', stags_sent[token_idx - 1],
                        pos_sent[token_idx - 1], '_',
                        str(dep[1]), dep[2], '_', '_'
                    ]
                    conllu += '\t'.join(output_list)
                    conllu += '\n'
            graph = DependencyGraph(conllu)
            if not os.path.isdir(output_dir):
                os.makedirs(output_dir)
            output_file = os.path.join(
                output_dir,
                'sent{}_dep{}_correct{}.gv'.format(sent_idx, dep_idx,
                                                   scores[(sent_idx,
                                                           dep_idx)]))
            dot_string = graph.to_dot()
            ## add colors
            new_dot_string = ''
            new_lines = [
                '{} -> {} [label="{}"]'.format(dep[1], dep[0], dep[2])
                for dep in new_edges[sent_idx]
            ]
            for line in dot_string.split('\n'):
                line = line.strip()
                if line == '{} -> {} [label="{}"]'.format(
                        unbounded_dep[0], unbounded_dep[1], unbounded_dep[2]):
                    line = '{} -> {} [label="{}", color="red"]'.format(
                        unbounded_dep[1], unbounded_dep[0], unbounded_dep[2])
                elif line in new_lines:
                    line = line[:-1] + ', color="blue"]'
                new_dot_string += line
                new_dot_string += '\n'
            with open(output_file, 'wt') as fout:
                fout.write(new_dot_string)
Exemple #3
0
def evaluate(corpus_data_type, debug=False, input_data_type=None):
    if input_data_type is None:
        input_data_type = corpus_data_type
    tree_prop_file = 'd6.treeproperties'
    t2props_dict = get_t2props_dict(tree_prop_file)
    t2topsub_dict = get_t2topsub_dict(tree_prop_file)
    if debug:
        #constructions = ['sbj_embedded']
        #constructions = ['obj_qus']
        #constructions = ['obj_extract_red_rel']
        constructions = ['right_node_raising']
    else:
        constructions = ['obj_extract_rel_clause', 'obj_extract_red_rel', 'sbj_extract_rel_clause', 'obj_free_rels', 'obj_qus', 'right_node_raising', 'sbj_embedded']
    #constructions = ['obj_qus']
    all_total = 0
    all_correct = 0
    nb_constructions = 0
    total_scores = 0
    for construction in constructions:
        ## get predicted_dependencies and apply transformations
        result_dir = os.path.join(construction, 'results', 'test')
        if not os.path.isdir(result_dir):
            os.makedirs(result_dir)
        predicted_dependencies = read_data(construction, input_data_type)
        unbounded_dependencies = read_unbounded(construction, corpus_data_type)
        sents = read_stags(construction, input_data_type, 'sents')
        predicted_stags = read_stags(construction, input_data_type)
        predicted_pos = read_stags(construction, input_data_type, 'predicted_pos')
        #assert(len(predicted_dependencies) == len(unbounded_dependencies))
        total = 0
        correct = 0
        if debug:
            sent_idxes = [70] 
        else:
            sent_idxes = range(len(unbounded_dependencies))
        with open(os.path.join(result_dir, 'results.txt'), 'wt') as fout:
            for sent_idx in sent_idxes:
            #for sent_idx in [73]:
                sent = sents[sent_idx]
                ## TAG analysis
                predicted_dependencies_sent = predicted_dependencies[sent_idx]
                predicted_stags_sent = predicted_stags[sent_idx]
                predicted_pos_sent = predicted_pos[sent_idx]
                transformed_sent = transform(t2props_dict, t2topsub_dict, sent, predicted_dependencies_sent, predicted_stags_sent, predicted_pos_sent)
                #transformed_sent = predicted_dependencies_sent
                #print(transformed_sent)
                assert(len(sent) == len(predicted_stags_sent))
                unbounded_dependencies_sent = unbounded_dependencies[sent_idx]
                for dep_idx, dep in enumerate(unbounded_dependencies_sent):
                    total += 1
                    all_total += 1
                    if 'nsubj' == dep[2]:
                        new_dep = (dep[0], dep[1], '0')
                        if construction == 'sbj_embedded':
                            if (sent_idx, dep_idx) in [(77, 0), (42, 0)]:
                                new_dep = tuple([dep[0], dep[1], '1']) ## causative-inchoative
                    elif 'dobj' == dep[2]:
                        new_dep = tuple([dep[0], dep[1], '1'])
                        if construction == 'obj_qus':
                            if sent[0].lower() in ['where']:
                                new_dep = tuple([dep[0], dep[1], 'ADJ'])
                    elif 'pobj' == dep[2]:
                        new_dep = tuple([dep[0], dep[1], '1'])
                    elif 'nsubjpass' in dep[2]:
                        new_dep = (dep[0], dep[1], '1')
                    elif 'advmod' in dep[2]:
                        if sent[dep[0]-1] == 'out':
                            new_dep = (dep[0], dep[1], 'ADJ')
                        else:
                            new_dep = (dep[0], dep[1], 'ADJ')
                    elif 'prep' in dep[2]:
                        new_dep = (dep[0], dep[1], 'ADJ')
                    elif 'infmod' in dep[2]:
                        new_dep = (dep[0], dep[1], 'ADJ')
                    elif 'obj2' in dep[2]:
                        new_dep = (dep[0], dep[1], '1')
                    elif 'cop' in dep[2]:
                        new_dep = (dep[0], dep[1], '0')
                    else:
                        new_dep = (dep[0], dep[1], 'ADJ')
                    if new_dep in transformed_sent:
                        correct += 1
                        all_correct += 1
                        success = 1
                    else:
                        success = 0
                    fout.write(' '.join([str(sent_idx), str(dep_idx), str(success)]))
                    fout.write('\n')
        print('Construction: {}'.format(construction))
        print('# total: {}'.format(total))
        print('# correct: {}'.format(correct))
        print('Accuracy: {}'.format(float(correct)/total))
        total_scores += float(correct)/total
        nb_constructions += 1
        #print(predicted_dependencies[0])
        #print(unbounded_dependencies[0])
        #for predicted_dependencies_sent in predicted_dependencies:
        #    predicted_dependencies_sent = transform(predicted_dependencies_sent)
    print('All constructions')
    print('# total: {}'.format(all_total))
    print('# correct: {}'.format(all_correct))
    print('Macro Accuracy: {}'.format(float(all_correct)/all_total))
    print('Overall Accuracy: {}'.format(float(total_scores)/nb_constructions))
Exemple #4
0
def evaluate(data_type):
    tree_prop_file = 'd6.treeproperties'
    t2props_dict = get_t2props_dict(tree_prop_file)
    t2topsub_dict = get_t2topsub_dict(tree_prop_file)
    constructions = ['obj_extract_rel_clause', 'obj_extract_red_rel', 'sbj_extract_rel_clause', 'obj_free_rels', 'obj_qus', 'right_node_raising', 'sbj_embedded']
    #constructions = ['obj_qus']
    all_total = 0
    all_correct = 0
    nb_constructions = 0
    total_scores = 0
    for construction in constructions:
        ## get predicted_dependencies and apply transformations
        predicted_dependencies = read_data(construction, data_type)
        unbounded_dependencies = read_unbounded(construction, data_type)
        sents = read_stags(construction, data_type, 'sents')
        predicted_stags = read_stags(construction, data_type)
        predicted_pos = read_stags(construction, data_type, 'predicted_pos')
        #assert(len(predicted_dependencies) == len(unbounded_dependencies))
        total = 0
        correct = 0
        for sent_idx in xrange(len(unbounded_dependencies)):
            sent = sents[sent_idx]
            ## TAG analysis
            predicted_dependencies_sent = predicted_dependencies[sent_idx]
            predicted_stags_sent = predicted_stags[sent_idx]
            predicted_pos_sent = predicted_pos[sent_idx]
            transformed_sent = transform(t2props_dict, t2topsub_dict, sent, predicted_dependencies_sent, predicted_stags_sent, predicted_pos_sent)
            #transformed_sent = predicted_dependencies_sent
            #print(transformed_sent)
            assert(len(sent) == len(predicted_stags_sent))
            unbounded_dependencies_sent = unbounded_dependencies[sent_idx]
            for dep in unbounded_dependencies_sent:
                total += 1
                all_total += 1
                if 'nsubj' == dep[2]:
                    new_dep = (dep[0], dep[1], '0')
                elif 'dobj' == dep[2]:
                    new_dep = tuple([dep[0], dep[1], '1'])
                elif 'pobj' == dep[2]:
                    new_dep = tuple([dep[0], dep[1], '1'])
                elif 'nsubjpass' in dep[2]:
                    new_dep = (dep[0], dep[1], '1')
                elif 'advmod' in dep[2]:
                    new_dep = (dep[0], dep[1], '-unk-')
                elif 'prep' in dep[2]:
                    new_dep = (dep[0], dep[1], 'ADJ')
                elif '' in dep[2]:
                    new_dep = (dep[0], dep[1], 'ADJ')
                else:
                    print(dep[2])
                if new_dep in transformed_sent:
                    correct += 1
                    all_correct += 1
        print('Construction: {}'.format(construction))
        print('# total: {}'.format(total))
        print('# correct: {}'.format(correct))
        print('Accuracy: {}'.format(float(correct)/total))
        total_scores += float(correct)/total
        nb_constructions += 1
        #print(predicted_dependencies[0])
        #print(unbounded_dependencies[0])
        #for predicted_dependencies_sent in predicted_dependencies:
        #    predicted_dependencies_sent = transform(predicted_dependencies_sent)
    print('All constructions')
    print('# total: {}'.format(all_total))
    print('# correct: {}'.format(all_correct))
    print('Macro Accuracy: {}'.format(float(all_correct)/all_total))
    print('Overall Accuracy: {}'.format(float(total_scores)/nb_constructions))
Exemple #5
0
            # if word2 is a verb, lemmatize it
            if word2_pos.startswith('V'):
                word2 = lemmatize(word2, pos[id2])
        lex_parse.append((word1, word2, dep))
    return (lex_parse)


def _triples2par_child_dict(parse_t, sent_t):
    from collections import defaultdict
    par_child_dict = defaultdict(lambda: defaultdict(list))

    for id1, id2, dep in parse_t:
        par_child_dict[id1]['parents_with_dep'].append((id2, dep))
        par_child_dict[id2]['children_with_dep'].append((id1, dep))

    return (par_child_dict)


if __name__ == '__main__':
    print(lemmatize('stayed', 'V'))
    print(lemmatize('which', 'V'))
    print(lemmatize('what', 'V'))
    print(lemmatize('None', 'V'))
    print(lemmatize('lots', 'V'))
    from get_treeprops import get_t2props_dict, get_t2topsub_dict
    tree_prop_file = 'd6.treeproperties'
    t2props_dict = get_t2props_dict(tree_prop_file)
    t2topsub_dict = get_t2topsub_dict(tree_prop_file)
    stag = 3339
    print('S#s#1' in t2props_dict[stag]['rfronts'])