コード例 #1
0
        assert len(x)>0
        print_sent(x,[u"#---sentence---splitter---JOIN-TO-PREVIOUS-SENTENCE---"])

if __name__=="__main__":
    parser = argparse.ArgumentParser(description='Split/merge long sentences. Use --reverse to merge.')
    parser.add_argument('--reverse', default=False, action="store_true", help='Reverse the splitting.')
    parser.add_argument('-N', '--max-len', type=int, default=120, help='Pass sentences shorter or equal to this number of tokens through, split the rest. This will also be the absolute maximum chunk size ever fed into the parser. Default %(default)d.')
    parser.add_argument('-C', '--chunk-size', type=int, default=80, help='Split into chunks of approximately this size. Default %(default)d.')
    parser.add_argument('input', nargs='?', help='Input. Nothing or "-" for stdin.')
    args = parser.parse_args()
    args.leeway=args.chunk_size//3 #TODO - better value maybe?

    if args.reverse:
        last_len=None
        last_root=None
        for sent,comments in read_conll(args.input,0):
            if len(comments)==1 and comments[0]==u"#---sentence---splitter---JOIN-TO-PREVIOUS-SENTENCE---":
                part_root=get_root(sent) #root of this chunk
                renumber(sent,last_len)
                sent[part_root][HEAD]=unicode(last_root+1) #...dep to the previous one
                sent[part_root][PHEAD]=unicode(last_root+1)
                sent[part_root][DEPREL]=u"dep"
                sent[part_root][PDEPREL]=u"dep"
                last_root=int(sent[part_root][ID])-1 #...and remember the renumbered root for the possible next one
                print_sent(sent,[],False)
                last_len+=len(sent)
            else:
                if last_len is not None:
                    print >> out8
                last_root=get_root(sent) 
                print_sent(sent,comments,False)
コード例 #2
0
def visualize_clauses(args):
    data_to_print=u""
    count=1
    for sent,comments in read_conll(args.input,args.max_sent):
        d=defaultdict(lambda:[])
        for line in sent:
            if len(line)==10: #conll-u
                line[5]=sort_feat(line[5])
                l=line
                idx=line[9]
            else: #conll-09
                line[6]=sort_feat(line[6])
                l=[line[i] for i in [0,1,2,4,5,6,8,10]] # take idx,token,lemma,pos,pos,feat,deprel,head
                l.append(u"_") #DEPS
                l.append(line[12]) #and MISC for CoNLL-U
                idx=line[12]
            d[count].append(l)
            if idx!=u"_":
                d[idx].append(l)
        
        for idx,tree in sorted(d.iteritems()):
            root=None
            root_deprel=u"ROOT"
            root_token=u"ROOT"
            if idx!=count:
                indexes={}
                for i in xrange(0,len(tree)):
                    token=int(tree[i][0])
                    indexes[token]=len(indexes)+2
                for line in tree:
                    line[0]=unicode(indexes[int(line[0])])
                    if int(line[6]) in indexes: 
                        line[6]=unicode(indexes[int(line[6])])
                    else: # this is root
                        head=int(line[6])
                        line[6]=u"1"
                        root=line[0]
                        root_deprel=line[7]
                        if head!=0:
                            root_token=d[count][head-1][1]
            # tree to text
            text=header
            text+=u"# sentence-label\t%s\n"%(unicode(idx))
            if root is not None:
                text+=u"# visual-style\t%s\tbgColor:red\n"%(u"1")
                text+=u"# visual-style %s %s %s\tcolor:red\n"%(u"1",root,root_deprel)
            if comments:
                text+=u"\n".join(comments)+u"\n"
            if idx!=count:
                root_token=u"**%s**"%(root_token)
                text+=u"\t".join(t for t in [u"1",root_token,u"_",u"_",u"_",u"_",u"0",root_deprel,u"_",u"_"])+u"\n"
            for line in tree:
                text+=u"\t".join(line[i] for i in range(10))+u"\n"

            text+=u"\n" #conll-u expects an empty line at the end of every tree
            text+=footer
            if idx==count or d[idx]!=d[count]:
                data_to_print+=text
        count+=1
    with codecs.open(os.path.join(SCRIPTDIR,u"templates","simple_brat_viz.html"),u"r",u"utf-8") as template:
        data=template.read().replace(u"CONTENTGOESHERE",data_to_print,1)
        print >> sys.stdout, data.encode(u"utf-8")
コード例 #3
0
from visualize import read_conll
import codecs
import json
try:
    import argparse
except ImportError:
    import compat.argparse as argparse

def print_sent(sent):
    print (u"\n".join(u"\t".join(cols) for cols in sent)).encode(u"utf-8")

parser = argparse.ArgumentParser(description='Options')
parser.add_argument('-d', required=True, help='Where to save the comments?')
args = parser.parse_args()

comms=dict()
sent_count=0
for sent,comments in read_conll(None,0):
    sent_count+=1
    if comments:
        comms[sent_count]=comments
    if sent_count!=1:
        print
    print_sent(sent)
    
    
with codecs.open(args.d,u"w") as f:
    json.dump(comms,f)
コード例 #4
0
from visualize import read_conll
import codecs
import json
try:
    import argparse
except ImportError:
    import compat.argparse as argparse


def print_sent(sent):
    print(u"\n".join(u"\t".join(cols) for cols in sent)).encode(u"utf-8")


parser = argparse.ArgumentParser(description='Options')
parser.add_argument('-d', required=True, help='Where to save the comments?')
args = parser.parse_args()

comms = dict()
sent_count = 0
for sent, comments in read_conll(None, 0):
    sent_count += 1
    if comments:
        comms[sent_count] = comments
    if sent_count != 1:
        print
    print_sent(sent)

with codecs.open(args.d, u"w") as f:
    json.dump(comms, f)
コード例 #5
0
def visualize_clauses(args):
    data_to_print = u""
    count = 1
    for sent, comments in read_conll(args.input, args.max_sent):
        d = defaultdict(lambda: [])
        for line in sent:
            if len(line) == 10:  #conll-u
                line[5] = sort_feat(line[5])
                l = line
                idx = line[9]
            else:  #conll-09
                line[6] = sort_feat(line[6])
                l = [line[i] for i in [0, 1, 2, 4, 5, 6, 8, 10]
                     ]  # take idx,token,lemma,pos,pos,feat,deprel,head
                l.append(u"_")  #DEPS
                l.append(line[12])  #and MISC for CoNLL-U
                idx = line[12]
            d[count].append(l)
            if idx != u"_":
                d[idx].append(l)

        for idx, tree in sorted(d.iteritems()):
            root = None
            root_deprel = u"ROOT"
            root_token = u"ROOT"
            if idx != count:
                indexes = {}
                for i in xrange(0, len(tree)):
                    token = int(tree[i][0])
                    indexes[token] = len(indexes) + 2
                for line in tree:
                    line[0] = unicode(indexes[int(line[0])])
                    if int(line[6]) in indexes:
                        line[6] = unicode(indexes[int(line[6])])
                    else:  # this is root
                        head = int(line[6])
                        line[6] = u"1"
                        root = line[0]
                        root_deprel = line[7]
                        if head != 0:
                            root_token = d[count][head - 1][1]
            # tree to text
            text = header
            text += u"# sentence-label\t%s\n" % (unicode(idx))
            if root is not None:
                text += u"# visual-style\t%s\tbgColor:red\n" % (u"1")
                text += u"# visual-style %s %s %s\tcolor:red\n" % (u"1", root,
                                                                   root_deprel)
            if comments:
                text += u"\n".join(comments) + u"\n"
            if idx != count:
                root_token = u"**%s**" % (root_token)
                text += u"\t".join(t for t in [
                    u"1", root_token, u"_", u"_", u"_", u"_", u"0",
                    root_deprel, u"_", u"_"
                ]) + u"\n"
            for line in tree:
                text += u"\t".join(line[i] for i in range(10)) + u"\n"

            text += u"\n"  #conll-u expects an empty line at the end of every tree
            text += footer
            if idx == count or d[idx] != d[count]:
                data_to_print += text
        count += 1
    with codecs.open(
            os.path.join(SCRIPTDIR, u"templates", "simple_brat_viz.html"),
            u"r", u"utf-8") as template:
        data = template.read().replace(u"CONTENTGOESHERE", data_to_print, 1)
        print >> sys.stdout, data.encode(u"utf-8")
コード例 #6
0
        '-C',
        '--chunk-size',
        type=int,
        default=80,
        help=
        'Split into chunks of approximately this size. Default %(default)d.')
    parser.add_argument('input',
                        nargs='?',
                        help='Input. Nothing or "-" for stdin.')
    args = parser.parse_args()
    args.leeway = args.chunk_size // 3  #TODO - better value maybe?

    if args.reverse:
        last_len = None
        last_root = None
        for sent, comments in read_conll(args.input, 0):
            if len(comments) == 1 and comments[
                    0] == u"#---sentence---splitter---JOIN-TO-PREVIOUS-SENTENCE---":
                part_root = get_root(sent)  #root of this chunk
                renumber(sent, last_len)
                sent[part_root][HEAD] = unicode(last_root +
                                                1)  #...dep to the previous one
                sent[part_root][PHEAD] = unicode(last_root + 1)
                sent[part_root][DEPREL] = u"dep"
                sent[part_root][PDEPREL] = u"dep"
                last_root = int(
                    sent[part_root][ID]
                ) - 1  #...and remember the renumbered root for the possible next one
                print_sent(sent, [], False)
                last_len += len(sent)
            else: