예제 #1
0
def validate(infile,instream,outstream, options):
    global exit_status

    # initialize
    line_number = 0
    (idmax, headmax, pheadmax) = (0, 0, 0)
    token_list = ['dummy'] # the 0'th element
    rootLines = []
    error_flag = 0
    sent_start = 1

    for line in instream:
        line_number += 1
        # empty line ends sentence
        if re.compile(u'^\s*$').search(line):
            check_sentence(infile, options,
                           sent_start,
                           rootLines, token_list,
                           error_flag,
                           idmax, headmax, pheadmax)
            # re-initialize
            (idmax, headmax, pheadmax) = (0, 0, 0)
            token_list = ['dummy'] # the 0'th element
            rootLines = []
            error_flag = 0
            sent_start = line_number+1 # line where next sentence starts

        # non-empty line, i.e. token
        else:
            if options.input_sep == ' +': # if separator is spaces
                line = line.strip() # remove leading and trailing whitespace
            else:
                line = line.rstrip() # remove trailing whitespace (e.g. \r, \n)
            # split using input_sep regular expression
            fields = re.compile(options.input_sep).split(line)

            if len(fields) < minNumCols:
                msg = "%s: Error: line %d: Too few columns (%d<%d):\n\t%s" % \
                      (infile,line_number,len(fields),minNumCols,line)
                print >>sys.stderr, msg.encode(options.encoding)
                error_flag = 1
                exit_status = 1
                terminal = 'dummy'
            elif len(fields) > maxNumCols:
                msg = "%s: Error: line %d: Too many columns (%d>%d):\n\t%s" % \
                      (infile,line_number,len(fields),maxNumCols,line)
                print >>sys.stderr, msg.encode(options.encoding)
                error_flag = 1
                exit_status = 1
                terminal = 'dummy'
            else:
                (terminal, error_flag, idmax,
                 headmax, pheadmax) = check_line(infile,line_number,line,
                                                 options, rootLines,
                                                 fields,token_list,
                                                 error_flag,
                                                 idmax,headmax,pheadmax)

            token_list.append(terminal)

    if len(token_list) > 1: # more than just dummy
        # i.e. some unprocessed sentence is left
        msg = "line %d: No empty line after last sentence" % \
              (line_number)
        handleProblem(infile, 'whitespace', msg, options)

        check_sentence(infile, options,
                       sent_start,
                       rootLines, token_list,
                       error_flag,
                       idmax, headmax, pheadmax)
예제 #2
0
def check_line(infile,line_number,line,
               options, rootLines,
               fields,token_list,
               error_flag,
               idmax,headmax,pheadmax):
    global exit_status

    if options.datatype == 'train':
        (id, form, lemma, cpostag, postag,
         feats, head, deprel, phead, pdeprel) = fields
    elif options.datatype == 'test_blind':
        (id, form, lemma, cpostag, postag, feats) = fields
        (head, deprel, phead, pdeprel) = (u'0',emptyProjColumnString,
                                          emptyProjColumnString,emptyProjColumnString)
    elif options.datatype == 'system':
        (id, form, lemma, cpostag, postag,
         feats, head, deprel) = fields[0:8]
        if len(fields) == 8:
            (phead, pdeprel) = (emptyProjColumnString,emptyProjColumnString)
        elif len(fields) == 9:
            phead = fields[8]
        elif len(fields) == 10:
            (phead, pdeprel) = fields[8:10]

    # check that ID is integer > 0
    if id != u'0' and re.compile(u'^[0-9]+$').search(id):
        id = int(id)
        if id > idmax:
            idmax = id
        # check that ID is consecutive
        if id != len(token_list):
            msg = "%s: Error: line %d: Non-consecutive value for ID column (%d!=%d):\n\t%s" % \
                  (infile,line_number,id,len(token_list),line)
            print >>sys.stderr, msg.encode(options.encoding)
            error_flag = 1
            exit_status = 1
    else:
        msg = "%s: Error: line %d: Illegal value for ID column:\n\t%s" % \
              (infile,line_number,line)
        print >>sys.stderr, msg.encode(options.encoding)
        error_flag = 1
        exit_status = 1

    if options.datatype == 'train'  or options.datatype == 'system':
        # check that PHEAD is emptyProjColumnString or integer >= 0
        if phead != emptyProjColumnString:
            if re.compile(u'^[0-9]+$').search(phead):
                phead = int(phead)
                if phead > pheadmax:
                    pheadmax = phead
            else:
                msg = "%s: Error: line %d: Illegal value for PHEAD column:\n\t%s" % \
                      (infile,line_number,line)
                print >>sys.stderr, msg.encode(options.encoding)
                error_flag = 1
                if options.datatype == 'train':
                    exit_status = 1
                # else: system submissions: print but accept

        # check that HEAD is integer >= 0
        if re.compile(u'^[0-9]+$').search(head):
            head = int(head)
            if head > headmax:
                headmax = head
            if options.rootDeprel != '' and deprel == options.rootDeprel and head != 0:
                # check that HEAD is 0 if DEPREL is options.rootDeprel
                msg = ("line %d: HEAD is not 0:\n\t%s" % \
                       (line_number,line))
                handleProblem(infile, 'root', msg, options)
            if head == 0: # root
                # check that DEPREL is options.rootDeprel if HEAD is 0
                if options.rootDeprel != '' and deprel != options.rootDeprel:
                    msg = "line %d: DEPREL is not %s:\n\t%s" % \
                          (line_number,options.rootDeprel,line)
                    handleProblem(infile, 'root', msg, options)
                rootLines.append(line)

        else:
            msg = "%s: Error: line %d: Illegal value for HEAD column:\n\t%s" % \
                  (infile,line_number,line)
            print >>sys.stderr, msg.encode(options.encoding)
            error_flag = 1
            if options.datatype == 'train':
                exit_status = 1
            # else: system submissions: print but accept

    # check that other fields are not empty
    # (can occur with tab but not with spaces as separator)
    if (len(form)    == 0 or len(lemma)  == 0 or
        len(cpostag) == 0 or len(postag) == 0 or
        len(feats)   == 0 or len(deprel) == 0 or len(pdeprel) == 0):
        msg = "line %d: At least one column value is the empty string:\n\t%s" % \
              (line_number,line)
        handleProblem(infile, 'other', msg, options)

    # check that other fields do not contain whitespace
    ws = re.compile('\s')
    if (ws.search(form) or ws.search(lemma) or
        ws.search(cpostag) or ws.search(postag) or
        ws.search(feats) or ws.search(deprel) or ws.search(pdeprel)):
        msg = "line %d: At least one column value contains whitespace:\n\t%s" % \
              (line_number,line)
        handleProblem(infile, 'whitespace', msg, options)


    terminal = Terminal(id,form,lemma,cpostag,postag,feats,deprel, phead,pdeprel)
    terminal.head = head # change class???
    return (terminal, error_flag, idmax, headmax, pheadmax)
예제 #3
0
def check_sentence(infile, options,
                   sent_start,
                   rootLines, token_list,
                   error_flag,
                   idmax, headmax, pheadmax):
    global exit_status

    # check that there are tokens, i.e. not two
    # empty lines following each other
    if len(token_list) == 1: # just dummy
        msg = "line %d: More than one empty line separating sentences" % \
              (sent_start)
        handleProblem(infile, 'whitespace', msg, options)

    else:
        if options.datatype == 'train' or options.datatype == 'system':
            if error_flag == 0: # only check if no error occurred so far
                # check that there is at least one root
                if len(rootLines) == 0:
                    msg = "%s: Error: line %dff: no token has HEAD=0" % \
                           (infile, sent_start)
                    print >>sys.stderr, msg.encode(options.encoding)
                    error_flag = 1
                    if options.datatype == 'train':
                        exit_status = 1
                    # else: system submissions: print but accept

    ##         # check that there is exactly one root (option???)
    ##         if len(rootLines) > 1:
    ##             msg = "%s line %dff: Warning: several tokens have HEAD=0:\n\t%s" % \
    ##                   (infile, sent_start,"\n\t".join(rootLines))
    ##             print >>sys.stderr, msg.encode(options.encoding)

                # check that HEAD and PHEAD are not higher than highest ID
                if headmax > idmax:
                    msg = "%s: Error: line %dff: too big HEAD value (%d>%d)" % \
                          (infile, sent_start, headmax, idmax)
                    print >>sys.stderr, msg.encode(options.encoding)
                    error_flag = 1
                    if options.datatype == 'train':
                        exit_status = 1
                    # else: system submissions: print but accept
                if pheadmax > idmax:
                    msg = "%s: Error: line %dff: too big PHEAD value (%d>%d)" % \
                          (infile, sent_start, pheadmax, idmax)
                    print >>sys.stderr, msg.encode(options.encoding)
                    error_flag = 1
                    if options.datatype == 'train':
                        exit_status = 1
                    # else: system submissions: print but accept

        # if necessary, do punctuation checks
        if error_flag == 0: # only check if no error occurred so far
            if options.punctPostag != '':
                # a value is given, so punctuation must be checked
                punctRe = re.compile('^'+options.punctPostag+'$')
                non_punct_count = 0 # how many tokens are not punctuation
                for i in range(1,len(token_list)):
                    if not punctRe.search(token_list[i].cpostag):
                        # is not punctuation
                        non_punct_count += 1
                        headID = token_list[i].head
                        if headID != 0: # not root (cannot link to punctuation anyway)
                            if punctRe.search(token_list[headID].cpostag):
                                # links to punctuation
                                msg = "line %dff: token %d (%s) links to punctuation" % \
                                      (sent_start, i, token_list[i].form)
                                handleProblem(infile, 'punct', msg, options)
                                # this assumes that punctuation linking
                                # to punctuation is fine
                if non_punct_count == 0:
                    msg = "line %dff: only punctuation tokens in sentence" % \
                          (sent_start)
                    handleProblem(infile, 'punct', msg, options)


        # check for dependency cycles
        if options.datatype == 'train' or options.datatype == 'system':
            if error_flag == 0: # only check if no error occurred so far
                checkCycles_tmp2("%s line %dff" % (infile, sent_start),
                                 options, token_list, options.rootDeprel)
                checkCyclesPhead("%s line %dff" % (infile, sent_start),
                                 options, token_list, options.rootDeprel)
예제 #4
0
def check_sentence(infile, options, sent_start, rootLines, token_list,
                   error_flag, idmax, headmax, pheadmax):
    global exit_status

    # check that there are tokens, i.e. not two
    # empty lines following each other
    if len(token_list) == 1:  # just dummy
        msg = "line %d: More than one empty line separating sentences" % \
              (sent_start)
        handleProblem(infile, 'whitespace', msg, options)

    else:
        if options.datatype == 'train' or options.datatype == 'system':
            if error_flag == 0:  # only check if no error occurred so far
                # check that there is at least one root
                if len(rootLines) == 0:
                    msg = "%s: Error: line %dff: no token has HEAD=0" % \
                           (infile, sent_start)
                    print >> sys.stderr, msg.encode(options.encoding)
                    error_flag = 1
                    if options.datatype == 'train':
                        exit_status = 1
                    # else: system submissions: print but accept

    ##         # check that there is exactly one root (option???)
    ##         if len(rootLines) > 1:
    ##             msg = "%s line %dff: Warning: several tokens have HEAD=0:\n\t%s" % \
    ##                   (infile, sent_start,"\n\t".join(rootLines))
    ##             print >>sys.stderr, msg.encode(options.encoding)

    # check that HEAD and PHEAD are not higher than highest ID
                if headmax > idmax:
                    msg = "%s: Error: line %dff: too big HEAD value (%d>%d)" % \
                          (infile, sent_start, headmax, idmax)
                    print >> sys.stderr, msg.encode(options.encoding)
                    error_flag = 1
                    if options.datatype == 'train':
                        exit_status = 1
                    # else: system submissions: print but accept
                if pheadmax > idmax:
                    msg = "%s: Error: line %dff: too big PHEAD value (%d>%d)" % \
                          (infile, sent_start, pheadmax, idmax)
                    print >> sys.stderr, msg.encode(options.encoding)
                    error_flag = 1
                    if options.datatype == 'train':
                        exit_status = 1
                    # else: system submissions: print but accept

        # if necessary, do punctuation checks
        if error_flag == 0:  # only check if no error occurred so far
            if options.punctPostag != '':
                # a value is given, so punctuation must be checked
                punctRe = re.compile('^' + options.punctPostag + '$')
                non_punct_count = 0  # how many tokens are not punctuation
                for i in range(1, len(token_list)):
                    if not punctRe.search(token_list[i].cpostag):
                        # is not punctuation
                        non_punct_count += 1
                        headID = token_list[i].head
                        if headID != 0:  # not root (cannot link to punctuation anyway)
                            if punctRe.search(token_list[headID].cpostag):
                                # links to punctuation
                                msg = "line %dff: token %d (%s) links to punctuation" % \
                                      (sent_start, i, token_list[i].form)
                                handleProblem(infile, 'punct', msg, options)
                                # this assumes that punctuation linking
                                # to punctuation is fine
                if non_punct_count == 0:
                    msg = "line %dff: only punctuation tokens in sentence" % \
                          (sent_start)
                    handleProblem(infile, 'punct', msg, options)

        # check for dependency cycles
        if options.datatype == 'train' or options.datatype == 'system':
            if error_flag == 0:  # only check if no error occurred so far
                checkCycles_tmp2("%s line %dff" % (infile, sent_start),
                                 options, token_list, options.rootDeprel)
                checkCyclesPhead("%s line %dff" % (infile, sent_start),
                                 options, token_list, options.rootDeprel)
예제 #5
0
def validate(infile, instream, outstream, options):
    global exit_status

    # initialize
    line_number = 0
    (idmax, headmax, pheadmax) = (0, 0, 0)
    token_list = ['dummy']  # the 0'th element
    rootLines = []
    error_flag = 0
    sent_start = 1

    for line in instream:
        line_number += 1
        # empty line ends sentence
        if re.compile(u'^\s*$').search(line):
            check_sentence(infile, options, sent_start, rootLines, token_list,
                           error_flag, idmax, headmax, pheadmax)
            # re-initialize
            (idmax, headmax, pheadmax) = (0, 0, 0)
            token_list = ['dummy']  # the 0'th element
            rootLines = []
            error_flag = 0
            sent_start = line_number + 1  # line where next sentence starts

        # non-empty line, i.e. token
        else:
            if options.input_sep == ' +':  # if separator is spaces
                line = line.strip()  # remove leading and trailing whitespace
            else:
                line = line.rstrip(
                )  # remove trailing whitespace (e.g. \r, \n)
            # split using input_sep regular expression
            fields = re.compile(options.input_sep).split(line)

            if len(fields) < minNumCols:
                msg = "%s: Error: line %d: Too few columns (%d<%d):\n\t%s" % \
                      (infile,line_number,len(fields),minNumCols,line)
                print >> sys.stderr, msg.encode(options.encoding)
                error_flag = 1
                exit_status = 1
                terminal = 'dummy'
            elif len(fields) > maxNumCols:
                msg = "%s: Error: line %d: Too many columns (%d>%d):\n\t%s" % \
                      (infile,line_number,len(fields),maxNumCols,line)
                print >> sys.stderr, msg.encode(options.encoding)
                error_flag = 1
                exit_status = 1
                terminal = 'dummy'
            else:
                (terminal, error_flag, idmax, headmax,
                 pheadmax) = check_line(infile, line_number, line, options,
                                        rootLines, fields, token_list,
                                        error_flag, idmax, headmax, pheadmax)

            token_list.append(terminal)

    if len(token_list) > 1:  # more than just dummy
        # i.e. some unprocessed sentence is left
        msg = "line %d: No empty line after last sentence" % \
              (line_number)
        handleProblem(infile, 'whitespace', msg, options)

        check_sentence(infile, options, sent_start, rootLines, token_list,
                       error_flag, idmax, headmax, pheadmax)
예제 #6
0
def check_line(infile, line_number, line, options, rootLines, fields,
               token_list, error_flag, idmax, headmax, pheadmax):
    global exit_status

    if options.datatype == 'train':
        (id, form, lemma, cpostag, postag, feats, head, deprel, phead,
         pdeprel) = fields
    elif options.datatype == 'test_blind':
        (id, form, lemma, cpostag, postag, feats) = fields
        (head, deprel, phead,
         pdeprel) = (u'0', emptyProjColumnString, emptyProjColumnString,
                     emptyProjColumnString)
    elif options.datatype == 'system':
        (id, form, lemma, cpostag, postag, feats, head, deprel) = fields[0:8]
        if len(fields) == 8:
            (phead, pdeprel) = (emptyProjColumnString, emptyProjColumnString)
        elif len(fields) == 9:
            phead = fields[8]
        elif len(fields) == 10:
            (phead, pdeprel) = fields[8:10]

    # check that ID is integer > 0
    if id != u'0' and re.compile(u'^[0-9]+$').search(id):
        id = int(id)
        if id > idmax:
            idmax = id
        # check that ID is consecutive
        if id != len(token_list):
            msg = "%s: Error: line %d: Non-consecutive value for ID column (%d!=%d):\n\t%s" % \
                  (infile,line_number,id,len(token_list),line)
            print >> sys.stderr, msg.encode(options.encoding)
            error_flag = 1
            exit_status = 1
    else:
        msg = "%s: Error: line %d: Illegal value for ID column:\n\t%s" % \
              (infile,line_number,line)
        print >> sys.stderr, msg.encode(options.encoding)
        error_flag = 1
        exit_status = 1

    if options.datatype == 'train' or options.datatype == 'system':
        # check that PHEAD is emptyProjColumnString or integer >= 0
        if phead != emptyProjColumnString:
            if re.compile(u'^[0-9]+$').search(phead):
                phead = int(phead)
                if phead > pheadmax:
                    pheadmax = phead
            else:
                msg = "%s: Error: line %d: Illegal value for PHEAD column:\n\t%s" % \
                      (infile,line_number,line)
                print >> sys.stderr, msg.encode(options.encoding)
                error_flag = 1
                if options.datatype == 'train':
                    exit_status = 1
                # else: system submissions: print but accept

        # check that HEAD is integer >= 0
        if re.compile(u'^[0-9]+$').search(head):
            head = int(head)
            if head > headmax:
                headmax = head
            if options.rootDeprel != '' and deprel == options.rootDeprel and head != 0:
                # check that HEAD is 0 if DEPREL is options.rootDeprel
                msg = ("line %d: HEAD is not 0:\n\t%s" % \
                       (line_number,line))
                handleProblem(infile, 'root', msg, options)
            if head == 0:  # root
                # check that DEPREL is options.rootDeprel if HEAD is 0
                if options.rootDeprel != '' and deprel != options.rootDeprel:
                    msg = "line %d: DEPREL is not %s:\n\t%s" % \
                          (line_number,options.rootDeprel,line)
                    handleProblem(infile, 'root', msg, options)
                rootLines.append(line)

        else:
            msg = "%s: Error: line %d: Illegal value for HEAD column:\n\t%s" % \
                  (infile,line_number,line)
            print >> sys.stderr, msg.encode(options.encoding)
            error_flag = 1
            if options.datatype == 'train':
                exit_status = 1
            # else: system submissions: print but accept

    # check that other fields are not empty
    # (can occur with tab but not with spaces as separator)
    if (len(form) == 0 or len(lemma) == 0 or len(cpostag) == 0
            or len(postag) == 0 or len(feats) == 0 or len(deprel) == 0
            or len(pdeprel) == 0):
        msg = "line %d: At least one column value is the empty string:\n\t%s" % \
              (line_number,line)
        handleProblem(infile, 'other', msg, options)

    # check that other fields do not contain whitespace
    ws = re.compile('\s')
    if (ws.search(form) or ws.search(lemma) or ws.search(cpostag)
            or ws.search(postag) or ws.search(feats) or ws.search(deprel)
            or ws.search(pdeprel)):
        msg = "line %d: At least one column value contains whitespace:\n\t%s" % \
              (line_number,line)
        handleProblem(infile, 'whitespace', msg, options)

    terminal = Terminal(id, form, lemma, cpostag, postag, feats, deprel, phead,
                        pdeprel)
    terminal.head = head  # change class???
    return (terminal, error_flag, idmax, headmax, pheadmax)