Beispiel #1
0
parser.terminators = ["ROOT","S","S[wq]","S[q]","S[imp]"]
parser.lexicon = Lexicon()
parser.concatenator = ""

def tokenize(s):
    if len(s)==0:
        return s
    elif s[-1]==".":
        tokens = s[:-1].split()
        tokens.append( s[-1] )
        return tokens
    else:
        return s.split()


if __name__=="__main__":
   def __repr__(s):
       if sys.stdout.encoding=='UTF-8':
            return s
       else:
            return repr(s)
   for line in sys.stdin:
       line = line.strip()
       line = line.decode('utf-8')
       print(u"test:{}".format(__repr__(line)))
       for t in parser.parse(tokenize(line)):
          for r in t.leaves():
              print(u"{0}\t{1}".format(__repr__(r.token) , r.catname))
          break
       print("")
Beispiel #2
0
        if len(line) == 0: continue
        if line[0] == "#": continue
        ls = line.split('\t')
        ret.setdefault(ls[0].decode('utf-8'), []).extend(ls[2].split(","))
    return ret


parser = CCGParser()
parser.combinators = [LApp, RApp, LB, RB, LBx, Conj, SkipComma, RT("NP")]
parser.terminators = ["ROOT", "S", "S[wq]", "S[q]", "S[imp]"]
parser.lexicon = default_lexicon()
parser.concatenator = ""

if __name__ == "__main__":

    def __repr__(s):
        if sys.stdout.encoding == 'UTF-8':
            return s
        else:
            return repr(s)

    for line in sys.stdin:
        line = line.strip()
        line = line.decode('utf-8')
        print(u"test:{}".format(__repr__(line)))
        for t in parser.parse(line):
            for r in t.leaves():
                print(u"{0}\t{1}".format(__repr__(r.token), r.catname))
            break
        print("")
Beispiel #3
0

def tokenize(s):
    if len(s) == 0:
        return s
    elif s[-1] == ".":
        tokens = s[:-1].split()
        tokens.append(s[-1])
        return tokens
    else:
        return s.split()


if __name__ == "__main__":

    def __repr__(s):
        if sys.stdout.encoding == 'UTF-8':
            return s
        else:
            return repr(s)

    for line in sys.stdin:
        line = line.strip()
        line = line.decode('utf-8')
        print(u"test:{}".format(__repr__(line)))
        for t in parser.parse(tokenize(line)):
            for r in t.leaves():
                print(u"{0}\t{1}".format(__repr__(r.token), r.catname))
            break
        print("")
Beispiel #4
0
        ls = line.split('\t')
        ret.setdefault(ls[0].decode('utf-8'),[]).extend( ls[2].split(",") )
    return ret



parser = CCGParser()
parser.combinators = [LApp,RApp,LB,RB,LBx,Conj,SkipComma,RT("NP")]
parser.terminators = ["ROOT","S","S[wq]","S[q]","S[imp]"]
parser.lexicon = default_lexicon()
parser.concatenator = ""


if __name__=="__main__":
   def __repr__(s):
       if sys.stdout.encoding=='UTF-8':
            return s
       else:
            return repr(s)
   for line in sys.stdin:
       line = line.strip()
       line = line.decode('utf-8')
       print(u"test:{}".format(__repr__(line)))
       for t in parser.parse(line):
          for r in t.leaves():
              print(u"{0}\t{1}".format(__repr__(r.token) , r.catname))
          break
       print("")