Exemple #1
0
def corpus_info(inputdir):
    conllx = CoNLLXHandle()
    i=0
    totalsentence =0;
    for name in os.listdir(inputdir):
        f = os.path.join(inputdir, name)
        try:
            sent_count, word_count = 0, 0
            for document in conllx.read_documents(f):
                sent_count += len(document.sentences())
                word_count += len(document.words())
                i+=1
            totalsentence+=sent_count
            print ('%d - %s: %d sentences, %d words' % (i,f, sent_count, word_count))
        except FormatError as e:
            print ('Error processing %s: %s' % (f, str(e)))
    print ("Total Sentence: %d"%(totalsentence))
Exemple #2
0
def createTree(path, output,format="pdf"):
    conllx = CoNLLXHandle()
    list = []
    i = 0
    for name in os.listdir(path):
        f = os.path.join(path, name)
        try:
            sent_count, word_count = 0, 0
            for sentence in conllx.read_conllx(f):
                sent_count += 1
                word_count += len(sentence.words())

                print(sentence.to_normal_sentence())
                print("\n")

                dotgraph = sentence.as_dotgraph()
                print(dotgraph)
                name = str(i) + '_'+ sentence.words()[0].lemma
                #print name
                dotgraph.render(filename=name, directory=output, cleanup=True)
                i=i+1

                #for element in sentence.words():
                 #   print element.form

            print ('%s: %d sentences, %d words' % (f, sent_count, word_count))

        except FormatError as e:
            print (sys.stderr, 'Error processing %s: %s' % (f, str(e)))

    # process document at a time
    for name in os.listdir(path):
        f = os.path.join(path, name)
        try:
            sent_count, word_count = 0, 0
            for document in conllx.read_documents(f):
                sent_count += len(document.sentences())
                word_count += len(document.words())
            print ('%s: %d sentences, %d words' % (f, sent_count, word_count))
        except FormatError as e:
            print (sys.stderr, 'Error processing %s: %s' % (f, str(e)))
Exemple #3
0
def readConllFile2(path):
    conllx = CoNLLXHandle()
    #listing all file in directory
    dtreeList = []
    for name in os.listdir(path):
        dtreeFileList = []
        #readfile
        f = os.path.join(path, name)
        try:
            #reading conll file content
            for sentence in conllx.read_conllx(f):
                #print(sentence.to_normal_sentence())
                dtree = DTree(sentence)
                dtreeList.append(dtree)
                dtreeFileList.append(dtree)
                #dtree.drawTreeInText()
        except FormatError as e:
            print(sys.stderr, 'Error processing %s: %s' % (f, str(e)))
        #generation ccg for tree

        #save conll file content
    return dtreeList
Exemple #4
0
def readConllFile(pathFile):
    conllx = CoNLLXHandle()
    return conllx.read_documents(pathFile)