Exemple #1
0
def main():
    t1 = time.time()

    new_corpus = Corpus(u'/home/gree-gorey/stupid/')

    spans = 0

    for text in new_corpus.texts(u'json'):
        text.sentence_splitter()
        for sentence in text.sentences:

            sentence.stupid_span_splitter()

            spans += len(sentence.spans)

            for span in sentence.spans:
                span.get_boundaries()

        text.write_stupid_clause_ann()

        text.copy_into_brat(u'/opt/brat-v1.3_Crunchy_Frog/data/stupid/')

    print spans

    t2 = time.time()

    print t2 - t1
Exemple #2
0
def main():
    t1 = time.time()

    new_corpus = Corpus('/home/gree-gorey/CorpusTest/')

    for text in new_corpus.texts('txt'):

        text.write_dummy_ann()

        text.copy_into_brat('/opt/brat-v1.3_Crunchy_Frog/data/left/', True)

    t2 = time.time()

    print(t2 - t1)
Exemple #3
0
def main():
    t1 = time.time()

    new_corpus = Corpus(u'/home/gree-gorey/CorpusTest/')

    for text in new_corpus.texts(u'txt'):

        text.write_dummy_ann()

        text.copy_into_brat(u'/opt/brat-v1.3_Crunchy_Frog/data/left/', True)

    t2 = time.time()

    print t2 - t1
Exemple #4
0
def main():
    t1 = time.time()

    new_corpus = Corpus(u'/home/gree-gorey/CorpusTemp/')
    # new_corpus = Corpus(u'/home/gree-gorey/tested/')
    # new_corpus = Corpus(u'/home/gree-gorey/tested_tested/')

    for text in new_corpus.texts(u'json'):
        text.sentence_splitter()
        # print len(text.sentences)
        for sentence in text.sentences:
            # print sentence.tokens[0].content

            # for token in sentence.tokens:
            #     print token.pos, token.content, token.lex
            # print u'***************'

            # sentence.find_pp()

            # sentence.find_coordination()

            sentence.find_complimentizers()

            sentence.find_names()

            sentence.eliminate_pair_comma()

            # for token in sentence.tokens:
            #     print token.pos, token.content
            # print u'***************'
            # print

            sentence.span_splitter()

            sentence.get_shared_tokens()  # loop through all the spans 1

            sentence.split_double_complimentizers()  # loop through all the spans 2

            for span in sentence.spans:  # loop through all the spans 3

                # decide whether span is inserted or embedded or neither
                span.type()
                # print span.tokens[0].content, span.embedded_type

            # split embedded span if it contains > 1 predicate
            sentence.split_embedded()

            # for span in sentence.spans:
            #     print span.shared_tokens[0].content, span.tokens[0].content

            # walk through spans and join whenever possible
            sentence.restore_embedded()

            sentence.split_base()

            # for span in sentence.spans:
            #     print span.shared_tokens[0].content, span.tokens[0].content, span.finite()

            sentence.restore_base()

            # for span in sentence.spans:
            #     print span.shared_tokens[0].content, span.tokens[0].content, span.finite()

            for span in sentence.spans:
                span.get_boundaries()
                # print span.quasi_embedded, span.tokens[0].content

        text.write_clause_ann()

        text.copy_into_brat(u'/opt/brat-v1.3_Crunchy_Frog/data/right/')

    t2 = time.time()

    print t2 - t1
Exemple #5
0
def main():
    t1 = time.time()

    new_corpus = Corpus(u'/home/gree-gorey/CorpusTemp/')
    # new_corpus = Corpus(u'/home/gree-gorey/tested/')
    # new_corpus = Corpus(u'/home/gree-gorey/tested_tested/')

    for text in new_corpus.texts(u'json'):
        text.sentence_splitter()
        # print len(text.sentences)
        for sentence in text.sentences:
            # print sentence.tokens[0].content

            # for token in sentence.tokens:
            #     print token.pos, token.content, token.lex
            # print u'***************'

            # sentence.find_pp()

            # sentence.find_coordination()

            sentence.find_complimentizers()

            sentence.find_names()

            sentence.eliminate_pair_comma()

            # for token in sentence.tokens:
            #     print token.pos, token.content
            # print u'***************'
            # print

            sentence.span_splitter()

            sentence.get_shared_tokens()  # loop through all the spans 1

            sentence.split_double_complimentizers(
            )  # loop through all the spans 2

            for span in sentence.spans:  # loop through all the spans 3

                # decide whether span is inserted or embedded or neither
                span.type()
                # print span.tokens[0].content, span.embedded_type

            # split embedded span if it contains > 1 predicate
            sentence.split_embedded()

            # for span in sentence.spans:
            #     print span.shared_tokens[0].content, span.tokens[0].content

            # walk through spans and join whenever possible
            sentence.restore_embedded()

            sentence.split_base()

            # for span in sentence.spans:
            #     print span.shared_tokens[0].content, span.tokens[0].content, span.finite()

            sentence.restore_base()

            # for span in sentence.spans:
            #     print span.shared_tokens[0].content, span.tokens[0].content, span.finite()

            for span in sentence.spans:
                span.get_boundaries()
                # print span.quasi_embedded, span.tokens[0].content

        text.write_clause_ann()

        text.copy_into_brat(u'/opt/brat-v1.3_Crunchy_Frog/data/right/')

    t2 = time.time()

    print t2 - t1