コード例 #1
0
def get_text_data(text, expect_labels=True, tokenize=False):
    """
    get text, returning an instance of the Doc class
    doc.frag is the first frag, and each points to the next
    """

    frag_list = None
    word_index = 0
    frag_index = 0
    curr_words = []
    lower_words, non_abbrs = util.Counter(), util.Counter()

    for line in text.splitlines():
        if (not line.strip()) and (not curr_words) and frag_list:
            frag.ends_seg = True
        for word in line.split():
            curr_words.append(word)

            if is_sbd_hyp(word):
                frag = Frag(' '.join(curr_words))
                if not frag_list: frag_list = frag
                else: prev.next = frag

                ## get label; tokenize
                if expect_labels: frag.label = int('<S>' in word)
                if tokenize:
                    tokens = word_tokenize.tokenize(frag.orig)
                else:
                    tokens = frag.orig
                tokens = re.sub('(<A>)|(<E>)|(<S>)', '', tokens)
                frag.tokenized = tokens

                frag_index += 1
                prev = frag
                curr_words = []

            word_index += 1

    ## last frag
    frag = Frag(' '.join(curr_words))
    if not frag_list: frag_list = frag
    else: prev.next = frag
    if expect_labels: frag.label = int('<S>' in word)
    if tokenize:
        tokens = word_tokenize.tokenize(frag.orig)
    else:
        tokens = frag.orig
    tokens = re.sub('(<A>)|(<E>)|(<S>)', '', tokens)
    frag.tokenized = tokens
    frag_index += 1

    sys.stderr.write(' words [%d] sbd hyps [%d]\n' % (word_index, frag_index))

    ## create a Doc object to hold all this information
    doc = Doc(frag_list)
    return doc
コード例 #2
0
ファイル: jdsbd_1.0.py プロジェクト: hvn002/lancet
def get_text_data(text, expect_labels=True, tokenize=False):
    """
    get text, returning an instance of the Doc class
    doc.frag is the first frag, and each points to the next
    """
    
    frag_list = None
    word_index = 0
    frag_index = 0
    curr_words = []
    lower_words, non_abbrs = util.Counter(), util.Counter()

    for line in text.splitlines():
        if (not line.strip()) and (not curr_words) and frag_list:
            frag.ends_seg = True
        for word in line.split():
            curr_words.append(word)

            if is_sbd_hyp(word):
                frag = Frag(' '.join(curr_words))
                if not frag_list: frag_list = frag
                else: prev.next = frag
                
                ## get label; tokenize
                if expect_labels: frag.label = int('<S>' in word)
                if tokenize:
                    tokens = word_tokenize.tokenize(frag.orig)
                else: tokens = frag.orig
                tokens = re.sub('(<A>)|(<E>)|(<S>)', '', tokens)
                frag.tokenized = tokens
                
                frag_index += 1
                prev = frag
                curr_words = []
                
            word_index += 1

    ## last frag
    frag = Frag(' '.join(curr_words))
    if not frag_list: frag_list = frag
    else: prev.next = frag
    if expect_labels: frag.label = int('<S>' in word)
    if tokenize:
        tokens = word_tokenize.tokenize(frag.orig)
    else: tokens = frag.orig
    tokens = re.sub('(<A>)|(<E>)|(<S>)', '', tokens)
    frag.tokenized = tokens
    frag_index += 1
        
    sys.stderr.write(' words [%d] sbd hyps [%d]\n' %(word_index, frag_index))

    ## create a Doc object to hold all this information
    doc = Doc(frag_list)
    return doc
コード例 #3
0
ファイル: main.py プロジェクト: nsaphra/mllab
def read_doc( path ):
    doc = open(path).read()
    tokens = []
    for token in tokenize(doc).split():
        if token.isalpha():
            tokens.append(clean_token(token))
    return tokens
コード例 #4
0
ファイル: sbd.py プロジェクト: anurupborah2001/lancet
def get_data(files, expect_labels=True, tokenize=False, verbose=False):
    """
    load text from files, returning an instance of the Doc class
    doc.frag is the first frag, and each points to the next
    """

    if type(files) == type(''): files = [files]
    frag_list = None
    word_index = 0
    frag_index = 0
    curr_words = []
    lower_words, non_abbrs = sbd_util.Counter(), sbd_util.Counter()

    for file in files:
        sys.stderr.write('reading [%s]\n' % file)
        fh = open(file)
        for line in fh:

            ## deal with blank lines
            if (not line.strip()) and frag_list:
                if not curr_words: frag.ends_seg = True
                else:
                    frag = Frag(' '.join(curr_words))
                    frag.ends_seg = True
                    if expect_labels: frag.label = True
                    prev.next = frag
                    if tokenize:
                        tokens = word_tokenize.tokenize(frag.orig)
                    frag.tokenized = tokens
                    frag_index += 1
                    prev = frag
                    curr_words = []

            for word in line.split():
                curr_words.append(word)

                if is_sbd_hyp(word):
                    #if True: # hypothesize all words
                    frag = Frag(' '.join(curr_words))
                    if not frag_list: frag_list = frag
                    else: prev.next = frag

                    ## get label; tokenize
                    if expect_labels: frag.label = int('<S>' in word)
                    if tokenize:
                        tokens = word_tokenize.tokenize(frag.orig)
                    else:
                        tokens = frag.orig
                    tokens = re.sub('(<A>)|(<E>)|(<S>)', '', tokens)
                    frag.tokenized = tokens

                    frag_index += 1
                    prev = frag
                    curr_words = []

                word_index += 1
        fh.close()

        ## last frag
        frag = Frag(' '.join(curr_words))
        if not frag_list: frag_list = frag
        else: prev.next = frag
        if expect_labels: frag.label = int('<S>' in word)
        if tokenize:
            tokens = word_tokenize.tokenize(frag.orig)
        else:
            tokens = frag.orig
        tokens = re.sub('(<A>)|(<E>)|(<S>)', '', tokens)
        frag.tokenized = tokens
        frag.ends_seg = True
        frag_index += 1

    if verbose:
        sys.stderr.write(' words [%d] sbd hyps [%d]\n' %
                         (word_index, frag_index))

    ## create a Doc object to hold all this information
    doc = Doc(frag_list)
    return doc
コード例 #5
0
ファイル: sbd.py プロジェクト: DevSinghSachan/splitta
def get_data(files, expect_labels=True, tokenize=False, verbose=False, files_already_opened=False):
    """
    load text from files, returning an instance of the Doc class
    doc.frag is the first frag, and each points to the next
    """
    
    if type(files) == type(''): files = [files]
    frag_list = None
    word_index = 0
    frag_index = 0
    curr_words = []
    lower_words, non_abbrs = sbd_util.Counter(), sbd_util.Counter()

    for file in files:
        sys.stderr.write('reading [%s]\n' %file)

        #fh = open(file)
        if files_already_opened:
            fh = file
        else:
            fh = open(file)

        for line in fh:

            ## deal with blank lines
            if (not line.strip()) and frag_list:
                if not curr_words: frag.ends_seg = True
                else:
                    frag = Frag(' '.join(curr_words))
                    frag.ends_seg = True
                    if expect_labels: frag.label = True
                    prev.next = frag
                    if tokenize:
                        tokens = word_tokenize.tokenize(frag.orig)
                    frag.tokenized = tokens
                    frag_index += 1
                    prev = frag
                    curr_words = []

            for word in line.split():
                curr_words.append(word)

                if is_sbd_hyp(word):
                #if True: # hypothesize all words
                    frag = Frag(' '.join(curr_words))
                    if not frag_list: frag_list = frag
                    else: prev.next = frag
                    
                    ## get label; tokenize
                    if expect_labels: frag.label = int('<S>' in word)
                    if tokenize:
                        tokens = word_tokenize.tokenize(frag.orig)
                    else: tokens = frag.orig
                    tokens = re.sub('(<A>)|(<E>)|(<S>)', '', tokens)
                    frag.tokenized = tokens
                    
                    frag_index += 1
                    prev = frag
                    curr_words = []

                word_index += 1

        if files_already_opened:
            pass
        else:
            fh.close()
        #fh.close()

        ## last frag
        frag = Frag(' '.join(curr_words))
        if not frag_list: frag_list = frag
        else: prev.next = frag
        if expect_labels: frag.label = int('<S>' in word)
        if tokenize:
            tokens = word_tokenize.tokenize(frag.orig)
        else: tokens = frag.orig
        tokens = re.sub('(<A>)|(<E>)|(<S>)', '', tokens)
        frag.tokenized = tokens
        frag.ends_seg = True
        frag_index += 1

    if verbose: sys.stderr.write(' words [%d] sbd hyps [%d]\n' %(word_index, frag_index))

    ## create a Doc object to hold all this information
    doc = Doc(frag_list)
    return doc
コード例 #6
0
ファイル: sbd.py プロジェクト: katherinehuwu/Word_Fit
def get_text_data(text, expect_labels=True, tokenize=False, verbose=False):
    """
    get text, returning an instance of the Doc class
    doc.frag is the first frag, and each points to the next
    """
    
    frag_list = None
    word_index = 0
    frag_index = 0
    curr_words = []
    lower_words, non_abbrs = sbd_util.Counter(), sbd_util.Counter()

    for line in text.splitlines():

        ## deal with blank lines
        if (not line.strip()) and frag_list:
            if not curr_words: frag.ends_seg = True
            else:
                frag = Frag(' '.join(curr_words))
                frag.ends_seg = True
                if expect_labels: frag.label = True
                prev.next = frag
                if tokenize:
                    tokens = word_tokenize.tokenize(frag.orig)
                frag.tokenized = tokens
                frag_index += 1
                prev = frag
                curr_words = []

        for word in line.split():
            curr_words.append(word)

            if is_sbd_hyp(word):
                frag = Frag(' '.join(curr_words))
                if not frag_list: frag_list = frag
                else: prev.next = frag
                
                ## get label; tokenize
                if expect_labels: frag.label = int('<S>' in word)
                if tokenize:
                    tokens = word_tokenize.tokenize(frag.orig)
                    # BJD possible hack, but pretty sure this is needed
                    tmp_tokens = tokens.split()
                    tokens = ' '.join(tmp_tokens[:-1] + re.split(r'([.?!]+["\')\]]*)$', tmp_tokens[-1]))
                else: tokens = frag.orig
                tokens = re.sub('(<A>)|(<E>)|(<S>)', '', tokens)
                frag.tokenized = tokens
                
                frag_index += 1
                prev = frag
                curr_words = []
                
            word_index += 1

    ## last frag
    frag = Frag(' '.join(curr_words))
    if not frag_list: frag_list = frag
    else: prev.next = frag
    if expect_labels: frag.label = int('<S>' in word)
    if tokenize:
        tokens = word_tokenize.tokenize(frag.orig)
    else: tokens = frag.orig
    tokens = re.sub('(<A>)|(<E>)|(<S>)', '', tokens)
    frag.tokenized = tokens
    frag.ends_seg = True
    frag_index += 1
        
    if verbose: sys.stderr.write(' words [%d] sbd hyps [%d]\n' %(word_index, frag_index))

    ## create a Doc object to hold all this information
    doc = Doc(frag_list)
    return doc