Beispiel #1
0
def pos_tag_corpus(input_path: str, output_path: str):
    io = PersistantBalancedCorpusIO(input_path, output_path)
    for line in io:
        tokens = nltk.word_tokenize(line)
        pos_tags = nltk.pos_tag(tokens)
        pos_tags_str = []
        for pos_tag in pos_tags:
            pos_tags_str.append(nltk.tag.tuple2str(pos_tag))
        result = ' '.join(pos_tags_str)
        io.out(result)
Beispiel #2
0
def pos_tag_corpus(input_path: str, output_path: str):
    io = PersistantBalancedCorpusIO(input_path, output_path)
    for line in io:
        tokens = nltk.word_tokenize(line)
        pos_tags = nltk.pos_tag(tokens)
        pos_tags_str = []
        for pos_tag in pos_tags:
            pos_tags_str.append(nltk.tag.tuple2str(pos_tag))
        result = ' '.join(pos_tags_str)
        io.out(result)
Beispiel #3
0
def normalize_acrotagged_corpus(input_file, output_file):
    rx = re.compile('III[^I]+III')
    io = PersistantBalancedCorpusIO(input_file, output_file)
    for line in io:
        tokens, pos = split_tokens_and_pos(line.split())
        norm_tokens = []
        for token in tokens:
            m = rx.search(token)
            if m:
                match = m.group(0)
                splits = rx.split(token)
                norm_splits = []
                for split in splits:
                    norm_splits.append(normalize(split))
                norm_token = match.join(norm_splits)
                norm_tokens.append(norm_token)
            else:
                norm_tokens.append(normalize(token))

        result_twp = join_tokens_and_pos(norm_tokens, pos)
        result = ' '.join(result_twp)
        io.out(result)
Beispiel #4
0
def normalize_acrotagged_corpus(input_file, output_file):
    rx = re.compile('III[^I]+III')
    io = PersistantBalancedCorpusIO(input_file, output_file)
    for line in io:
        tokens, pos = split_tokens_and_pos(line.split())
        norm_tokens = []
        for token in tokens:
            m = rx.search(token)
            if m:
                match = m.group(0)
                splits = rx.split(token)
                norm_splits = []
                for split in splits:
                    norm_splits.append(normalize(split))
                norm_token = match.join(norm_splits)
                norm_tokens.append(norm_token)
            else:
                norm_tokens.append(normalize(token))

        result_twp = join_tokens_and_pos(norm_tokens, pos)
        result = ' '.join(result_twp)
        io.out(result)