def pos_tag_corpus(input_path: str, output_path: str): io = PersistantBalancedCorpusIO(input_path, output_path) for line in io: tokens = nltk.word_tokenize(line) pos_tags = nltk.pos_tag(tokens) pos_tags_str = [] for pos_tag in pos_tags: pos_tags_str.append(nltk.tag.tuple2str(pos_tag)) result = ' '.join(pos_tags_str) io.out(result)
def normalize_acrotagged_corpus(input_file, output_file): rx = re.compile('III[^I]+III') io = PersistantBalancedCorpusIO(input_file, output_file) for line in io: tokens, pos = split_tokens_and_pos(line.split()) norm_tokens = [] for token in tokens: m = rx.search(token) if m: match = m.group(0) splits = rx.split(token) norm_splits = [] for split in splits: norm_splits.append(normalize(split)) norm_token = match.join(norm_splits) norm_tokens.append(norm_token) else: norm_tokens.append(normalize(token)) result_twp = join_tokens_and_pos(norm_tokens, pos) result = ' '.join(result_twp) io.out(result)