def tokenize_tagged(input_file): lines = read_lines(input_file) sentence_tokens = [] sentences = [] for line in map(lambda line: line.split(' '), lines): if len(line) < 2: sentences.append(sentence_tokens) sentence_tokens = [] else: sentence_tokens.append(line[0]) if len(sentence_tokens) > 0: sentences.append(sentence_tokens) return sentences
def reformat_file(filename): lines = read_lines(filename) sentence = [] words = [] labels = [] for line in lines[2:]: if line == '': words.append(' '.join( map(lambda token: token.split(' ')[0], sentence))) labels.append(' '.join( map(lambda token: token.split(' ')[-1], sentence))) sentence = [] else: sentence.append(line) return words, labels
from file_operations import read_lines import sys ignored_sequences = ['—', "«", '»', '"', '(', ')', '…', '–'] def get_tokens(lines, separator=' '): return list(map(lambda line: line.split(separator)[0], lines)) if __name__ == "__main__": first_file = get_tokens(read_lines(sys.argv[1])) second_file = get_tokens(read_lines(sys.argv[2])) min_length = min(len(first_file), len(second_file)) for i in range(min_length): if first_file[i] != second_file[i] and first_file[ i] not in ignored_sequences: print(f'{i:4d}\t{first_file[i]:10s}\t{second_file[i]:10s}') break
def get_labels(input_file): return list( map(lambda line: line.split(' ')[1], [i for i in read_lines(input_file) if i != '']))