Ejemplo n.º 1
0
def tokenize_tagged(input_file):
    lines = read_lines(input_file)
    sentence_tokens = []
    sentences = []
    for line in map(lambda line: line.split(' '), lines):
        if len(line) < 2:
            sentences.append(sentence_tokens)
            sentence_tokens = []
        else:
            sentence_tokens.append(line[0])
    if len(sentence_tokens) > 0:
        sentences.append(sentence_tokens)
    return sentences
Ejemplo n.º 2
0
def reformat_file(filename):
    lines = read_lines(filename)
    sentence = []
    words = []
    labels = []
    for line in lines[2:]:
        if line == '':
            words.append(' '.join(
                map(lambda token: token.split(' ')[0], sentence)))
            labels.append(' '.join(
                map(lambda token: token.split(' ')[-1], sentence)))
            sentence = []
        else:
            sentence.append(line)
    return words, labels
Ejemplo n.º 3
0
from file_operations import read_lines
import sys

ignored_sequences = ['—', "«", '»', '"', '(', ')', '…', '–']


def get_tokens(lines, separator=' '):
    return list(map(lambda line: line.split(separator)[0], lines))


if __name__ == "__main__":
    first_file = get_tokens(read_lines(sys.argv[1]))
    second_file = get_tokens(read_lines(sys.argv[2]))
    min_length = min(len(first_file), len(second_file))
    for i in range(min_length):
        if first_file[i] != second_file[i] and first_file[
                i] not in ignored_sequences:
            print(f'{i:4d}\t{first_file[i]:10s}\t{second_file[i]:10s}')
            break
Ejemplo n.º 4
0
def get_labels(input_file):
    return list(
        map(lambda line: line.split(' ')[1],
            [i for i in read_lines(input_file) if i != '']))