Esempio n. 1
0
    def read_file(self, filename):
        print("Reading lines...")

        # Read the file and split into lines
        lines = open('data/%s.txt' % (filename), encoding='utf-8').\
            read().strip().split('\n')

        # Split every line into pairs and normalize
        pairs = [[seq_utils.normalizeString(s) for s in l.split('\t')][:2]
                 for l in lines]

        pairs = self.filterPairs(pairs)

        source = Language()
        target = Language()

        for pair in pairs:
            if self.is_valid_pair(pair[0], pair[1]):
                source.addSentence(pair[0])
                target.addSentence(pair[1])

        print(f'Source language counted words: {source.n_words}')
        print(f'Target language counted words: {target.n_words}')

        return source, target, pairs