Beispiel #1
0
 def python_basic_english_normalize(input):
     patterns_list = [(r'\'', ' \'  '), (r'\"', ''), (r'\.', ' . '),
                      (r'<br \/>', ' '), (r',', ' , '), (r'\(', ' ( '),
                      (r'\)', ' ) '), (r'\!', ' ! '), (r'\?', ' ? '),
                      (r'\;', ' '), (r'\:', ' '), (r'\s+', ' ')]
     norm_transform = custom_replace(patterns_list)
     return list(norm_transform([input.lower()]))[0].split()
Beispiel #2
0
 def test_custom_replace(self):
     custom_replace_transform = custom_replace([(r'S', 's'), (r'\s+', ' ')])
     test_sample = [
         'test     cuStom   replace', 'with   uSer   instruction'
     ]
     ref_results = ['test custom replace', 'with user instruction']
     self.assertEqual(list(custom_replace_transform(test_sample)),
                      ref_results)
Beispiel #3
0
             (r'\|right', ''), (r'\|\d+px', ''), (r'\[\[image:[^\[\]]*\|', ''),
             (r'\[\[category:([^|\]]*)[^]]*\]\]', '[[$1]]'),
             (r'\[\[[a-z\-]*:[^\]]*\]\]', ''), (r'\[\[[^\|\]]*\|', '[['),
             (r'\{\{[^\}]*\}\}', ''), (r'\{[^\}]*\}', ''), (r'\[', ''),
             (r'\]', ''), (r'&[^;]*;', ' '), (r'A', 'a'), (r'B', 'b'),
             (r'C', 'c'), (r'D', 'd'), (r'E', 'e'), (r'F', 'f'), (r'G', 'g'),
             (r'H', 'h'), (r'I', 'i'), (r'J', 'j'), (r'K', 'k'), (r'L', 'l'),
             (r'M', 'm'), (r'N', 'n'), (r'O', 'o'), (r'P', 'p'), (r'Q', 'q'),
             (r'R', 'r'), (r'S', 's'), (r'T', 't'), (r'U', 'u'), (r'V', 'v'),
             (r'W', 'w'), (r'X', 'x'), (r'Y', 'y'), (r'Z', 'z'),
             (r'0', ' zero '), (r'1', ' one '), (r'2', ' two '),
             (r'3', ' three '), (r'4', ' four '), (r'5', ' five '),
             (r'6', ' six '), (r'7', ' seven '), (r'8', ' eight '),
             (r'9', ' nine '), (r'[^a-z\n]+', ' '), (r'\n ', ''),
             (r'\s+', ' '), (r'\n\s*\n', r'\n')]
enwik9_norm_transform = custom_replace(_patterns)


def generate_offsets(filename):
    offsets = []
    with open(filename) as f:
        offsets.append(f.tell())
        while f.readline():
            offsets.append(f.tell())
    return offsets


def read_lines_from_iterator(data_path, offsets, begin_line, num_lines):
    with open(data_path) as f:
        f.seek(offsets[begin_line])
        for i in range(num_lines):