コード例 #1
0
 def test_remove_punctuation_empty_input(self):
     # Setup
     input_text = ''
     expected_output = ''
     # Actual call
     output_text = remove_punctuation(input_text)
     # Asserts
     self.assertEqual(output_text, expected_output)
コード例 #2
0
 def test_remove_punctuation_all_punctuations(self):
     # Setup
     input_text = '!"#$%&\'()*+,-./:;<=>?@[\\]^_{|}~'
     expected_output = ''
     # Actual call
     output_text = remove_punctuation(input_text)
     # Asserts
     self.assertEqual(output_text, expected_output)
コード例 #3
0
 def test_remove_punctuation(self):
     # Setup
     input_text = 'Hello!!! Welcome.'
     expected_output = 'Hello Welcome'
     # Actual call
     output_text = remove_punctuation(input_text)
     # Asserts
     self.assertEqual(output_text, expected_output)
コード例 #4
0
 def test_remove_punctuation_no_punctuations(self):
     # Setup
     input_text = 'Hello world'
     expected_output = 'Hello world'
     # Actual call
     output_text = remove_punctuation(input_text)
     # Asserts
     self.assertEqual(output_text, expected_output)
import sys
import os
import codecs
from text_preprocessing import remove_punctuation, remove_double_spaces, remove_noisy_digits, remove_dash_and_minus_signs, reichstag_patterns, extract_meeting_protocols_reichstag
import logging
logging.basicConfig(level=logging.INFO,
                    format='%(asctime)s :: %(levelname)s :: %(message)s')

ROOT_DIR = os.path.dirname(os.path.realpath(__file__))
tpath = os.path.abspath(os.path.join(ROOT_DIR, "data"))
os.chdir(tpath)

if __name__ == "__main__":
    text = codecs.open(os.path.join(tpath, sys.argv[1]), 'r',
                       encoding='utf-8').readlines()
    # Do some minor text cleaning before protocols are extracted
    text = remove_punctuation(text)
    text = remove_double_spaces(text)
    text = remove_noisy_digits(text)
    text = remove_dash_and_minus_signs(text)

    # Extract protocols and save them in ./data folder
    patterns = reichstag_patterns()
    extract_meeting_protocols_reichstag(text, *patterns, sys.argv[1][:4])