def test_inconsistent_issues(self): a_stream = StringIO.StringIO() sys.stderr = a_stream self.assertTrue(preprocess_wmt.parse_line( """z/2012/12/01/198819-18_de_MT2\tDas Wort orientiert sich an "Bang Dakuan".\tThe word <mqm:startIssue type="Terminology" severity="critical" note="" agent="annot16" id="4279"/>orientates<mqm:endIssue id="4279"/> itself <mqm:startIssue type="Function words" severity="critical" note="" agent="annot16" id="4280"/>by "Bang Dakuan".""" ) == ("","",[],[]) ) self.assertTrue(preprocess_wmt.parse_line( """derstandart.at/2012/12/01/141907-37_de_MT2\tDas ist billig und zeiteffizient.\tThis is cheap and time-efficient<mqm:endIssue id="4281"/>.""") == ("","",[],[]) ) # self.assertTrue( a_stream.getvalue() == "Inconsistent error(s): 4280\nInconsistent error 4281\n" ) print 'inconsistent issues: ', a_stream.getvalue() a_stream.close()
def test_inconsistent_issues(self): a_stream = StringIO.StringIO() sys.stderr = a_stream self.assertTrue( preprocess_wmt.parse_line( """z/2012/12/01/198819-18_de_MT2\tDas Wort orientiert sich an "Bang Dakuan".\tThe word <mqm:startIssue type="Terminology" severity="critical" note="" agent="annot16" id="4279"/>orientates<mqm:endIssue id="4279"/> itself <mqm:startIssue type="Function words" severity="critical" note="" agent="annot16" id="4280"/>by "Bang Dakuan".""" ) == ("", "", [], [])) self.assertTrue( preprocess_wmt.parse_line( """derstandart.at/2012/12/01/141907-37_de_MT2\tDas ist billig und zeiteffizient.\tThis is cheap and time-efficient<mqm:endIssue id="4281"/>.""" ) == ("", "", [], [])) # self.assertTrue( a_stream.getvalue() == "Inconsistent error(s): 4280\nInconsistent error 4281\n" ) print 'inconsistent issues: ', a_stream.getvalue() a_stream.close()
def test_invalid_xml(self): a_stream = StringIO.StringIO() sys.stderr = a_stream self.assertTrue( preprocess_wmt.parse_line( """faz/2012/12/01/198819-55_de_MT2\tMan muss höllisch aufpassen\tOne must pay attention <mqm:startIssue type="Mistranslation" severity="critical" note="" agent="annot16" id="4290"/ >like hell<mqm:endIssue id="4290"/>""" ) == ("", "", [], [])) print 'invalid xml: ', a_stream.getvalue() a_stream.close()
def test_wrong_format(self): a_stream = StringIO.StringIO() sys.stderr = a_stream self.assertTrue( preprocess_wmt.parse_line("This is not a valid format") == ("", "", [], [])) self.assertTrue(a_stream.getvalue() == "Wrong format\n") a_stream.close()
def test_tokenizer(self): (a, b, a_list, aa_list) = preprocess_wmt.parse_line( """z/2012/12/01/198819-18_de_MT2\tDas Wort orientiert sich an "Bang Dakuan".\tThis, sentence, very-very <mqm:startIssue type="Terminology" severity="critical" note="" agent="annot16" id="4279"/>http://website.com (complicated)<mqm:endIssue id="4279"/> 10,000 and 0.1: to tokenize; don't <mqm:startIssue type="Function words" severity="critical" note="" agent="annot16" id="4280"/>and wouldn't e.g.<mqm:endIssue id="4280"/> he'll "many" John's $200 other things.""" ) self.assertTrue( np.array_equal(a_list, [ u'This', u',', u'sentence', u',', u'very', u'-', u'very', u'http://website.com', u'(', u'complicated', u')', u'10,000', u'and', u'0.1', u':', u'to', u'tokenize', u';', u'do', u"n't", u'and', u'would', u"n't", u'e.g.', u'he', u"'ll", u'"', u'many', u'"', u'John', u"'s", u'$', u'200', u'other', u'things', u'.' ])) self.assertTrue(aa_list[0].start == 7 and aa_list[0].end == 11) self.assertTrue(aa_list[1].start == 20 and aa_list[1].end == 24)
def test_tokenizer(self): (a, b, a_list, aa_list) = preprocess_wmt.parse_line( """z/2012/12/01/198819-18_de_MT2\tDas Wort orientiert sich an "Bang Dakuan".\tThis, sentence, very-very <mqm:startIssue type="Terminology" severity="critical" note="" agent="annot16" id="4279"/>http://website.com (complicated)<mqm:endIssue id="4279"/> 10,000 and 0.1: to tokenize; don't <mqm:startIssue type="Function words" severity="critical" note="" agent="annot16" id="4280"/>and wouldn't e.g.<mqm:endIssue id="4280"/> he'll "many" John's $200 other things.""") self.assertTrue(np.array_equal( a_list, [u'This', u',', u'sentence', u',', u'very', u'-', u'very', u'http://website.com', u'(', u'complicated', u')', u'10,000', u'and', u'0.1', u':', u'to', u'tokenize', u';', u'do', u"n't", u'and', u'would', u"n't", u'e.g.', u'he', u"'ll", u'"', u'many', u'"', u'John', u"'s", u'$', u'200', u'other', u'things',u'.'])) self.assertTrue(aa_list[0].start == 7 and aa_list[0].end == 11) self.assertTrue(aa_list[1].start == 20 and aa_list[1].end == 24)
def test_cyrillic_str(self): # preprocess.parse_line( open('test_data.txt').readline()[:-1] ) preprocess_wmt.parse_line( """z/2012/12/01/198819-18_de_MT2\tDas Wort orientiert sich an "Bang Dakuan".\tФарш невозможно <mqm:startIssue type="Terminology" severity="critical" note="" agent="annot16" id="4279"/>провернуть<mqm:endIssue id="4279"/> назад <mqm:startIssue type="Function words" severity="critical" note="" agent="annot16" id="4280"/>и<mqm:endIssue id="4280"/> мясо из котлет не востановишь.""" )
def test_invalid_xml(self): a_stream = StringIO.StringIO() sys.stderr = a_stream self.assertTrue(preprocess_wmt.parse_line("""faz/2012/12/01/198819-55_de_MT2\tMan muss höllisch aufpassen\tOne must pay attention <mqm:startIssue type="Mistranslation" severity="critical" note="" agent="annot16" id="4290"/ >like hell<mqm:endIssue id="4290"/>""" ) == ("","",[],[]) ) print 'invalid xml: ', a_stream.getvalue() a_stream.close()
def test_wrong_format(self): a_stream = StringIO.StringIO() sys.stderr = a_stream self.assertTrue(preprocess_wmt.parse_line( "This is not a valid format" ) == ("","",[],[])) self.assertTrue(a_stream.getvalue() == "Wrong format\n") a_stream.close()