def setUp(self): self.inst = raw_txt_to_inst("balhd dfds dkkdf ldf\n" + "experiment this is a\n" + "this is a test") self.gloss_tokens = tokenize_string("experiment this is a") self.trans_tokens = tokenize_string("this is a test") self.a = Alignment([(2,3),(1,2),(3,4)]) self.a2 = Alignment([(2,3),(1,2),(3,4),(4,1)])
def create_words_tier_from_string(string): tokens = tokenize_string(string, tokenizer=whitespace_tokenizer) wt = Tier(type=WORDS_TYPE) for token in tokens: i = Item(id=ask_item_id(wt), text=token.value()) wt.append(i) return wt
def add_word_tag(self, word, tag, prev_word=None, next_word=None, count=1): for subword in tokenize_string(word, tokenizer=morpheme_tokenizer): subword = subword.seq if tag not in self.sw_dict[subword].keys(): self.sw_dict[subword][tag] = {'contexts':[(prev_word,next_word)], 'count':count} else: self.sw_dict[subword][tag]['contexts'].append((prev_word, next_word)) self.sw_dict[subword][tag]['count'] += count
def remove_tags(source_path, target_path): source_f = open(source_path, 'r', encoding='utf-8') target_f = open(target_path, 'w', encoding='utf-8') for line in source_f: tokens = tokenize_string(line, tokenizer=tag_tokenizer) target_f.write(tokens.text()+'\n') source_f.close() target_f.close()
def clean_new_trans_test(self): orig = '"I don\'t understand any of it; I don\'t understand it at all"' expected = " I don\'t understand any of it; I don\'t understand it at all " result = clean_trans_string(orig) self.assertEqual(result, expected) tokenized = tokenize_string(result, tokenizer=sentence_tokenizer).text() tokenized_expected = "I don't understand any of it ; I don't understand it at all" self.assertEqual(tokenized, tokenized_expected)
def from_giza_lines(cls, tgt, aln): """ Return the target-to-source alignment from the target and aln lines of giza. """ # Start by getting the target tokens from the provided target line tgt_tokens = tokenize_string(tgt, whitespace_tokenizer) # next, read the alignments from the aln line. a = Alignment.from_giza(aln) # Finally, the source tokens are also on the aln line. alignments = re.findall('(\S+) \(\{(.*?)\}\)', aln) # Get the src tokens... src_tokens = [a[0] for a in alignments[1:]] # And create the aln sent. aln_sent = cls(src_tokens, tgt_tokens, a) return aln_sent
def stanford_stdout_handler(output, queue): queue.append(tokenize_string(output, tokenizer=tag_tokenizer))
def runTest(self): s1 = tokenize_string('This is a test sentence') s2 = tokenize_string('test sentence this is') a = Alignment([(1,3),(2,4),(4,1),(5,2)]) a_sent = AlignedSent(s1, s2, a)