def _fill_lang(self, tuv, seg): d = dict() d['source_language'] = TMUtils.lang2short(self._get_lang(tuv[0]))#tuv[0].attrib.get('lang')#get('{%s}lang' % self.NS) d['target_language'] = TMUtils.lang2short(self._get_lang(tuv[1])) #tuv[1].attrib.get('lang')#get('{%s}lang' % self.NS) d['source_text'] = self._get_text(seg[0]) if isinstance(d['source_text'], bytes): d['source_text'] = d['source_text'].decode('utf8').encode('utf8') d['target_text'] = self._get_text(seg[1]) if isinstance(d['target_text'], bytes): d['target_text'] = d['target_text'].decode('utf8').encode('utf8') d['source_metadata'] = self._parse_metadata(tuv[0]) d['target_metadata'] = self._parse_metadata(tuv[1]) return d
def _segment2doc(self, segment, ftype): text_pos = getattr(segment, ftype + '_pos') doc = {'text': getattr(segment, ftype + '_text')} # Optional fields (POS, tokenized) if hasattr(segment, ftype + '_pos'): doc['pos'] = getattr(segment, ftype + '_pos') op_ftype = 'source' if ftype == 'target' else 'target' # Auxiliary field to facilitate language matrix generation doc['target_language'] = [TMUtils.lang2short(TMUtils.str2list(getattr(segment, op_ftype + '_language'))[0])] doc['token_cnt'] = self.token_count(getattr(segment, ftype + '_text'), getattr(segment, ftype + '_language')) return doc
def _gen_lang_pairs(self, tuv, seg): # Get all languages in the given tu lang_map = dict() for tu, seg in zip(tuv, seg): #lang = TMUtils.lang2short(tu.attrib.get('{%s}lang' % self.NS)) lang = TMUtils.lang2short((self._get_lang(tu)))#(tu.attrib.get('lang')) if not lang in lang_map: lang_map[lang] = [] lang_map[lang].append((tu, seg)) # Generate all requested pairs (note: one tu can contain multiple translations for # the same language for s_lang, t_lang in self.lang_pairs: for s_tuv, s_seg in lang_map.get(s_lang, []): for t_tuv, t_seg in lang_map.get(t_lang, []): yield self._fill_lang((s_tuv, t_tuv), (s_seg, t_seg))