def init_dataset(src_path, tgt_path, src_out, tgt_out): with open(src_path) as fp_in_src, open(tgt_path) as fp_in_tgt: with open(src_out, 'w') as fp_out_src, open(tgt_out, 'w') as fp_out_tgt: for src_l, tgt_l in tqdm(zip(fp_in_src, fp_in_tgt)): src, tgt = Sentence.deserialize(src_l), Sentence.deserialize( tgt_l) fp_out_src.write("{}|{}\n".format(src.source, tgt.key)) fp_out_tgt.write("{}\n".format(tgt.source))
def iter_operators(self, apply_direction=True): if not self.cache_exists: for fr, en in self: self._apply_operators(fr) self._apply_operators(en) if apply_direction: yield self._apply_direction(fr, en) else: yield fr, en else: with open(self.cache_fr) as fp_fr, open(self.cache_en) as fp_en: for fr_l, en_l in zip(fp_fr, fp_en): fr = Sentence.deserialize(fr_l) en = Sentence.deserialize(en_l) if apply_direction: yield self._apply_direction(fr, en) else: yield fr, en
def find(self, sentence): with self.connection.cursor() as c: c.execute("""SELECT TU_tgt.mapping FROM ( SELECT translations.id_{} as id FROM translations_units INNER JOIN translations ON translations.id_{} = translations_units.id AND translations_units.text_hash = {} AND translations_units.language = {} ) as trans INNER JOIN translations_units as TU_tgt ON trans.id = TU_tgt.id ORDER BY TU_tgt.id DESC""".format('en' if sentence.language == 'fr' else 'fr', sentence.language, self.C, self.C), (self.hash(sentence.key), sentence.language)) return [Sentence.deserialize(t['mapping']) for t in c.fetchall()]