def handle(srcfs, srtsf_base, srttf_base, srtsf, srttf, tgtf): data = {} with open(srtsf_base, "rb") as fs, open(srttf_base, "rb") as ft: for sl, tl in zip(fs, ft): _sl, _tl = sl.strip(), tl.strip() if _sl and _tl: _sl = clean_str(_sl.decode("utf-8")) _tl = clean_str(_tl.decode("utf-8")) data[_sl] = _tl with open(srtsf, "rb") as fs, open(srttf, "rb") as ft: for sl, tl in zip(fs, ft): _sl, _tl = sl.strip(), tl.strip() if _sl and _tl: _sl = clean_str(_sl.decode("utf-8")) _tl = clean_str(_tl.decode("utf-8")) data[_sl] = _tl ens = "\n".encode("utf-8") with open(srcfs, "rb") as fs, open(tgtf, "wb") as ft: for line in fs: tmp = line.strip() if tmp: tmp = clean_str(tmp.decode("utf-8")) tmp = data.get(tmp, "") ft.write(tmp.encode("utf-8")) ft.write(ens)
def handle(srcfs, srcft, srcfg, tgtfs, tgtft, tgtfg): ens = "\n".encode("utf-8") with open(srcfs, "rb") as fs, open(srcft, "rb") as ft, open( srcfg, "rb") as fg, open(tgtfs, "wb") as fsw, open( tgtft, "wb") as ftw, open(tgtfg, "wb") as fgw: total = 0 keep = 0 for ls, lt, lg in zip(fs, ft, fg): ls, lt, lg = ls.strip(), lt.strip(), lg.strip() if ls and lt and lg: ls, lt, lg = clean_str(ls.decode("utf-8")), clean_str( lt.decode("utf-8")), clean_str(lg.decode("utf-8")) if lt != lg: fsw.write(ls.encode("utf-8")) fsw.write(ens) ftw.write(lt.encode("utf-8")) ftw.write(ens) fgw.write(lg.encode("utf-8")) fgw.write(ens) keep += 1 total += 1 print("%d in %d data keeped with ratio %.2f" % (keep, total, float(keep) / float(total) * 100.0 if total > 0 else 0.0))
def handle(srcf, tgtf, rankf, rssf, rstf, threshold): with open(srcf, "rb") as frs, open(tgtf, "rb") as frt, open( rankf, "rb") as fs, open(rssf, "wb") as fws, open(rstf, "wb") as fwt: ndata = nkeep = 0 ens = "\n".encode("utf-8") for srcl, tgtl, score in zip(frs, frt, fs): src, tgt, s = srcl.strip(), tgtl.strip(), score.strip() if src and tgt and s: src, tgt, s = clean_str(src.decode("utf-8")), clean_str( tgt.decode("utf-8")), float(s.decode("utf-8")) if s <= threshold: fws.write(src.encode("utf-8")) fws.write(ens) fwt.write(tgt.encode("utf-8")) fwt.write(ens) nkeep += 1 ndata += 1 print("%d in %d data keeped with ratio %.2f" % (nkeep, ndata, float(nkeep) / float(ndata) * 100.0 if ndata > 0 else 0.0))
def __call__(self, paragraphs): _paras = [ clean_str(tmpu.strip()) for tmpu in paragraphs.strip().split("\n") if tmpu ] _tmp = [] if self.sent_split is None: for _tmpu in paras: _tmp.append(_tmpu) _tmp.append("\n") else: for _tmpu in _paras: _tmp.extend( clean_list([ clean_str(_tmps) for _tmps in self.sent_split(_tmpu) ])) _tmp.append("\n") _tmp_o = _tmpi = sorti(_tmp) for pu in self.flow: _tmp_o = pu(_tmp_o) _tmp = restore(_tmp, _tmpi, _tmp_o) return " ".join(_tmp).replace(" \n", "\n").replace("\n ", "\n")
def handle(srcfs, srcfm, srtsf, srtmf, srttf, tgtf): data = {} with open(srtsf, "rb") as fs, open(srtmf, "rb") as fm, open(srttf, "rb") as ft: for sl, ml, tl in zip(fs, fm, ft): _sl, _ml, _tl = sl.strip(), ml.strip(), tl.strip() if _sl and _tl: _sl = clean_str(_sl.decode("utf-8")) _ml = clean_str(_ml.decode("utf-8")) _tl = clean_str(_tl.decode("utf-8")) data[( _sl, _ml, )] = _tl ens = "\n".encode("utf-8") with open(srcfs, "rb") as fs, open(srcfm, "rb") as fm, open(tgtf, "wb") as ft: for sl, ml in zip(fs, fm): _sl, _ml = sl.strip(), ml.strip() if _sl: _sl = clean_str(_sl.decode("utf-8")) _ml = clean_str(_ml.decode("utf-8")) tmp = data.get(( _sl, _ml, ), "") ft.write(tmp.encode("utf-8")) ft.write(ens)
def restore(src, tsrc, trs): data = {} for sl, tl in zip(tsrc, trs): _sl, _tl = sl.strip(), tl.strip() if _sl and _tl: data[_sl] = clean_str(_tl) rs = [] _tl = [] for line in src: tmp = line.strip() if tmp: tmp = clean_str(tmp) tmp = data.get(tmp, "").strip() if tmp: _tl.append(tmp) elif _tl: rs.append(" ".join(_tl)) _tl = [] elif _tl: rs.append(" ".join(_tl)) _tl = [] else: rs.append("") if _tl: rs.append(" ".join(_tl)) return rs
def __call__(self, paragraph): _tmp = [tmpu.strip() for tmpu in paragraph.strip().split("\n")] _rs = [] _tmpi = None if self.sent_split is not None: np = len(_tmp) - 1 if np > 0: for _i, _tmpu in enumerate(_tmp): if _tmpu: _rs.extend(self.sent_split(_tmpu)) if _i < np: _rs.append("") _tmpi = sorti(_rs) _tmp = _tmpi else: _tmp = [clean_str(_tmp[0])] else: _tmp = [clean_str(tmpu) for tmpu in _tmp] for pu in self.flow: _tmp = pu(_tmp) if len(_rs) > 1: _tmp = restore(_rs, _tmpi, _tmp) return "\n".join(_tmp) return " ".join(_tmp)
def restore(src, tsrc, trs): data = {} for sl, tl in zip(tsrc, trs): _sl, _tl = sl.strip(), tl.strip() if _sl and _tl: data[_sl] = clean_str(_tl) return [data.get(clean_str(line.strip()), line) for line in src]
def handle(srcfl, srtfl, tgtf): data = {} with FileList(srtfl, "rb") as fs: for lines in zip(*fs): lines = tuple(line.strip() for line in lines) if all(lines): lines = tuple( clean_str(line.decode("utf-8")) for line in lines) data[lines[:-1]] = lines[-1].encode("utf-8") ens = "\n".encode("utf-8") with FileList(srcfl, "rb") as fs, open(tgtf, "wb") as ft: for lines in zip(*fs): lines = tuple(line.strip() for line in lines) if all(lines): lines = tuple( clean_str(line.decode("utf-8")) for line in lines) if lines in data: ft.write(data[lines]) ft.write(ens)
def handle(srcfl, rsfl): data = [] with FileList(srcfl, "rb") as files: for lines in zip(*files): data.append( [clean_str(tmpu.strip().decode("utf-8")) for tmpu in lines]) shuffle(data) ens = "\n".encode("utf-8") for du, rsf in zip(zip(*data), rsfl): with open(rsf, "wb") as fwrt: fwrt.write("\n".join(du).encode("utf-8")) fwrt.write(ens)
def handle(srcfl, rsfl): files = [open(srcf, "rb") for srcf in srcfl] data = [] for lines in zip(*files): data.append([clean_str(tmpu.strip().decode("utf-8")) for tmpu in lines]) for frd in files: frd.close() shuffle(data) files = [open(rsf, "wb") for rsf in rsfl] ens = "\n".encode("utf-8") for du, fwrt in zip(zip(*data), files): fwrt.write("\n".join(du).encode("utf-8")) fwrt.write(ens) fwrt.close()