def handle(srcfs, srcft, tgtfs, tgtft, max_len=256, remove_same=False, shuf=True, max_remove=False): _max_len = max(1, max_len - 2) data = {} with open(srcfs, "rb") as fs, open(srcft, "rb") as ft: for ls, lt in zip(fs, ft): ls, lt = ls.strip(), lt.strip() if ls and lt: ls, slen = clean_liststr_lentok(ls.decode("utf-8").split()) lt, tlen = clean_liststr_lentok(lt.decode("utf-8").split()) if (slen <= _max_len) and (tlen <= _max_len): lgth = slen + tlen data = dict_insert_list(data, (ls, lt,), lgth, tlen) ens = "\n".encode("utf-8") with open(tgtfs, "wb") as fs, open(tgtft, "wb") as ft: for tmp in iter_dict_sort(data): ls, lt = zip(*tmp) if len(ls) > 1: if remove_same: ls, lt = maxfreq_filter(ls, lt, max_remove) if shuf: ls, lt = shuffle_pair(ls, lt) fs.write("\n".join(ls).encode("utf-8")) fs.write(ens) ft.write("\n".join(lt).encode("utf-8")) ft.write(ens)
def paral_reader(fsrc, ftgt): srcf, tgtf = open(fsrc, "rb"), open(ftgt, "rb") src, tgt = srcf.readline(), tgtf.readline() while src and tgt: src, tgt = src.strip(), tgt.strip() if src and tgt: src, lsrc = clean_liststr_lentok(src.decode("utf-8").split()) tgt, ltgt = clean_liststr_lentok(tgt.decode("utf-8").split()) yield src, tgt, ltgt + lsrc, ltgt src, tgt = srcf.readline(), tgtf.readline() srcf.close() tgtf.close()
def paral_reader(srcfl): with FileList(srcfl, "rb") as fl: for lines in zip(*fl): lines = [line.strip() for line in lines] if all(lines): lines, lens = zip(*[clean_liststr_lentok(line.decode("utf-8").split()) for line in lines]) lgth = sum(lens) yield tuple(line.encode("utf-8") for line in lines), lgth, *reversed(lens[1:])
def handle(srcfs, srcft, tgtfs, tgtft, maxlen=256): ens = "\n".encode("utf-8") with open(srcfs, "rb") as fs, open(srcft, "rb") as ft, open(tgtfs, "wb") as fsw, open(tgtft, "wb") as ftw: total = keep = 0 for ls, lt in zip(fs, ft): ls, lt = ls.strip(), lt.strip() if ls and lt: ls, lt = ls.decode("utf-8"), lt.decode("utf-8") ls, lens = clean_liststr_lentok(ls.split()) lt, lent = clean_liststr_lentok(lt.split()) if (lens <= maxlen) and (lent <= maxlen): fsw.write(ls.encode("utf-8")) fsw.write(ens) ftw.write(lt.encode("utf-8")) ftw.write(ens) keep += 1 total += 1 print("%d in %d data keeped with ratio %.2f" % (keep, total, float(keep) / float(total) * 100.0 ))
def handle(srcfl, tgtfl, max_len=256, drop_tail=False): _max_len = max(1, max_len - 2) data = set() ens = "\n".encode("utf-8") with FileList(srcfl, "rb") as frl, FileList(tgtfl, "wb") as fwl: if drop_tail: for lines in zip(*frl): lines = [line.strip() for line in lines] if all(lines): lines, lens = zip(*[ clean_liststr_lentok(line.decode("utf-8").split()) for line in lines ]) if all_le(lens, max_len): tmp = lines[0].encode("utf-8") if tmp not in data: for du, f in zip(lines, fwl): f.write(du.encode("utf-8")) f.write(ens) data.add(tmp) else: for lines in zip(*frl): lines = [line.strip() for line in lines] if all(lines): lines, lens = zip(*[ clean_liststr_lentok(line.decode("utf-8").split()) for line in lines ]) if all_le(lens, max_len): lines = tuple(line.encode("utf-8") for line in lines) if lines not in data: for du, f in zip(lines, fwl): f.write(du) f.write(ens) data.add(lines)
def handle(srcfl, tgtd, max_len=256, remove_same=False, cache_token=500000000): def save_cache(cache, tgtfl): ens = "\n".encode("utf-8") with FileList(tgtfl, "wb") as wfl: for tmp in iter_dict_sort(cache): lines = zip(*tmp) for du, f in zip(lines, wfl): f.write(ens.join(du)) f.write(ens) _max_len = max(1, max_len - 2) _insert_func = dict_insert_set if remove_same else dict_insert_list data = {} mem_token = curf = 0 num_files = len(srcfl) with FileList(srcfl, "rb") as fl: for lines in zip(*fl): lines = [line.strip() for line in lines] if all(lines): lines, lens = zip(*[ clean_liststr_lentok(line.decode("utf-8").split()) for line in lines ]) if all_le(lens, max_len): lgth = sum(lens) data = _insert_func( data, tuple(line.encode("utf-8") for line in lines), lgth, *reversed(lens[1:])) mem_token += lgth if mem_token >= cache_token: save_cache(data, [ pjoin(tgtd, "%d.%d.txt" % ( i, curf, )) for i in range(num_files) ]) data = {} mem_token = 0 curf += 1 if data: save_cache( data, [pjoin(tgtd, "%d.%d.txt" % ( i, curf, )) for i in range(num_files)])
def handle(srcfl, tgtfl, max_len=256, remove_same=False, shuf=True, max_remove=False): _max_len = max(1, max_len - 2) _insert_func = dict_insert_set if remove_same and ( not max_remove) else dict_insert_list data = {} with FileList(srcfl, "rb") as fl: for lines in zip(*fl): lines = [line.strip() for line in lines] if all(lines): lines, lens = zip(*[ clean_liststr_lentok(line.decode("utf-8").split()) for line in lines ]) if all_le(lens, max_len): lgth = sum(lens) ls = lines[0] data = _insert_func( data, tuple(line.encode("utf-8") for line in lines), ls[:ls.find(" ")], lgth, *reversed(lens[1:])) ens = "\n".encode("utf-8") with FileList(tgtfl, "wb") as fl: for tmp in iter_dict_sort(data): lines = zip(*tmp) if len(tmp) > 1: if max_remove: lines = maxfreq_filter(*lines) if shuf: lines = shuffle_pair(*lines) for du, f in zip(lines, fl): f.write(ens.join(du)) f.write(ens)
def handle(srcfs, tgtfs, max_len=1048576): data = {} _max_len = max(1, max_len - 2) with open(srcfs, "rb") as fs: for ls in fs: ls = ls.strip() if ls: ls, lgth = clean_liststr_lentok(ls.decode("utf-8").split()) if lgth <= _max_len: if lgth in data: if ls not in data[lgth]: data[lgth].add(ls) else: data[lgth] = set([ls]) ens = "\n".encode("utf-8") with open(tgtfs, "wb") as fs: for tmp in iter_dict_sort(data): fs.write("\n".join(tmp).encode("utf-8")) fs.write(ens)
def handle(srcfl, tgtfl, max_len=256, remove_same=False, shuf=True, max_remove=False): _max_len = max(1, max_len - 2) _insert_func = dict_insert_set if remove_same and ( not max_remove) else dict_insert_list data = {} cache = [] with FileList(srcfl, "rb") as fl: for lines in zip(*fl): lines = [line.strip() for line in lines] if all(lines): lines, lens = zip(*[ clean_liststr_lentok(line.decode("utf-8").split()) for line in lines ]) if all_le(lens, max_len): lgth = sum(lens) cache.append(( lines, lens, )) else: if cache: nsent = len(cache) lines, lens = zip(*cache) lines = zip(*lines) lens = zip(*lens) mxlens = [max(mu) for mu in lens] slens = [sum(mu) for mu in lens] lines = tuple("\n".join(lu) for lu in lines) data = _insert_func( data, tuple(line.encode("utf-8") for line in lines), nsent, sum(mxlens), *reversed(mxlens[1:]), sum(slens), *reversed(slens[1:])) cache = [] else: if cache: nsent = len(cache) lines, lens = zip(*cache) lines = zip(*lines) lens = zip(*lens) mxlens = [max(mu) for mu in lens] slens = [sum(mu) for mu in lens] lines = tuple("\n".join(lu) for lu in lines) data = _insert_func( data, tuple(line.encode("utf-8") for line in lines), nsent, sum(mxlens), *reversed(mxlens[1:]), sum(slens), *reversed(slens[1:])) cache = [] ens = "\n\n".encode("utf-8") with FileList(tgtfl, "wb") as fl: for tmp in iter_dict_sort(data): lines = zip(*tmp) if len(tmp) > 1: if max_remove: lines = maxfreq_filter(*lines) if shuf: lines = shuffle_pair(*lines) for du, f in zip(lines, fl): f.write(ens.join(du)) f.write(ens)
def handle(srcfs, srcft, tgtfs, tgtft, max_len=256): _max_len = max(1, max_len - 2) data = {} with open(srcfs, "rb") as fs, open(srcft, "rb") as ft: for ls, lt in zip(fs, ft): ls, lt = ls.strip(), lt.strip() if ls and lt: ls, slen = clean_liststr_lentok(ls.decode("utf-8").split()) lt, tlen = clean_liststr_lentok(lt.decode("utf-8").split()) if (slen <= _max_len) and (tlen <= _max_len): if ls in data: data[ls][lt] = data[ls].get(lt, 0) + 1 else: data[ls] = {lt: 1} _clean = {} for ls, v in data.items(): if len(v) > 1: rlt = [] _maxf = 0 for key, value in v.items(): if value > _maxf: _maxf = value rlt = [key] elif value == _maxf: rlt.append(key) for lt in rlt: if lt in _clean: _clean[lt][ls] = _clean[lt].get(ls, 0) + 1 else: _clean[lt] = {ls: 1} else: lt = list(v.keys())[0] if lt in _clean: _clean[lt][ls] = _clean[lt].get(ls, 0) + 1 else: _clean[lt] = {ls: 1} data = _clean ens = "\n".encode("utf-8") with open(tgtfs, "wb") as fs, open(tgtft, "wb") as ft: for lt, v in data.items(): if len(v) > 1: rls = [] _maxf = 0 for key, value in v.items(): if value > _maxf: _maxf = value rls = [key] elif value == _maxf: rls.append(key) rlt = "\n".join([lt for i in range(len(rls))]) rls = "\n".join(rls) else: rlt = lt rls = list(v.keys())[0] fs.write(rls.encode("utf-8")) fs.write(ens) ft.write(rlt.encode("utf-8")) ft.write(ens)
def handle(srcfs, srcft, tgtfs, tgtft, remove_same=False, shuf=True, max_remove=False): data = {} cache = [] mxtoks = mxtokt = ntoks = ntokt = 0 with open(srcfs, "rb") as fs, open(srcft, "rb") as ft: for ls, lt in zip(fs, ft): ls, lt = ls.strip(), lt.strip() if ls and lt: ls, slen = clean_liststr_lentok(ls.decode("utf-8").split()) lt, tlen = clean_liststr_lentok(lt.decode("utf-8").split()) cache.append(( ls, lt, )) if slen > mxtoks: mxtoks = slen if tlen > mxtokt: mxtokt = tlen ntoks += slen ntokt += tlen else: if cache: nsent = len(cache) ls, lt = zip(*cache) _tmp = ( "\n".join(ls), "\n".join(lt), ) data = dict_insert_set(data, _tmp, nsent, mxtoks + mxtokt, mxtokt, ntoks + ntokt, ntokt) cache = [] mxtoks = mxtokt = ntoks = ntokt = 0 if cache: nsent = len(cache) ls, lt = zip(*cache) _tmp = ( "\n".join(ls), "\n".join(lt), ) data = dict_insert_set(data, _tmp, nsent, mxtoks + mxtokt, mxtokt, ntoks + ntokt, ntokt) cache = [] mxtoks = mxtokt = ntoks = ntokt = 0 ens = "\n\n".encode("utf-8") with open(tgtfs, "wb") as fs, open(tgtft, "wb") as ft: for tmp in iter_dict_sort(data): ls, lt = zip(*tmp) if len(ls) > 1: if remove_same: ls, lt = maxfreq_filter(ls, lt, max_remove) if shuf: ls, lt = shuffle_pair(ls, lt) fs.write("\n\n".join(ls).encode("utf-8")) fs.write(ens) ft.write("\n\n".join(lt).encode("utf-8")) ft.write(ens)