def handle(srcfs, srcft, tgtfs, tgtft, max_len=256, remove_same=False, shuf=True, max_remove=False): _max_len = max(1, max_len - 2) data = {} with open(srcfs, "rb") as fs, open(srcft, "rb") as ft: for ls, lt in zip(fs, ft): ls, lt = ls.strip(), lt.strip() if ls and lt: ls, slen = clean_liststr_lentok(ls.decode("utf-8").split()) lt, tlen = clean_liststr_lentok(lt.decode("utf-8").split()) if (slen <= _max_len) and (tlen <= _max_len): lgth = slen + tlen data = dict_insert_list(data, (ls, lt,), lgth, tlen) ens = "\n".encode("utf-8") with open(tgtfs, "wb") as fs, open(tgtft, "wb") as ft: for tmp in iter_dict_sort(data): ls, lt = zip(*tmp) if len(ls) > 1: if remove_same: ls, lt = maxfreq_filter(ls, lt, max_remove) if shuf: ls, lt = shuffle_pair(ls, lt) fs.write("\n".join(ls).encode("utf-8")) fs.write(ens) ft.write("\n".join(lt).encode("utf-8")) ft.write(ens)
def sorti(lin): data = {} for ls in lin: ls = ls.strip() if ls: data = dict_insert_set(data, ls, len(ls.split())) return list(iter_dict_sort(data))
def save_cache(cache, tgtfl): ens = "\n".encode("utf-8") with FileList(tgtfl, "wb") as wfl: for tmp in iter_dict_sort(cache): lines = zip(*tmp) for du, f in zip(lines, wfl): f.write(ens.join(du)) f.write(ens)
def save_cache(cache, srcf, tgtf): ens = "\n".encode("utf-8") with open(srcf, "wb") as fs, open(tgtf, "wb") as ft: for tmp in iter_dict_sort(cache): ls, lt = zip(*tmp) fs.write("\n".join(ls).encode("utf-8")) fs.write(ens) ft.write("\n".join(lt).encode("utf-8")) ft.write(ens)
def write_data(data, fs, ft, ens, rsame, shuf, mclean): for tmp in iter_dict_sort(data): ls, lt = zip(*tmp) if len(ls) > 1: if rsame: ls, lt = maxfreq_filter(ls, lt, mclean) if shuf: ls, lt = shuffle_pair(ls, lt) fs.write("\n".join(ls).encode("utf-8")) fs.write(ens) ft.write("\n".join(lt).encode("utf-8")) ft.write(ens)
def handle(srcfl, tgtfl, max_len=256, remove_same=False, shuf=True, max_remove=False): _max_len = max(1, max_len - 2) _insert_func = dict_insert_set if remove_same and ( not max_remove) else dict_insert_list data = {} with FileList(srcfl, "rb") as fl: for lines in zip(*fl): lines = [line.strip() for line in lines] if all(lines): lines, lens = zip(*[ clean_liststr_lentok(line.decode("utf-8").split()) for line in lines ]) if all_le(lens, max_len): lgth = sum(lens) ls = lines[0] data = _insert_func( data, tuple(line.encode("utf-8") for line in lines), ls[:ls.find(" ")], lgth, *reversed(lens[1:])) ens = "\n".encode("utf-8") with FileList(tgtfl, "wb") as fl: for tmp in iter_dict_sort(data): lines = zip(*tmp) if len(tmp) > 1: if max_remove: lines = maxfreq_filter(*lines) if shuf: lines = shuffle_pair(*lines) for du, f in zip(lines, fl): f.write(ens.join(du)) f.write(ens)
def handle(srcfs, tgtfs): data = {} with open(srcfs, "rb") as fs: for ls in fs: ls = ls.strip() if ls: ls, lgth = clean_list_len(ls.decode("utf-8").split()) if lgth in data: if ls not in data[lgth]: data[lgth].add(ls) else: data[lgth] = set([ls]) ens = "\n".encode("utf-8") with open(tgtfs, "wb") as fs: for tmp in iter_dict_sort(data): fs.write("\n".join(tmp).encode("utf-8")) fs.write(ens)
def handle(srcfl, tgtfl, max_len=256, remove_same=False, shuf=True, max_remove=False): _max_len = max(1, max_len - 2) _insert_func = dict_insert_set if remove_same and ( not max_remove) else dict_insert_list data = {} cache = [] with FileList(srcfl, "rb") as fl: for lines in zip(*fl): lines = [line.strip() for line in lines] if all(lines): lines, lens = zip(*[ clean_liststr_lentok(line.decode("utf-8").split()) for line in lines ]) if all_le(lens, max_len): lgth = sum(lens) cache.append(( lines, lens, )) else: if cache: nsent = len(cache) lines, lens = zip(*cache) lines = zip(*lines) lens = zip(*lens) mxlens = [max(mu) for mu in lens] slens = [sum(mu) for mu in lens] lines = tuple("\n".join(lu) for lu in lines) data = _insert_func( data, tuple(line.encode("utf-8") for line in lines), nsent, sum(mxlens), *reversed(mxlens[1:]), sum(slens), *reversed(slens[1:])) cache = [] else: if cache: nsent = len(cache) lines, lens = zip(*cache) lines = zip(*lines) lens = zip(*lens) mxlens = [max(mu) for mu in lens] slens = [sum(mu) for mu in lens] lines = tuple("\n".join(lu) for lu in lines) data = _insert_func( data, tuple(line.encode("utf-8") for line in lines), nsent, sum(mxlens), *reversed(mxlens[1:]), sum(slens), *reversed(slens[1:])) cache = [] ens = "\n\n".encode("utf-8") with FileList(tgtfl, "wb") as fl: for tmp in iter_dict_sort(data): lines = zip(*tmp) if len(tmp) > 1: if max_remove: lines = maxfreq_filter(*lines) if shuf: lines = shuffle_pair(*lines) for du, f in zip(lines, fl): f.write(ens.join(du)) f.write(ens)
def handle(srcfs, srcft, tgtfs, tgtft, remove_same=False, shuf=True, max_remove=False): data = {} cache = [] mxtoks = mxtokt = ntoks = ntokt = 0 with open(srcfs, "rb") as fs, open(srcft, "rb") as ft: for ls, lt in zip(fs, ft): ls, lt = ls.strip(), lt.strip() if ls and lt: ls, slen = clean_liststr_lentok(ls.decode("utf-8").split()) lt, tlen = clean_liststr_lentok(lt.decode("utf-8").split()) cache.append(( ls, lt, )) if slen > mxtoks: mxtoks = slen if tlen > mxtokt: mxtokt = tlen ntoks += slen ntokt += tlen else: if cache: nsent = len(cache) ls, lt = zip(*cache) _tmp = ( "\n".join(ls), "\n".join(lt), ) data = dict_insert_set(data, _tmp, nsent, mxtoks + mxtokt, mxtokt, ntoks + ntokt, ntokt) cache = [] mxtoks = mxtokt = ntoks = ntokt = 0 if cache: nsent = len(cache) ls, lt = zip(*cache) _tmp = ( "\n".join(ls), "\n".join(lt), ) data = dict_insert_set(data, _tmp, nsent, mxtoks + mxtokt, mxtokt, ntoks + ntokt, ntokt) cache = [] mxtoks = mxtokt = ntoks = ntokt = 0 ens = "\n\n".encode("utf-8") with open(tgtfs, "wb") as fs, open(tgtft, "wb") as ft: for tmp in iter_dict_sort(data): ls, lt = zip(*tmp) if len(ls) > 1: if remove_same: ls, lt = maxfreq_filter(ls, lt, max_remove) if shuf: ls, lt = shuffle_pair(ls, lt) fs.write("\n\n".join(ls).encode("utf-8")) fs.write(ens) ft.write("\n\n".join(lt).encode("utf-8")) ft.write(ens)