Ejemplo n.º 1
0
def handle(srcfs, srcft, tgtfs, tgtft, max_len=256, remove_same=False, shuf=True, max_remove=False):

	_max_len = max(1, max_len - 2)

	data = {}

	with open(srcfs, "rb") as fs, open(srcft, "rb") as ft:
		for ls, lt in zip(fs, ft):
			ls, lt = ls.strip(), lt.strip()
			if ls and lt:
				ls, slen = clean_liststr_lentok(ls.decode("utf-8").split())
				lt, tlen = clean_liststr_lentok(lt.decode("utf-8").split())
				if (slen <= _max_len) and (tlen <= _max_len):
					lgth = slen + tlen
					data = dict_insert_list(data, (ls, lt,), lgth, tlen)

	ens = "\n".encode("utf-8")

	with open(tgtfs, "wb") as fs, open(tgtft, "wb") as ft:
		for tmp in iter_dict_sort(data):
			ls, lt = zip(*tmp)
			if len(ls) > 1:
				if remove_same:
					ls, lt = maxfreq_filter(ls, lt, max_remove)
				if shuf:
					ls, lt = shuffle_pair(ls, lt)
			fs.write("\n".join(ls).encode("utf-8"))
			fs.write(ens)
			ft.write("\n".join(lt).encode("utf-8"))
			ft.write(ens)
Ejemplo n.º 2
0
def sorti(lin):

	data = {}

	for ls in lin:
		ls = ls.strip()
		if ls:
			data = dict_insert_set(data, ls, len(ls.split()))

	return list(iter_dict_sort(data))
Ejemplo n.º 3
0
    def save_cache(cache, tgtfl):

        ens = "\n".encode("utf-8")

        with FileList(tgtfl, "wb") as wfl:
            for tmp in iter_dict_sort(cache):
                lines = zip(*tmp)
                for du, f in zip(lines, wfl):
                    f.write(ens.join(du))
                    f.write(ens)
Ejemplo n.º 4
0
    def save_cache(cache, srcf, tgtf):

        ens = "\n".encode("utf-8")

        with open(srcf, "wb") as fs, open(tgtf, "wb") as ft:
            for tmp in iter_dict_sort(cache):
                ls, lt = zip(*tmp)
                fs.write("\n".join(ls).encode("utf-8"))
                fs.write(ens)
                ft.write("\n".join(lt).encode("utf-8"))
                ft.write(ens)
Ejemplo n.º 5
0
    def write_data(data, fs, ft, ens, rsame, shuf, mclean):

        for tmp in iter_dict_sort(data):
            ls, lt = zip(*tmp)
            if len(ls) > 1:
                if rsame:
                    ls, lt = maxfreq_filter(ls, lt, mclean)
                if shuf:
                    ls, lt = shuffle_pair(ls, lt)
            fs.write("\n".join(ls).encode("utf-8"))
            fs.write(ens)
            ft.write("\n".join(lt).encode("utf-8"))
            ft.write(ens)
Ejemplo n.º 6
0
def handle(srcfl,
           tgtfl,
           max_len=256,
           remove_same=False,
           shuf=True,
           max_remove=False):

    _max_len = max(1, max_len - 2)

    _insert_func = dict_insert_set if remove_same and (
        not max_remove) else dict_insert_list
    data = {}

    with FileList(srcfl, "rb") as fl:
        for lines in zip(*fl):
            lines = [line.strip() for line in lines]
            if all(lines):
                lines, lens = zip(*[
                    clean_liststr_lentok(line.decode("utf-8").split())
                    for line in lines
                ])
                if all_le(lens, max_len):
                    lgth = sum(lens)
                    ls = lines[0]
                    data = _insert_func(
                        data, tuple(line.encode("utf-8") for line in lines),
                        ls[:ls.find(" ")], lgth, *reversed(lens[1:]))

    ens = "\n".encode("utf-8")

    with FileList(tgtfl, "wb") as fl:
        for tmp in iter_dict_sort(data):
            lines = zip(*tmp)
            if len(tmp) > 1:
                if max_remove:
                    lines = maxfreq_filter(*lines)
                if shuf:
                    lines = shuffle_pair(*lines)
            for du, f in zip(lines, fl):
                f.write(ens.join(du))
                f.write(ens)
Ejemplo n.º 7
0
def handle(srcfs, tgtfs):

    data = {}

    with open(srcfs, "rb") as fs:
        for ls in fs:
            ls = ls.strip()
            if ls:
                ls, lgth = clean_list_len(ls.decode("utf-8").split())
                if lgth in data:
                    if ls not in data[lgth]:
                        data[lgth].add(ls)
                else:
                    data[lgth] = set([ls])

    ens = "\n".encode("utf-8")

    with open(tgtfs, "wb") as fs:
        for tmp in iter_dict_sort(data):
            fs.write("\n".join(tmp).encode("utf-8"))
            fs.write(ens)
Ejemplo n.º 8
0
def handle(srcfl,
           tgtfl,
           max_len=256,
           remove_same=False,
           shuf=True,
           max_remove=False):

    _max_len = max(1, max_len - 2)

    _insert_func = dict_insert_set if remove_same and (
        not max_remove) else dict_insert_list
    data = {}
    cache = []

    with FileList(srcfl, "rb") as fl:
        for lines in zip(*fl):
            lines = [line.strip() for line in lines]
            if all(lines):
                lines, lens = zip(*[
                    clean_liststr_lentok(line.decode("utf-8").split())
                    for line in lines
                ])
                if all_le(lens, max_len):
                    lgth = sum(lens)
                    cache.append((
                        lines,
                        lens,
                    ))
                else:
                    if cache:
                        nsent = len(cache)
                        lines, lens = zip(*cache)
                        lines = zip(*lines)
                        lens = zip(*lens)
                        mxlens = [max(mu) for mu in lens]
                        slens = [sum(mu) for mu in lens]
                        lines = tuple("\n".join(lu) for lu in lines)
                        data = _insert_func(
                            data,
                            tuple(line.encode("utf-8") for line in lines),
                            nsent, sum(mxlens), *reversed(mxlens[1:]),
                            sum(slens), *reversed(slens[1:]))
                        cache = []
            else:
                if cache:
                    nsent = len(cache)
                    lines, lens = zip(*cache)
                    lines = zip(*lines)
                    lens = zip(*lens)
                    mxlens = [max(mu) for mu in lens]
                    slens = [sum(mu) for mu in lens]
                    lines = tuple("\n".join(lu) for lu in lines)
                    data = _insert_func(
                        data, tuple(line.encode("utf-8") for line in lines),
                        nsent, sum(mxlens), *reversed(mxlens[1:]), sum(slens),
                        *reversed(slens[1:]))
                    cache = []

    ens = "\n\n".encode("utf-8")
    with FileList(tgtfl, "wb") as fl:
        for tmp in iter_dict_sort(data):
            lines = zip(*tmp)
            if len(tmp) > 1:
                if max_remove:
                    lines = maxfreq_filter(*lines)
                if shuf:
                    lines = shuffle_pair(*lines)
            for du, f in zip(lines, fl):
                f.write(ens.join(du))
                f.write(ens)
Ejemplo n.º 9
0
def handle(srcfs,
           srcft,
           tgtfs,
           tgtft,
           remove_same=False,
           shuf=True,
           max_remove=False):

    data = {}
    cache = []
    mxtoks = mxtokt = ntoks = ntokt = 0

    with open(srcfs, "rb") as fs, open(srcft, "rb") as ft:
        for ls, lt in zip(fs, ft):
            ls, lt = ls.strip(), lt.strip()
            if ls and lt:
                ls, slen = clean_liststr_lentok(ls.decode("utf-8").split())
                lt, tlen = clean_liststr_lentok(lt.decode("utf-8").split())
                cache.append((
                    ls,
                    lt,
                ))
                if slen > mxtoks:
                    mxtoks = slen
                if tlen > mxtokt:
                    mxtokt = tlen
                ntoks += slen
                ntokt += tlen
            else:
                if cache:
                    nsent = len(cache)
                    ls, lt = zip(*cache)
                    _tmp = (
                        "\n".join(ls),
                        "\n".join(lt),
                    )
                    data = dict_insert_set(data, _tmp, nsent, mxtoks + mxtokt,
                                           mxtokt, ntoks + ntokt, ntokt)
                    cache = []
                    mxtoks = mxtokt = ntoks = ntokt = 0
        if cache:
            nsent = len(cache)
            ls, lt = zip(*cache)
            _tmp = (
                "\n".join(ls),
                "\n".join(lt),
            )
            data = dict_insert_set(data, _tmp, nsent, mxtoks + mxtokt, mxtokt,
                                   ntoks + ntokt, ntokt)
            cache = []
            mxtoks = mxtokt = ntoks = ntokt = 0

    ens = "\n\n".encode("utf-8")

    with open(tgtfs, "wb") as fs, open(tgtft, "wb") as ft:
        for tmp in iter_dict_sort(data):
            ls, lt = zip(*tmp)
            if len(ls) > 1:
                if remove_same:
                    ls, lt = maxfreq_filter(ls, lt, max_remove)
                if shuf:
                    ls, lt = shuffle_pair(ls, lt)
            fs.write("\n\n".join(ls).encode("utf-8"))
            fs.write(ens)
            ft.write("\n\n".join(lt).encode("utf-8"))
            ft.write(ens)