Esempio n. 1
0
def handle(srcfs, srcft, tgtfs, tgtft, max_len=256, remove_same=False, shuf=True, max_remove=False):

	_max_len = max(1, max_len - 2)

	data = {}

	with open(srcfs, "rb") as fs, open(srcft, "rb") as ft:
		for ls, lt in zip(fs, ft):
			ls, lt = ls.strip(), lt.strip()
			if ls and lt:
				ls, slen = clean_liststr_lentok(ls.decode("utf-8").split())
				lt, tlen = clean_liststr_lentok(lt.decode("utf-8").split())
				if (slen <= _max_len) and (tlen <= _max_len):
					lgth = slen + tlen
					data = dict_insert_list(data, (ls, lt,), lgth, tlen)

	ens = "\n".encode("utf-8")

	with open(tgtfs, "wb") as fs, open(tgtft, "wb") as ft:
		for tmp in iter_dict_sort(data):
			ls, lt = zip(*tmp)
			if len(ls) > 1:
				if remove_same:
					ls, lt = maxfreq_filter(ls, lt, max_remove)
				if shuf:
					ls, lt = shuffle_pair(ls, lt)
			fs.write("\n".join(ls).encode("utf-8"))
			fs.write(ens)
			ft.write("\n".join(lt).encode("utf-8"))
			ft.write(ens)
Esempio n. 2
0
 def paral_reader(fsrc, ftgt):
     srcf, tgtf = open(fsrc, "rb"), open(ftgt, "rb")
     src, tgt = srcf.readline(), tgtf.readline()
     while src and tgt:
         src, tgt = src.strip(), tgt.strip()
         if src and tgt:
             src, lsrc = clean_liststr_lentok(src.decode("utf-8").split())
             tgt, ltgt = clean_liststr_lentok(tgt.decode("utf-8").split())
             yield src, tgt, ltgt + lsrc, ltgt
         src, tgt = srcf.readline(), tgtf.readline()
     srcf.close()
     tgtf.close()
Esempio n. 3
0
	def paral_reader(srcfl):

		with FileList(srcfl, "rb") as fl:
			for lines in zip(*fl):
				lines = [line.strip() for line in lines]
				if all(lines):
					lines, lens = zip(*[clean_liststr_lentok(line.decode("utf-8").split()) for line in lines])
					lgth = sum(lens)
					yield tuple(line.encode("utf-8") for line in lines), lgth, *reversed(lens[1:])
Esempio n. 4
0
def handle(srcfs, srcft, tgtfs, tgtft, maxlen=256):

	ens = "\n".encode("utf-8")

	with open(srcfs, "rb") as fs, open(srcft, "rb") as ft, open(tgtfs, "wb") as fsw, open(tgtft, "wb") as ftw:
		total = keep = 0
		for ls, lt in zip(fs, ft):
			ls, lt = ls.strip(), lt.strip()
			if ls and lt:
				ls, lt = ls.decode("utf-8"), lt.decode("utf-8")
				ls, lens = clean_liststr_lentok(ls.split())
				lt, lent = clean_liststr_lentok(lt.split())
				if (lens <= maxlen) and (lent <= maxlen):
					fsw.write(ls.encode("utf-8"))
					fsw.write(ens)
					ftw.write(lt.encode("utf-8"))
					ftw.write(ens)
					keep += 1
				total += 1
	print("%d in %d data keeped with ratio %.2f" % (keep, total, float(keep) / float(total) * 100.0 ))
Esempio n. 5
0
def handle(srcfl, tgtfl, max_len=256, drop_tail=False):

    _max_len = max(1, max_len - 2)
    data = set()
    ens = "\n".encode("utf-8")

    with FileList(srcfl, "rb") as frl, FileList(tgtfl, "wb") as fwl:
        if drop_tail:
            for lines in zip(*frl):
                lines = [line.strip() for line in lines]
                if all(lines):
                    lines, lens = zip(*[
                        clean_liststr_lentok(line.decode("utf-8").split())
                        for line in lines
                    ])
                    if all_le(lens, max_len):
                        tmp = lines[0].encode("utf-8")
                        if tmp not in data:
                            for du, f in zip(lines, fwl):
                                f.write(du.encode("utf-8"))
                                f.write(ens)
                            data.add(tmp)
        else:
            for lines in zip(*frl):
                lines = [line.strip() for line in lines]
                if all(lines):
                    lines, lens = zip(*[
                        clean_liststr_lentok(line.decode("utf-8").split())
                        for line in lines
                    ])
                    if all_le(lens, max_len):
                        lines = tuple(line.encode("utf-8") for line in lines)
                        if lines not in data:
                            for du, f in zip(lines, fwl):
                                f.write(du)
                                f.write(ens)
                            data.add(lines)
Esempio n. 6
0
def handle(srcfl, tgtd, max_len=256, remove_same=False, cache_token=500000000):
    def save_cache(cache, tgtfl):

        ens = "\n".encode("utf-8")

        with FileList(tgtfl, "wb") as wfl:
            for tmp in iter_dict_sort(cache):
                lines = zip(*tmp)
                for du, f in zip(lines, wfl):
                    f.write(ens.join(du))
                    f.write(ens)

    _max_len = max(1, max_len - 2)

    _insert_func = dict_insert_set if remove_same else dict_insert_list
    data = {}
    mem_token = curf = 0
    num_files = len(srcfl)
    with FileList(srcfl, "rb") as fl:
        for lines in zip(*fl):
            lines = [line.strip() for line in lines]
            if all(lines):
                lines, lens = zip(*[
                    clean_liststr_lentok(line.decode("utf-8").split())
                    for line in lines
                ])
                if all_le(lens, max_len):
                    lgth = sum(lens)
                    data = _insert_func(
                        data, tuple(line.encode("utf-8") for line in lines),
                        lgth, *reversed(lens[1:]))
                    mem_token += lgth
                    if mem_token >= cache_token:
                        save_cache(data, [
                            pjoin(tgtd, "%d.%d.txt" % (
                                i,
                                curf,
                            )) for i in range(num_files)
                        ])
                        data = {}
                        mem_token = 0
                        curf += 1
    if data:
        save_cache(
            data,
            [pjoin(tgtd, "%d.%d.txt" % (
                i,
                curf,
            )) for i in range(num_files)])
Esempio n. 7
0
def handle(srcfl,
           tgtfl,
           max_len=256,
           remove_same=False,
           shuf=True,
           max_remove=False):

    _max_len = max(1, max_len - 2)

    _insert_func = dict_insert_set if remove_same and (
        not max_remove) else dict_insert_list
    data = {}

    with FileList(srcfl, "rb") as fl:
        for lines in zip(*fl):
            lines = [line.strip() for line in lines]
            if all(lines):
                lines, lens = zip(*[
                    clean_liststr_lentok(line.decode("utf-8").split())
                    for line in lines
                ])
                if all_le(lens, max_len):
                    lgth = sum(lens)
                    ls = lines[0]
                    data = _insert_func(
                        data, tuple(line.encode("utf-8") for line in lines),
                        ls[:ls.find(" ")], lgth, *reversed(lens[1:]))

    ens = "\n".encode("utf-8")

    with FileList(tgtfl, "wb") as fl:
        for tmp in iter_dict_sort(data):
            lines = zip(*tmp)
            if len(tmp) > 1:
                if max_remove:
                    lines = maxfreq_filter(*lines)
                if shuf:
                    lines = shuffle_pair(*lines)
            for du, f in zip(lines, fl):
                f.write(ens.join(du))
                f.write(ens)
Esempio n. 8
0
def handle(srcfs, tgtfs, max_len=1048576):

    data = {}

    _max_len = max(1, max_len - 2)

    with open(srcfs, "rb") as fs:
        for ls in fs:
            ls = ls.strip()
            if ls:
                ls, lgth = clean_liststr_lentok(ls.decode("utf-8").split())
                if lgth <= _max_len:
                    if lgth in data:
                        if ls not in data[lgth]:
                            data[lgth].add(ls)
                    else:
                        data[lgth] = set([ls])

    ens = "\n".encode("utf-8")

    with open(tgtfs, "wb") as fs:
        for tmp in iter_dict_sort(data):
            fs.write("\n".join(tmp).encode("utf-8"))
            fs.write(ens)
Esempio n. 9
0
def handle(srcfl,
           tgtfl,
           max_len=256,
           remove_same=False,
           shuf=True,
           max_remove=False):

    _max_len = max(1, max_len - 2)

    _insert_func = dict_insert_set if remove_same and (
        not max_remove) else dict_insert_list
    data = {}
    cache = []

    with FileList(srcfl, "rb") as fl:
        for lines in zip(*fl):
            lines = [line.strip() for line in lines]
            if all(lines):
                lines, lens = zip(*[
                    clean_liststr_lentok(line.decode("utf-8").split())
                    for line in lines
                ])
                if all_le(lens, max_len):
                    lgth = sum(lens)
                    cache.append((
                        lines,
                        lens,
                    ))
                else:
                    if cache:
                        nsent = len(cache)
                        lines, lens = zip(*cache)
                        lines = zip(*lines)
                        lens = zip(*lens)
                        mxlens = [max(mu) for mu in lens]
                        slens = [sum(mu) for mu in lens]
                        lines = tuple("\n".join(lu) for lu in lines)
                        data = _insert_func(
                            data,
                            tuple(line.encode("utf-8") for line in lines),
                            nsent, sum(mxlens), *reversed(mxlens[1:]),
                            sum(slens), *reversed(slens[1:]))
                        cache = []
            else:
                if cache:
                    nsent = len(cache)
                    lines, lens = zip(*cache)
                    lines = zip(*lines)
                    lens = zip(*lens)
                    mxlens = [max(mu) for mu in lens]
                    slens = [sum(mu) for mu in lens]
                    lines = tuple("\n".join(lu) for lu in lines)
                    data = _insert_func(
                        data, tuple(line.encode("utf-8") for line in lines),
                        nsent, sum(mxlens), *reversed(mxlens[1:]), sum(slens),
                        *reversed(slens[1:]))
                    cache = []

    ens = "\n\n".encode("utf-8")
    with FileList(tgtfl, "wb") as fl:
        for tmp in iter_dict_sort(data):
            lines = zip(*tmp)
            if len(tmp) > 1:
                if max_remove:
                    lines = maxfreq_filter(*lines)
                if shuf:
                    lines = shuffle_pair(*lines)
            for du, f in zip(lines, fl):
                f.write(ens.join(du))
                f.write(ens)
Esempio n. 10
0
def handle(srcfs, srcft, tgtfs, tgtft, max_len=256):

	_max_len = max(1, max_len - 2)

	data = {}

	with open(srcfs, "rb") as fs, open(srcft, "rb") as ft:
		for ls, lt in zip(fs, ft):
			ls, lt = ls.strip(), lt.strip()
			if ls and lt:
				ls, slen = clean_liststr_lentok(ls.decode("utf-8").split())
				lt, tlen = clean_liststr_lentok(lt.decode("utf-8").split())
				if (slen <= _max_len) and (tlen <= _max_len):
					if ls in data:
						data[ls][lt] = data[ls].get(lt, 0) + 1
					else:
						data[ls] = {lt: 1}

	_clean = {}
	for ls, v in data.items():
		if len(v) > 1:
			rlt = []
			_maxf = 0
			for key, value in v.items():
				if value > _maxf:
					_maxf = value
					rlt = [key]
				elif value == _maxf:
					rlt.append(key)
			for lt in rlt:
				if lt in _clean:
					_clean[lt][ls] = _clean[lt].get(ls, 0) + 1
				else:
					_clean[lt] = {ls: 1}
		else:
			lt = list(v.keys())[0]
			if lt in _clean:
				_clean[lt][ls] = _clean[lt].get(ls, 0) + 1
			else:
				_clean[lt] = {ls: 1}

	data = _clean

	ens = "\n".encode("utf-8")

	with open(tgtfs, "wb") as fs, open(tgtft, "wb") as ft:
		for lt, v in data.items():
			if len(v) > 1:
				rls = []
				_maxf = 0
				for key, value in v.items():
					if value > _maxf:
						_maxf = value
						rls = [key]
					elif value == _maxf:
						rls.append(key)
				rlt = "\n".join([lt for i in range(len(rls))])
				rls = "\n".join(rls)
			else:
				rlt = lt
				rls = list(v.keys())[0]
			fs.write(rls.encode("utf-8"))
			fs.write(ens)
			ft.write(rlt.encode("utf-8"))
			ft.write(ens)
Esempio n. 11
0
def handle(srcfs,
           srcft,
           tgtfs,
           tgtft,
           remove_same=False,
           shuf=True,
           max_remove=False):

    data = {}
    cache = []
    mxtoks = mxtokt = ntoks = ntokt = 0

    with open(srcfs, "rb") as fs, open(srcft, "rb") as ft:
        for ls, lt in zip(fs, ft):
            ls, lt = ls.strip(), lt.strip()
            if ls and lt:
                ls, slen = clean_liststr_lentok(ls.decode("utf-8").split())
                lt, tlen = clean_liststr_lentok(lt.decode("utf-8").split())
                cache.append((
                    ls,
                    lt,
                ))
                if slen > mxtoks:
                    mxtoks = slen
                if tlen > mxtokt:
                    mxtokt = tlen
                ntoks += slen
                ntokt += tlen
            else:
                if cache:
                    nsent = len(cache)
                    ls, lt = zip(*cache)
                    _tmp = (
                        "\n".join(ls),
                        "\n".join(lt),
                    )
                    data = dict_insert_set(data, _tmp, nsent, mxtoks + mxtokt,
                                           mxtokt, ntoks + ntokt, ntokt)
                    cache = []
                    mxtoks = mxtokt = ntoks = ntokt = 0
        if cache:
            nsent = len(cache)
            ls, lt = zip(*cache)
            _tmp = (
                "\n".join(ls),
                "\n".join(lt),
            )
            data = dict_insert_set(data, _tmp, nsent, mxtoks + mxtokt, mxtokt,
                                   ntoks + ntokt, ntokt)
            cache = []
            mxtoks = mxtokt = ntoks = ntokt = 0

    ens = "\n\n".encode("utf-8")

    with open(tgtfs, "wb") as fs, open(tgtft, "wb") as ft:
        for tmp in iter_dict_sort(data):
            ls, lt = zip(*tmp)
            if len(ls) > 1:
                if remove_same:
                    ls, lt = maxfreq_filter(ls, lt, max_remove)
                if shuf:
                    ls, lt = shuffle_pair(ls, lt)
            fs.write("\n\n".join(ls).encode("utf-8"))
            fs.write(ens)
            ft.write("\n\n".join(lt).encode("utf-8"))
            ft.write(ens)