def main():
    # Read already finished
    already_finished = []
    if os.path.exists("file/already_finished.txt"):
        already_finished = mytools.load_from_txt("file/already_finished.txt")

    # Read data
    data = np.array(
        pd.read_csv("../2_text_maching/baseline/file/submission.csv"))
    data = data.tolist()

    refine_data = []
    if os.path.exists("file/bleu_after_optim.csv"):
        refine_data = getSavedCsv()

    for idx, d in tqdm(enumerate(data)):
        if idx % 1000 == 999:
            refine_data, all_bleu = save_to_csv(refine_data,
                                                default_path="after_optim")
            print("lens of refine_data:", len(refine_data))
            print("now bleu is: ", all_bleu)

        if " ".join(d) + "\n" in already_finished:
            continue

        tmp_data = compute_column_score(d)
        for t in tmp_data:
            refine_data.append(t)

    data = np.array(pd.read_csv("file/bleu_after_optim.csv"))
    print("After Optim:", np.sum(np.transpose(data)[0]))
def randomGetTarget():
    root_source = "../data/processData/it/"
    source_dir = root_source + random.choice(os.listdir(root_source)) + "/"
    source_file = source_dir + random.choice(os.listdir(source_dir))
    source_ctx = mytools.load_from_txt(source_file)

    try:
        itemPath, start2end, text = source_ctx[random.randint(
            0, len(source_ctx))].split(" | ")
        itemPath = itemPath.replace("../sourceData/", "")
        return itemPath, start2end, text
    except:
        return None, None, None
Exemple #3
0
def readDict(path="new_dict.txt"):
    print("Reading Directory ...")

    # 读取中译日词典
    ctxs = mytools.load_from_txt(path)

    # 解析每一条并保转换为json形式
    directory = {}
    for ctx in ctxs:
        try:
            source, target = ctx.replace("\n", "").split(" | ")
            directory[source] = target
        except:
            pass

    print("Directory load completed .")
    return directory
    def __init__(self, train="train", transform=None):
        print("Reading data...")

        self.img_path = "../data/argumented_data/"

        if train == "train":
            data_path = config.train_path
        elif train == "val":
            data_path = config.val_path
        else:
            data_path = config.test_path

        #读取数据
        self.data = mytools.load_from_txt(data_path)

        self.transform = transform

        self.train = train
    ctx = replace_all_blank(ctx)

    # if len(ctx.split())<=4 and lang=='zh':
    #     return None

    # word_tokens = word_tokenize(ctx) #分词
    word_tokens = jieba.lcut(ctx)
    # filtered_sentence = [w for w in word_tokens if w not in stop_words] #去停用词
    # stem_words = [ps.stem(w) for w in filtered_sentence]
    # return stem_words
    return word_tokens


if __name__ == "__main__":
    file_name = "sorted_data/sorted_" + lang + ".txt"
    ctxs = mytools.load_from_txt(file_name)

    filted_results = []
    for ctx in tqdm(ctxs):
        try:
            html, location, text = ctx.split(" | ")

            # 决定要保存与否 以及保存的内容
            text = save_decision(text, lang)
            if text is not None:
                filted_results.append({
                    "html":
                    html.replace("data/it\\",
                                 "../zhit-0825/it/2020-04-17/").replace(
                                     "zh/2020-04-18/",
                                     "../zhit-0825/zh/2020-04-18/"),
Exemple #6
0
    data = pd.DataFrame(results[:, 1:],
                        index=None,
                        columns=[
                            "file_source", "location_source", "file_target",
                            "location_target"
                        ])
    data.to_csv("file/" + default_path + ".csv", index=None)
    print("Save finished")

    if return_min:
        return results.tolist(), all_bleu, float(results[-1][0])
    else:
        return results.tolist(), all_bleu


ctxs = mytools.load_from_txt("new_dict_lower.txt")
DICT = {}
for ctx in ctxs:
    try:
        zh_, it_ = ctx.replace("\n", "").split(" | ")
    except:
        items = ctx.replace("\n", "").split(" | ")
        zh_, it_ = items[0], items[1]

    DICT[zh_] = it_

csv_path = "bleu_submission.csv"
data = np.array(pd.read_csv(csv_path)).tolist()

goodwords = mytools.load_from_json("good_words.json")
for goodword in goodwords:
Exemple #7
0
    去除value中的所有非字母内容,包括标点符号、空格、换行、下划线等
    :param value: 需要处理的内容
    :return: 返回处理后的内容
    """
    # \W 表示匹配非数字字母下划线
    result = re.sub('\W+', ' ', value).replace("_", ' ').replace(r"\u", " ")

    result = re.sub(pat1, '', result)

    if (len(value) - len(result)) <= 0.3 * len(value):
        return result
    else:
        return "_"


ctxs = mytools.load_from_txt("../filter_data/sorted_data/sorted_it.txt")

counts = {}
for ctx in tqdm(ctxs):
    html, location, text = ctx.replace("\n", "").split(" | ")

    text = replace_all_blank(text)
    word_tokens = word_tokenize(text)  #分词
    # filtered_sentence = [w for w in word_tokens if w not in stop_words] #去停用词
    # filtered_sentence = [word.lower() for word in filtered_sentence]
    # stem_words = [ps.stem(w) for w in filtered_sentence]

    stem_words = [word.lower() for word in word_tokens]

    for w in stem_words:
        counts[w] = counts.get(w, 0) + 1
import mytools,os
from tqdm import tqdm

root = "../../data/processedData/zh/"
for file in tqdm(os.listdir(root)):
    ctxs = mytools.load_from_txt(root+file)

    new_ctxs = []
    for ctx in ctxs:
        # print("-----------------------------------------")
        # print(ctx)
        try:
            html, location, texts = ctx.replace("\n","").split(" | ")
        except:
            continue

        start,end = int(location.split(":")[0]),int(location.split(":")[1])

        while len(texts) > 11:
            new_texts = texts[:11]
            new_ctxs.append(
                html + " | " + str(start)+":"+str(start+11) + " | " +new_texts + "\n"
            )
            # print(html + " | " + str(start)+":"+str(start+11) + " | " +new_texts)
            texts = texts[7:]

            start = start + 7
        new_ctxs.append(html + " | " + str(start)+":"+str(end) + " | " + texts + "\n")
        # print(html + " | " + str(start)+":"+str(end) + " | " + texts)
    # exit(0)
    mytools.log_to_txt(new_ctxs, root.replace("zh","optim_zh/")+file)
Exemple #9
0
import mytools
import os

ctxs = mytools.load_from_txt("all_it_vocab.txt")

remove_words = "0123456789,./;'[]\\-=<>?:\"{}|~!@#$%^&*()_+"
smooth = []
for ctx in ctxs:
    try:
        word, freq = ctx.replace("\n", "").split(" | ")

        flag = True
        for tmp in word:
            if tmp in remove_words:
                flag = False
                break
        if len(word) > 18:
            flag = False

        if flag:
            smooth.append(word + " | " + freq + "\n")
    except:
        print("error")
mytools.log_to_txt(smooth, "smooth_it_vocab.txt")
Exemple #10
0
import mytools, os
from tqdm import tqdm

ctxs = mytools.load_from_txt("sorted_data/zh.txt")

for ctx in tqdm(ctxs):
    text, pos = ctx.split(" ||| ")
    html, location = pos.replace("\n", "").split(" || ")[0].split(" | ")
    ctx = html + " | " + location + " | " + text + "\n"
    mytools.log_to_txt(ctx, "sorted_data/sorted_zh.txt")
import mytools, os
from tqdm import tqdm

root_path = "../filter_data/sorted_data/zh/"
saved_results = {}
all_idx = 0
for file in tqdm(os.listdir(root_path)):
    try:
        ctxs = mytools.load_from_txt(root_path + file)

        # try:
        for ctx in ctxs:
            all_idx += 1
            html, location, text = ctx.replace("\n", "").split(" | ")

            if text not in saved_results:
                saved_results[text] = [html + " | " + location]
            else:
                saved_results[text].append(html + " | " + location)
    except:
        pass

print(all_idx)
print(len(saved_results.keys()))

for k in tqdm(saved_results.keys()):
    mytools.log_to_txt(k + " ||| " + " || ".join(saved_results[k]) + "\n",
                       "saved_results/zh.txt")
"""
1202420
582393
Exemple #12
0
import os, sys
from ots_python3_demo.ots_python3_demo import WebOTS
import time
import mytools

host = "ntrans.xfyun.cn"
# 初始化类
gClass = WebOTS.get_result(host)

# text = "hello world"
# respData = gClass.call_url(text=text)
# print(respData["data"]["result"]["trans_result"]['dst'])

ctxs = mytools.load_from_txt("smooth_it_vocab.txt")

i = 0
for ctx in ctxs:
    try:
        # print(ctx.replace("\n","").split(" | ")[0])
        # ans = translate(fromLang='zh', toLang='it', q=ctx.replace("\n","").split(" | ")[0])
        ans = gClass.call_url(text=ctx.replace("\n", "").split(" | ")
                              [0])["data"]["result"]["trans_result"]['dst']
        # print(ans)
        if ans == None:
            ans = "unknown"
    except:
        ans = "unknown"

    tmp = ctx.replace("\n", "").split(" | ")[0] + " | " + ans + "\n"
    mytools.log_to_txt(tmp, "smooth_xunfei_dict.txt")
import mytools

ctxs = mytools.load_from_txt("goodwords_dict.txt")

remove_ = "|»,# ?."

for ctx in ctxs:
    try:
        zh_, it_ = ctx.replace("\n", "").split(" | ")
    except:
        items = ctx.replace("\n", "").split(" | ")
        zh_, it_ = items[0], items[1]

    it_ = it_.lower()
    for i in remove_:
        it_ = it_.replace(i, "")
    mytools.log_to_txt(zh_ + " | " + it_ + "\n", "new_dict_lower_.txt")