def main(): # Read already finished already_finished = [] if os.path.exists("file/already_finished.txt"): already_finished = mytools.load_from_txt("file/already_finished.txt") # Read data data = np.array( pd.read_csv("../2_text_maching/baseline/file/submission.csv")) data = data.tolist() refine_data = [] if os.path.exists("file/bleu_after_optim.csv"): refine_data = getSavedCsv() for idx, d in tqdm(enumerate(data)): if idx % 1000 == 999: refine_data, all_bleu = save_to_csv(refine_data, default_path="after_optim") print("lens of refine_data:", len(refine_data)) print("now bleu is: ", all_bleu) if " ".join(d) + "\n" in already_finished: continue tmp_data = compute_column_score(d) for t in tmp_data: refine_data.append(t) data = np.array(pd.read_csv("file/bleu_after_optim.csv")) print("After Optim:", np.sum(np.transpose(data)[0]))
def randomGetTarget(): root_source = "../data/processData/it/" source_dir = root_source + random.choice(os.listdir(root_source)) + "/" source_file = source_dir + random.choice(os.listdir(source_dir)) source_ctx = mytools.load_from_txt(source_file) try: itemPath, start2end, text = source_ctx[random.randint( 0, len(source_ctx))].split(" | ") itemPath = itemPath.replace("../sourceData/", "") return itemPath, start2end, text except: return None, None, None
def readDict(path="new_dict.txt"): print("Reading Directory ...") # 读取中译日词典 ctxs = mytools.load_from_txt(path) # 解析每一条并保转换为json形式 directory = {} for ctx in ctxs: try: source, target = ctx.replace("\n", "").split(" | ") directory[source] = target except: pass print("Directory load completed .") return directory
def __init__(self, train="train", transform=None): print("Reading data...") self.img_path = "../data/argumented_data/" if train == "train": data_path = config.train_path elif train == "val": data_path = config.val_path else: data_path = config.test_path #读取数据 self.data = mytools.load_from_txt(data_path) self.transform = transform self.train = train
ctx = replace_all_blank(ctx) # if len(ctx.split())<=4 and lang=='zh': # return None # word_tokens = word_tokenize(ctx) #分词 word_tokens = jieba.lcut(ctx) # filtered_sentence = [w for w in word_tokens if w not in stop_words] #去停用词 # stem_words = [ps.stem(w) for w in filtered_sentence] # return stem_words return word_tokens if __name__ == "__main__": file_name = "sorted_data/sorted_" + lang + ".txt" ctxs = mytools.load_from_txt(file_name) filted_results = [] for ctx in tqdm(ctxs): try: html, location, text = ctx.split(" | ") # 决定要保存与否 以及保存的内容 text = save_decision(text, lang) if text is not None: filted_results.append({ "html": html.replace("data/it\\", "../zhit-0825/it/2020-04-17/").replace( "zh/2020-04-18/", "../zhit-0825/zh/2020-04-18/"),
data = pd.DataFrame(results[:, 1:], index=None, columns=[ "file_source", "location_source", "file_target", "location_target" ]) data.to_csv("file/" + default_path + ".csv", index=None) print("Save finished") if return_min: return results.tolist(), all_bleu, float(results[-1][0]) else: return results.tolist(), all_bleu ctxs = mytools.load_from_txt("new_dict_lower.txt") DICT = {} for ctx in ctxs: try: zh_, it_ = ctx.replace("\n", "").split(" | ") except: items = ctx.replace("\n", "").split(" | ") zh_, it_ = items[0], items[1] DICT[zh_] = it_ csv_path = "bleu_submission.csv" data = np.array(pd.read_csv(csv_path)).tolist() goodwords = mytools.load_from_json("good_words.json") for goodword in goodwords:
去除value中的所有非字母内容,包括标点符号、空格、换行、下划线等 :param value: 需要处理的内容 :return: 返回处理后的内容 """ # \W 表示匹配非数字字母下划线 result = re.sub('\W+', ' ', value).replace("_", ' ').replace(r"\u", " ") result = re.sub(pat1, '', result) if (len(value) - len(result)) <= 0.3 * len(value): return result else: return "_" ctxs = mytools.load_from_txt("../filter_data/sorted_data/sorted_it.txt") counts = {} for ctx in tqdm(ctxs): html, location, text = ctx.replace("\n", "").split(" | ") text = replace_all_blank(text) word_tokens = word_tokenize(text) #分词 # filtered_sentence = [w for w in word_tokens if w not in stop_words] #去停用词 # filtered_sentence = [word.lower() for word in filtered_sentence] # stem_words = [ps.stem(w) for w in filtered_sentence] stem_words = [word.lower() for word in word_tokens] for w in stem_words: counts[w] = counts.get(w, 0) + 1
import mytools,os from tqdm import tqdm root = "../../data/processedData/zh/" for file in tqdm(os.listdir(root)): ctxs = mytools.load_from_txt(root+file) new_ctxs = [] for ctx in ctxs: # print("-----------------------------------------") # print(ctx) try: html, location, texts = ctx.replace("\n","").split(" | ") except: continue start,end = int(location.split(":")[0]),int(location.split(":")[1]) while len(texts) > 11: new_texts = texts[:11] new_ctxs.append( html + " | " + str(start)+":"+str(start+11) + " | " +new_texts + "\n" ) # print(html + " | " + str(start)+":"+str(start+11) + " | " +new_texts) texts = texts[7:] start = start + 7 new_ctxs.append(html + " | " + str(start)+":"+str(end) + " | " + texts + "\n") # print(html + " | " + str(start)+":"+str(end) + " | " + texts) # exit(0) mytools.log_to_txt(new_ctxs, root.replace("zh","optim_zh/")+file)
import mytools import os ctxs = mytools.load_from_txt("all_it_vocab.txt") remove_words = "0123456789,./;'[]\\-=<>?:\"{}|~!@#$%^&*()_+" smooth = [] for ctx in ctxs: try: word, freq = ctx.replace("\n", "").split(" | ") flag = True for tmp in word: if tmp in remove_words: flag = False break if len(word) > 18: flag = False if flag: smooth.append(word + " | " + freq + "\n") except: print("error") mytools.log_to_txt(smooth, "smooth_it_vocab.txt")
import mytools, os from tqdm import tqdm ctxs = mytools.load_from_txt("sorted_data/zh.txt") for ctx in tqdm(ctxs): text, pos = ctx.split(" ||| ") html, location = pos.replace("\n", "").split(" || ")[0].split(" | ") ctx = html + " | " + location + " | " + text + "\n" mytools.log_to_txt(ctx, "sorted_data/sorted_zh.txt")
import mytools, os from tqdm import tqdm root_path = "../filter_data/sorted_data/zh/" saved_results = {} all_idx = 0 for file in tqdm(os.listdir(root_path)): try: ctxs = mytools.load_from_txt(root_path + file) # try: for ctx in ctxs: all_idx += 1 html, location, text = ctx.replace("\n", "").split(" | ") if text not in saved_results: saved_results[text] = [html + " | " + location] else: saved_results[text].append(html + " | " + location) except: pass print(all_idx) print(len(saved_results.keys())) for k in tqdm(saved_results.keys()): mytools.log_to_txt(k + " ||| " + " || ".join(saved_results[k]) + "\n", "saved_results/zh.txt") """ 1202420 582393
import os, sys from ots_python3_demo.ots_python3_demo import WebOTS import time import mytools host = "ntrans.xfyun.cn" # 初始化类 gClass = WebOTS.get_result(host) # text = "hello world" # respData = gClass.call_url(text=text) # print(respData["data"]["result"]["trans_result"]['dst']) ctxs = mytools.load_from_txt("smooth_it_vocab.txt") i = 0 for ctx in ctxs: try: # print(ctx.replace("\n","").split(" | ")[0]) # ans = translate(fromLang='zh', toLang='it', q=ctx.replace("\n","").split(" | ")[0]) ans = gClass.call_url(text=ctx.replace("\n", "").split(" | ") [0])["data"]["result"]["trans_result"]['dst'] # print(ans) if ans == None: ans = "unknown" except: ans = "unknown" tmp = ctx.replace("\n", "").split(" | ")[0] + " | " + ans + "\n" mytools.log_to_txt(tmp, "smooth_xunfei_dict.txt")
import mytools ctxs = mytools.load_from_txt("goodwords_dict.txt") remove_ = "|»,# ?." for ctx in ctxs: try: zh_, it_ = ctx.replace("\n", "").split(" | ") except: items = ctx.replace("\n", "").split(" | ") zh_, it_ = items[0], items[1] it_ = it_.lower() for i in remove_: it_ = it_.replace(i, "") mytools.log_to_txt(zh_ + " | " + it_ + "\n", "new_dict_lower_.txt")