def main(): WEBPAGE_CACHE.update(load_cache()) # if True: for i in range(3): page_url = WEBPAGE[0] # result = yield crawler(page_url) result = open("test.html", "rb").read() # print("result:", result) result = result.decode("utf8") ehtml = parse_content_to_dom(result) content_tds = ehtml.find(".t_fsz").find("td") content_texts = [td.text().strip() for i, td in iter_eles(content_tds)] # 判断改动 if WEBPAGE_CACHE.get(page_url) != content_texts: content_texts_old = WEBPAGE_CACHE.get(page_url) if content_texts_old: if len(content_texts) != len(content_texts_old): print("[PAGE] comment amount add: %s" % (len(content_texts) - len(content_texts_old))) for i in range(min(len(content_texts), len(content_texts_old))): if content_texts_old[i] != content_texts[i]: print("[COMMENT] comment %s changed." % (i)) WEBPAGE_CACHE[page_url] = content_texts else: WEBPAGE_CACHE[page_url][0] = WEBPAGE_CACHE[page_url][0].replace('5672427', '123456') q.d() tornado.ioloop.IOLoop.current().stop()
def work(self): self.fp_write = open("%s.format.j" % self.arg["file_path"], "w") self.fp_debug = open("%s.debug.j" % self.arg["file_path"], "w") while True: try: sentense, first_word = self.read_next_sentense() except Exception as e: if str(e) == "EOF": print("EOF!") break print(traceback.format_exc()) q.d() pad_this_line = "" if first_word.lower() in self.PAD_TWICE: pad_this_line = (self.line_pad - 1) * self.PAD else: if first_word.lower() in self.PAD_PLUS: pad_this_line = self.line_pad * self.PAD self.line_pad += 1 elif first_word.lower() in self.PAD_MINUS: pad_this_line = (self.line_pad - 1) * self.PAD self.line_pad -= 1 else: pad_this_line = self.line_pad * self.PAD self.fp_write.write(pad_this_line) self.fp_write.write(sentense) self.fp_write.close() self.fp_debug.close()
def end_sentense(self): try: self._worker.end_sentense() pass except Exception as e: q.d()
def test_try_decode_content(): target_url = "https://jd.com/" res = requests.get( target_url, headers={ "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3", "Accept-Encoding": "gzip, deflate, br", "Accept-Language": "en-US,en;q=0.9,zh-CN;q=0.8,zh-TW;q=0.7,zh;q=0.6", "Pragma": "no-cache", "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36", }, stream=True) text = res.raw.read() text = ungzip(text) q.d() cc = text.decode("utf8") cc.encode("gb18030").decode("gb18030") text.decode("gb2312") text.decode("gbk") text.decode("gb18030") text.decode("big5") text.decode("big5hkscs") text = try_decode_content(text)
def dummy_prep(data, method=None): varlist = data.columns[(data.dtypes == "category").values] if not method: return pd.get_dummies(data.loc[:, data.dtypes == "category"]) if method == "drop_first": return pd.get_dummies(data.loc[:, data.dtypes == "category"], drop_first=True) if method == "deviation": dummies = pd.get_dummies(data.loc[:, data.dtypes == "category"]) dummylist = {i: [x for x in dummies.columns if i in x] for i in varlist} for var in dummylist: if (dummies.values == 255).any(): print(f"{var} before") import q q.d() dropout = dummylist[var][0] keepers = dummylist[var][1:] dummies.loc[dummies[dropout] == 1, keepers] = -1 del dummies[dropout] if (dummies.values == 255).any(): print(f"{var} after") import q q.d() return dummies
def grep_string(self): self.result_list = [] def _wrap(word): self.result_list += word.split("\n") # self.result_list.append(word) return word COMMON_TRANSLATE_WRAP["translate"] = _wrap while True: try: sentense, first_word = self.read_next_sentense() except Exception as e: if str(e) == "EOF": print("EOF!") break print(traceback.format_exc()) q.d() # #!!!! # break return self.result_list
def get_start(): try: url = request.json["url"] response = requests.get(url) print("url:", url) title, article = extract_article(response.content) print("title:", title) print("article:", article) # article = article.replace(" ", "") article = article[:520] q.d() return json.dumps({ "desc": "success", "data": { "title": title, "article": article }, "code": 200 }) except Exception as e: import traceback print(traceback.format_exc()) return json.dumps({"desc": str(e), "data": None, "code": 200})
def write_translate_string(self, wf, line, string): # 去除字符串前后的标点空格和特殊字符 trans_line = string trans_strings = list(set([x.lower() for x in self.take_string_from_sentense(string)])) if trans_strings: # print("trans_strings:", set(trans_strings)) good_key = self.trans_result_keys & set(trans_strings) good_key = list(good_key) if good_key: # print("good_key:", good_key) if not len(good_key) == len(set(trans_strings)): print("len is not good", "TranslateWorkerForWts") q.d() good_trans = {x: self.trans_result[x] for x in good_key} # print("good_trans:", good_trans) good_key.sort(key=lambda x: -len(x)) # print("trans_line:", trans_line.strip()) trans_line = re.sub(self.RSTRING, self.re_escape, trans_line) for key in good_key: trans_line = re.sub(re.escape(key), good_trans[key], trans_line, flags=re.I) # trans_line = trans_line.replace(key, good_trans[key]) self.re_cache.reverse() trans_line = re.sub(r"[\0]", self.re_unescape, trans_line) wf.write(line.replace(string, trans_line))
def read_next_sentense(self): first_word = "" buf = "" # is_call = False # ss = None # function_deep = 0 sw = SentenseWorker() line_end = "\n" while True: word, line_end = self.read_next_word() # pass `call` if not first_word: first_word = word sw.add_first_word(word) else: sw.add_next_word(word) # if word.strip(): # # handle the strings # if word[0] == "\"": # self.fp_debug.write("%s\r\n" % word.replace("\n", "\\n")) # if len(word) > 300: # print("string too long:", word) buf += word if line_end: buf += line_end # self.line_no += 1 # print(">", self.line_no, buf[:20]) sw.end_sentense() break # if ss: # print("get_para_list:", ss.get_para_list()) # #!!!! # print("-" * 64) # print(buf, first_word) if first_word == "call": # print("------" * 16) # print(buf) ree = "call " + sw.get_translate_result() ree += line_end buf = ree # print(ree) # FOR DEBUG if DEBUG: if buf.replace(" ", "") != ree.replace(" ", ""): print("x" * 16) q.d() else: pass # print("pass one!") # q.d() # exit(1) return buf, first_word
def extract_text(files=[], outfile='-', no_laparams=False, all_texts=None, detect_vertical=None, word_margin=None, char_margin=None, line_margin=None, boxes_flow=None, output_type='text', codec='utf-8', strip_control=False, maxpages=0, page_numbers=None, password="", scale=1.0, rotation=0, layoutmode='normal', output_dir=None, debug=False, disable_caching=False, **kwargs): if not files: raise ValueError("Must provide files to work upon!") # If any LAParams group arguments were passed, # create an LAParams object and # populate with given args. Otherwise, set it to None. if not no_laparams: laparams = pdfminer.layout.LAParams() for param in ("all_texts", "detect_vertical", "word_margin", "char_margin", "line_margin", "boxes_flow"): paramv = locals().get(param, None) if paramv is not None: setattr(laparams, param, paramv) else: laparams = None if output_type == "text" and outfile != "-": for override, alttype in OUTPUT_TYPES: if outfile.endswith(override): output_type = alttype if outfile == "-": outfp = sys.stdout if outfp.encoding is not None: codec = 'utf-8' else: outfp = open(outfile, "wb") for fname in files: with open(fname, "rb") as fp: pdfminer.high_level.extract_text_to_fp(fp, **locals()) q.d() return outfp
def test_stream(): res = requests.get( "https://ss0.bdstatic.com/5aV1bjqh_Q23odCf/static/superman/img/logo/bd_logo1_31bdc765.png", timeout=(3, 3), stream=True) head = res.raw.read(44) # only about content print("head:", head) peer = res.raw._fp.fp.raw._sock.getpeername() print("ip:", peer[0], peer[1]) q.d()
def main(): table_h, table_w = calc_table() result = cut_text(table_h, table_w) q.d() table_text print("\nresult:\n", result) return
def write_translate_string(self, wf, line, string): """ 一行里只有一部分字符串是需要翻译的,例如 line => aaa = "who am i?" string => "who am i?" trans_line => 我是谁 结构 ==> aaa = "我是谁?" """ # 去除字符串前后的标点空格和特殊字符 # string = re.sub(r"^[\~\!\@\#\$\%\^\&\*\(\)\_\+\=\-\`\[\]\\\{\}\|\;\'\:\"\,\.\/\<\>\?\s]*|[\~\!\@\#\$\%\^\&\*\(\)\_\+\=\-\`\[\]\\\{\}\|\;\'\:\"\,\.\/\<\>\?\s]*$", "", string, flags=re.I) trans_line = string trans_strings = list(set([x.lower() for x in self.take_string_from_sentense(string)])) if trans_strings: # print("trans_strings:", set(trans_strings)) good_key = self.trans_result_keys & set(trans_strings) good_key = list(good_key) if good_key: # print("good_key:", good_key) if not len(good_key) == len(set(trans_strings)): print("len is not good", "TranslateWorkerForIni") q.d() raise("1") good_trans = {x: self.trans_result[x] for x in good_key} # print("good_trans:", good_trans) good_key.sort(key=lambda x: -len(x)) # print("trans_line:", trans_line.strip()) trans_line = re.sub(self.RSTRING, self.re_escape, trans_line) for key in good_key: trans_line = re.sub(re.escape(key), good_trans[key], trans_line, flags=re.I) # trans_line = trans_line.replace(key, good_trans[key]) self.re_cache.reverse() trans_line = re.sub(r"[\0]", self.re_unescape, trans_line) while " |n" in trans_line: trans_line = trans_line.replace(" |n", "|n") # if "рыцарь" in good_key: # q.d() # print("trans_line result:", trans_line) # if self.debug: # q.d() # if "рыцарь" in line: # print("".join(traceback.format_stack())) # q.d() # print("line:", line) # print("string:", string) # print("trans_line:", trans_line) # q.d() # print("[Translate]:", line.strip()) wf.write(line.replace(string, trans_line))
def test(): wk = Worker({ # "file_path": "/mine/war3work/(2)Game of Life and Death-v2/map/war3map.wts" # "file_path": "/mine/war3work/The-Chosen-Ones-1.0_x/map/war3campaign.wts" "file_path": "/mine/war3work/The-Chosen-Ones-1.0_x/map/war3campaign.wts.mta2.cache" # "file_path": "/mine/war3work/The-Chosen-Ones-1.0_x.bak/map/war3campaign.wts" }) string_list = wk.grep_string_from_config() for lineno, line in enumerate(string_list): if len(line.encode("utf8")) > 800: print(len(line.encode("utf8"))) q.d()
def make_up_leak(): # 原始爬取结果路径 course_path = "/f_data/dxy/dxy/" # 输出整理好的结果 new_root_dir = "/f_data/dxy_done/" all_lesson_path = [] # 补充:遍历处理 for course in Path(course_path).glob("*"): course_path_name = "%s" % (course) if not os.path.isdir(course_path_name): continue for lesson_path in Path(course_path_name).glob("*"): lesson_path_name = "%s" % (lesson_path) if not os.path.isdir(lesson_path_name): continue all_lesson_path.append(lesson_path_name) # 复制,重命名视频 print("Total is:", len(all_lesson_path)) for lesson_path in all_lesson_path: lesson_path_name = "%s" % (lesson_path) out_fix = "" if os.path.isfile("%s/%s" % (lesson_path_name, "/temp.ts")): out_fix = ".ts" continue elif os.path.isfile("%s/%s" % (lesson_path_name, "/temp.mp4")): out_fix = ".mp4" continue else: print("nothing to do:", lesson_path_name) try: out_fix = decrypt_and_join_all_ts(lesson_path_name) except Exception: print("Error lesson_path_name:", lesson_path_name) print(traceback.format_exc()) new_lesson_path_name = lesson_path_name.replace(course_path, new_root_dir) new_lesson_path_name = "%s%s" % (new_lesson_path_name, out_fix) if os.path.isfile(new_lesson_path_name): print("Not really happen") q.d() else: # print("copying", "%s/temp%s" % (lesson_path_name, out_fix), "to", new_lesson_path_name) shutil.copy("%s/temp%s" % (lesson_path_name, out_fix), new_lesson_path_name)
def parse(self): """Main entry point for parsers super() implementation will call to split_records and parse_record to process the file. """ with open(self.filename, "r", encoding='iso-8859-8') as f: soup = BeautifulSoup(f, 'lxml') statement = Statement() table = soup.find_all('table', id='trBlueOnWhite12') if len(table) == 0: raise ParseError(0, "'trBlueonWhite12' table not found") q.d() return statement
def rewrite_j(self, _wrap, new_file_path): COMMON_TRANSLATE_WRAP["translate"] = _wrap self.fp_write = open(new_file_path, "w") while True: try: sentense, first_word = self.read_next_sentense() except Exception as e: if str(e) == "EOF": print("EOF!") break print(traceback.format_exc()) q.d() self.fp_write.write(sentense) self.fp_write.close()
def work(self): self.fp_write = open("%s.translate.j" % self.arg["file_path"], "w") self.fp_debug = open("%s.debug.2.j" % self.arg["file_path"], "w") while True: try: sentense, first_word = self.read_next_sentense() except Exception as e: if str(e) == "EOF": print("EOF!") break print(traceback.format_exc()) q.d() self.fp_write.write(sentense) self.fp_write.close() self.fp_debug.close()
def main(): yield mongodb.init({"db_wm": { "HOST": "127.0.0.1", "PORT": 27017, "DATABASE_NAME": "wm", "USERNAME": "", "PASSWORD": "", }}) itemid = yield mongodb.DBS["db_wm"]["jingdong_itemid"].find({}).to_list(length=None) item_list = yield mongodb.DBS["db_wm"]["jingdong_price"].find({ # "_id": ObjectId("5dafb4bb632c52b5bc74d004"), }).to_list(length=None) itemid_map = {x["itemid"]: x for x in itemid} print("item_list.length:", len(item_list)) for item in item_list: if not itemid_map.get(item["itemid"]): # or itemid_map.get(item["itemid"])["status"] != 1: continue try: JDDiscount.calc(item) yield mongodb.DBS["db_wm"]["jingdong_price"].update_one( { "_id": ObjectId(item["_id"]), }, { "$set": { "calc_price": item["calc_price"], "calc_advice": item["calc_advice"], } }, ) except Exception: print(traceback.format_exc()) q.d() good_price = itemid_map.get(item["itemid"], {}).get("good_price", 0) if item["calc_price"] > 0 and item["calc_price"] < good_price: print(item.get("url")) print(item.get("name"), "好价:", good_price) print(item["price"], "=>", item["calc_price"], "|", item["calc_advice"]) # print(item) print() tornado.ioloop.IOLoop.current().stop()
def cut_text2(table_h, table_w): result = np.empty((len(table_h) - 1, len(table_w) - 1), dtype=np.object) result[:] = "" loca = Loca(table_h, table_w) # line = [90, 748, 97, 0, '2'] # loca.loca_h(line[1]) # q.d() for line in table_text: try: h, w = loca.do(line) result[h, w] = result[h, w] + line[4] except Exception as e: print("e:", e) q.d() return result
def test2(): item = { "itemid" : "56746195796", "good_price" : 800, "url" : "https://item.jd.com/56746195796.html", "name" : "尼康(Nikon)AF-S DX 尼克尔 35mm f/1.8G 标准定焦镜头", "cat" : "652,654,834", "venderId" : "10008806", "shopId" : "843487", "presale" : False, "datetime" : "2020-09-23 15:21:52", "price" : 1399.0, "vender" : "尼康官方旗舰店", "stock" : "现货", "promote" : [ [ "满2件,总价打9.80折,包邮(限中国内地)", "https://search.jd.com/Search?activity_id=101782838006", "2020-09-01 16:10:53 ~ 2020-09-30 23:59:59" ] ], "gift" : [ [ "限购", "2020-09-30 23:59:59" ] ], "quan" : [], "feedback" : "", "ads" : [ "尼康Z5全新上市!部分商品下单立减900元,相机&镜头任意两件98折,还有分享有礼等丰富活动,<a href=\"https://pro.m.jd.com/mall/active/2xtN2NXJ6Z55PtYvSGJWmLEx5txf/index.html\" target=\"_blank\">更多优惠,</a>" ] } new_item = JDDiscount.calc(item, item) q.d() print(new_item["price"]) print(new_item["quan"]) print(new_item["promote"]) print(new_item["calc_price"], new_item["calc_advice"])
def train(c): net = get_net(c) opt = get_opt(c, net) net, opt, step = c.init_model(net, opt=opt, step='max', train=True) step_lr = scheduler(c, opt, step) data_tr = SampleIterator(c, c.train_batch, split='valid' if c.debug else 'train') iter_tr = iter(data_tr) data_val = SequentialIterator(c, c.eval_batch, split='valid') data_test = SequentialIterator(c, c.eval_batch, split='test') print('Before quantization') tbl, sparsity = distiller.weights_sparsity_tbl_summary(net, return_total_sparsity=True) step_result = pd.Series(evaluate(c, data_val, net)).add_prefix('val_') step_result = step_result.append( pd.Series(evaluate(c, data_test, net)).add_prefix('test_') ) step_result['sparsity'] = sparsity print(step_result) compression_scheduler = distiller.config.file_config(net, opt, c.compress) print('After initial quantization') s = Namespace(net=net, opt=opt, step=step) c.on_train_start(s) tbl, sparsity = distiller.weights_sparsity_tbl_summary(net, return_total_sparsity=True) step_result = pd.Series(evaluate(c, data_val, net)).add_prefix('val_') step_result = step_result.append( pd.Series(evaluate(c, data_test, net)).add_prefix('test_') ) step_result['sparsity'] = sparsity print(step_result) npm = [] for name, param in net.named_parameters(): if param.dim() in [2, 4] and any(type in name for type in ['weight', 'bias']): npm.append((name, param, param.abs() == 0)) best_val_loss = np.inf if s.results is not None and 'val_loss' in s.results.columns: best_val_loss = s.results['val_loss'].dropna().max() try: steps_per_epoch = c.step_eval while step < s.step_max: epoch = step // steps_per_epoch batch = step % steps_per_epoch if batch == 0: compression_scheduler.on_epoch_begin(epoch) compression_scheduler.on_minibatch_begin(epoch, batch, steps_per_epoch) step_lr(step) x = to_torch(next(iter_tr), c.device).t() t_s = time() inputs, labels = x[:-1], x[1:] loss, _, lam, theta = net(inputs, labels) compression_scheduler.before_backward_pass(epoch, batch, steps_per_epoch, loss, False) opt.zero_grad() if torch.isnan(loss): import q; q.d() loss.backward() torch.nn.utils.clip_grad_norm_(net.parameters(), c.get('clip_grad', 0.5)) compression_scheduler.before_parameter_optimization(epoch, batch, steps_per_epoch, opt) opt.step() for name, param, mask in npm: param.data[mask] = 0 compression_scheduler.on_minibatch_end(epoch, batch, steps_per_epoch) if (batch + 1) == steps_per_epoch: compression_scheduler.on_epoch_end(epoch) time_model = np.round(time() - t_s, 5) loss = from_torch(loss) perplexity = np.nan if loss > 5 else np.e ** loss step_result = pd.Series(dict( loss=loss, perplexity=perplexity, time=time_model, )).add_prefix('train_') step_result['lr'] = next(iter(opt.param_groups))['lr'] step_result['theta'] = from_torch(theta) step_result['lambda'] = from_torch(lam) s.step = step = step + 1 if step % c.step_eval == 0: tbl, sparsity = distiller.weights_sparsity_tbl_summary(net, return_total_sparsity=True) step_result = step_result.append( pd.Series(evaluate(c, data_val, net)).add_prefix('val_') ) step_result = step_result.append( pd.Series(evaluate(c, data_test, net)).add_prefix('test_') ) step_result['sparsity'] = sparsity s.record_step = step_result['val_loss'] < best_val_loss clear_gpu_memory() s.step_result = step_result c.on_step_end(s) except Exception as e: import traceback err = traceback.format_exc() if c.main: c.log(err) else: print(err) finally: c.on_train_end(s)
def write_translate_string(self, wf, line, rawstring, key_index, mark): # 去除字符串前后的标点空格和特殊字符 # 兼容平台换行格式 while rawstring[-1:] == "\n": rawstring = rawstring[:-1] strings = [] # 忽略空字符串 和 TRIGSTR_xx 等固定字符串 if not rawstring or re.match(r"^TRIGSTR_[\d]+$", rawstring): return self.write_raw_string(wf, line) elif mark == "middle": strings.append(line) elif mark == "start": line = line.replace("\\\"", "\3") i_temp = line.rindex("\"") # r_string = line[i_temp:] + "\"" r_string = line[i_temp:] strings.append(r_string) eline = line[:i_temp] + ")" rline = eline.replace("\\\"", "\3") rline = rline[rline.find("(") + 1: rline.rfind(")")] # 这里的正则表达式没写对,只匹配了第一个 "" 里面的字符串,如果有多个则不行 # rline = re.sub(r"(\".*?\"){1,}", self.do_re_sub, rline, flags=re.I) rline = self.fuck_reg(rline) rline_list = rline.split(",") for i in War3mapJWorker.LINE_PLACE[key_index][:-1]: if len(rline_list) > i and rline_list[i]: # strings.append(rline_list[i]) strings += self.fuck_reg2(rline_list[i]) elif mark == "end": line = line.replace("\\\"", "\3") i_temp = line.index("\"") # l_string = "\"" + line[:i_temp + 1] l_string = line[:i_temp + 1] strings.append(l_string) eline = "(\"" + line[i_temp:] rline = eline.replace("\\\"", "\3") rline = rline[rline.find("(") + 1: rline.rfind(")")] # 这里的正则表达式没写对,只匹配了第一个 "" 里面的字符串,如果有多个则不行 # rline = re.sub(r"(\".*?\")", self.do_re_sub, rline, flags=re.I) rline = self.fuck_reg(rline) rline_list = rline.split(",") ll = War3mapJWorker.LINE_PLACE[key_index][-1] for i in War3mapJWorker.LINE_PLACE[key_index][:-1]: if -len(rline_list) < i - ll and rline_list[i - ll]: # strings.append(rline_list[i - ll]) strings += self.fuck_reg2(rline_list[i - ll]) elif mark == "inside": # 去假双引号 (在字符串里) rline = rawstring.replace("\\\"", "\3") # 去括号 rline = rline[rline.find("(") + 1: rline.rfind(")")] # 去假逗号 (在字符串里) # rline = re.sub(r"(\".*?\")", self.do_re_sub, rline, flags=re.I) rline = self.fuck_reg(rline) # 取目标字符串 (第x个参数)(不会这么惨,遇到嵌套方法的吧。。。) rline_list = rline.split(",") for i in War3mapJWorker.LINE_PLACE[key_index][:-1]: if len(rline_list) > i and rline_list[i]: # temp_string = rline_list[i] # strings.append(temp_string) strings += self.fuck_reg2(rline_list[i]) # 后面会split的 # string = ".,".join(strings) for string in strings: # 还原string string = string.strip().replace("\4", ",") # 判断 string 长度,剔除非 "字符串" 和 空字符串 if len(string) >= 3 and (( mark == "start" and string[0] == "\"") or ( mark == "end" and string[-1] == "\"") or ( mark == "middle") or ( mark == "inside" and string[0] == string[-1] == "\"" )): # 去首尾的 双引号 # string = string[1:-1] pass else: print("SKip string in J:", string, mark) string = "" trans_line = string trans_strings = "" if string: trans_strings = list(set([x.lower() for x in self.take_string_from_sentense(string)])) if trans_strings: # print("trans_strings:", set(trans_strings)) good_key = self.trans_result_keys & set(trans_strings) good_key = list(good_key) if good_key: # print("good_key:", good_key) if not len(good_key) == len(set(trans_strings)): print("len is not good") q.d() good_trans = {x: self.trans_result[x] for x in good_key} # print("good_trans:", good_trans) good_key.sort(key=lambda x: -len(x)) # print("trans_line:", trans_line.strip()) trans_line = re.sub(self.RSTRING, self.re_escape, trans_line) for key in good_key: trans_line = re.sub(re.escape(key), good_trans[key], trans_line, flags=re.I) # trans_line = trans_line.replace(key, good_trans[key]) self.re_cache.reverse() trans_line = re.sub(r"[\0]", self.re_unescape, trans_line) line = re.sub(re.escape(string), trans_line, line, flags=re.I) wf.write(line)
async def test_tool(): res = await execute_command("netstat -ano") q.d() print(res)
def main(args): lineno = 0 print_line = 0 with open(args["fa"], "rb") as ra: with open(args["fb"], "rb") as rb: while True: la = ra.readline() lb = rb.readline() lineno += 1 if lineno % 1000 == 999: print("lineno:", lineno) if repr(la) != repr(lb): print(repr(la), repr(lb)) q.d() if not la and not lb: break continue # if lineno < 65701: # continue if not la and not lb: print("[DONE]") break la = la.strip() lb = lb.strip() if la != lb: # # 无关 # if len(lb.encode("utf8")) > 900: # print("len > 999", len(lb.encode("utf8"))) # print(lb) # q.d() # continue # if la.count("\"") == lb.count("\"") == 2: # pass # else: # continue if la.count("\"") == lb.count("\"") > 0: continue # while True: # if not la or not lb: # break # if la[-1] == lb[-1]: # la = la[:-1] # lb = lb[:-1] # elif la[0] == lb[0]: # la = la[1:] # lb = lb[1:] # else: # break # if re.match(r"^[a-z0-9\ \?\!]*$", la, flags=re.I) and re.match(r"^[a-z0-9\ \?\!\u4e00-\u9fa5\u3002|\uff1f|\uff01|\uff0c|\u3001|\uff1b|\uff1a|\u201c|\u201d|\u2018|\u2019|\uff08|\uff09|\u300a|\u300b|\u3008|\u3009|\u3010|\u3011|\u300e|\u300f|\u300c|\u300d|\ufe43|\ufe44|\u3014|\u3015|\u2026|\u2014|\uff5e|\ufe4f|\uffe5|\ ]+$", lb, flags=re.I): # continue # else: # pass # if re.search(r"[\<\>]", la, flags=re.I): # # if re.search(r"[\{\}]", lb, flags=re.I): # # if re.search(r"[\[\]]", lb, flags=re.I): # # if re.search(r"[\(\)]", lb, flags=re.I): # pass # else: # continue print("-" * 64, lineno, ":") print(la) print(lb) print_line += 1 if print_line % 10 == 9: q.d() print("=" * 128) print("=" * 128)
def draw_the_line(dn): min_line_width = Glb.PADDING_HEIGHT # 判断 行和列 是否存在字符 h, w = dn.shape hl = [(dn[i]).any() for i in range(h)] wl = [(dn[:, i]).any() for i in range(w)] if "计算 h 的切割": h_cut_width, h_cut = [], 0 h_cut_pare = [] # 如果连续的 False (不存在字符) 达到 line_width 则认为存在可能的 表格线 for i in range(min_line_width, len(hl) - min_line_width): if not hl[i]: h_cut += 1 else: if h_cut: h_cut_pare.append((i - h_cut, i)) h_cut_width.append(h_cut) h_cut = 0 h_cut = max(2 * min_line_width + 1, min(h_cut_width)) print("h_cut_width:", h_cut_width, h_cut) if "计算 w 的切割": w_cut_width, w_cut = [], 0 w_cut_pare = [] # 如果连续的 False (不存在字符) 达到 line_width 则认为存在可能的 表格线 for i in range(min_line_width, len(wl) - min_line_width): if not wl[i]: w_cut += 1 else: if w_cut: w_cut_pare.append((i - w_cut, i)) w_cut_width.append(w_cut) w_cut = 0 q.d() w_cut = max(min_line_width + 1, min(w_cut_width)) print("w_cut_width:", w_cut_width, w_cut) fin_w, fin_h = [], [] # 筛选 w 和 h dc = np.zeros(dn.shape) for wp in w_cut_pare: if wp[1] - wp[0] < w_cut: continue minw = (wp[1] + wp[0]) / 2 fin_w.append(minw) dc[:, int(minw - min_line_width / 2): int(minw + min_line_width / 2)] = 1 for hp in h_cut_pare: if hp[1] - hp[0] < h_cut: continue minh = (hp[1] + hp[0]) / 2 fin_h.append(minh) dc[int(minh - min_line_width / 2): int(minh + min_line_width / 2)] = 1 save_png(dc, "tmp.dc.png") print("fin_h:", fin_h) print("fin_w:", fin_w) return fin_h, fin_w
def my_writer(self, file_path, work_item): with open(file_path, "rb") as rf: contents_tmp = rf.readlines() contents = [] i = 0 for c in contents_tmp: i = i + 1 # w3x2lni cannot handle (\xd0, \xd1) if c[-4:] in (b"\xd0\"\r\n", b"\xd1\"\r\n"): print("Line", i, "is bad char:", repr(c[-4])) c = c[:-4] + b" \"\r\n" try: contents.append(c.decode("utf8")) except Exception as e: print(e, "is bad char") q.d() with open("%s.mta2.cache" % file_path, "w") as wf: wait_end = False key_name = None wait_string = False wait_string2 = False for lineno, rawline in enumerate(contents): line = rawline try: # 如果遇到跨行字符串 if wait_string: append_string = line # zz = "" if line.strip() == "]=]" or line.strip() == "]=],": wait_string = False self.write_raw_string(wf, rawline) continue if line.strip()[-3:] == "]=]": wait_string = False append_string = line.strip()[:-3].strip() # zz = line[line.find(append_string) + len(append_string):] elif line.strip()[-4:] == "]=],": wait_string = False append_string = line.strip()[:-4].strip() # zz = line[line.find(append_string) + len(append_string):] if append_string: if key_name.lower() in work_item: self.write_translate_string(wf, rawline, append_string) # if zz: # self.write_raw_string(wf, zz) else: self.write_raw_string(wf, rawline) # ini_obj[sect_name][key_name].append(append_string) # self.write_translate_string(wf, rawline, append_string) continue # 如果在跨行字符串 -> 对象 if wait_end: # 如果遇到跨行字符串 if wait_string2: append_string2 = line # zz = "" if line.strip() == "]=]" or line.strip() == "]=],": wait_string2 = False self.write_raw_string(wf, rawline) continue if line.strip()[-3:] == "]=]": wait_string2 = False append_string2 = line.strip()[:-3].strip() # zz = line[line.find(append_string2) + len(append_string2):] elif line.strip()[-4:] == "]=],": wait_string2 = False append_string2 = line.strip()[:-4].strip() # zz = line[line.find(append_string2) + len(append_string2):] if append_string2: if key_name.lower() in work_item: self.write_translate_string(wf, rawline, append_string2) # if zz: # self.write_raw_string(wf, zz) else: self.write_raw_string(wf, rawline) # ini_obj[sect_name][key_name].append(append_string2) # ini_obj[sect_name][key_name][w_index].append(append_string2) # self.write_translate_string(wf, rawline, append_string2) continue # 判断对象是否完结 if line.strip()[-1:] == "}": wait_end = False key_name = None self.write_raw_string(wf, rawline) elif line[-3:] == "[=[": wait_string2 = True # w_index = len(ini_obj[sect_name][key_name]) # ini_obj[sect_name][key_name].append([]) self.write_raw_string(wf, rawline) else: if key_name.lower() in work_item: self.write_translate_string(wf, rawline, line) else: self.write_raw_string(wf, rawline) # ini_obj[sect_name][key_name].append(line) # self.write_translate_string(wf, rawline, line) continue else: line = line.strip() # 无视注释和空行 if not line or line[:2] == "--": self.write_raw_string(wf, rawline) continue # 判断值是否对象 if line[-1:] == "{": wait_end = True key_name = line.split("=")[0].strip() # ini_obj[sect_name][key_name] = [] self.write_raw_string(wf, rawline) elif line[-3:] == "[=[": wait_string = True key_name = line.split("=")[0].strip() # ini_obj[sect_name][key_name] = [] self.write_raw_string(wf, rawline) # 判断是否新节点 # 未兼容 单行 [=[ 和 ]=] 同时存在的情况 elif re.match(r"^\[([a-z0-9]+)\]$", line, flags=re.I): # sect_name = re.match(r"^\[([a-z0-9]+)\]$", line, re.I).group(1) # ini_obj[sect_name] = {} # print("Add new section: [%s]" % sect_name) self.write_raw_string(wf, rawline) elif re.match(r"^\[([^\[\]\"\']+)\]$", line, re.I): # print("warn:", "Be not english:", line) # sect_name = re.match(r"^\[([^\[\]\"\']+)\]$", line, re.I).group(1) # ini_obj[sect_name] = {} self.write_raw_string(wf, rawline) elif re.match(r"^\[(.+)\]$", line, re.I): # print("warn:", "Very bad [\"\\\\I0F\"] type ID:", line) self.write_raw_string(wf, rawline) # 直接赋值 elif "=" in line: key_name = line.split("=")[0] key_val = line[len(key_name) + 1:] key_name = key_name.strip() # if "рыцарь" in rawline: # print("".join(traceback.format_stack())) # q.d() # ?????? if key_name.lower() in work_item: self.write_translate_string(wf, rawline, key_val) else: self.write_raw_string(wf, rawline) else: print("[SKIP]", line) self.write_raw_string(wf, rawline) # raise Exception("[CANNOT PARSE]") except Exception: print(traceback.format_exc()) q.d()
a = Article(url, language='zh') # Chinese a.download() a.parse() print(a.title) print(a.text) response = requests.get(url) doc = Document(response.content) title = doc.title() html = doc.summary(True) article = Article(url, language='zh') article.download(input_html=html) article.parse() q.d() print(article.title) print(article.text) exit(1) response = requests.get(url) doc = Document(response.content) title = doc.title() html = doc.summary(True) q.d() text = fulltext(html) print(text)
def request(session, method, uri, *args, **kwargs): """Send a request to the URI and return the response The generic form of the function signature is as follows: .. code-block:: python args = [o['name'] for o in parameters if o['in'] == 'path'] function(*args, model=None, query=None) If accountId is in the list of args and the value is not supplied then the function will automatically insert the discovered account_id for the session. :param method: the http method to call :type method: str :param uri: the relative uri to call :type: uri: str """ api = get_api(session) method = method.lower() path = api['paths'][uri].get(method) if path is None: raise PureportError("method {} not supported for uri {}".format( method, uri)) parameters = list() query = {} for item in path.get('parameters', []): if item.get('in', 'path') == 'path' and item.get('required', True) is True: parameters.append(to_snake_case(item['name'])) elif item.get('in') == 'query': query[to_snake_case(item['name'])] = None cls = None ref = get_value('requestBody.content.application/json.schema.$ref', path) if ref: clsname = ref.split('/')[-1] schema = getattr(models, clsname)._schema if schema.discriminator: propval = getattr(kwargs['model'], schema.discriminator['propertyName']) clsname = schema.discriminator['mapping'].get(propval).split( '/')[-1] cls = getattr(models, clsname, None) log.debug("connection class is {}".format(cls)) parameters.append('model') query_values = kwargs.pop('query', None) if query_values: # TODO need to validate query inputs against api spec if not set(query_values).issubset(query): raise PureportError("unknown query value provided") variables = dict(zip(parameters, args)) for item in parameters: if item not in variables: variables[item] = kwargs.pop(to_snake_case(item), None) model = variables.get('model') body = None if cls and isinstance(model, cls): body = models.dump(model) if kwargs: raise PureportError("unexpected keyword arguments") if set(args).issubset(variables.values()) is False: raise PureportError("unexpected positional arguments") for p in parameters: if variables.get(p) is None: # inject the session accountId automatically into the variables # if it is the only parameter that doesn't have a supplied value. if p == 'account_id': log.debug("automatically injecting account_id argument") variables['account_id'] = session.account_id else: import q q.d() raise PureportError("missing required argument: {}".format(p)) func = globals().get(method) data = func(session, uri, body=body, variables=variables, query=query_values) schema = get_value('responses.default.content.application/json.schema', path) if schema: if '$ref' in schema: clsname = schema['$ref'].split('/')[-1] elif schema.get('type') == 'array' and 'items' in schema: clsname = schema['items']['$ref'].split('/')[-1] if isinstance(data, list): data = [models.load(clsname, item) for item in data] else: data = models.load(clsname, data) return data
def MyReader(file_path): ini_obj = {} with open(file_path, "rb") as rf: contents_tmp = rf.readlines() contents = [] i = 0 for c in contents_tmp: i = i + 1 # if i >= 34700: # # наносящее его противникам по # q.d() # w3x2lni cannot handle (\xd0, \xd1) if c[-4:] in (b"\xd0\"\r\n", b"\xd1\"\r\n"): print("Line", i, "is bad char") c = c[:-4] + b" \"\r\n" else: pass try: contents.append(c.decode("utf8")) except Exception as e: print(e, "is bad char") q.d() # if "наносящее его противникам по" in contents[-1]: # print("in the MyReader") # q.d() wait_end = False sect_name = None key_name = None wait_string = False # start_debug = False wait_string2 = False w_index = 0 for lineno, line in enumerate(contents): # 兼容平台换行格式 while line[-1:] == "\n": line = line[:-1] while line[:1] == "\ufeff": line = line[1:] try: # if not start_debug and sect_name == "A00B" and key_name == "Ubertip": # start_debug = True # if start_debug: # q.d() # 如果遇到跨行字符串 if wait_string: append_string = line if line.strip()[-3:] == "]=]": wait_string = False append_string = line.strip()[:-3].strip() elif line.strip()[-4:] == "]=],": wait_string = False append_string = line.strip()[:-4].strip() if append_string: ini_obj[sect_name][key_name].append(append_string) continue # 如果在跨行字符串 -> 对象 if wait_end: # 如果遇到跨行字符串 if wait_string2: append_string = line if line.strip()[-3:] == "]=]": wait_string2 = False append_string = line.strip()[:-3].strip() elif line.strip()[-4:] == "]=],": wait_string2 = False append_string = line.strip()[:-4].strip() if append_string: # ini_obj[sect_name][key_name].append(append_string) ini_obj[sect_name][key_name][w_index].append( append_string) continue # 判断对象是否完结 if line.strip() == "}": wait_end = False key_name = None elif line[-3:] == "[=[": wait_string2 = True w_index = len(ini_obj[sect_name][key_name]) ini_obj[sect_name][key_name].append([]) else: ini_obj[sect_name][key_name].append(line) continue else: line = line.strip() # 无视注释和空行 if not line or line[:2] == "--": continue # if file_path == "/mine/war3work/Otro Mapa TD de Warcraft III/map/Units/CommandStrings.txt": # q.d() # 判断值是否对象 if line[-1:] == "{": wait_end = True key_name = line.split("=")[0].strip() ini_obj[sect_name][key_name] = [] elif line[-3:] == "[=[": wait_string = True key_name = line.split("=")[0].strip() ini_obj[sect_name][key_name] = [] # 判断是否新节点 # 未兼容 单行 [=[ 和 ]=] 同时存在的情况 elif re.match(r"^\[([a-z0-9]{4})\]$", line, re.I): sect_name = re.match(r"^\[([a-z0-9]{4})\]$", line, re.I).group(1) ini_obj[sect_name] = {} # print("Add new section: [%s]" % sect_name) elif re.match(r"^\[([^\[\]\"\']+)\]$", line, re.I): print("warn:", "Be not 'abcd' type ID:", line) sect_name = re.match(r"^\[([^\[\]\"\']+)\]$", line, re.I).group(1) ini_obj[sect_name] = {} elif re.match(r"^\[(.+)\]$", line, re.I): print("warn:", "Very bad [\"\\\\I0F\"] type ID:", line) sect_name = re.match(r"^\[(.+)\]$", line, re.I).group(1) ini_obj[sect_name] = {} # 直接赋值 elif "=" in line: key_name = line.split("=")[0] key_val = line[len(key_name) + 1:] key_name = key_name.strip() ini_obj[sect_name][key_name] = key_val.strip() else: print("[!CANNOT PARSE]", line) # raise Exception("[CANNOT PARSE]") except Exception: print(traceback.format_exc()) q.d() # print( # "ini_obj['A00B']['Ubertip']:", # ini_obj['A00B']['Tip'], # ini_obj['A00B']['Ubertip'] # ) # q.d() # exit(1) # if "table/upgrade.ini" in file_path: # q.d() return ini_obj