Beispiel #1
0
def main():

    WEBPAGE_CACHE.update(load_cache())

    # if True:
    for i in range(3):
        page_url = WEBPAGE[0]
        # result = yield crawler(page_url)

        result = open("test.html", "rb").read()
        # print("result:", result)
        result = result.decode("utf8")

        ehtml = parse_content_to_dom(result)
        content_tds = ehtml.find(".t_fsz").find("td")
        content_texts = [td.text().strip() for i, td in iter_eles(content_tds)]

        # 判断改动
        if WEBPAGE_CACHE.get(page_url) != content_texts:
            content_texts_old = WEBPAGE_CACHE.get(page_url)
            if content_texts_old:
                if len(content_texts) != len(content_texts_old):
                    print("[PAGE] comment amount add: %s" % (len(content_texts) - len(content_texts_old)))

                for i in range(min(len(content_texts), len(content_texts_old))):
                    if content_texts_old[i] != content_texts[i]:
                        print("[COMMENT] comment %s changed." % (i))

            WEBPAGE_CACHE[page_url] = content_texts

        else:
            WEBPAGE_CACHE[page_url][0] = WEBPAGE_CACHE[page_url][0].replace('5672427', '123456')

    q.d()
    tornado.ioloop.IOLoop.current().stop()
Beispiel #2
0
    def work(self):

        self.fp_write = open("%s.format.j" % self.arg["file_path"], "w")
        self.fp_debug = open("%s.debug.j" % self.arg["file_path"], "w")
        while True:
            try:
                sentense, first_word = self.read_next_sentense()
            except Exception as e:
                if str(e) == "EOF":
                    print("EOF!")
                    break

                print(traceback.format_exc())
                q.d()

            pad_this_line = ""

            if first_word.lower() in self.PAD_TWICE:
                pad_this_line = (self.line_pad - 1) * self.PAD
            else:
                if first_word.lower() in self.PAD_PLUS:
                    pad_this_line = self.line_pad * self.PAD
                    self.line_pad += 1
                elif first_word.lower() in self.PAD_MINUS:
                    pad_this_line = (self.line_pad - 1) * self.PAD
                    self.line_pad -= 1
                else:
                    pad_this_line = self.line_pad * self.PAD

            self.fp_write.write(pad_this_line)
            self.fp_write.write(sentense)

        self.fp_write.close()
        self.fp_debug.close()
Beispiel #3
0
    def end_sentense(self):

        try:
            self._worker.end_sentense()
            pass
        except Exception as e:
            q.d()
Beispiel #4
0
def test_try_decode_content():

    target_url = "https://jd.com/"
    res = requests.get(
        target_url,
        headers={
            "Accept":
            "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3",
            "Accept-Encoding":
            "gzip, deflate, br",
            "Accept-Language":
            "en-US,en;q=0.9,zh-CN;q=0.8,zh-TW;q=0.7,zh;q=0.6",
            "Pragma":
            "no-cache",
            "User-Agent":
            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36",
        },
        stream=True)
    text = res.raw.read()
    text = ungzip(text)

    q.d()
    cc = text.decode("utf8")
    cc.encode("gb18030").decode("gb18030")

    text.decode("gb2312")
    text.decode("gbk")
    text.decode("gb18030")
    text.decode("big5")
    text.decode("big5hkscs")
    text = try_decode_content(text)
Beispiel #5
0
def dummy_prep(data, method=None):
    varlist = data.columns[(data.dtypes == "category").values]
    if not method:
        return pd.get_dummies(data.loc[:, data.dtypes == "category"])
    if method == "drop_first":
        return pd.get_dummies(data.loc[:, data.dtypes == "category"], drop_first=True)
    if method == "deviation":
        dummies = pd.get_dummies(data.loc[:, data.dtypes == "category"])

        dummylist = {i: [x for x in dummies.columns if i in x] for i in varlist}
        for var in dummylist:
            if (dummies.values == 255).any():
                print(f"{var} before")
                import q

                q.d()
            dropout = dummylist[var][0]
            keepers = dummylist[var][1:]
            dummies.loc[dummies[dropout] == 1, keepers] = -1
            del dummies[dropout]
        if (dummies.values == 255).any():
            print(f"{var} after")
            import q

            q.d()

    return dummies
Beispiel #6
0
    def grep_string(self):

        self.result_list = []

        def _wrap(word):
            self.result_list += word.split("\n")
            # self.result_list.append(word)
            return word

        COMMON_TRANSLATE_WRAP["translate"] = _wrap

        while True:
            try:
                sentense, first_word = self.read_next_sentense()
            except Exception as e:
                if str(e) == "EOF":
                    print("EOF!")
                    break

                print(traceback.format_exc())
                q.d()

            # #!!!!
            # break

        return self.result_list
Beispiel #7
0
def get_start():

    try:
        url = request.json["url"]
        response = requests.get(url)
        print("url:", url)

        title, article = extract_article(response.content)
        print("title:", title)
        print("article:", article)

        # article = article.replace(" ", "")

        article = article[:520]
        q.d()

        return json.dumps({
            "desc": "success",
            "data": {
                "title": title,
                "article": article
            },
            "code": 200
        })
    except Exception as e:
        import traceback
        print(traceback.format_exc())
        return json.dumps({"desc": str(e), "data": None, "code": 200})
Beispiel #8
0
    def write_translate_string(self, wf, line, string):
        # 去除字符串前后的标点空格和特殊字符
        trans_line = string
        trans_strings = list(set([x.lower() for x in self.take_string_from_sentense(string)]))
        if trans_strings:
            # print("trans_strings:", set(trans_strings))
            good_key = self.trans_result_keys & set(trans_strings)
            good_key = list(good_key)
            if good_key:
                # print("good_key:", good_key)
                if not len(good_key) == len(set(trans_strings)):
                    print("len is not good", "TranslateWorkerForWts")
                    q.d()
                good_trans = {x: self.trans_result[x] for x in good_key}
                # print("good_trans:", good_trans)
                good_key.sort(key=lambda x: -len(x))
                # print("trans_line:", trans_line.strip())

                trans_line = re.sub(self.RSTRING, self.re_escape, trans_line)
                for key in good_key:
                    trans_line = re.sub(re.escape(key), good_trans[key], trans_line, flags=re.I)
                    # trans_line = trans_line.replace(key, good_trans[key])

                self.re_cache.reverse()
                trans_line = re.sub(r"[\0]", self.re_unescape, trans_line)

        wf.write(line.replace(string, trans_line))
Beispiel #9
0
    def read_next_sentense(self):
        first_word = ""
        buf = ""
        # is_call = False
        # ss = None
        # function_deep = 0
        sw = SentenseWorker()
        line_end = "\n"
        while True:

            word, line_end = self.read_next_word()

            # pass `call`
            if not first_word:
                first_word = word
                sw.add_first_word(word)
            else:
                sw.add_next_word(word)
            # if word.strip():
            #     # handle the strings
            #     if word[0] == "\"":
            #         self.fp_debug.write("%s\r\n" % word.replace("\n", "\\n"))
            #         if len(word) > 300:
            #             print("string too long:", word)

            buf += word

            if line_end:
                buf += line_end
                # self.line_no += 1
                # print(">", self.line_no, buf[:20])
                sw.end_sentense()
                break

        # if ss:
        #     print("get_para_list:", ss.get_para_list())

        # #!!!!
        # print("-" * 64)
        # print(buf, first_word)
        if first_word == "call":
            # print("------" * 16)
            # print(buf)
            ree = "call " + sw.get_translate_result()
            ree += line_end
            buf = ree

            # print(ree)
            # FOR DEBUG
            if DEBUG:
                if buf.replace(" ", "") != ree.replace(" ", ""):
                    print("x" * 16)
                    q.d()
                else:
                    pass
                # print("pass one!")
        # q.d()
        # exit(1)
        return buf, first_word
Beispiel #10
0
def extract_text(files=[],
                 outfile='-',
                 no_laparams=False,
                 all_texts=None,
                 detect_vertical=None,
                 word_margin=None,
                 char_margin=None,
                 line_margin=None,
                 boxes_flow=None,
                 output_type='text',
                 codec='utf-8',
                 strip_control=False,
                 maxpages=0,
                 page_numbers=None,
                 password="",
                 scale=1.0,
                 rotation=0,
                 layoutmode='normal',
                 output_dir=None,
                 debug=False,
                 disable_caching=False,
                 **kwargs):
    if not files:
        raise ValueError("Must provide files to work upon!")

    # If any LAParams group arguments were passed,
    # create an LAParams object and
    # populate with given args. Otherwise, set it to None.
    if not no_laparams:
        laparams = pdfminer.layout.LAParams()
        for param in ("all_texts", "detect_vertical", "word_margin",
                      "char_margin", "line_margin", "boxes_flow"):
            paramv = locals().get(param, None)
            if paramv is not None:
                setattr(laparams, param, paramv)
    else:
        laparams = None

    if output_type == "text" and outfile != "-":
        for override, alttype in OUTPUT_TYPES:
            if outfile.endswith(override):
                output_type = alttype

    if outfile == "-":
        outfp = sys.stdout
        if outfp.encoding is not None:
            codec = 'utf-8'
    else:
        outfp = open(outfile, "wb")

    for fname in files:
        with open(fname, "rb") as fp:
            pdfminer.high_level.extract_text_to_fp(fp, **locals())

    q.d()
    return outfp
Beispiel #11
0
def test_stream():
    res = requests.get(
        "https://ss0.bdstatic.com/5aV1bjqh_Q23odCf/static/superman/img/logo/bd_logo1_31bdc765.png",
        timeout=(3, 3),
        stream=True)
    head = res.raw.read(44)  # only about content
    print("head:", head)
    peer = res.raw._fp.fp.raw._sock.getpeername()
    print("ip:", peer[0], peer[1])
    q.d()
Beispiel #12
0
def main():

    table_h, table_w = calc_table()
    result = cut_text(table_h, table_w)

    q.d()
    table_text

    print("\nresult:\n", result)

    return 
Beispiel #13
0
    def write_translate_string(self, wf, line, string):
        """
            一行里只有一部分字符串是需要翻译的,例如
            line => aaa = "who am i?"
            string => "who am i?"
            trans_line => 我是谁
            结构 ==> aaa = "我是谁?"
        """

        # 去除字符串前后的标点空格和特殊字符
        # string = re.sub(r"^[\~\!\@\#\$\%\^\&\*\(\)\_\+\=\-\`\[\]\\\{\}\|\;\'\:\"\,\.\/\<\>\?\s]*|[\~\!\@\#\$\%\^\&\*\(\)\_\+\=\-\`\[\]\\\{\}\|\;\'\:\"\,\.\/\<\>\?\s]*$", "", string, flags=re.I)
        trans_line = string
        trans_strings = list(set([x.lower() for x in self.take_string_from_sentense(string)]))
        if trans_strings:
            # print("trans_strings:", set(trans_strings))
            good_key = self.trans_result_keys & set(trans_strings)
            good_key = list(good_key)
            if good_key:
                # print("good_key:", good_key)
                if not len(good_key) == len(set(trans_strings)):
                    print("len is not good", "TranslateWorkerForIni")
                    q.d()
                    raise("1")
                good_trans = {x: self.trans_result[x] for x in good_key}
                # print("good_trans:", good_trans)
                good_key.sort(key=lambda x: -len(x))
                # print("trans_line:", trans_line.strip())
                trans_line = re.sub(self.RSTRING, self.re_escape, trans_line)
                for key in good_key:
                    trans_line = re.sub(re.escape(key), good_trans[key], trans_line, flags=re.I)
                    # trans_line = trans_line.replace(key, good_trans[key])

                self.re_cache.reverse()
                trans_line = re.sub(r"[\0]", self.re_unescape, trans_line)

                while " |n" in trans_line:
                    trans_line = trans_line.replace(" |n", "|n")
                # if "рыцарь" in good_key:
                #     q.d()
                # print("trans_line result:", trans_line)

        # if self.debug:
        #     q.d()
        # if "рыцарь" in line:
        #     print("".join(traceback.format_stack()))
        #     q.d()
        # print("line:", line)
        # print("string:", string)
        # print("trans_line:", trans_line)
        # q.d()
        # print("[Translate]:", line.strip())
        wf.write(line.replace(string, trans_line))
Beispiel #14
0
def test():
    wk = Worker({
        # "file_path": "/mine/war3work/(2)Game of Life and Death-v2/map/war3map.wts"
        # "file_path": "/mine/war3work/The-Chosen-Ones-1.0_x/map/war3campaign.wts"
        "file_path":
        "/mine/war3work/The-Chosen-Ones-1.0_x/map/war3campaign.wts.mta2.cache"
        # "file_path": "/mine/war3work/The-Chosen-Ones-1.0_x.bak/map/war3campaign.wts"
    })
    string_list = wk.grep_string_from_config()
    for lineno, line in enumerate(string_list):
        if len(line.encode("utf8")) > 800:
            print(len(line.encode("utf8")))
            q.d()
Beispiel #15
0
def make_up_leak():

    # 原始爬取结果路径
    course_path = "/f_data/dxy/dxy/"
    # 输出整理好的结果
    new_root_dir = "/f_data/dxy_done/"

    all_lesson_path = []
    # 补充:遍历处理
    for course in Path(course_path).glob("*"):

        course_path_name = "%s" % (course)
        if not os.path.isdir(course_path_name):
            continue

        for lesson_path in Path(course_path_name).glob("*"):

            lesson_path_name = "%s" % (lesson_path)
            if not os.path.isdir(lesson_path_name):
                continue

            all_lesson_path.append(lesson_path_name)

    # 复制,重命名视频
    print("Total is:", len(all_lesson_path))
    for lesson_path in all_lesson_path:
        lesson_path_name = "%s" % (lesson_path)
        out_fix = ""
        if os.path.isfile("%s/%s" % (lesson_path_name, "/temp.ts")):
            out_fix = ".ts"
            continue
        elif os.path.isfile("%s/%s" % (lesson_path_name, "/temp.mp4")):
            out_fix = ".mp4"
            continue
        else:
            print("nothing to do:", lesson_path_name)
            try:
                out_fix = decrypt_and_join_all_ts(lesson_path_name)
            except Exception:
                print("Error lesson_path_name:", lesson_path_name)
                print(traceback.format_exc())

        new_lesson_path_name = lesson_path_name.replace(course_path, new_root_dir)
        new_lesson_path_name = "%s%s" % (new_lesson_path_name, out_fix)

        if os.path.isfile(new_lesson_path_name):
            print("Not really happen")
            q.d()
        else:
            # print("copying", "%s/temp%s" % (lesson_path_name, out_fix), "to", new_lesson_path_name)
            shutil.copy("%s/temp%s" % (lesson_path_name, out_fix), new_lesson_path_name)
    def parse(self):
        """Main entry point for parsers

        super() implementation will call to split_records and parse_record to
        process the file.
        """
        with open(self.filename, "r", encoding='iso-8859-8') as f:
            soup = BeautifulSoup(f, 'lxml')
            statement = Statement()
            table = soup.find_all('table', id='trBlueOnWhite12')
            if len(table) == 0:
                raise ParseError(0, "'trBlueonWhite12' table not found")
            q.d()
            return statement
Beispiel #17
0
    def rewrite_j(self, _wrap, new_file_path):

        COMMON_TRANSLATE_WRAP["translate"] = _wrap
        self.fp_write = open(new_file_path, "w")

        while True:
            try:
                sentense, first_word = self.read_next_sentense()
            except Exception as e:
                if str(e) == "EOF":
                    print("EOF!")
                    break

                print(traceback.format_exc())
                q.d()

            self.fp_write.write(sentense)

        self.fp_write.close()
Beispiel #18
0
    def work(self):

        self.fp_write = open("%s.translate.j" % self.arg["file_path"], "w")
        self.fp_debug = open("%s.debug.2.j" % self.arg["file_path"], "w")
        while True:
            try:
                sentense, first_word = self.read_next_sentense()
            except Exception as e:
                if str(e) == "EOF":
                    print("EOF!")
                    break

                print(traceback.format_exc())
                q.d()

            self.fp_write.write(sentense)

        self.fp_write.close()
        self.fp_debug.close()
Beispiel #19
0
    def main():
        yield mongodb.init({"db_wm": {
            "HOST": "127.0.0.1",
            "PORT": 27017,
            "DATABASE_NAME": "wm",
            "USERNAME": "",
            "PASSWORD": "",
        }})
        itemid = yield mongodb.DBS["db_wm"]["jingdong_itemid"].find({}).to_list(length=None)
        item_list = yield mongodb.DBS["db_wm"]["jingdong_price"].find({
            # "_id": ObjectId("5dafb4bb632c52b5bc74d004"),
        }).to_list(length=None)
        itemid_map = {x["itemid"]: x for x in itemid}
        print("item_list.length:", len(item_list))
        for item in item_list:
            if not itemid_map.get(item["itemid"]):  # or itemid_map.get(item["itemid"])["status"] != 1:
                continue
            try:
                JDDiscount.calc(item)
                yield mongodb.DBS["db_wm"]["jingdong_price"].update_one(
                    {
                        "_id": ObjectId(item["_id"]),
                    }, {
                        "$set": {
                            "calc_price": item["calc_price"],
                            "calc_advice": item["calc_advice"],
                        }
                    },
                )
            except Exception:
                print(traceback.format_exc())
                q.d()

            good_price = itemid_map.get(item["itemid"], {}).get("good_price", 0)
            if item["calc_price"] > 0 and item["calc_price"] < good_price:
                print(item.get("url"))
                print(item.get("name"), "好价:", good_price)
                print(item["price"], "=>", item["calc_price"], "|", item["calc_advice"])
                # print(item)
                print()

        tornado.ioloop.IOLoop.current().stop()
Beispiel #20
0
def cut_text2(table_h, table_w):

    result = np.empty((len(table_h) - 1, len(table_w) - 1), dtype=np.object)
    result[:] = ""

    loca = Loca(table_h, table_w)

    # line = [90, 748, 97, 0, '2']
    # loca.loca_h(line[1])
    # q.d()

    for line in table_text:

        try:
            h, w = loca.do(line)
            result[h, w] = result[h, w] + line[4]
        except Exception as e:
            print("e:", e)
            q.d()

    return result
Beispiel #21
0
def test2():
    item = {
        "itemid" : "56746195796",
        "good_price" : 800,
        "url" : "https://item.jd.com/56746195796.html",
        "name" : "尼康(Nikon)AF-S DX 尼克尔 35mm f/1.8G 标准定焦镜头",
        "cat" : "652,654,834",
        "venderId" : "10008806",
        "shopId" : "843487",
        "presale" : False,
        "datetime" : "2020-09-23 15:21:52",
        "price" : 1399.0,
        "vender" : "尼康官方旗舰店",
        "stock" : "现货",
        "promote" : [ 
            [ 
                "满2件,总价打9.80折,包邮(限中国内地)", 
                "https://search.jd.com/Search?activity_id=101782838006", 
                "2020-09-01 16:10:53 ~ 2020-09-30 23:59:59"
            ]
        ],
        "gift" : [ 
            [ 
                "限购", 
                "2020-09-30 23:59:59"
            ]
        ],
        "quan" : [],
        "feedback" : "",
        "ads" : [ 
            "尼康Z5全新上市!部分商品下单立减900元,相机&镜头任意两件98折,还有分享有礼等丰富活动,&lt;a href=\"https://pro.m.jd.com/mall/active/2xtN2NXJ6Z55PtYvSGJWmLEx5txf/index.html\" target=\"_blank\"&gt;更多优惠,&lt;/a&gt;"
        ]
    }
    new_item = JDDiscount.calc(item, item)
    q.d()
    print(new_item["price"])
    print(new_item["quan"])
    print(new_item["promote"])
    print(new_item["calc_price"], new_item["calc_advice"])
def train(c):
    net = get_net(c)

    opt = get_opt(c, net)
    net, opt, step = c.init_model(net, opt=opt, step='max', train=True)

    step_lr = scheduler(c, opt, step)
    data_tr = SampleIterator(c, c.train_batch, split='valid' if c.debug else 'train')
    iter_tr = iter(data_tr)
    data_val = SequentialIterator(c, c.eval_batch, split='valid')
    data_test = SequentialIterator(c, c.eval_batch, split='test')

    print('Before quantization')
    tbl, sparsity = distiller.weights_sparsity_tbl_summary(net, return_total_sparsity=True)
    step_result = pd.Series(evaluate(c, data_val, net)).add_prefix('val_')
    step_result = step_result.append(
        pd.Series(evaluate(c, data_test, net)).add_prefix('test_')
    )
    step_result['sparsity'] = sparsity
    print(step_result)

    compression_scheduler = distiller.config.file_config(net, opt, c.compress)

    print('After initial quantization')
    s = Namespace(net=net, opt=opt, step=step)
    c.on_train_start(s)

    tbl, sparsity = distiller.weights_sparsity_tbl_summary(net, return_total_sparsity=True)
    step_result = pd.Series(evaluate(c, data_val, net)).add_prefix('val_')
    step_result = step_result.append(
        pd.Series(evaluate(c, data_test, net)).add_prefix('test_')
    )
    step_result['sparsity'] = sparsity
    print(step_result)

    npm = []    
    for name, param in net.named_parameters():
        if param.dim() in [2, 4] and any(type in name for type in ['weight', 'bias']):
            npm.append((name, param, param.abs() == 0))

    best_val_loss = np.inf
    if s.results is not None and 'val_loss' in s.results.columns:
        best_val_loss = s.results['val_loss'].dropna().max()
    try:
        steps_per_epoch = c.step_eval
        while step < s.step_max:
            epoch = step // steps_per_epoch
            batch = step % steps_per_epoch

            if batch == 0:
                compression_scheduler.on_epoch_begin(epoch)
            compression_scheduler.on_minibatch_begin(epoch, batch, steps_per_epoch)
            
            step_lr(step)

            x = to_torch(next(iter_tr), c.device).t()

            t_s = time()
            inputs, labels = x[:-1], x[1:]
            loss, _, lam, theta = net(inputs, labels)
            
            compression_scheduler.before_backward_pass(epoch, batch, steps_per_epoch, loss, False)

            opt.zero_grad()
            if torch.isnan(loss):
                import q; q.d()
            
            loss.backward()
            torch.nn.utils.clip_grad_norm_(net.parameters(), c.get('clip_grad', 0.5))

            compression_scheduler.before_parameter_optimization(epoch, batch, steps_per_epoch, opt)
            opt.step()
            for name, param, mask in npm:
                param.data[mask] = 0
            compression_scheduler.on_minibatch_end(epoch, batch, steps_per_epoch)
            
            if (batch + 1) == steps_per_epoch:
                compression_scheduler.on_epoch_end(epoch)

            time_model = np.round(time() - t_s, 5)

            loss = from_torch(loss)
            perplexity = np.nan if loss > 5 else np.e ** loss
            step_result = pd.Series(dict(
                loss=loss,
                perplexity=perplexity,
                time=time_model,
            )).add_prefix('train_')
            step_result['lr'] = next(iter(opt.param_groups))['lr']
            step_result['theta'] = from_torch(theta)
            step_result['lambda'] = from_torch(lam)

            s.step = step = step + 1
            if step % c.step_eval == 0:
                tbl, sparsity = distiller.weights_sparsity_tbl_summary(net, return_total_sparsity=True)
                step_result = step_result.append(
                    pd.Series(evaluate(c, data_val, net)).add_prefix('val_')
                )
                step_result = step_result.append(
                    pd.Series(evaluate(c, data_test, net)).add_prefix('test_')
                )
                step_result['sparsity'] = sparsity
                s.record_step = step_result['val_loss'] < best_val_loss
                clear_gpu_memory()
            s.step_result = step_result
            c.on_step_end(s)
    except Exception as e:
        import traceback
        err = traceback.format_exc()
        if c.main:
            c.log(err)
        else:
            print(err)
    finally:
        c.on_train_end(s)
Beispiel #23
0
    def write_translate_string(self, wf, line, rawstring, key_index, mark):
        # 去除字符串前后的标点空格和特殊字符

        # 兼容平台换行格式
        while rawstring[-1:] == "\n":
            rawstring = rawstring[:-1]

        strings = []

        # 忽略空字符串 和 TRIGSTR_xx 等固定字符串
        if not rawstring or re.match(r"^TRIGSTR_[\d]+$", rawstring):
            return self.write_raw_string(wf, line)

        elif mark == "middle":
            strings.append(line)

        elif mark == "start":
            line = line.replace("\\\"", "\3")
            i_temp = line.rindex("\"")
            # r_string = line[i_temp:] + "\""
            r_string = line[i_temp:]
            strings.append(r_string)
            eline = line[:i_temp] + ")"

            rline = eline.replace("\\\"", "\3")
            rline = rline[rline.find("(") + 1: rline.rfind(")")]

            # 这里的正则表达式没写对,只匹配了第一个 "" 里面的字符串,如果有多个则不行
            # rline = re.sub(r"(\".*?\"){1,}", self.do_re_sub, rline, flags=re.I)
            rline = self.fuck_reg(rline)

            rline_list = rline.split(",")
            for i in War3mapJWorker.LINE_PLACE[key_index][:-1]:
                if len(rline_list) > i and rline_list[i]:
                    # strings.append(rline_list[i])
                    strings += self.fuck_reg2(rline_list[i])

        elif mark == "end":

            line = line.replace("\\\"", "\3")
            i_temp = line.index("\"")
            # l_string = "\"" + line[:i_temp + 1]
            l_string = line[:i_temp + 1]
            strings.append(l_string)
            eline = "(\"" + line[i_temp:]

            rline = eline.replace("\\\"", "\3")
            rline = rline[rline.find("(") + 1: rline.rfind(")")]

            # 这里的正则表达式没写对,只匹配了第一个 "" 里面的字符串,如果有多个则不行
            # rline = re.sub(r"(\".*?\")", self.do_re_sub, rline, flags=re.I)
            rline = self.fuck_reg(rline)

            rline_list = rline.split(",")
            ll = War3mapJWorker.LINE_PLACE[key_index][-1]
            for i in War3mapJWorker.LINE_PLACE[key_index][:-1]:
                if -len(rline_list) < i - ll and rline_list[i - ll]:
                    # strings.append(rline_list[i - ll])
                    strings += self.fuck_reg2(rline_list[i - ll])

        elif mark == "inside":

            # 去假双引号 (在字符串里)
            rline = rawstring.replace("\\\"", "\3")

            # 去括号
            rline = rline[rline.find("(") + 1: rline.rfind(")")]

            # 去假逗号 (在字符串里)
            # rline = re.sub(r"(\".*?\")", self.do_re_sub, rline, flags=re.I)
            rline = self.fuck_reg(rline)

            # 取目标字符串 (第x个参数)(不会这么惨,遇到嵌套方法的吧。。。)
            rline_list = rline.split(",")
            for i in War3mapJWorker.LINE_PLACE[key_index][:-1]:
                if len(rline_list) > i and rline_list[i]:
                    # temp_string = rline_list[i]
                    # strings.append(temp_string)
                    strings += self.fuck_reg2(rline_list[i])

        # 后面会split的
        # string = ".,".join(strings)
        for string in strings:

            # 还原string
            string = string.strip().replace("\4", ",")

            # 判断 string 长度,剔除非 "字符串" 和 空字符串
            if len(string) >= 3 and ((
                mark == "start" and string[0] == "\"") or (
                mark == "end" and string[-1] == "\"") or (
                mark == "middle") or (
                mark == "inside" and string[0] == string[-1] == "\""
            )):
                # 去首尾的 双引号
                # string = string[1:-1]
                pass

            else:
                print("SKip string in J:", string, mark)
                string = ""

            trans_line = string
            trans_strings = ""
            if string:
                trans_strings = list(set([x.lower() for x in self.take_string_from_sentense(string)]))

            if trans_strings:
                # print("trans_strings:", set(trans_strings))
                good_key = self.trans_result_keys & set(trans_strings)
                good_key = list(good_key)
                if good_key:
                    # print("good_key:", good_key)
                    if not len(good_key) == len(set(trans_strings)):
                        print("len is not good")
                        q.d()
                    good_trans = {x: self.trans_result[x] for x in good_key}
                    # print("good_trans:", good_trans)
                    good_key.sort(key=lambda x: -len(x))
                    # print("trans_line:", trans_line.strip())
                    trans_line = re.sub(self.RSTRING, self.re_escape, trans_line)
                    for key in good_key:
                        trans_line = re.sub(re.escape(key), good_trans[key], trans_line, flags=re.I)
                        # trans_line = trans_line.replace(key, good_trans[key])

                    self.re_cache.reverse()
                    trans_line = re.sub(r"[\0]", self.re_unescape, trans_line)

            line = re.sub(re.escape(string), trans_line, line, flags=re.I)

        wf.write(line)
Beispiel #24
0
async def test_tool():
    res = await execute_command("netstat -ano")
    q.d()
    print(res)
Beispiel #25
0
def main(args):
    lineno = 0
    print_line = 0
    with open(args["fa"], "rb") as ra:
        with open(args["fb"], "rb") as rb:
            while True:
                la = ra.readline()
                lb = rb.readline()
                lineno += 1

                if lineno % 1000 == 999:
                    print("lineno:", lineno)

                if repr(la) != repr(lb):
                    print(repr(la), repr(lb))
                    q.d()

                if not la and not lb:
                    break

                continue
                # if lineno < 65701:
                #     continue
                if not la and not lb:
                    print("[DONE]")
                    break
                la = la.strip()
                lb = lb.strip()
                if la != lb:
                    # # 无关
                    # if len(lb.encode("utf8")) > 900:
                    #     print("len > 999", len(lb.encode("utf8")))
                    #     print(lb)
                    #     q.d()
                    # continue
                    # if la.count("\"") == lb.count("\"") == 2:
                    #     pass
                    # else:
                    #     continue

                    if la.count("\"") == lb.count("\"") > 0:
                        continue

                    # while True:
                    #     if not la or not lb:
                    #         break
                    #     if la[-1] == lb[-1]:
                    #         la = la[:-1]
                    #         lb = lb[:-1]
                    #     elif la[0] == lb[0]:
                    #         la = la[1:]
                    #         lb = lb[1:]
                    #     else:
                    #         break

                    # if re.match(r"^[a-z0-9\ \?\!]*$", la, flags=re.I) and re.match(r"^[a-z0-9\ \?\!\u4e00-\u9fa5\u3002|\uff1f|\uff01|\uff0c|\u3001|\uff1b|\uff1a|\u201c|\u201d|\u2018|\u2019|\uff08|\uff09|\u300a|\u300b|\u3008|\u3009|\u3010|\u3011|\u300e|\u300f|\u300c|\u300d|\ufe43|\ufe44|\u3014|\u3015|\u2026|\u2014|\uff5e|\ufe4f|\uffe5|\ ]+$", lb, flags=re.I):
                    #     continue
                    # else:
                    #     pass

                    # if re.search(r"[\<\>]", la, flags=re.I):
                    # # if re.search(r"[\{\}]", lb, flags=re.I):
                    # # if re.search(r"[\[\]]", lb, flags=re.I):
                    # # if re.search(r"[\(\)]", lb, flags=re.I):
                    #     pass
                    # else:
                    #     continue

                    print("-" * 64, lineno, ":")
                    print(la)
                    print(lb)
                    print_line += 1
                    if print_line % 10 == 9:
                        q.d()
                        print("=" * 128)
                        print("=" * 128)
Beispiel #26
0
    def draw_the_line(dn):

        min_line_width = Glb.PADDING_HEIGHT

        # 判断 行和列 是否存在字符
        h, w = dn.shape
        hl = [(dn[i]).any() for i in range(h)]
        wl = [(dn[:, i]).any() for i in range(w)]

        if "计算 h 的切割":
            h_cut_width, h_cut = [], 0
            h_cut_pare = []
            # 如果连续的 False (不存在字符) 达到 line_width 则认为存在可能的 表格线
            for i in range(min_line_width, len(hl) - min_line_width):
                if not hl[i]:
                    h_cut += 1
                else:
                    if h_cut:
                        h_cut_pare.append((i - h_cut, i))
                        h_cut_width.append(h_cut)
                        h_cut = 0

            h_cut = max(2 * min_line_width + 1, min(h_cut_width))
            print("h_cut_width:", h_cut_width, h_cut)

        if "计算 w 的切割":
            w_cut_width, w_cut = [], 0
            w_cut_pare = []
            # 如果连续的 False (不存在字符) 达到 line_width 则认为存在可能的 表格线
            for i in range(min_line_width, len(wl) - min_line_width):
                if not wl[i]:
                    w_cut += 1
                else:
                    if w_cut:
                        w_cut_pare.append((i - w_cut, i))
                        w_cut_width.append(w_cut)
                        w_cut = 0

            q.d()
            w_cut = max(min_line_width + 1, min(w_cut_width))
            print("w_cut_width:", w_cut_width, w_cut)

        fin_w, fin_h = [], []

        # 筛选 w 和 h
        dc = np.zeros(dn.shape)
        for wp in w_cut_pare:
            if wp[1] - wp[0] < w_cut:
                continue

            minw = (wp[1] + wp[0]) / 2
            fin_w.append(minw)
            dc[:, int(minw - min_line_width / 2): int(minw + min_line_width / 2)] = 1

        for hp in h_cut_pare:
            if hp[1] - hp[0] < h_cut:
                continue

            minh = (hp[1] + hp[0]) / 2
            fin_h.append(minh)
            dc[int(minh - min_line_width / 2): int(minh + min_line_width / 2)] = 1

        save_png(dc, "tmp.dc.png")
        print("fin_h:", fin_h)
        print("fin_w:", fin_w)

        return fin_h, fin_w
Beispiel #27
0
    def my_writer(self, file_path, work_item):

        with open(file_path, "rb") as rf:
            contents_tmp = rf.readlines()
            contents = []
            i = 0
            for c in contents_tmp:
                i = i + 1

                # w3x2lni cannot handle (\xd0, \xd1)
                if c[-4:] in (b"\xd0\"\r\n", b"\xd1\"\r\n"):
                    print("Line", i, "is bad char:", repr(c[-4]))
                    c = c[:-4] + b" \"\r\n"

                try:
                    contents.append(c.decode("utf8"))
                except Exception as e:
                    print(e, "is bad char")
                    q.d()

        with open("%s.mta2.cache" % file_path, "w") as wf:
            wait_end = False
            key_name = None
            wait_string = False
            wait_string2 = False
            for lineno, rawline in enumerate(contents):
                line = rawline
                try:

                    # 如果遇到跨行字符串
                    if wait_string:
                        append_string = line
                        # zz = ""

                        if line.strip() == "]=]" or line.strip() == "]=],":
                            wait_string = False
                            self.write_raw_string(wf, rawline)
                            continue

                        if line.strip()[-3:] == "]=]":
                            wait_string = False
                            append_string = line.strip()[:-3].strip()
                            # zz = line[line.find(append_string) + len(append_string):]
                        elif line.strip()[-4:] == "]=],":
                            wait_string = False
                            append_string = line.strip()[:-4].strip()
                            # zz = line[line.find(append_string) + len(append_string):]

                        if append_string:
                            if key_name.lower() in work_item:
                                self.write_translate_string(wf, rawline, append_string)
                                # if zz:
                                #     self.write_raw_string(wf, zz)
                            else:
                                self.write_raw_string(wf, rawline)
                            # ini_obj[sect_name][key_name].append(append_string)
                            # self.write_translate_string(wf, rawline, append_string)

                        continue

                    # 如果在跨行字符串 -> 对象
                    if wait_end:

                        # 如果遇到跨行字符串
                        if wait_string2:
                            append_string2 = line
                            # zz = ""

                            if line.strip() == "]=]" or line.strip() == "]=],":
                                wait_string2 = False
                                self.write_raw_string(wf, rawline)
                                continue

                            if line.strip()[-3:] == "]=]":
                                wait_string2 = False
                                append_string2 = line.strip()[:-3].strip()
                                # zz = line[line.find(append_string2) + len(append_string2):]
                            elif line.strip()[-4:] == "]=],":
                                wait_string2 = False
                                append_string2 = line.strip()[:-4].strip()
                                # zz = line[line.find(append_string2) + len(append_string2):]

                            if append_string2:

                                if key_name.lower() in work_item:
                                    self.write_translate_string(wf, rawline, append_string2)
                                    # if zz:
                                    #     self.write_raw_string(wf, zz)
                                else:
                                    self.write_raw_string(wf, rawline)
                                # ini_obj[sect_name][key_name].append(append_string2)
                                # ini_obj[sect_name][key_name][w_index].append(append_string2)
                                # self.write_translate_string(wf, rawline, append_string2)

                            continue

                        # 判断对象是否完结
                        if line.strip()[-1:] == "}":
                            wait_end = False
                            key_name = None
                            self.write_raw_string(wf, rawline)
                        elif line[-3:] == "[=[":
                            wait_string2 = True
                            # w_index = len(ini_obj[sect_name][key_name])
                            # ini_obj[sect_name][key_name].append([])
                            self.write_raw_string(wf, rawline)
                        else:
                            if key_name.lower() in work_item:
                                self.write_translate_string(wf, rawline, line)
                            else:
                                self.write_raw_string(wf, rawline)
                            # ini_obj[sect_name][key_name].append(line)
                            # self.write_translate_string(wf, rawline, line)

                        continue

                    else:
                        line = line.strip()
                        # 无视注释和空行
                        if not line or line[:2] == "--":
                            self.write_raw_string(wf, rawline)
                            continue

                    # 判断值是否对象
                    if line[-1:] == "{":
                        wait_end = True
                        key_name = line.split("=")[0].strip()
                        # ini_obj[sect_name][key_name] = []
                        self.write_raw_string(wf, rawline)
                    elif line[-3:] == "[=[":
                        wait_string = True
                        key_name = line.split("=")[0].strip()
                        # ini_obj[sect_name][key_name] = []
                        self.write_raw_string(wf, rawline)

                    # 判断是否新节点
                    # 未兼容 单行 [=[ 和 ]=] 同时存在的情况
                    elif re.match(r"^\[([a-z0-9]+)\]$", line, flags=re.I):
                        # sect_name = re.match(r"^\[([a-z0-9]+)\]$", line, re.I).group(1)
                        # ini_obj[sect_name] = {}
                        # print("Add new section: [%s]" % sect_name)
                        self.write_raw_string(wf, rawline)

                    elif re.match(r"^\[([^\[\]\"\']+)\]$", line, re.I):
                        # print("warn:", "Be not english:", line)
                        # sect_name = re.match(r"^\[([^\[\]\"\']+)\]$", line, re.I).group(1)
                        # ini_obj[sect_name] = {}
                        self.write_raw_string(wf, rawline)

                    elif re.match(r"^\[(.+)\]$", line, re.I):
                        # print("warn:", "Very bad [\"\\\\I0F\"] type ID:", line)
                        self.write_raw_string(wf, rawline)

                    # 直接赋值
                    elif "=" in line:
                        key_name = line.split("=")[0]
                        key_val = line[len(key_name) + 1:]
                        key_name = key_name.strip()

                        # if "рыцарь" in rawline:
                        #     print("".join(traceback.format_stack()))
                        #     q.d()
                        # ??????
                        if key_name.lower() in work_item:
                            self.write_translate_string(wf, rawline, key_val)
                        else:
                            self.write_raw_string(wf, rawline)
                    else:
                        print("[SKIP]", line)
                        self.write_raw_string(wf, rawline)
                        # raise Exception("[CANNOT PARSE]")

                except Exception:
                    print(traceback.format_exc())
                    q.d()
Beispiel #28
0
a = Article(url, language='zh')  # Chinese

a.download()
a.parse()

print(a.title)
print(a.text)

response = requests.get(url)
doc = Document(response.content)
title = doc.title()
html = doc.summary(True)

article = Article(url, language='zh')
article.download(input_html=html)
article.parse()

q.d()
print(article.title)
print(article.text)
exit(1)

response = requests.get(url)

doc = Document(response.content)
title = doc.title()
html = doc.summary(True)
q.d()
text = fulltext(html)
print(text)
Beispiel #29
0
def request(session, method, uri, *args, **kwargs):
    """Send a request to the URI and return the response

    The generic form of the function signature is as follows:

    .. code-block:: python

        args = [o['name'] for o in parameters if o['in'] == 'path']
        function(*args, model=None, query=None)


    If accountId is in the list of args and the value is not supplied then
    the function will automatically insert the discovered account_id
    for the session.

    :param method: the http method to call
    :type method: str

    :param uri: the relative uri to call
    :type: uri: str
    """
    api = get_api(session)

    method = method.lower()
    path = api['paths'][uri].get(method)

    if path is None:
        raise PureportError("method {} not supported for uri {}".format(
            method, uri))

    parameters = list()
    query = {}

    for item in path.get('parameters', []):
        if item.get('in',
                    'path') == 'path' and item.get('required', True) is True:
            parameters.append(to_snake_case(item['name']))
        elif item.get('in') == 'query':
            query[to_snake_case(item['name'])] = None

    cls = None

    ref = get_value('requestBody.content.application/json.schema.$ref', path)
    if ref:
        clsname = ref.split('/')[-1]
        schema = getattr(models, clsname)._schema

        if schema.discriminator:
            propval = getattr(kwargs['model'],
                              schema.discriminator['propertyName'])
            clsname = schema.discriminator['mapping'].get(propval).split(
                '/')[-1]

        cls = getattr(models, clsname, None)
        log.debug("connection class is {}".format(cls))
        parameters.append('model')

    query_values = kwargs.pop('query', None)
    if query_values:
        # TODO need to validate query inputs against api spec
        if not set(query_values).issubset(query):
            raise PureportError("unknown query value provided")

    variables = dict(zip(parameters, args))

    for item in parameters:
        if item not in variables:
            variables[item] = kwargs.pop(to_snake_case(item), None)

    model = variables.get('model')

    body = None

    if cls and isinstance(model, cls):
        body = models.dump(model)

    if kwargs:
        raise PureportError("unexpected keyword arguments")

    if set(args).issubset(variables.values()) is False:
        raise PureportError("unexpected positional arguments")

    for p in parameters:
        if variables.get(p) is None:
            # inject the session accountId automatically into the variables
            # if it is the only parameter that doesn't have a supplied value.
            if p == 'account_id':
                log.debug("automatically injecting account_id argument")
                variables['account_id'] = session.account_id
            else:
                import q
                q.d()
                raise PureportError("missing required argument: {}".format(p))

    func = globals().get(method)
    data = func(session,
                uri,
                body=body,
                variables=variables,
                query=query_values)

    schema = get_value('responses.default.content.application/json.schema',
                       path)
    if schema:
        if '$ref' in schema:
            clsname = schema['$ref'].split('/')[-1]
        elif schema.get('type') == 'array' and 'items' in schema:
            clsname = schema['items']['$ref'].split('/')[-1]

        if isinstance(data, list):
            data = [models.load(clsname, item) for item in data]
        else:
            data = models.load(clsname, data)

    return data
Beispiel #30
0
def MyReader(file_path):

    ini_obj = {}
    with open(file_path, "rb") as rf:
        contents_tmp = rf.readlines()
        contents = []
        i = 0
        for c in contents_tmp:
            i = i + 1

            # if i >= 34700:
            #     # наносящее его противникам по
            #     q.d()

            # w3x2lni cannot handle (\xd0, \xd1)
            if c[-4:] in (b"\xd0\"\r\n", b"\xd1\"\r\n"):
                print("Line", i, "is bad char")
                c = c[:-4] + b" \"\r\n"
            else:
                pass

            try:
                contents.append(c.decode("utf8"))
            except Exception as e:
                print(e, "is bad char")
                q.d()

            # if "наносящее его противникам по" in contents[-1]:
            #     print("in the MyReader")
            #     q.d()

        wait_end = False
        sect_name = None
        key_name = None
        wait_string = False
        # start_debug = False
        wait_string2 = False
        w_index = 0
        for lineno, line in enumerate(contents):

            # 兼容平台换行格式
            while line[-1:] == "\n":
                line = line[:-1]
            while line[:1] == "\ufeff":
                line = line[1:]

            try:

                # if not start_debug and sect_name == "A00B" and key_name == "Ubertip":
                #     start_debug = True

                # if start_debug:
                #     q.d()

                # 如果遇到跨行字符串
                if wait_string:
                    append_string = line

                    if line.strip()[-3:] == "]=]":
                        wait_string = False
                        append_string = line.strip()[:-3].strip()
                    elif line.strip()[-4:] == "]=],":
                        wait_string = False
                        append_string = line.strip()[:-4].strip()

                    if append_string:
                        ini_obj[sect_name][key_name].append(append_string)

                    continue

                # 如果在跨行字符串 -> 对象
                if wait_end:

                    # 如果遇到跨行字符串
                    if wait_string2:
                        append_string = line

                        if line.strip()[-3:] == "]=]":
                            wait_string2 = False
                            append_string = line.strip()[:-3].strip()
                        elif line.strip()[-4:] == "]=],":
                            wait_string2 = False
                            append_string = line.strip()[:-4].strip()

                        if append_string:
                            # ini_obj[sect_name][key_name].append(append_string)
                            ini_obj[sect_name][key_name][w_index].append(
                                append_string)

                        continue

                    # 判断对象是否完结
                    if line.strip() == "}":
                        wait_end = False
                        key_name = None
                    elif line[-3:] == "[=[":
                        wait_string2 = True
                        w_index = len(ini_obj[sect_name][key_name])
                        ini_obj[sect_name][key_name].append([])
                    else:
                        ini_obj[sect_name][key_name].append(line)

                    continue

                else:
                    line = line.strip()
                    # 无视注释和空行
                    if not line or line[:2] == "--":
                        continue

                # if file_path == "/mine/war3work/Otro Mapa TD de Warcraft III/map/Units/CommandStrings.txt":
                #     q.d()

                # 判断值是否对象
                if line[-1:] == "{":
                    wait_end = True
                    key_name = line.split("=")[0].strip()
                    ini_obj[sect_name][key_name] = []
                elif line[-3:] == "[=[":
                    wait_string = True
                    key_name = line.split("=")[0].strip()
                    ini_obj[sect_name][key_name] = []

                # 判断是否新节点
                # 未兼容 单行 [=[ 和 ]=] 同时存在的情况
                elif re.match(r"^\[([a-z0-9]{4})\]$", line, re.I):
                    sect_name = re.match(r"^\[([a-z0-9]{4})\]$", line,
                                         re.I).group(1)
                    ini_obj[sect_name] = {}
                    # print("Add new section: [%s]" % sect_name)
                elif re.match(r"^\[([^\[\]\"\']+)\]$", line, re.I):
                    print("warn:", "Be not 'abcd' type ID:", line)
                    sect_name = re.match(r"^\[([^\[\]\"\']+)\]$", line,
                                         re.I).group(1)
                    ini_obj[sect_name] = {}
                elif re.match(r"^\[(.+)\]$", line, re.I):
                    print("warn:", "Very bad [\"\\\\I0F\"] type ID:", line)
                    sect_name = re.match(r"^\[(.+)\]$", line, re.I).group(1)
                    ini_obj[sect_name] = {}
                # 直接赋值
                elif "=" in line:
                    key_name = line.split("=")[0]
                    key_val = line[len(key_name) + 1:]
                    key_name = key_name.strip()
                    ini_obj[sect_name][key_name] = key_val.strip()

                else:
                    print("[!CANNOT PARSE]", line)
                    # raise Exception("[CANNOT PARSE]")

            except Exception:
                print(traceback.format_exc())
                q.d()

    # print(
    #     "ini_obj['A00B']['Ubertip']:",
    #     ini_obj['A00B']['Tip'],
    #     ini_obj['A00B']['Ubertip']
    # )
    # q.d()
    # exit(1)
    # if "table/upgrade.ini" in file_path:
    #     q.d()

    return ini_obj