def space_file(file): f = open(file, 'r') txt = f.read() f.close() # new_txt = txt.replace('- ', '+ ') new_txt = txt.replace('* ', '+ ') # print(new_txt) f = open(file, 'w') f.write(new_txt) f.close() name, ext = os.path.splitext(file) nf = open(f'../new_{file}', 'w') with open(file, 'r') as f: for line in f.readlines(): tmp = line.strip() tmp = tmp.replace('blob/debug', 'blob/master') new = pangu.spacing_text(tmp) new = new.replace('** ', '**') new = new.replace(' **', '**') new = new.replace('~~ ', '~~') new = new.replace(' ~~', '~~') nf.writelines(new + '\n') # print(new) nf.close()
def clearReplace(self, text): paragraph = [] multi_replace_regexp = re.compile(r'\n\s*\n') text_segment = multi_replace_regexp.split(text) period_replace_regexp = re.compile(r'\n') for i in text_segment: line_i = i.strip() if self.flag_keep_period_replace == 1: line_tmp = [] line_segment = period_replace_regexp.split(line_i) for j in line_segment: line_j = j.strip() if re.match(r'.*[.:?!。:?!]$', line_j): line_tmp.append(line_j) line_tmp.append('\n') else: line_tmp.append(line_j) line_tmp.append(' ') paragraph.append(''.join(line_tmp)) else: paragraph.append(line_i.replace('\n', ' ')) fine_text = '\n\n'.join(paragraph) # 多次换行保留为直接换两行 new_text = pangu.spacing_text(fine_text) return new_text
def highlight(text: str, keyword: str): text = pangu.spacing_text(text) return re.sub( keyword, "\33[0m" + "\33[93m" + keyword + "\33[0m" + "\33[37m", text, flags=re.IGNORECASE, )
def headersstr2headers(header_str): t = arrow.now() print('大家好,今天是公历 %d 月 %d 日,星期%s \r\n' % (t.month, t.day, week[t.weekday()])) for str_line in header_str.splitlines(): if str_line != "": str_line = re.sub('\.$', '。', str_line) str_line = re.sub('。$', '', str_line) pangu_text = pangu.spacing_text(str_line) print(pangu_text + '\r\n')
def youdao_api(words: str): print() print(huepy.grey(" -------- ")) print() url = ( "http://fanyi.youdao.com/openapi.do?keyfrom={}&key={}&" "type=data&doctype=json&version=1.1&q={}" ) try: resp = requests.get( url.format(CONF.youdao_key_from, CONF.youdao_key, words), headers=HEADERS ).json() phonetic = "" basic = resp.get("basic", None) if basic and resp.get("basic").get("phonetic"): phonetic += huepy.purple(" [ " + basic.get("phonetic") + " ]") print(" " + words + phonetic + huepy.grey(" ~ fanyi.youdao.com")) print() translation = resp.get("translation", []) if len(translation) > 0: print(" - " + pangu.spacing_text(huepy.green(translation[0]))) if basic and basic.get("explains", None): for item in basic.get("explains"): print(huepy.grey(" - ") + pangu.spacing_text(huepy.green(item))) print() web = resp.get("web", None) if web and len(web): for i, item in enumerate(web): print( huepy.grey( " " + str(i + 1) + ". " + highlight(item.get("key"), words) ) ) print(" " + huepy.cyan(", ".join(item.get("value")))) except: print(" " + huepy.red(ERR_MSG))
def on_pangu_btn_clicked(self): text = self.before_txt.toPlainText() if text: text = self.handle_text(text) try: text = pangu.spacing_text(text) except: pass self.after_txt.setText(text)
def to_html(data, template_filename, fp): # Template with Jinja2 with _resources.path("crawler_book_info", "templates") as _path: template_path = str(_path) loader = FileSystemLoader(searchpath=template_path) env = Environment(loader=loader) template = env.get_template(template_filename) # Mapping the parser data to template. result = template.render(**data) # Write to HTML file. fp.write(pangu.spacing_text(result))
def google_api(words: str): print() def switch_language(): for w in words: if "\u4e00" <= w <= "\u9fff": return "en" return "zh-cn" translator = Translator(service_urls=["translate.google.cn"]) text = pangu.spacing_text(translator.translate(words, dest=switch_language()).text) print(" " + words + huepy.grey(" ~ translate.google.cn")) print() print(" - " + huepy.cyan(text))
def modify_text(line): """处理文字的格式""" # 去 \n 是转 pdf 时启用 # line = line.replace('\n', '') line = pangu.spacing_text(line) new_line = line.replace(' “', '“')\ .replace('” ', '”')\ .replace('“', '「')\ .replace('”', '」')\ .replace('・', '·')\ .replace(', ', ',')\ .replace('。 ', '。')\ .replace('’', '\'')\ .replace(' ', ' ') new_line = new_line.strip() return new_line
def parse_html(content): content = content.replace("\n", "<br>") result = re.findall(r"<e [^>]*>", content) if result: for i in result: html = PyQuery(i) if html.attr("type") == "web": template = "[%s](%s)" % (parse.unquote( html.attr("title")), parse.unquote(html.attr("href"))) elif html.attr("type") == "hashtag": template = " `%s` " % parse.unquote(html.attr("title")) elif html.attr("type") == "mention": template = parse.unquote(html.attr("title")) else: template = i content = content.strip().replace(i, template) else: content = pangu.spacing_text(content) return content
def onClipboradChanged(self): """ 根据是否 `监听剪贴板` 和 `自动替换回车` """ if not self.listen_clip_board.isChecked(): return clipboard = QApplication.clipboard() text = clipboard.text() if not text: return text = self.handle_text(text) self.before_txt.setText(text) try: text = pangu.spacing_text(text) except: pass self.after_txt.setText(text)
def parse_html(self, content): content = content.replace("\n", "<br>") result = re.findall(r"<e [^>]*>", content) if result: for i in result: html = PyQuery(i) if html.attr("type") == "web": title = parse.unquote(html.attr("title")) url = parse.unquote(html.attr("href")) template = "[%s](%s)" % (title, url) template += f"\n## {title}\n" template += self.parse_article(url) template += "\n" elif html.attr("type") == "hashtag": template = " `%s` " % parse.unquote(html.attr("title")) elif html.attr("type") == "mention": template = parse.unquote(html.attr("title")) else: template = i content = content.strip().replace(i, template) else: content = pangu.spacing_text(content) return content
def format_python(self, text): return FormatCode(pangu.spacing_text(text))[0]
def format_md(self, text): return pangu.spacing_text(text)
def format_json(self, text): return jsbeautifier.beautify(pangu.spacing_text(text))
runpy3 () { /anaconda3/bin/python << 'EOF' - "$@" import sys import pangu import clipboard import pyautogui for f in sys.argv: target = clipboard.paste() result = pangu.spacing_text(target) clipboard.copy(result) pyautogui.keyDown('command') pyautogui.press('v') pyautogui.keyUp('command') EOF } runpy3 "$@"
def get_html_from_blog(self, blog, rule): s = requests.session() r = s.get(blog.url, headers=headers) if rule['encoding'] is not None: r.encoding = 'utf-8' # 获取文本内容 html = r.text soup = BeautifulSoup(html, 'lxml') if False: # 增加代码标签 html = re.sub('<code.*?>', '<code>```\n', html) html = re.sub('</code>', '```\n</code>', html) with open('temp.html', 'w', encoding='utf-8') as f: f.write(html) # 正则获取标题 title_pattern = rule['title_pattern'] titles = re.findall(title_pattern, html, re.DOTALL) if len(titles) == 0: title = 'default' else: title = pangu.spacing_text(titles[0]) blog.title = title print('标题:', title) if rule['content_type'] == 'bs': content = soup.select(rule['content_pattern']).pop() content = str(content) else: # 提取正文内容 content_pattern = rule['content_pattern'] contents = re.findall(content_pattern, html, re.DOTALL) if len(contents) == 0: content = '' else: content = contents[0] content = '<h1><a href="{}">{}</a></h1><br><br>'.format( blog.url, blog.title) + content for src, dst in rule['content_replaces']: content = re.sub(src, dst, content) blog.content = content # print('正文:', content) # 转换为 MD # md_content = Tomd(content).markdown # content = re.sub('<a id=".*?"></a>', '', content) text_maker = ht.HTML2Text() md_content = text_maker.handle(content) # 去空行 md_content = md_content.replace('\r', '') while ' \n' in md_content: md_content = md_content.replace(' \n', '\n') #md_content = md_content.replace('\n', '\n\n') while '\n\n\n' in md_content: md_content = md_content.replace('\n\n\n', '\n\n') # print(' MD:', md_content) # 正则替换 for src, dst in rule['md_replaces']: md_content = re.sub(src, dst, md_content) # 加空格 md_content = pangu.spacing_text(md_content) # ** * for star_line in re.findall('\*(.*?)\*', md_content): md_content = md_content.replace('{}'.format(star_line), '{}'.format(star_line.strip())) # 异常断行 md_content = re.sub('-\n', '-', md_content) # 规范代码标签 #md_content = re.sub('[ ]```', '```', md_content) # 过滤非法字符 title = re.sub('[\/:*?"<>|]', '-', title) with open("blogs" + os.sep + title + '.md', 'w', encoding='utf-8') as f: f.write(md_content) pass
def main(): try: # Template with Jinja2 template = Template('''\ <!DOCTYPE html> <html> <head> <meta charset="utf-8" /> <meta name="viewport" content="width=device-width" /> <title> {{ title }} </title> </head> <body> <p> Buy: <ul> <li> <a href="{{ url }}" target="_blank">博客來</a> </li> </ul> </p> <hr> {{ full_title }} <p> <img src="{{ cover }}"/> </p> {{ info1 }} {{ price }} {{ info2 }} <h2>商品描述</h2> {{ desc }} <h2>作者簡介</h2> {{ author }} <h2>目錄大綱</h2> {{ outline }} <h2>Memo</h2> <h3>我想讀這本書的原因是什麼?</h3> <ol> <li> <FIXED_ME> </li> </ol> <h3>看完書封介紹和目錄大綱後,我覺得我可以從那邊得到什麼?</h3> <ol> <li> <FIXED_ME> </li> </ol> <h3>在買這本新書前,我曾讀過相關的主題的書籍嗎? 當時得到了什麼新知?</h3> <ol> <li> <FIXED_ME> </li> </ol> <footer style="text-align: center;"> Parser by <a href="https://github.com/chusiang/crawler-book-info" target="_blank"> chusiang/crawler-book-info </a> <hr> </footer> </body> </html> ''') # Get data. data = get_data() # Parser. book_title = parser_book_title(data[0]) book_url = data[1] book_full_title = parser_book_full_title(data[0]) book_cover = parser_book_cover(data[0]) book_info1 = parser_book_info1(data[0]) book_price = parser_book_price(data[0]) book_info2 = parser_book_info2(data[0]) book_desc = parser_book_desc(data[0]) book_author = parser_book_author(data[0]) book_outline = parser_book_outline(data[0]) # Mapping the parser data to template. result = template.render( title=book_title, url=book_url, full_title=book_full_title, cover=book_cover, info1=book_info1, price=book_price, info2=book_info2, desc=book_desc, author=book_author, outline=book_outline ) # Write to HTML file. f = open('index.html', 'w') f.write(pangu.spacing_text(result)) f.close() except Exception as e: print(e)
def pangu_func(self, keyword): text = pyperclip.paste() new_text = pangu.spacing_text(text) pyperclip.copy(new_text)
def format_sql(self, text): return sqlparse.format(pangu.spacing_text(text))
for infile in glob.glob("/Users/Daglas/Desktop/*.docx"): filename, ext = os.path.splitext(infile) document = Document(filename + ".docx") # document.save('new-file-name.txt') # for para in document.paragraphs: # print(para.text) # 对文字处理并写入文件 with open(filename + ".md", 'w') as file_obj: for para in document.paragraphs: if para.text != '\n': file_obj.write(para.text + "\n\n") # 读取文件,文件名「filename + ".md"」是关键 with open(filename + ".md") as file_obj: lines = file_obj.readlines() with open(filename + ".md", 'w') as file_obj: for line in lines: if line != '\n': new_content = pangu.spacing_text(line) new_content = new_content.replace(' “', '“') new_content = new_content.replace('” ', '”') new_content = new_content.replace('“', '「') new_content = new_content.replace('”', '」') new_content = new_content.replace('・', '·') new_content = new_content.replace(', ', ',') new_content = new_content.replace('。 ', '。') file_obj.write(new_content + "\n\n")
def cjk_layout(text): return pangu.spacing_text(text)
def test_spacing_text(self): self.assertEqual(pangu.spacing_text('請使用uname -m指令來檢查你的Linux作業系統是32位元或是[敏感词已被屏蔽]位元'), '請使用 uname -m 指令來檢查你的 Linux 作業系統是 32 位元或是 [敏感词已被屏蔽] 位元')
async def set_content(self, content): return await self.set_props_by_key('content', pangu.spacing_text(content))
#!/usr/bin/python3 # coding=utf-8 import pangu import pyperclip text = pyperclip.paste() print(text) new_text = pangu.spacing_text(text) pyperclip.copy(new_text) print(new_text)
def pangu_spacing(self): self.text = pangu.spacing_text(self.text) return self
#!/usr/bin/python3 # -*- coding: utf-8 -*- import pangu import pyperclip import sys from enum import Enum, unique @unique class QueryMode(Enum): Paste = 0 # 格式化剪切板文字 Selection = 1 # 格式化选中文字 mode = QueryMode.Selection query = "" if sys.argv[1]: mode = QueryMode.Selection query = sys.argv[1] else: mode = QueryMode.Paste query = pyperclip.paste() ret = pangu.spacing_text(query) if mode == QueryMode.Selection: print(ret, end='') # 默认输出不要换行 else: pyperclip.copy(ret)
def wrapper(*args, **kwargs): _args = list(args) _args[index] = pangu.spacing_text(_args[index]) result = func(*_args, **kwargs) return result
arr.append('。') arr.append('?') arr.append('!') arr.append(',') arr.append('.') arr.append('?') arr.append('!') if sys.argv[2] == 'all': replace = sys.argv[3] else: replace = sys.argv[2] sub = "[" + "|".join(arr) + "]+" f1 = open(script, 'r') lines = f1.readlines() for line in lines: if line.strip() == '': continue data = re.split('[\t ]', line) trans = ' '.join(data[1:]) new = re.sub(sub, replace, trans) \ .replace('[', replace) \ .replace(']', replace) \ .replace('FIL', replace) \ .replace('SPK', replace) \ .replace(' ', ' ') print(data[0] + '\t' + pangu.spacing_text(new).upper().strip()) f1.close()