def bcompare(self, file, sha1=0, sha2=None): """ :param file: 文件相对路径 :param sha1: 需要调取出的旧版本sha编号 0,1,2可以索引上次提交,上上次提交的sha 当sha1输入负数,例如-1时,则sha1调出的是文件最早版本 :param sha2: 需要调取出的另一个版本sha编号 默认是None,和当前文件内容比较 也可以输入数字等索引sha :return: 无,直接调出 bcompare 软件对比结果 还会在命令行输出这个文件的版本信息 """ # 1 获取s1内容 sha1_id = self.sha_id(sha1) s1 = self.show(file, sha1_id) # 2 获取s2内容 if sha2 is not None: sha2_id = self.sha_id(sha2) s2 = self.show(file, sha2_id) else: sha2_id = None s2 = os.path.join(self.g.working_dir, file) # 存储文件名而不是内容 # 3 对比 dprint(sha1, sha2, sha1_id, sha2_id) print(dataframe_str(self.list_commits(file=file))) bcompare(s1, s2)
def __init__(self, file, mode=None): """ :param file: 要处理的文件 :param mode: 要处理的格式,不输入会有一套智能匹配算法 'rar': 'zip': docx后缀的,默认采用zip格式解压 """ # 1 确定压缩格式 name, ext = os.path.splitext(file) ext = ext.lower() if not mode: if ext in ('.docx', '.zip'): mode = 'zip' elif ext == '.rar': mode = 'rar' else: dprint(ext) # 从文件扩展名无法得知压缩格式 raise ValueError self.mode = mode # 2 确定是用的解压“引擎” if mode == 'zip': self.proc = zipfile.ZipFile(file) elif mode == 'rar': # 安装详见: https://blog.csdn.net/code4101/article/details/79328636 check_install_package('unrar') from unrar.rarfile import RarFile self.proc = RarFile(file) # 3 解压文件夹目录,None表示还未解压 self.tempfolder = None
def smartsha(self, s, file=None): """输入一段文本,智能识别sha None --> None sha --> sha 数字 --> 0,1,2,3索引上次提交、上上次提交... 文本 --> 找commit有出现关键词的commit 可以只抓取某个文件file的修改版本 """ if s is None: return s elif isinstance(s, str) and re.match(r'[a-z0-9]+', s): # 正常的一段sha return s shas = self.commits_sha(file=file) num = len(shas) if isinstance(s, int): # 一个整数,0表示HEAD,最近的一次提交 if s < num: return shas[s] else: dprint(num, s) # 下标越界 raise ValueError else: # 其他情况,去匹配commit的名称 names = self.commits_name(file=file) for i, t in enumerate(names): if s in t: return shas[i] else: dprint(s) # 没有找到这个关键词对应的commit raise ValueError
def gettag_attr(tagstr, attrname): r"""tagstr是一个标签字符串,attrname是要索引的名字 返回属性值,如果不存在该属性则返回None >>> gettag_attr('%<topic type=danxuan description=单选题> 123\n<a b=c></a>', 'type') 'danxuan' >>> gettag_attr('%<topic type="dan xu an" description=单选题>', 'type') 'dan xu an' >>> gettag_attr("%<topic type='dan xu an' description=单选题>", 'type') 'dan xu an' >>> gettag_attr('%<topic type=dan xu an description=单选题>', 'description') '单选题' >>> gettag_attr('%<topic type=dan xu an description=单选题>', 'type') 'dan' >>> gettag_attr('%<topic type=danxuan description=单选题 >', 'description') '单选题' >>> gettag_attr('%<topic type=danxuan description=单选题 >', 'description123') is None True """ soup = BeautifulSoup(tagstr, 'lxml') try: for tag in soup.p.contents: if isinstance(tag, bs4.Tag): return tag.get(attrname, None) except AttributeError: dprint(tagstr) return None
def treetable_flatten(df, *, reverse=False, childid_colname='id', parentid_colname='parent_id', format_colname=None): """ 获得知识树横向展开表:列为depth-3, depth-2, depth-1,表示倒数第3级、倒数第2级、倒数第1级 :param df: DataFrame数据 :param reverse: False,正常地罗列depth1、depth2、depth3...等结点信息 True,反向列举所属层级,即显示倒数第1层parent1,然后是倒数第2层parent2... :param childid_colname: 孩子结点列 :param parentid_colname: 父结点列 :param format_colname: 显示的数值 None,默认采用 childid_colname 的值 str,某一列的名称,采用那一列的值(可以实现设置好格式) :return: """ # 1 构造辅助数组 if format_colname is None: format_colname = parentid_colname parentid = dict() # parentid[k] = v, 存储结点k对应的父结点v nodeval = dict() # nodeval[k] = v, 存储结点k需要显示的数值情况 if len(df[df.index.duplicated()]): dprint(len(set(df.index)), len(df.index)) # 有重复index raise ValueError for idx, row in df.iterrows(): parentid[row[childid_colname]] = row[parentid_colname] nodeval[row[childid_colname]] = str(row[format_colname]) # 2 每个结点往上遍历出所有父结点 parents = [] for idx, row in df.iterrows(): ps = [nodeval[row[childid_colname]]] # 包含结点自身的所有父结点名称 p = row[parentid_colname] while p in parentid: ps.append(nodeval[p]) p = parentid[p] parents.append(ps) num_depth = max(map(len, parents), default=0) # 3 这里可以灵活调整最终要显示的格式效果 df['parents'] = parents if reverse: for j in range(num_depth, 0, -1): df[f'depth-{j}'] = '' for idx, row in df.iterrows(): for j in range(1, len(row.parents) + 1): df.loc[idx, f'depth-{j}'] = row.parents[j - 1] else: for j in range(num_depth): df[f'depth{j}'] = '' for idx, row in df.iterrows(): for j in range(len(row.parents)): df.loc[idx, f'depth{j}'] = row.parents[-j - 1] df.drop('parents', axis=1, inplace=True) return df
def select_columns(_self, columns, column_name='searchkey'): r""" 获取表中columns属性列的值,返回dataframe数据类型 :param columns: 搜索列名使用正则re.search字符串匹配查找 可以单列:'attr1',找到列头后,会一直往后取到最后一个非空值 也可以多列: ['attr1', 'attr2', 'attr3'] 会结合多个列标题定位,数据从最大的起始行号开始取, (TODO 截止到最末非空值所在行 未实现,先用openpyxl自带的max_row判断,不过这个有时会判断过大) 遇到合并单元格,会寻找其母单元格的值填充 :param column_name: 返回的df。列名 origin,原始的列名 searchkey,搜索时用的查找名 """ if not isinstance(columns, (list, tuple)): columns = [columns] # 1 找到所有标题位置,定位起始行 cels, names, start_line = [], [], -1 for search_name in columns: cel = _self.findcel(search_name) if cel: cels.append(cel) if column_name == 'searchkey': names.append(str(search_name)) elif column_name == 'origin': if isinstance(search_name, (list, tuple)) and len(search_name) > 1: names.append('/'.join( list(search_name[:-1]) + [str(cel.value)])) else: names.append(str(cel.value)) else: raise ValueError(f'{column_name}') start_line = max(start_line, cel.down().row) else: dprint(search_name) # 找不到指定列 # 2 获得每列的数据 datas = {} for k, cel in enumerate(cels): if cel: col = cel.column li = [] for i in range(start_line, _self.max_row + 1): v = _self.cell(i, col).mcell().value # 注意合并单元格的取值 li.append(v) datas[names[k]] = li else: # 如果没找到列,设一个空列 datas[names[k]] = [None] * (_self.max_row + 1 - start_line) df = pd.DataFrame(datas) # 3 去除所有空行数据 df.dropna(how='all', inplace=True) return df
def page_add_ele(self): """往页面添加元素 添加元素前后xrefstr的区别: https://paste.ubuntu.com/p/Dxhnzp4XJ2/ """ self.doc.select([0]) page = self.doc.loadPage(0) # page.insertText(fitz.Point(100, 200), 'test\ntest') file = File('a.pdf', Dir.TEMP).to_str() dprint(file) self.doc.save(file, garbage=4) browser(file)
def tag_name(self): """输入一个bs4的Tag或NavigableString, 返回tag.name或者'NavigableString' """ if self.name: return self.name elif isinstance(self, bs4.NavigableString): return 'NavigableString' else: dprint(self) # 获取结点t名称失败 return None
def pagetext(self): """单页上的文本""" page = self.doc[0] # 获得页面上的所有文本,还支持参数: html,dict,xml,xhtml,json text = page.getText('text') dprint(text) # 获得页面上的所有文本(返回字典对象) textdict = page.getText('dict') textdict['blocks'] = textdict['blocks'][:-1] browser(pprint.pformat(textdict))
def inner(node, depth): if isinstance(node, bs4.ProcessingInstruction): add('ProcessingInstruction', depth) elif isinstance(node, bs4.Tag): if node.name == tagname and depth: dprint(node, depth) # tagname里有同名子标签 add(node.name, depth) for t in node.children: inner(t, depth + 1) elif isinstance(node, bs4.NavigableString): add('NavigableString', depth) else: add('其他特殊结点', depth)
def getmembers(object, predicate=None): """自己重写改动的 inspect.getmembers TODO 这个实现好复杂,对于成员,直接用dir不就好了? """ from inspect import isclass, getmro import types if isclass(object): mro = (object, ) + getmro(object) else: mro = () results = [] processed = set() names = dir(object) # :dd any DynamicClassAttributes to the list of names if object is a class; # this may result in duplicate entries if, for example, a virtual # attribute with the same name as a DynamicClassAttribute exists try: for base in object.__bases__: for k, v in base.__dict__.items(): if isinstance(v, types.DynamicClassAttribute): names.append(k) except AttributeError: pass for key in names: # First try to get the value via getattr. Some descriptors don't # like calling their __get__ (see bug #1785), so fall back to # looking in the __dict__. try: value = getattr(object, key) # handle the duplicate key if key in processed: raise AttributeError # except AttributeError: except: # 加了这种异常获取,190919周四15:14,sqlalchemy.exc.InvalidRequestError dprint(key) # 抓不到对应的这个属性 for base in mro: if key in base.__dict__: value = base.__dict__[key] break else: # could be a (currently) missing slot member, or a buggy # __dir__; discard and move on continue if not predicate or predicate(value): results.append((key, value)) processed.add(key) results.sort(key=lambda pair: pair[0]) return results
def __init__(self, key=None): """key,允许设置密钥,必须是""" # 1 分析key是否合法 if key: if len(key) != 85 or set(key) != Base85Coder.CHARS_SET: dprint(key) # 输入key无效 key = None self.key = key # 2 制作转换表 trantab if key: self.encode_trantab = str.maketrans(Base85Coder.DEFAULT_KEY, key) self.decode_trantab = str.maketrans(key, Base85Coder.DEFAULT_KEY) else: self.encode_trantab = self.decode_trantab = None
def filetext_replace(files, func, *, count=-1, start=1, bc=False, write=False, if_exists=None): r"""遍历目录下的文本文件进行批量处理的功能函数 :param files: 文件匹配规则,详见filesmatch用法 :param func: 通用文本处理函数 :param count: 匹配到count个文件后结束,防止满足条件的文件太多,程序会跑死 :param start: 从编号几的文件开始查找,一般在遇到意外调试的时候使用 :param bc: 使用beyond compare软件 注意bc的优先级比write高,如果bc和write同时为True,则会开bc,但并不会执行write :param write: 是否原地修改文件内容进行保存 :param if_exists: 是否进行备份,详见writefile里的参数文件 :return: 满足条件的文件清单 """ ls = [] total = 0 for f in filesmatch(files): # if 'A4-Exam' in f: # continue total += 1 if total < start: continue s0 = File(f).read() s1 = func(s0) if s0 != s1: match = len(ls) + 1 dprint(f, total, match) if bc: bcompare(f, s1) elif write: # 如果开了bc,程序是绝对不会自动写入的 File(f).write(s1, if_exists=if_exists) ls.append(f) if len(ls) == count: break match_num = len(ls) dprint(total, match_num) return ls
def sha_id(self, s, file=None): """类似smartsha,但是返回的是sha对应的序号 0对应最近一次提交,1对应上上次提交... 这个实现和smartsha有点重复冗余,不是很优雅~~ """ shas = self.commits_sha(file=file) if s is None: return 0 elif isinstance(s, str) and re.match(r'[a-z1-9]+', s): # 正常的一段sha for i, t in enumerate(shas): if t.startswith(s): return i elif isinstance(s, int): return s else: # 其他情况,去匹配commit的名称 names = self.commits_name(file=file) for i, t in enumerate(names): if s in t: return i else: dprint(s) # 没有找到这个关键词对应的commit raise ValueError
def demo_myspellchecker(): # 类的初始化大概要0.4秒 a = MySpellChecker() # sql的加载更新大概要1秒 # hsql = HistudySQL('ckz', 'tr_develop') # df = hsql.query('SELECT * FROM spell_check') # a.update_by_dataframe(df) # dprint(a.correction_detail('d')) # dprint(a.correction_detail('wrod')) # wrod有很多种可能性,但word权重是最大的 # dprint(a.correction_detail('ckzckzckzckzckzckz')) # wrod有很多种可能性,但word权重是最大的 # dprint(a.correction('ckzckzckzckzckzckz')) # wrod有很多种可能性,但word权重是最大的 dprint(a.correction_detail('ike')) dprint(a.correction_detail('dean')) dprint(a.correction_detail('stud')) dprint(a.correction_detail('U'))
def message(self): """查看pdf文档一些基础信息""" dprint(fitz.version) # fitz模块的版本 dprint(self.doc.pageCount) # pdf页数 dprint(self.doc._getXrefLength()) # 文档的对象总数
def demo_openpyxl(): # 一、新建一个工作薄 from openpyxl import Workbook wb = Workbook() # 取一个工作表 ws = wb.active # wb['Sheet'],取已知名称、下标的表格,excel不区分大小写,这里索引区分大小写 # 1 索引单元格的两种方法,及可以用.value获取值 ws['A2'] = '123' dprint(ws.cell(2, 1).value) # 123 # 2 合并单元格 ws.merge_cells('A1:C2') dprint(ws['A1'].value) # None,会把原来A2的内容清除 # print(ws['A2'].value) # AttributeError: 'MergedCell' object has no attribute 'value' # ws.unmerge_cells('A1:A3') # ValueError: list.remove(x): x not in list,必须标记完整的合并单元格区域,否则会报错 ws['A1'].value = '模块一' ws['A3'].value = '属性1' ws['B3'].value = '属性2' ws['C3'].value = '属性3' ws.merge_cells('D1:E2') ws['D1'].value = '模块二' ws['D3'].value = '属性1' ws['E3'].value = '属性2' dprint(ws['A1'].offset(1, 0).coordinate) # A2 dprint(ws['A1'].down().coordinate) # A3 # 3 设置单元格样式、格式 from openpyxl.comments import Comment cell = ws['A3'] cell.font = Font(name='Courier', size=36) cell.comment = Comment(text="A comment", author="Author's Name") styles = [[ 'Number formats', 'Comma', 'Comma [0]', 'Currency', 'Currency [0]', 'Percent' ], [ 'Informative', 'Calculation', 'Total', 'Note', 'Warning Text', 'Explanatory Text' ], [ 'Text styles', 'Title', 'Headline 1', 'Headline 2', 'Headline 3', 'Headline 4', 'Hyperlink', 'Followed Hyperlink', 'Linked Cell' ], [ 'Comparisons', 'Input', 'Output', 'Check Cell', 'Good', 'Bad', 'Neutral' ], [ 'Highlights', 'Accent1', '20 % - Accent1', '40 % - Accent1', '60 % - Accent1', 'Accent2', 'Accent3', 'Accent4', 'Accent5', 'Accent6', 'Pandas' ]] for i, name in enumerate(styles, start=4): ws.cell(i, 1, name[0]) for j, v in enumerate(name[1:], start=2): ws.cell(i, j, v) ws.cell(i, j).style = v # 二、测试一些功能 dprint(ws.search('模块二').coordinate) # D1 dprint(ws.search(['模块二', '属性1']).coordinate) # D3 dprint(ws.findcol(['模块一', '属性1'], direction=1)) # 0 wb.save("demo_openpyxl.xlsx")
def demo_spellchecker(): """演示如何使用spellchecker库 官方介绍文档 pyspellchecker · PyPI: https://pypi.org/project/pyspellchecker/ 190909周一15:58,from 陈坤泽 """ # 1 创建对象 # 可以设置语言、大小写敏感、拼写检查的最大距离 # 默认'en'英语,大小写不敏感 spell = SpellChecker() # 如果是英语,SpellChecker会自动加载语言包site-packages\spellchecker\resources\en.json.gz,大概12万个词汇,包括词频权重 d = spell.word_frequency # 这里的d是WordFrequency对象,其底层用了Counter类进行数据存储 dprint(d.unique_words, d.total_words) # 词汇数,权重总和 # 2 修改词频表 spell.word_frequency dprint(d['ckz']) # 不存在的词汇直接输出0 d.add('ckz') # 可以添加ckz词汇的一次词频 d.load_words(['ckz', 'ckz', 'lyb']) # 可以批量添加词汇 dprint(d['ckz'], d['lyb']) # d['ckz']=3 d['lyb']=1 d.load_words(['ckz'] * 100 + ['lyb'] * 500) # 可以用这种技巧进行大权重的添加 dprint(d['ckz'], d['lyb']) # d['ckz']=103 d['lyb']=501 # 同理,去除也有remove和remove_words两种方法 d.remove('ckz') # d.remove_words(['ckz', 'lyb']) # 不过注意不能删除已经不存在的key('ckz'),否则会报KeyError dprint(d['ckz'], d['lyb']) # d['ckz']=0 d['lyb']=501 # remove是完全去除单词,如果只是要减权重可以访问底层的_dictionary对象操作 d._dictionary['lyb'] -= 100 # 当然不太建议直接访问下划线开头的成员变量~~ dprint(d['lyb']) # ['lyb']=401 # 还可以按阈值删除词频不超过设置阈值的词汇 d.remove_by_threshold(5) # 3 spell的基本功能 # (1)用unknown可以找到可能拼写错误的单词,再用correction可以获得最佳修改意见 misspelled = spell.unknown(['something', 'is', 'hapenning', 'here']) dprint(misspelled) # misspelled<set>={'hapenning'} for word in misspelled: # Get the one `most likely` answer dprint(spell.correction(word)) # <str>='happening' # Get a list of `likely` options dprint(spell.candidates( word)) # <set>={'henning', 'happening', 'penning'} # 注意默认的spell不区分大小写,如果词库存储了100次'ckz' # 此时判断任意大小写形式组合的'CKZ'都是返回原值 # 例如 spell.correction('ckZ') => 'ckZ' # (2)可以通过修改spell.word_frequency影响correction的计算结果 dprint(d['henning'], d['happening'], d['penning']) # d['henning']<int>=53 d['happening']<int>=4538 d['penning']<int>=23 d._dictionary['henning'] += 10000 dprint(spell.correction('hapenning')) # <str>='henning' # (3)词汇在整个字典里占的权重 dprint(spell.word_probability('henning')) # <float>=0.0001040741914298211