Beispiel #1
0
    def bcompare(self, file, sha1=0, sha2=None):
        """
        :param file: 文件相对路径
        :param sha1: 需要调取出的旧版本sha编号
            0,1,2可以索引上次提交,上上次提交的sha
            当sha1输入负数,例如-1时,则sha1调出的是文件最早版本
        :param sha2: 需要调取出的另一个版本sha编号
            默认是None,和当前文件内容比较
            也可以输入数字等索引sha
        :return: 无,直接调出 bcompare 软件对比结果

        还会在命令行输出这个文件的版本信息
        """
        # 1 获取s1内容
        sha1_id = self.sha_id(sha1)
        s1 = self.show(file, sha1_id)

        # 2 获取s2内容
        if sha2 is not None:
            sha2_id = self.sha_id(sha2)
            s2 = self.show(file, sha2_id)
        else:
            sha2_id = None
            s2 = os.path.join(self.g.working_dir, file)  # 存储文件名而不是内容

        # 3 对比
        dprint(sha1, sha2, sha1_id, sha2_id)
        print(dataframe_str(self.list_commits(file=file)))

        bcompare(s1, s2)
Beispiel #2
0
    def __init__(self, file, mode=None):
        """
        :param file: 要处理的文件
        :param mode: 要处理的格式,不输入会有一套智能匹配算法
            'rar':
            'zip': docx后缀的,默认采用zip格式解压
        """
        # 1 确定压缩格式
        name, ext = os.path.splitext(file)
        ext = ext.lower()
        if not mode:
            if ext in ('.docx', '.zip'):
                mode = 'zip'
            elif ext == '.rar':
                mode = 'rar'
            else:
                dprint(ext)  # 从文件扩展名无法得知压缩格式
                raise ValueError
        self.mode = mode

        # 2 确定是用的解压“引擎”
        if mode == 'zip':
            self.proc = zipfile.ZipFile(file)
        elif mode == 'rar':
            # 安装详见: https://blog.csdn.net/code4101/article/details/79328636
            check_install_package('unrar')
            from unrar.rarfile import RarFile
            self.proc = RarFile(file)
        # 3 解压文件夹目录,None表示还未解压
        self.tempfolder = None
Beispiel #3
0
    def smartsha(self, s, file=None):
        """输入一段文本,智能识别sha
            None  -->  None
            sha   -->  sha
            数字  -->  0,1,2,3索引上次提交、上上次提交...
            文本  -->  找commit有出现关键词的commit

        可以只抓取某个文件file的修改版本
        """
        if s is None:
            return s
        elif isinstance(s, str) and re.match(r'[a-z0-9]+', s):  # 正常的一段sha
            return s

        shas = self.commits_sha(file=file)
        num = len(shas)

        if isinstance(s, int):  # 一个整数,0表示HEAD,最近的一次提交
            if s < num:
                return shas[s]
            else:
                dprint(num, s)  # 下标越界
                raise ValueError
        else:  # 其他情况,去匹配commit的名称
            names = self.commits_name(file=file)
            for i, t in enumerate(names):
                if s in t:
                    return shas[i]
            else:
                dprint(s)  # 没有找到这个关键词对应的commit
                raise ValueError
Beispiel #4
0
def gettag_attr(tagstr, attrname):
    r"""tagstr是一个标签字符串,attrname是要索引的名字
    返回属性值,如果不存在该属性则返回None

    >>> gettag_attr('%<topic type=danxuan description=单选题> 123\n<a b=c></a>', 'type')
    'danxuan'
    >>> gettag_attr('%<topic type="dan xu an" description=单选题>', 'type')
    'dan xu an'
    >>> gettag_attr("%<topic type='dan xu an' description=单选题>", 'type')
    'dan xu an'
    >>> gettag_attr('%<topic type=dan xu an description=单选题>', 'description')
    '单选题'
    >>> gettag_attr('%<topic type=dan xu an description=单选题>', 'type')
    'dan'
    >>> gettag_attr('%<topic type=danxuan description=单选题 >', 'description')
    '单选题'
    >>> gettag_attr('%<topic type=danxuan description=单选题 >', 'description123') is None
    True
    """
    soup = BeautifulSoup(tagstr, 'lxml')
    try:
        for tag in soup.p.contents:
            if isinstance(tag, bs4.Tag):
                return tag.get(attrname, None)
    except AttributeError:
        dprint(tagstr)
    return None
Beispiel #5
0
def treetable_flatten(df,
                      *,
                      reverse=False,
                      childid_colname='id',
                      parentid_colname='parent_id',
                      format_colname=None):
    """ 获得知识树横向展开表:列为depth-3, depth-2, depth-1,表示倒数第3级、倒数第2级、倒数第1级

    :param df: DataFrame数据
    :param reverse:
        False,正常地罗列depth1、depth2、depth3...等结点信息
        True,反向列举所属层级,即显示倒数第1层parent1,然后是倒数第2层parent2...
    :param childid_colname: 孩子结点列
    :param parentid_colname: 父结点列
    :param format_colname: 显示的数值
        None,默认采用 childid_colname 的值
        str,某一列的名称,采用那一列的值(可以实现设置好格式)
    :return:
    """
    # 1 构造辅助数组
    if format_colname is None: format_colname = parentid_colname
    parentid = dict()  # parentid[k] = v, 存储结点k对应的父结点v
    nodeval = dict()  # nodeval[k] = v,  存储结点k需要显示的数值情况
    if len(df[df.index.duplicated()]):
        dprint(len(set(df.index)), len(df.index))  # 有重复index
        raise ValueError

    for idx, row in df.iterrows():
        parentid[row[childid_colname]] = row[parentid_colname]
        nodeval[row[childid_colname]] = str(row[format_colname])

    # 2 每个结点往上遍历出所有父结点
    parents = []
    for idx, row in df.iterrows():
        ps = [nodeval[row[childid_colname]]]  # 包含结点自身的所有父结点名称
        p = row[parentid_colname]
        while p in parentid:
            ps.append(nodeval[p])
            p = parentid[p]
        parents.append(ps)
    num_depth = max(map(len, parents), default=0)

    # 3 这里可以灵活调整最终要显示的格式效果
    df['parents'] = parents
    if reverse:
        for j in range(num_depth, 0, -1):
            df[f'depth-{j}'] = ''
        for idx, row in df.iterrows():
            for j in range(1, len(row.parents) + 1):
                df.loc[idx, f'depth-{j}'] = row.parents[j - 1]
    else:
        for j in range(num_depth):
            df[f'depth{j}'] = ''
        for idx, row in df.iterrows():
            for j in range(len(row.parents)):
                df.loc[idx, f'depth{j}'] = row.parents[-j - 1]
    df.drop('parents', axis=1, inplace=True)
    return df
Beispiel #6
0
    def select_columns(_self, columns, column_name='searchkey'):
        r""" 获取表中columns属性列的值,返回dataframe数据类型

        :param columns: 搜索列名使用正则re.search字符串匹配查找
            可以单列:'attr1',找到列头后,会一直往后取到最后一个非空值
            也可以多列: ['attr1', 'attr2', 'attr3']
                会结合多个列标题定位,数据从最大的起始行号开始取,
                (TODO 截止到最末非空值所在行  未实现,先用openpyxl自带的max_row判断,不过这个有时会判断过大)
            遇到合并单元格,会寻找其母单元格的值填充
        :param column_name: 返回的df。列名
            origin,原始的列名
            searchkey,搜索时用的查找名
        """
        if not isinstance(columns, (list, tuple)):
            columns = [columns]

        # 1 找到所有标题位置,定位起始行
        cels, names, start_line = [], [], -1
        for search_name in columns:
            cel = _self.findcel(search_name)
            if cel:
                cels.append(cel)
                if column_name == 'searchkey':
                    names.append(str(search_name))
                elif column_name == 'origin':
                    if isinstance(search_name,
                                  (list, tuple)) and len(search_name) > 1:
                        names.append('/'.join(
                            list(search_name[:-1]) + [str(cel.value)]))
                    else:
                        names.append(str(cel.value))
                else:
                    raise ValueError(f'{column_name}')
                start_line = max(start_line, cel.down().row)
            else:
                dprint(search_name)  # 找不到指定列

        # 2 获得每列的数据
        datas = {}
        for k, cel in enumerate(cels):
            if cel:
                col = cel.column
                li = []
                for i in range(start_line, _self.max_row + 1):
                    v = _self.cell(i, col).mcell().value  # 注意合并单元格的取值
                    li.append(v)
                datas[names[k]] = li
            else:
                # 如果没找到列,设一个空列
                datas[names[k]] = [None] * (_self.max_row + 1 - start_line)
        df = pd.DataFrame(datas)

        # 3 去除所有空行数据
        df.dropna(how='all', inplace=True)

        return df
Beispiel #7
0
 def page_add_ele(self):
     """往页面添加元素
     添加元素前后xrefstr的区别: https://paste.ubuntu.com/p/Dxhnzp4XJ2/
     """
     self.doc.select([0])
     page = self.doc.loadPage(0)
     # page.insertText(fitz.Point(100, 200), 'test\ntest')
     file = File('a.pdf', Dir.TEMP).to_str()
     dprint(file)
     self.doc.save(file, garbage=4)
     browser(file)
Beispiel #8
0
 def tag_name(self):
     """输入一个bs4的Tag或NavigableString,
     返回tag.name或者'NavigableString'
     """
     if self.name:
         return self.name
     elif isinstance(self, bs4.NavigableString):
         return 'NavigableString'
     else:
         dprint(self)  # 获取结点t名称失败
         return None
Beispiel #9
0
    def pagetext(self):
        """单页上的文本"""
        page = self.doc[0]

        # 获得页面上的所有文本,还支持参数: html,dict,xml,xhtml,json
        text = page.getText('text')
        dprint(text)

        # 获得页面上的所有文本(返回字典对象)
        textdict = page.getText('dict')
        textdict['blocks'] = textdict['blocks'][:-1]
        browser(pprint.pformat(textdict))
Beispiel #10
0
 def inner(node, depth):
     if isinstance(node, bs4.ProcessingInstruction):
         add('ProcessingInstruction', depth)
     elif isinstance(node, bs4.Tag):
         if node.name == tagname and depth:
             dprint(node, depth)  # tagname里有同名子标签
         add(node.name, depth)
         for t in node.children:
             inner(t, depth + 1)
     elif isinstance(node, bs4.NavigableString):
         add('NavigableString', depth)
     else:
         add('其他特殊结点', depth)
Beispiel #11
0
def getmembers(object, predicate=None):
    """自己重写改动的 inspect.getmembers

    TODO 这个实现好复杂,对于成员,直接用dir不就好了?
    """
    from inspect import isclass, getmro
    import types

    if isclass(object):
        mro = (object, ) + getmro(object)
    else:
        mro = ()
    results = []
    processed = set()
    names = dir(object)
    # :dd any DynamicClassAttributes to the list of names if object is a class;
    # this may result in duplicate entries if, for example, a virtual
    # attribute with the same name as a DynamicClassAttribute exists
    try:
        for base in object.__bases__:
            for k, v in base.__dict__.items():
                if isinstance(v, types.DynamicClassAttribute):
                    names.append(k)
    except AttributeError:
        pass
    for key in names:
        # First try to get the value via getattr.  Some descriptors don't
        # like calling their __get__ (see bug #1785), so fall back to
        # looking in the __dict__.
        try:
            value = getattr(object, key)
            # handle the duplicate key
            if key in processed:
                raise AttributeError
        # except AttributeError:
        except:  # 加了这种异常获取,190919周四15:14,sqlalchemy.exc.InvalidRequestError
            dprint(key)  # 抓不到对应的这个属性
            for base in mro:
                if key in base.__dict__:
                    value = base.__dict__[key]
                    break
            else:
                # could be a (currently) missing slot member, or a buggy
                # __dir__; discard and move on
                continue

        if not predicate or predicate(value):
            results.append((key, value))
        processed.add(key)
    results.sort(key=lambda pair: pair[0])
    return results
Beispiel #12
0
    def __init__(self, key=None):
        """key,允许设置密钥,必须是"""
        # 1 分析key是否合法
        if key:
            if len(key) != 85 or set(key) != Base85Coder.CHARS_SET:
                dprint(key)  # 输入key无效
                key = None
        self.key = key

        # 2 制作转换表 trantab
        if key:
            self.encode_trantab = str.maketrans(Base85Coder.DEFAULT_KEY, key)
            self.decode_trantab = str.maketrans(key, Base85Coder.DEFAULT_KEY)
        else:
            self.encode_trantab = self.decode_trantab = None
Beispiel #13
0
def filetext_replace(files,
                     func,
                     *,
                     count=-1,
                     start=1,
                     bc=False,
                     write=False,
                     if_exists=None):
    r"""遍历目录下的文本文件进行批量处理的功能函数

    :param files: 文件匹配规则,详见filesmatch用法
    :param func: 通用文本处理函数
    :param count: 匹配到count个文件后结束,防止满足条件的文件太多,程序会跑死
    :param start: 从编号几的文件开始查找,一般在遇到意外调试的时候使用
    :param bc: 使用beyond compare软件
        注意bc的优先级比write高,如果bc和write同时为True,则会开bc,但并不会执行write
    :param write: 是否原地修改文件内容进行保存
    :param if_exists: 是否进行备份,详见writefile里的参数文件
    :return: 满足条件的文件清单
    """
    ls = []
    total = 0
    for f in filesmatch(files):
        # if 'A4-Exam' in f:
        #     continue
        total += 1
        if total < start:
            continue
        s0 = File(f).read()
        s1 = func(s0)
        if s0 != s1:
            match = len(ls) + 1
            dprint(f, total, match)
            if bc:
                bcompare(f, s1)
            elif write:  # 如果开了bc,程序是绝对不会自动写入的
                File(f).write(s1, if_exists=if_exists)
            ls.append(f)
            if len(ls) == count:
                break

    match_num = len(ls)
    dprint(total, match_num)
    return ls
Beispiel #14
0
    def sha_id(self, s, file=None):
        """类似smartsha,但是返回的是sha对应的序号
        0对应最近一次提交,1对应上上次提交...

        这个实现和smartsha有点重复冗余,不是很优雅~~
        """
        shas = self.commits_sha(file=file)

        if s is None:
            return 0
        elif isinstance(s, str) and re.match(r'[a-z1-9]+', s):  # 正常的一段sha
            for i, t in enumerate(shas):
                if t.startswith(s):
                    return i
        elif isinstance(s, int):
            return s
        else:  # 其他情况,去匹配commit的名称
            names = self.commits_name(file=file)
            for i, t in enumerate(names):
                if s in t:
                    return i
            else:
                dprint(s)  # 没有找到这个关键词对应的commit
                raise ValueError
Beispiel #15
0
def demo_myspellchecker():
    # 类的初始化大概要0.4秒
    a = MySpellChecker()

    # sql的加载更新大概要1秒
    # hsql = HistudySQL('ckz', 'tr_develop')
    # df = hsql.query('SELECT * FROM spell_check')
    # a.update_by_dataframe(df)

    # dprint(a.correction_detail('d'))
    # dprint(a.correction_detail('wrod'))  # wrod有很多种可能性,但word权重是最大的
    # dprint(a.correction_detail('ckzckzckzckzckzckz'))  # wrod有很多种可能性,但word权重是最大的
    # dprint(a.correction('ckzckzckzckzckzckz'))  # wrod有很多种可能性,但word权重是最大的
    dprint(a.correction_detail('ike'))
    dprint(a.correction_detail('dean'))
    dprint(a.correction_detail('stud'))
    dprint(a.correction_detail('U'))
Beispiel #16
0
 def message(self):
     """查看pdf文档一些基础信息"""
     dprint(fitz.version)  # fitz模块的版本
     dprint(self.doc.pageCount)  # pdf页数
     dprint(self.doc._getXrefLength())  # 文档的对象总数
Beispiel #17
0
def demo_openpyxl():
    # 一、新建一个工作薄
    from openpyxl import Workbook
    wb = Workbook()

    # 取一个工作表
    ws = wb.active  # wb['Sheet'],取已知名称、下标的表格,excel不区分大小写,这里索引区分大小写

    # 1 索引单元格的两种方法,及可以用.value获取值
    ws['A2'] = '123'
    dprint(ws.cell(2, 1).value)  # 123

    # 2 合并单元格
    ws.merge_cells('A1:C2')
    dprint(ws['A1'].value)  # None,会把原来A2的内容清除

    # print(ws['A2'].value)  # AttributeError: 'MergedCell' object has no attribute 'value'

    # ws.unmerge_cells('A1:A3')  # ValueError: list.remove(x): x not in list,必须标记完整的合并单元格区域,否则会报错
    ws['A1'].value = '模块一'
    ws['A3'].value = '属性1'
    ws['B3'].value = '属性2'
    ws['C3'].value = '属性3'

    ws.merge_cells('D1:E2')
    ws['D1'].value = '模块二'
    ws['D3'].value = '属性1'
    ws['E3'].value = '属性2'

    dprint(ws['A1'].offset(1, 0).coordinate)  # A2
    dprint(ws['A1'].down().coordinate)  # A3

    # 3 设置单元格样式、格式
    from openpyxl.comments import Comment
    cell = ws['A3']
    cell.font = Font(name='Courier', size=36)
    cell.comment = Comment(text="A comment", author="Author's Name")

    styles = [[
        'Number formats', 'Comma', 'Comma [0]', 'Currency', 'Currency [0]',
        'Percent'
    ],
              [
                  'Informative', 'Calculation', 'Total', 'Note',
                  'Warning Text', 'Explanatory Text'
              ],
              [
                  'Text styles', 'Title', 'Headline 1', 'Headline 2',
                  'Headline 3', 'Headline 4', 'Hyperlink',
                  'Followed Hyperlink', 'Linked Cell'
              ],
              [
                  'Comparisons', 'Input', 'Output', 'Check Cell', 'Good',
                  'Bad', 'Neutral'
              ],
              [
                  'Highlights', 'Accent1', '20 % - Accent1', '40 % - Accent1',
                  '60 % - Accent1', 'Accent2', 'Accent3', 'Accent4', 'Accent5',
                  'Accent6', 'Pandas'
              ]]
    for i, name in enumerate(styles, start=4):
        ws.cell(i, 1, name[0])
        for j, v in enumerate(name[1:], start=2):
            ws.cell(i, j, v)
            ws.cell(i, j).style = v

    # 二、测试一些功能
    dprint(ws.search('模块二').coordinate)  # D1
    dprint(ws.search(['模块二', '属性1']).coordinate)  # D3

    dprint(ws.findcol(['模块一', '属性1'], direction=1))  # 0

    wb.save("demo_openpyxl.xlsx")
Beispiel #18
0
def demo_spellchecker():
    """演示如何使用spellchecker库
    官方介绍文档 pyspellchecker · PyPI: https://pypi.org/project/pyspellchecker/
    190909周一15:58,from 陈坤泽
    """
    # 1 创建对象
    # 可以设置语言、大小写敏感、拼写检查的最大距离
    #   默认'en'英语,大小写不敏感
    spell = SpellChecker()
    # 如果是英语,SpellChecker会自动加载语言包site-packages\spellchecker\resources\en.json.gz,大概12万个词汇,包括词频权重
    d = spell.word_frequency  # 这里的d是WordFrequency对象,其底层用了Counter类进行数据存储
    dprint(d.unique_words, d.total_words)  # 词汇数,权重总和

    # 2 修改词频表 spell.word_frequency
    dprint(d['ckz'])  # 不存在的词汇直接输出0
    d.add('ckz')  # 可以添加ckz词汇的一次词频
    d.load_words(['ckz', 'ckz', 'lyb'])  # 可以批量添加词汇
    dprint(d['ckz'], d['lyb'])  # d['ckz']=3  d['lyb']=1
    d.load_words(['ckz'] * 100 + ['lyb'] * 500)  # 可以用这种技巧进行大权重的添加
    dprint(d['ckz'], d['lyb'])  # d['ckz']=103  d['lyb']=501

    # 同理,去除也有remove和remove_words两种方法
    d.remove('ckz')
    # d.remove_words(['ckz', 'lyb'])  # 不过注意不能删除已经不存在的key('ckz'),否则会报KeyError
    dprint(d['ckz'], d['lyb'])  # d['ckz']=0  d['lyb']=501
    # remove是完全去除单词,如果只是要减权重可以访问底层的_dictionary对象操作
    d._dictionary['lyb'] -= 100  # 当然不太建议直接访问下划线开头的成员变量~~
    dprint(d['lyb'])  # ['lyb']=401

    # 还可以按阈值删除词频不超过设置阈值的词汇
    d.remove_by_threshold(5)

    # 3 spell的基本功能
    # (1)用unknown可以找到可能拼写错误的单词,再用correction可以获得最佳修改意见
    misspelled = spell.unknown(['something', 'is', 'hapenning', 'here'])
    dprint(misspelled)  # misspelled<set>={'hapenning'}

    for word in misspelled:
        # Get the one `most likely` answer
        dprint(spell.correction(word))  # <str>='happening'
        # Get a list of `likely` options
        dprint(spell.candidates(
            word))  # <set>={'henning', 'happening', 'penning'}

    # 注意默认的spell不区分大小写,如果词库存储了100次'ckz'
    #   此时判断任意大小写形式组合的'CKZ'都是返回原值
    #   例如 spell.correction('ckZ') => 'ckZ'

    # (2)可以通过修改spell.word_frequency影响correction的计算结果
    dprint(d['henning'], d['happening'], d['penning'])
    # d['henning']<int>=53    d['happening']<int>=4538    d['penning']<int>=23
    d._dictionary['henning'] += 10000
    dprint(spell.correction('hapenning'))  # <str>='henning'

    # (3)词汇在整个字典里占的权重
    dprint(spell.word_probability('henning'))  # <float>=0.0001040741914298211