Python analysis Exemples, core.analysis Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : Main.py Projet : qwerttqq95/376_Special

    def start_to_connect(self):
        buffsize = 2048
        if self.ui.pushButton.text() == '已上线':
            return 0
        try:
            print("Connection from :", self.add)
            self._signal_text.emit("Connection from :" + self.add[0])
            while True:
                try:
                    readable, [], exceptional = select.select([self.tctimeClient], [], [self.tctimeClient], 0)
                    if self.tctimeClient in readable:
                        data = self.tctimeClient.recv(buffsize)
                        data = Comm.makestr(str(binascii.b2a_hex(data))[2:-1])
                        if data is not None and data != '':
                            print('Received message:', data)
                            self._signal_text.emit('Received message:' + data)
                            message = core.analysis(data.replace(' ', ''))
                            print('adssss', message)
                            if message is None:
                                continue
                            if message[0] == 0:
                                print('Send message:', Comm.makestr(message[1]))
                                self._signal_text.emit('Send message:' + Comm.makestr(message[1]))
                                self._signal_warm.emit((1, '登录/心跳'))
                                self.ui.pushButton.setText('已上线')
                                self.ui.menubar.setDisabled(0)
                                self.ui.lineEdit_2.setDisabled(1)
                                self.tctimeClient.send(binascii.a2b_hex(message[1]))

                            elif message[0] == 1:
                                self._signal_warm.emit((1, message[1]))

                            elif message[0] == 3:
                                self._signal_warm.emit((3, message[1]))
                    if self.tctimeClient in exceptional:
                        break
                except:
                    traceback.print_exc(file=open('bug.txt', 'a+'))
                    break
        except:
            self._signal_warm.emit((0, '端口被占用'))
            traceback.print_exc(file=open('bug.txt', 'a+'))

Exemple #2

0

Afficher le fichier

def on_loop(project_id):
    # docresponse = get_documenttask(projid=project_id)
    # docdata = pd.DataFrame(docresponse)
    docdata = get_new_doc_task_db(project_id, 'docx')
    if len(docdata) == 0:
        return

    # docdata = docdata[(docdata['step'] == 1) & (docdata['fileType'] == 'dwg')]
    docdata = docdata.tail(config.n_for_project_in_loop)
    docdata.columns = [s[0].lower() + s[1:] for s in docdata.columns]

    docdata = (docdata.dropna(subset=['fileUrl', 'step']).reset_index())

    # docdata = (docdata.sort_values('name')
    #            .dropna(subset=['fileUrl', 'step'])
    #            .reset_index()
    #            )

    # basepath = os.path.join(config.root_dir, str(project_id))
    basepath = config.root_dir
    imgdir = os.path.join(config.root_dir, 'images')
    for indx, dt in docdata.iterrows():
        dt['createTime'] = str(dt['createTime'].asm8)
        print(datetime.now())
        info_log_obj = {'id': dt['fileId'], 'name': dt['name']}
        # analysis_log('开始', info_log_obj)
        if not dt['fileUrl'].startswith('http'):
            dt['step'] = 6
            change_step(dt['id'], dt.to_dict(), projid=project_id)
            analysis_log('无文件', info_log_obj)
            continue

        # 不分析一些类型
        no_analysis = False
        for tp in config.skip_file_types:
            if not dt['fileType'] or tp in dt['fileType']:
                dt['step'] = 5
                change_step(dt['id'], dt.to_dict(), projid=project_id)
                info_log_obj['type'] = dt['fileType']
                analysis_log('跳过类型', info_log_obj)
                no_analysis = True
                break
        if no_analysis:
            continue

        try:
            # 下载文件到本地文件夹
            curpath = os.path.join(basepath, dt['name'])
            download_doc(dt['fileUrl'], curpath)
        except:
            analysis_log('下载文件', info_log_obj)
            continue

        # 转换文件
        try:
            # 很大的
            if os.path.getsize(curpath) > 300 * 1000 * 1000:
                analysis_log('文件过大', info_log_obj)
                dt['step'] = 4
                change_step(dt['id'], dt.to_dict(), projid=project_id)
                analysis_log('完成', info_log_obj)
                continue

            ext_tuple = os.path.splitext(dt['name'])
            fname = ext_tuple[0]
            extname = ext_tuple[1]
            transformed = core.transform(curpath, basepath, extname)
        except:
            analysis_log('转换文件', info_log_obj)
            continue

        # 分析成字段
        try:
            kwords, kwfreq, pharr, nwarr, sumarr, attaimges, *drawing_none = core.analysis(
                curpath, extname, imgdir=imgdir, do_drawings=True)

            kwords_arr = kwords.split(',')
            real_kwords = []
            for kw in kwords_arr:
                if is_real_kw(kw):
                    real_kwords.append(kw)
            if len(real_kwords) > 5:
                low_kw = real_kwords[5:]
            else:
                low_kw = []
        except Exception as e:
            dt['step'] = 7
            change_step(dt['id'], dt.to_dict(), projid=project_id)
            analysis_log('分析成字段', info_log_obj)
            print(e)
            continue

        # 图片附件
        try:
            # 上传oss
            upload_result = core.upload_images(attaimges)

            # 写入附件表
            for atta in upload_result:
                atta_obj = {
                    "name": atta['name'],
                    "remark": "",
                    "keyword": "",
                    "abstract": utils.remove_blank(atta['abstract']),
                    "url": atta['url'],
                    "fileSize": atta['fileSize'],
                    "fileType": atta['fileType'],
                    "newWords": "",
                    "wordFrequency": "",
                    "phrases": "",
                    "linkType": "文件关联图片",
                    "fileId": dt['fileId']
                }
                add_attachment(atta_obj, projid=project_id)
        except Exception as e:
            print(e)
            analysis_log('图片附件', info_log_obj)
            continue

        # 文件表写入字段
        file_table_write_success = False
        try:
            doc_record = get_docs_byid(dt['fileId'], projid=project_id)

            # choose summary
            real_summary = []
            for su in sumarr:
                if is_real_summary(su):
                    real_summary.append(su)
            summarylimit = 3
            if len(real_summary) > summarylimit:
                real_summary = sorted(real_summary,
                                      key=lambda x: len(x),
                                      reverse=True)[:summarylimit]

            nwlimit = 900
            nwarr = utils.remove_blank(nwarr)
            if len(nwarr) > nwlimit:
                nwarr = nwarr[:nwlimit]
            updated = {
                # "keyWord": kwords,
                "keyWord": ','.join(low_kw),
                "abstract": ','.join(real_summary),
                "newWords": nwarr,
                "wordFrequency": kwfreq,
                "phrases": pharr
            }

            doc_record.update(updated)
            # print(doc_record)
            fill_docinfo(doc_record['id'], doc_record, projid=project_id)
            file_table_write_success = True
        except Exception as e:
            analysis_log('文件表填入', info_log_obj)
            print(e)
            continue

        # 创建新标签并关联
        try:
            if not real_kwords:
                analysis_log('无内容', info_log_obj)
            else:
                alltags = get_doctag(projid=project_id)
                if len(real_kwords) >= config.web_keywords_num:
                    curtags = real_kwords[:config.web_keywords_num]
                else:
                    curtags = real_kwords
                dtrels = []
                for curtag in curtags:
                    existq = False
                    for t in alltags:
                        if str(t['name']).upper() == str(curtag).upper():
                            dtrels.append((dt['fileId'], t['id']))
                            existq = True
                            break
                    if not existq:
                        tagid = create_doctag(curtag, projid=project_id)
                        dtrels.append((dt['fileId'], tagid))
                # 写入关联文件和标签
                create_doctagrel(dtrels, projid=project_id)
        except Exception as e:
            analysis_log('标签', info_log_obj)
            print(e)
            continue

        # 更改task的阶段为已完成
        if file_table_write_success:
            dt['step'] = 2
            change_step(dt['id'], dt.to_dict(), projid=project_id)

        # 删除本地下载文件
        pass
        analysis_log('完成', info_log_obj)

    # delete_doctagrel(13, projid=project_id)
    print('end proj')

Exemple #3

0

Afficher le fichier

            if not transformed:
                shutil.copy(fpath, filedir)

reanalysis = False
if reanalysis:
    print('analysis')
    result = []
    imgresult = []
    drawingresult = []
    for indx, fullname in enumerate(fname_arr):
        print(fullname)
        ext_tuple = os.path.splitext(fullname)
        fname = ext_tuple[0]
        extname = ext_tuple[1]
        fpath = os.path.join(filedir, fullname)
        kwords, kwfreq, pharr, nwarr, sumarr, curimg, curdrawing = core.analysis(
            fpath, extname, imgdir)
        fid = indx + 100
        result.append({
            'id': fid,
            'fname': fname,
            'extname': extname,
            'username': username,
            'keywords': kwords,
            'kwfreq': kwfreq,
            'phrase': pharr,
            'newwords': nwarr,
            'summary': sumarr
        })
        imgresult += curimg
        for d in curdrawing:
            d['drawing_id'] = fid

Exemple #4

0

Afficher le fichier

Fichier : analysislocal.py Projet : pengyang486868/PY-read-Document

def on_loop(project_id):
    docresponse = get_documenttask(projid=project_id)
    docdata = pd.DataFrame(docresponse)

    if len(docdata) == 0:
        return

    docdata = docdata[docdata['step'] == 1]
    docdata = docdata.tail(config.n_for_project_in_loop)

    docdata = (docdata
               # .sort_values('name')
               .dropna(subset=['fileUrl', 'step'])
               .reset_index()
               )

    # basepath = os.path.join(config.root_dir, str(project_id))
    basepath = r'E:\file-local-analysis'
    for indx, dt in docdata.iterrows():
        info_log_obj = {'id': dt['fileId'], 'name': dt['name']}
        print()
        analysis_log('开始', info_log_obj)

        # if not dt['fileUrl'].startswith('http'):
        #     analysis_log('无文件', info_log_obj)
        #     continue

        try:
            # curpath = os.path.join(basepath, dt['name'])
            curpath = dt['fileUrl']

            # transformed = core.transform(curpath, basepath, extname)
            ext_tuple = os.path.splitext(dt['name'])
            extname = ext_tuple[1]

            # 补写
            # if extname != '.dwg' and extname != '.rar':
            #     continue
            # analysis_log('开始', info_log_obj)
            # 补写

            if extname == '.doc':
                transdoc.doc2docx(curpath, basepath, remove=False)
                curpath = os.path.join(basepath, dt['name'])
            if extname == '.ppt':
                transppt.ppt2pptx(curpath, basepath, remove=False)
                curpath = os.path.join(basepath, dt['name'])

            # dwg rar本地转移 在线分析不用
            if extname == '.dwg':
                shutil.copy(curpath, basepath)
                curpath = os.path.join(basepath, dt['name'])
            if extname == '.rar' or extname == '.zip':
                shutil.copy(curpath, basepath)
                curpath = os.path.join(basepath, dt['name'])

            # 很大的
            if os.path.getsize(dt['fileUrl']) > 100 * 1000 * 1000:
                analysis_log('文件过大', info_log_obj)
                dt['step'] = 2
                change_step(dt['id'], dt.to_dict(), projid=project_id)
                continue
        except Exception as e:
            analysis_log('下载和转换文件', info_log_obj)
            continue

        # 分析成字段
        try:
            kwords, kwfreq, pharr, nwarr, sumarr, *img_none = core.analysis(
                curpath, extname, imgdir=None, do_drawings=True)

            kwords_arr = kwords.split(',')
            real_kwords = []
            for kw in kwords_arr:
                if is_real_kw(kw):
                    real_kwords.append(kw)
            if len(real_kwords) > 5:
                low_kw = real_kwords[5:]
            else:
                low_kw = []
        except Exception as e:
            analysis_log('分析成字段', info_log_obj)
            print(e)

            # avoid always fail
            dt['step'] = 2
            change_step(dt['id'], dt.to_dict(), projid=project_id)
            # avoid always fail
            continue

        # 文件表写入字段
        file_table_write_success = False
        try:
            doc_record = get_docs_byid(dt['fileId'], projid=project_id)

            # choose summary
            real_summary = []
            for su in sumarr:
                if is_real_summary(su):
                    real_summary.append(su)
            summarylimit = 3
            if len(real_summary) > summarylimit:
                real_summary = sorted(real_summary,
                                      key=lambda x: len(x),
                                      reverse=True)[:summarylimit]

            nwlimit = 900
            nwarr = utils.remove_blank(nwarr)
            if len(nwarr) > nwlimit:
                nwarr = nwarr[:nwlimit]
            updated = {
                # "keyWord": kwords,
                "keyWord": ','.join(low_kw),
                "abstract": ','.join(real_summary),
                "newWords": nwarr,
                "wordFrequency": kwfreq,
                "phrases": pharr
            }

            doc_record.update(updated)
            # print(doc_record)
            fill_docinfo(doc_record['id'], doc_record, projid=project_id)
            file_table_write_success = True
        except Exception as e:
            analysis_log('文件表填入', info_log_obj)
            continue

        # 创建新标签并关联
        try:
            if not real_kwords:
                analysis_log('无内容', info_log_obj)
            else:
                alltags = get_doctag(projid=project_id)
                if len(real_kwords) >= config.web_keywords_num:
                    curtags = real_kwords[:config.web_keywords_num]
                else:
                    curtags = real_kwords
                dtrels = []
                for curtag in curtags:
                    existq = False
                    for t in alltags:
                        if str(t['name']).upper() == str(curtag).upper():
                            dtrels.append((dt['fileId'], t['id']))
                            existq = True
                            break
                    if not existq:
                        tagid = create_doctag(curtag, projid=project_id)
                        dtrels.append((dt['fileId'], tagid))
                # 写入关联文件和标签
                create_doctagrel(dtrels, projid=project_id)
        except:
            analysis_log('标签', info_log_obj)
            continue

        # 更改task的阶段为已完成
        if file_table_write_success:
            dt['step'] = 2
            change_step(dt['id'], dt.to_dict(), projid=project_id)

        # 删除本地下载文件
        pass
        analysis_log('完成', info_log_obj)

    # delete_doctagrel(13, projid=project_id)
    print('end proj')