def on_loop(project_id): # docresponse = get_documenttask(projid=project_id) # docdata = pd.DataFrame(docresponse) docdata = get_new_doc_task_db(project_id, 'docx') if len(docdata) == 0: return # docdata = docdata[(docdata['step'] == 1) & (docdata['fileType'] == 'dwg')] docdata = docdata.tail(config.n_for_project_in_loop) docdata.columns = [s[0].lower() + s[1:] for s in docdata.columns] docdata = (docdata.dropna(subset=['fileUrl', 'step']).reset_index()) # docdata = (docdata.sort_values('name') # .dropna(subset=['fileUrl', 'step']) # .reset_index() # ) # basepath = os.path.join(config.root_dir, str(project_id)) basepath = config.root_dir imgdir = os.path.join(config.root_dir, 'images') for indx, dt in docdata.iterrows(): dt['createTime'] = str(dt['createTime'].asm8) print(datetime.now()) info_log_obj = {'id': dt['fileId'], 'name': dt['name']} # analysis_log('开始', info_log_obj) if not dt['fileUrl'].startswith('http'): dt['step'] = 6 change_step(dt['id'], dt.to_dict(), projid=project_id) analysis_log('无文件', info_log_obj) continue # 不分析一些类型 no_analysis = False for tp in config.skip_file_types: if not dt['fileType'] or tp in dt['fileType']: dt['step'] = 5 change_step(dt['id'], dt.to_dict(), projid=project_id) info_log_obj['type'] = dt['fileType'] analysis_log('跳过类型', info_log_obj) no_analysis = True break if no_analysis: continue try: # 下载文件到本地文件夹 curpath = os.path.join(basepath, dt['name']) download_doc(dt['fileUrl'], curpath) except: analysis_log('下载文件', info_log_obj) continue # 转换文件 try: # 很大的 if os.path.getsize(curpath) > 300 * 1000 * 1000: analysis_log('文件过大', info_log_obj) dt['step'] = 4 change_step(dt['id'], dt.to_dict(), projid=project_id) analysis_log('完成', info_log_obj) continue ext_tuple = os.path.splitext(dt['name']) fname = ext_tuple[0] extname = ext_tuple[1] transformed = core.transform(curpath, basepath, extname) except: analysis_log('转换文件', info_log_obj) continue # 分析成字段 try: kwords, kwfreq, pharr, nwarr, sumarr, attaimges, *drawing_none = core.analysis( curpath, extname, imgdir=imgdir, do_drawings=True) kwords_arr = kwords.split(',') real_kwords = [] for kw in kwords_arr: if is_real_kw(kw): real_kwords.append(kw) if len(real_kwords) > 5: low_kw = real_kwords[5:] else: low_kw = [] except Exception as e: dt['step'] = 7 change_step(dt['id'], dt.to_dict(), projid=project_id) analysis_log('分析成字段', info_log_obj) print(e) continue # 图片附件 try: # 上传oss upload_result = core.upload_images(attaimges) # 写入附件表 for atta in upload_result: atta_obj = { "name": atta['name'], "remark": "", "keyword": "", "abstract": utils.remove_blank(atta['abstract']), "url": atta['url'], "fileSize": atta['fileSize'], "fileType": atta['fileType'], "newWords": "", "wordFrequency": "", "phrases": "", "linkType": "文件关联图片", "fileId": dt['fileId'] } add_attachment(atta_obj, projid=project_id) except Exception as e: print(e) analysis_log('图片附件', info_log_obj) continue # 文件表写入字段 file_table_write_success = False try: doc_record = get_docs_byid(dt['fileId'], projid=project_id) # choose summary real_summary = [] for su in sumarr: if is_real_summary(su): real_summary.append(su) summarylimit = 3 if len(real_summary) > summarylimit: real_summary = sorted(real_summary, key=lambda x: len(x), reverse=True)[:summarylimit] nwlimit = 900 nwarr = utils.remove_blank(nwarr) if len(nwarr) > nwlimit: nwarr = nwarr[:nwlimit] updated = { # "keyWord": kwords, "keyWord": ','.join(low_kw), "abstract": ','.join(real_summary), "newWords": nwarr, "wordFrequency": kwfreq, "phrases": pharr } doc_record.update(updated) # print(doc_record) fill_docinfo(doc_record['id'], doc_record, projid=project_id) file_table_write_success = True except Exception as e: analysis_log('文件表填入', info_log_obj) print(e) continue # 创建新标签并关联 try: if not real_kwords: analysis_log('无内容', info_log_obj) else: alltags = get_doctag(projid=project_id) if len(real_kwords) >= config.web_keywords_num: curtags = real_kwords[:config.web_keywords_num] else: curtags = real_kwords dtrels = [] for curtag in curtags: existq = False for t in alltags: if str(t['name']).upper() == str(curtag).upper(): dtrels.append((dt['fileId'], t['id'])) existq = True break if not existq: tagid = create_doctag(curtag, projid=project_id) dtrels.append((dt['fileId'], tagid)) # 写入关联文件和标签 create_doctagrel(dtrels, projid=project_id) except Exception as e: analysis_log('标签', info_log_obj) print(e) continue # 更改task的阶段为已完成 if file_table_write_success: dt['step'] = 2 change_step(dt['id'], dt.to_dict(), projid=project_id) # 删除本地下载文件 pass analysis_log('完成', info_log_obj) # delete_doctagrel(13, projid=project_id) print('end proj')
def on_loop(project_id): docresponse = get_documenttask(projid=project_id) docdata = pd.DataFrame(docresponse) if len(docdata) == 0: return docdata = docdata[docdata['step'] == 1] docdata = docdata.tail(config.n_for_project_in_loop) docdata = (docdata # .sort_values('name') .dropna(subset=['fileUrl', 'step']) .reset_index() ) # basepath = os.path.join(config.root_dir, str(project_id)) basepath = r'E:\file-local-analysis' for indx, dt in docdata.iterrows(): info_log_obj = {'id': dt['fileId'], 'name': dt['name']} print() analysis_log('开始', info_log_obj) # if not dt['fileUrl'].startswith('http'): # analysis_log('无文件', info_log_obj) # continue try: # curpath = os.path.join(basepath, dt['name']) curpath = dt['fileUrl'] # transformed = core.transform(curpath, basepath, extname) ext_tuple = os.path.splitext(dt['name']) extname = ext_tuple[1] # 补写 # if extname != '.dwg' and extname != '.rar': # continue # analysis_log('开始', info_log_obj) # 补写 if extname == '.doc': transdoc.doc2docx(curpath, basepath, remove=False) curpath = os.path.join(basepath, dt['name']) if extname == '.ppt': transppt.ppt2pptx(curpath, basepath, remove=False) curpath = os.path.join(basepath, dt['name']) # dwg rar本地转移 在线分析不用 if extname == '.dwg': shutil.copy(curpath, basepath) curpath = os.path.join(basepath, dt['name']) if extname == '.rar' or extname == '.zip': shutil.copy(curpath, basepath) curpath = os.path.join(basepath, dt['name']) # 很大的 if os.path.getsize(dt['fileUrl']) > 100 * 1000 * 1000: analysis_log('文件过大', info_log_obj) dt['step'] = 2 change_step(dt['id'], dt.to_dict(), projid=project_id) continue except Exception as e: analysis_log('下载和转换文件', info_log_obj) continue # 分析成字段 try: kwords, kwfreq, pharr, nwarr, sumarr, *img_none = core.analysis( curpath, extname, imgdir=None, do_drawings=True) kwords_arr = kwords.split(',') real_kwords = [] for kw in kwords_arr: if is_real_kw(kw): real_kwords.append(kw) if len(real_kwords) > 5: low_kw = real_kwords[5:] else: low_kw = [] except Exception as e: analysis_log('分析成字段', info_log_obj) print(e) # avoid always fail dt['step'] = 2 change_step(dt['id'], dt.to_dict(), projid=project_id) # avoid always fail continue # 文件表写入字段 file_table_write_success = False try: doc_record = get_docs_byid(dt['fileId'], projid=project_id) # choose summary real_summary = [] for su in sumarr: if is_real_summary(su): real_summary.append(su) summarylimit = 3 if len(real_summary) > summarylimit: real_summary = sorted(real_summary, key=lambda x: len(x), reverse=True)[:summarylimit] nwlimit = 900 nwarr = utils.remove_blank(nwarr) if len(nwarr) > nwlimit: nwarr = nwarr[:nwlimit] updated = { # "keyWord": kwords, "keyWord": ','.join(low_kw), "abstract": ','.join(real_summary), "newWords": nwarr, "wordFrequency": kwfreq, "phrases": pharr } doc_record.update(updated) # print(doc_record) fill_docinfo(doc_record['id'], doc_record, projid=project_id) file_table_write_success = True except Exception as e: analysis_log('文件表填入', info_log_obj) continue # 创建新标签并关联 try: if not real_kwords: analysis_log('无内容', info_log_obj) else: alltags = get_doctag(projid=project_id) if len(real_kwords) >= config.web_keywords_num: curtags = real_kwords[:config.web_keywords_num] else: curtags = real_kwords dtrels = [] for curtag in curtags: existq = False for t in alltags: if str(t['name']).upper() == str(curtag).upper(): dtrels.append((dt['fileId'], t['id'])) existq = True break if not existq: tagid = create_doctag(curtag, projid=project_id) dtrels.append((dt['fileId'], tagid)) # 写入关联文件和标签 create_doctagrel(dtrels, projid=project_id) except: analysis_log('标签', info_log_obj) continue # 更改task的阶段为已完成 if file_table_write_success: dt['step'] = 2 change_step(dt['id'], dt.to_dict(), projid=project_id) # 删除本地下载文件 pass analysis_log('完成', info_log_obj) # delete_doctagrel(13, projid=project_id) print('end proj')