def read_number_curr_result(doc_id, page_no): cid_odir = cfgObj.get_config('PageAnalysis', 'number_curr_result') fname = '%s.sh' % (page_no) ofname = os.path.join(opath, str(doc_id), cid_odir, fname) d = datastore.read_data_fname(ofname, isdb, isenc) cell_info_dict = d.get('data', {}) return cell_info_dict
def get_fp_curr_result(doc_id, page_no): cid_odir = cfgObj.get_config('PageAnalysis', 'fp_curr_result') fname = '%s.sh' % (page_no) ofname = os.path.join(opath, str(doc_id), cid_odir, fname) d = datastore.read_data_fname(ofname, isdb, isenc) data = d.get('data', {}) return data
def get_metadata_dict(docid): cid_odir = cfgObj.get_config('ExtractEntity', 'EntityOutput_odir') fname = '%s.sh' % (docid) ofname = os.path.join(opath, str(docid), cid_odir, fname) shv = datastore.read_data_fname(ofname, isdb, isenc) metadat_dict = shv.get('data', {}) return metadat_dict
def get_number_curr_result(self, doc_id, page_no): cid_odir = cfgObj.get_config('PageAnalysis', 'number_curr_result') #print "number_curr_result path:", cid_odir fname = '%s.sh' % (page_no) ofname = os.path.join(self.opath, str(doc_id), cid_odir, fname) d = datastore.read_data_fname(ofname, self.isdb, self.isenc) data = d.get('data', {}) return data
def get_basic_hrvr_grps(doc_id, page_no): fname = '%s_HRVR.sh' % (page_no) shname = os.path.join(opath, str(doc_id), "MDB", fname) #d = datastore.read_data_fname(shname, isdb, isenc) d = datastore.read_data_fname(shname, isdb, 0) bbox_list = d.get('hrvr_bboxs', {}) cell_list = d.get('hrvr_cells', {}) font_dict = d.get('font', {}) return bbox_list, cell_list, font_dict
def read_projected_rm_linll(doc_id, page_no, level): prm_odir = cfgObj.get_config('PageAnalysis', 'projectedrmlnill_odir') fpath = os.path.join(opath, str(doc_id), prm_odir, '') #os.system('mkdir -p %s' %fpath) datastore.make_dirs(fpath) filename = '%s_%s.sh' % (str(page_no), level) fname = os.path.join(fpath, filename) data = datastore.read_data_fname(fname, isdb, isenc, d) return data.get('data', {})
def get_font_rm(doc_id, page_no): fname = '%s.sh' % (page_no) #shname = os.path.join(ipath, str(doc_id), ci_odir, fname) shname = os.path.join(opath, str(doc_id), "MDB", fname) #d = datastore.read_data_fname(shname, isdb, isenc) d = datastore.read_data_fname(shname, isdb, 0) cellinfodict = d.get('data', {}) return cellinfodict
def get_base_igs(doc_id, page_no): fname = '%s_TOK_HLPN.sh' % (page_no) #shname = os.path.join(ipath, str(doc_id), ci_odir, fname) shname = os.path.join(opath, str(doc_id), "MDD", fname) #print shname #d = datastore.read_data_fname(shname, isdb, isenc) d = datastore.read_data_fname(shname, isdb, 0) cellinfodict = d.get('data', {}) return cellinfodict
def get_font_dict(self, doc_id, page_no): cpath = cfgObj.get_config('MOD_DIRNAME', 'fontdict') fname = '%s.sh' % (page_no) #FontDictpath = os.path.join(ipath, str(doc_id), cpath, fname) FontDictpath = os.path.join(self.opath, str(doc_id), cpath, fname) if not os.path.exists(FontDictpath): print >> sys.stderr, 'Font dict not found! ', FontDictpath return {} shv = datastore.read_data_fname(FontDictpath, self.isdb, self.isenc) font_dict = shv.get('font_dict', {}) return font_dict
def get_num_grid(doc_id, page_no): cpath = cfgObj.get_config('MOD_DIRNAME', 'numgrid') fname = '%s.sh' % (page_no) #numpath = os.path.join(ipath, str(doc_id), cpath, fname) numpath = os.path.join(opath, str(doc_id), cpath, fname) if not os.path.exists(numpath): print >> sys.stderr, 'NUM GRID dict not found! ', numpath return {} shv = datastore.read_data_fname(numpath, isdb, isenc) num_dict = shv.get('data', {}) return num_dict
def get_hrvr_grps2(doc_id, page_no): fname = '%s_HRVR.sh' % (page_no) #shname = os.path.join(ipath, str(doc_id), ci_odir, fname) shname = os.path.join(opath, str(doc_id), "MDB", fname) #d = datastore.read_data_fname(shname, isdb, isenc) d = datastore.read_data_fname(shname, isdb, 0) bbox_list = d.get('bbox_unordered', {}) cell_list = d.get('cell_unordered', {}) font_dict = d.get('font', {}) return bbox_list, cell_list, font_dict
def is_cell_info_dict_exists(doc_id, page_no): ci_odir = cfgObj.get_config('PageAnalysis', 'cell_info_dict_odir') fname = '%s.sh' % (page_no) #shname = os.path.join(ipath, str(doc_id), ci_odir, fname) shname = os.path.join(opath, str(doc_id), ci_odir, fname) #print 'shname: ', shname if os.path.exists(shname): d = datastore.read_data_fname(shname, isdb, isenc) if d.get('cell_info_dict', {}): return 1 return 0
def get_tok_indexing(doc_id, page_no, inkey, level): fname = '%s_%s_%s.sh' % (page_no, inkey, level) #shname = os.path.join(ipath, str(doc_id), ci_odir, fname) shname = os.path.join(opath, str(doc_id), "MDD", fname) print shname #print isdb, isenc #d = datastore.read_data_fname(shname, isdb, isenc) d = datastore.read_data_fname(shname, isdb, 0) if type(d) == type({}): cellinfodict = d.get('data', {}) return cellinfodict return {}
def get_visual_group_dict(doc_id, page_no): ci_odir = cfgObj.get_config('MOD_DIRNAME', 'visdict') fname = '%s.sh' % (page_no) #shname = os.path.join(ipath, str(doc_id), ci_odir, fname) shname = os.path.join(opath, str(doc_id), ci_odir, fname) if not os.path.exists(shname): print >> sys.stderr, 'Visual Group dict not found! ' return {} shv = datastore.read_data_fname(shname, isdb, isenc) cell_info_dict = shv.get('vis_dict', {}) return cell_info_dict
def get_relation_dict(doc_id, page_no): rel_odir = cfgObj.get_config('PageAnalysis', 'RelationResults_odir') fname = '%s.sh' % (str(page_no)) #sh_path = os.path.join(ipath, str(doc_id), rel_odir, fname) sh_path = os.path.join(opath, str(doc_id), rel_odir, fname) if not os.path.exists(sh_path): print >> sys.stderr, 'relation dict not found! ' return {} dd = datastore.read_data_fname(sh_path, isdb, isenc) rd = dd.get('data', {}) return rd
def get_num_behave_shelve(doc_id, page_no): drm_odir = cfgObj.get_config('PageAnalysis', 'num_behave_odir') fname = '%s.sh' % (str(page_no)) #rm_path = os.path.join(ipath, str(doc_id), drm_odir, fname) rm_path = os.path.join(opath, str(doc_id), drm_odir, fname) if not os.path.exists(rm_path): print >> sys.stderr, 'Number Behavior Shelve not found! ' return [] dd = datastore.read_data_fname(rm_path, isdb, isenc) data_dict = dd.get('data', {}) return data_dict
def get_visual_group_proj_dict(doc_id, page_no): ci_odir = cfgObj.get_config('MOD_DIRNAME', 'visprojdict') fname = '%s.sh' % (page_no) shname = os.path.join(ipath, str(doc_id), ci_odir, fname) #print shname if not os.path.exists(shname): #print >> sys.stderr, 'Visual Group projected dict not found! ' return {} shv = datastore.read_data_fname(shname, isdb, isenc) res_dict = shv.get('vis_proj_dict', {}) return res_dict
def get_nonG_shelve(doc_id): drm_odir = cfgObj.get_config('applicator', 'TAS_Topic_Mapped_NonG') fname = '%s.sh' % (str(doc_id)) #rm_path = os.path.join(ipath, str(doc_id), drm_odir, fname) rm_path = os.path.join(opath, str(doc_id), drm_odir, fname) if not os.path.exists(rm_path): print >> sys.stderr, 'Number Behavior Shelve not found! ' return [] dd = datastore.read_data_fname(rm_path, isdb, isenc) data_dict = dd.get('nong_data', {}) return data_dict
def get_semantic_ph(doc_id): ci_odir = cfgObj.get_config('SemanticModule', 'oldPH_odir') fname = '%s.sh' % (doc_id) shname = os.path.join(opath, str(doc_id), ci_odir, fname) #print shname if not os.path.exists(shname): print >> sys.stderr, 'Visual Group projected dict not found! ' return {} shv = datastore.read_data_fname(shname, isdb, isenc) res_dict = shv.get('data', {}) return res_dict
def get_proj_rm(doc_id, page_no, level): drm_odir = cfgObj.get_config('PageAnalysis', 'projectedrm_odir') fname = '%s_%s.sh' % (str(page_no), level) #rm_path = os.path.join(ipath, str(doc_id), drm_odir, fname) rm_path = os.path.join(opath, str(doc_id), drm_odir, fname) print 'ppp', rm_path if not os.path.exists(rm_path): print >> sys.stderr, 'Projected RM not found! ' return [] dd = datastore.read_data_fname(rm_path, isdb, 0) data_dict = dd.get('data', {}) return data_dict
def get_cell_info_dict_1(self, doc_id, page_no): ci_odir = cfgObj.get_config('PageAnalysis', 'cell_info_dict_odir') fname = '%s_HRA.sh' % (page_no) #shname = os.path.join(ipath, str(doc_id), ci_odir, fname) shname = os.path.join(self.opath, str(doc_id), ci_odir, fname) print 'sssss : ', shname if not os.path.exists(shname): # print >> sys.stderr, 'Cell INFO dict not found! ' return {} shv = datastore.read_data_fname(shname, self.isdb, self.isenc) cell_info_dict = shv.get('cell_info_dict', {}) return cell_info_dict
def get_synthe_dict(doc_id, page_no, level): slt_odir = cfgObj.get_config('PageAnalysis', 'coverpagesynthesizer_odir') fname = '%s_%s.sh' % (page_no, level) #CellDictpath = os.path.join(ipath, str(doc_id), slt_odir, fname) CellDictpath = os.path.join(opath, str(doc_id), slt_odir, fname) #print CellDictpath if not os.path.exists(CellDictpath): print >> sys.stderr, 'SLT not found! ' return {} dd = datastore.read_data_fname(CellDictpath, isdb, 0) data_dict = dd.get('data', {}) return data_dict
def get_cell_info_dict(doc_id, page_no): ci_odir = "CID" #cfgObj.get_config('PageAnalysis', 'cell_info_dict_odir') fname = '%s.sh' % (page_no) #print fname #shname = os.path.join(ipath, str(doc_id), ci_odir, fname) shname = os.path.join(opath, str(doc_id), ci_odir, fname) if not os.path.exists(shname): print >> sys.stderr, 'Cell INFO dict not found! ' return {} print shname, isdb, isenc shv = datastore.read_data_fname(shname, isdb, isenc) cell_info_dict = shv.get('cell_info_dict', {}) return cell_info_dict
def get_cell_info_dict_level(doc_id, page_no, level): ci_odir = cfgObj.get_config('PageAnalysis', 'cell_info_dict_odir') fname = '%s_%s.sh' % (page_no, level) #shname = os.path.join(ipath, str(doc_id), ci_odir, fname) shname = os.path.join(opath, str(doc_id), ci_odir, fname) if not os.path.exists(shname): print >> sys.stderr, 'Cell INFO dict not found! ', shname return {} #shv = datastore.read_data_fname(shname, isdb, isenc) shv = datastore.read_data_fname(shname, isdb, 0) #print >> sys.stderr, "JJJJ", shname cell_info_dict = shv.get('cell_info_dict', {}) return cell_info_dict
def get_fc_grps(doc_id, page_no): fname = '%s_HRVR.sh' % (page_no) #shname = os.path.join(ipath, str(doc_id), ci_odir, fname) shname = os.path.join(opath, str(doc_id), "MDB", fname) #d = datastore.read_data_fname(shname, isdb, isenc) d = datastore.read_data_fname(shname, isdb, 0) fc_cells = d.get('fc_cells', []) fc_signature_list = d.get('fc_sig_dict_list', []) sfc_cells = d.get('sfc_cells', []) sfc_signature_list = d.get('sfc_sig_dict_list', []) return fc_cells, fc_signature_list, sfc_cells, sfc_signature_list
def get_slt_dict(doc_id, page_no, level): slt_odir = cfgObj.get_config('PageAnalysis', 'slt_data_odir') fname = '%s_%s.sh' % (page_no, level) #CellDictpath = os.path.join(ipath, str(doc_id), slt_odir, fname) CellDictpath = os.path.join(opath, str(doc_id), slt_odir, fname) #print CellDictpath if not os.path.exists(CellDictpath): print >> sys.stderr, 'SLT not found! ' return {} #return return_data(CellDictpath) dd = datastore.read_data_fname(CellDictpath, isdb, isenc) return dd
def get_cell_dict(doc_id, page_no): cpath = cfgObj.get_config('MOD_DIRNAME', 'celldict') fname = '%s.sh' % (page_no) CellDictpath = os.path.join(ipath, str(doc_id), cpath, fname) #print 'CELL DICT PATH : ', CellDictpath #sys.exit() if not os.path.exists(CellDictpath): print >> sys.stderr, 'Cell dict not found! ', CellDictpath return {} #sys.exit() #print CellDictpath shv = datastore.read_data_fname(CellDictpath, isdb, isenc) cell_dict = shv.get('cell_dict', {}) #print cell_dict return cell_dict
def get_media_box(doc_id, pno): #rm_path = os.path.join(opath, str(doc_id), "db", str(pno), 'pdfdata.db') rm_path = os.path.join(ipath, str(doc_id), "db", str(pno), 'pdfdata.db') #print rm_path if not os.path.exists(rm_path): print >> sys.stderr, ' pdfdata not found' return [] dd = datastore.read_data_fname(rm_path, isdb, isenc, {}, 'pdfdata') data_dict = dd.get('page_master', []) if data_dict: bbox_dict = data_dict[0].get('bbox', {}) if bbox_dict: return "%s_%s_%s_%s" % (bbox_dict['x0'], bbox_dict['y0'], bbox_dict['w'], bbox_dict['h']) return ""
def return_data(data_path): dd = datastore.read_data_fname(data_path, isdb, isenc) data_dict = dd.get('data', {}) return data_dict