def get_data(self, company_id): table_match_dict = robj.processdelta(company_id) deal_id = tuple(company_id.split('_')) import read_slt_info as read_slt_info upObj = read_slt_info.update(deal_id) docname_docid_lst = upObj.get_documentName_id() ph_doc_info_dict = {} doc_ph_info = {} for (doc_name, doc_id) in docname_docid_lst: doc_sp_lst = doc_name.split('_') ph = '' if len(doc_sp_lst) == 2: ph = doc_sp_lst[-1] if len(doc_sp_lst) == 4: ph = doc_sp_lst[-2] + doc_sp_lst[-1] ph = ph.replace('AR', 'FY') ph_doc_info_dict[ph] = (ph, doc_name, doc_id) doc_ph_info[doc_id] = ph sorted_ph_lst = report_year_sort.year_sort(ph_doc_info_dict.keys()) len_phs = len(sorted_ph_lst) doc_sorted_lst = [ph_doc_info_dict[ph][-1] for ph in sorted_ph_lst] #print sorted_ph_lst #print doc_sorted_lst #sys.exit() new_table_match_dict = {} for doc_pair, pair_ars in table_match_dict.items(): i1 = doc_sorted_lst.index(doc_pair[0]) i2 = doc_sorted_lst.index(doc_pair[1]) print 'doc_pair: ', doc_pair if (i1 < i2): new_table_match_dict[doc_pair] = pair_ars[:] else: print 'rev: ', doc_pair, ' == ', i1, i2 new_pair_ars = [] for pair in pair_ars: new_pair_ars.append((pair[1], pair[0])) new_table_match_dict[(doc_pair[1], doc_pair[0])] = new_pair_ars[:] fname = os.path.join(self.output_path, company_id + '_table_delta.slv') sh = shelve.open(fname, 'n') sh['data'] = new_table_match_dict sh.close() return
def get_doc_pairs(self, company_id): deal_id = tuple(company_id.split('_')) import read_slt_info as read_slt_info upObj = read_slt_info.update(deal_id) docname_docid_lst = upObj.get_documentName_id() ph_doc_info_dict = {} doc_ph_info = {} #print docname_docid_lst #sys.exit() for (doc_name, doc_id) in docname_docid_lst: doc_sp_lst = doc_name.split('_') ph = '' if len(doc_sp_lst) == 2: ph = doc_sp_lst[-1] if len(doc_sp_lst) == 4: ph = doc_sp_lst[-2] + doc_sp_lst[-1] ph = ph.replace('AR', 'FY') ph_doc_info_dict[ph] = doc_id doc_ph_info[doc_id] = ph lmdb_folder = os.path.join(self.output_path, company_id) if not os.path.exists(lmdb_folder): os.mkdir(lmdb_folder) sorted_ph_lst = report_year_sort.year_sort(ph_doc_info_dict.keys()) doc_pair_dict = {} for i, ph in enumerate(sorted_ph_lst): qh_flg1 = ph[:2] doc_id1 = ph_doc_info_dict[ph] for kph in sorted_ph_lst[i + 1:]: qh_flg2 = kph[:2] doc_id2 = ph_doc_info_dict[kph] if qh_flg1 == qh_flg2: if (doc_id1, doc_id2) not in doc_pair_dict: doc_pair_dict[(doc_id1, doc_id2)] = 1 break return doc_pair_dict.keys()
def get_sorted_doc_list(self, company_id): delta_table_match_dict = self.read_delta_data(company_id) #print delta_table_match_dict.keys() #sys.exit() deal_id = tuple(company_id.split('_')) import read_slt_info as read_slt_info upObj = read_slt_info.update(deal_id) docname_docid_lst = upObj.get_documentName_id() ph_doc_info_dict = {} doc_ph_info = {} for (doc_name, doc_id) in docname_docid_lst: doc_sp_lst = doc_name.split('_') ph = '' if len(doc_sp_lst) == 2: ph = doc_sp_lst[-1] if len(doc_sp_lst) == 4: ph = doc_sp_lst[-2] + doc_sp_lst[-1] ph = ph.replace('AR', 'FY') ph_doc_info_dict[ph] = (ph, doc_name, doc_id) doc_ph_info[doc_id] = ph lmdb_folder = os.path.join(self.output_path, company_id) if not os.path.exists(lmdb_folder): os.mkdir(lmdb_folder) dfname = os.path.join(lmdb_folder, 'doc_ph_info') self.lmdb_obj.write_to_lmdb(dfname, doc_ph_info, doc_ph_info.keys()) sorted_ph_lst = report_year_sort.year_sort(ph_doc_info_dict.keys()) len_phs = len(sorted_ph_lst) doc_sorted_lst = [ph_doc_info_dict[ph][-1] for ph in sorted_ph_lst] sorted_combination_doc_lst = [] for i, ph in enumerate(sorted_ph_lst): map_doc_tup = doc_sorted_lst[i] if (ph[:2] != 'FY') and (i < len(sorted_ph_lst) - 2): next_ph = sorted_ph_lst[i + 1] if ph[:1] in ['Q', 'H'] and next_ph[:1] in ['F']: continue next_doc_tup = doc_sorted_lst[i + 1] sorted_combination_doc_lst.append((map_doc_tup, next_doc_tup)) elif (ph[:2] == 'FY') and (i < len(sorted_ph_lst) - 2): next_fy = ph[:2] + str(int(ph[2:]) + 1) if next_fy not in sorted_ph_lst: continue next_fy_indx = sorted_ph_lst.index(next_fy) next_doc_tup = doc_sorted_lst[next_fy_indx] sorted_combination_doc_lst.append((map_doc_tup, next_doc_tup)) ###################################################################### project_id, url_id = company_id.split('_') norm_res_list = sObj.slt_normresids(project_id, url_id) doc_page_dict = {} for doc_tup in norm_res_list: doc_id, page_number, norm_table_id = doc_tup if doc_id not in doc_page_dict: doc_page_dict[doc_id] = {} if page_number not in doc_page_dict[doc_id]: doc_page_dict[doc_id][page_number] = [] doc_page_dict[doc_id][page_number].append(norm_table_id) cache_xml_ids = {} val_cons_dict = {} doc_pair_table_pair_dict = {} doc_id_pairs = delta_table_match_dict.keys() for doc_pair in doc_id_pairs: hyp_doc, ref_doc = doc_pair hyf_ref_lst = delta_table_match_dict[doc_pair] for (hyp_list, ref_list) in hyf_ref_lst: pg1 = hyp_list[0].split('_')[-1] pg2 = ref_list[0].split('_')[-1] #print doc_pair, pg1, pg2, hyp_list, ref_list if (pg2 not in doc_page_dict[doc_pair[1]]) or ( pg1 not in doc_page_dict[doc_pair[0]]): continue norm_table_id_hyps = doc_page_dict[doc_pair[0]][pg1] norm_table_id_refs = doc_page_dict[doc_pair[1]][pg2] selected_hyp = '' if (len(norm_table_id_hyps) == 1): selected_hyp = norm_table_id_hyps[0] else: n_hyp_list = [] for r in hyp_list: n_hyp_list += r.split('#') for norm_table_id_hyp in norm_table_id_hyps: if (project_id, url_id, norm_table_id_hyp) in cache_xml_ids: xmlids1 = cache_xml_ids[(project_id, url_id, norm_table_id_hyp)] else: xmlids1 = self.get_xml_id(project_id, url_id, norm_table_id_hyp) cache_xml_ids[(project_id, url_id, norm_table_id_hyp)] = xmlids1 s1 = sets.Set(xmlids1).intersection( sets.Set(n_hyp_list)) if list(s1): selected_hyp = norm_table_id_hyp break selected_ref = '' if (len(norm_table_id_refs) == 1): selected_ref = norm_table_id_refs[0] else: n_ref_list = [] for r in ref_list: n_ref_list += r.split('#') for norm_table_id_ref in norm_table_id_refs: if (project_id, url_id, norm_table_id_ref) in cache_xml_ids: xmlids1 = cache_xml_ids[(project_id, url_id, norm_table_id_ref)] else: xmlids1 = self.get_xml_id(project_id, url_id, norm_table_id_ref) cache_xml_ids[(project_id, url_id, norm_table_id_ref)] = xmlids1 s1 = sets.Set(xmlids1).intersection( sets.Set(n_ref_list)) if list(s1): selected_ref = norm_table_id_ref break #print ' doc_pair: ', doc_pair #print if (not selected_hyp) or (not selected_ref): print 'mmmm Learning Error....' continue #sys.exit() if selected_hyp and selected_ref: #print [selected_hyp, selected_ref] if (doc_pair[0], selected_hyp) not in doc_pair_table_pair_dict: doc_pair_table_pair_dict[(doc_pair[0], selected_hyp)] = [] #if selected_hyp not in doc_pair_table_pair_dict[doc_pair]: # doc_pair_table_pair_dict[doc_pair][selected_hyp] = [] if (doc_pair[1], selected_ref) not in doc_pair_table_pair_dict[( doc_pair[0], selected_hyp)]: doc_pair_table_pair_dict[(doc_pair[0], selected_hyp)].append( (doc_pair[1], selected_ref)) val_cons_dict[(doc_pair[1], selected_ref)] = 1 init_values = [] for k, vs in doc_pair_table_pair_dict.items(): if k not in val_cons_dict: init_values.append([k]) #print len(init_values) #sys.exit() flg = 1 while flg: flg = 0 new_init_values = [] for init_val in init_values: last_key = init_val[-1] if last_key in doc_pair_table_pair_dict: flg = 1 extended_pos = doc_pair_table_pair_dict[last_key] print ' extended_pos: ', extended_pos, len(extended_pos) new_init_val = init_val[:] for e in extended_pos: new_init_values.append(init_val[:] + [e]) print len(init_val[:]) + 1, ' === ALL: ', len_phs else: new_init_values.append(init_val[:]) init_values = new_init_values[:] ''' new_init_values = [] for init_value in init_values: l1 = map(lambda x:x[0]+'#'+x[1], init_value[:]) s1 = sets.Set(l1) flg = 0 for init_value1 in init_values: l2 = map(lambda x:x[0]+'#'+x[1], init_value1[:]) s2 = sets.Set(l2) if (s1 == s2): continue if s1.issubset(s2): flg = 1 break if (flg == 0): new_init_values.append(init_value) ''' ar = [] for init_value in init_values: ar.append((len(init_value), init_value)) ar.sort() ar.reverse() print 'Total ar: ', len(ar) for ar_elm in ar: print 'Len: ', ar_elm[0], 'ELMS: ', ar_elm[1] #sys.exit() ofname = os.path.join(lmdb_folder, 'doc_table_final_chain_pair') final_pair_dict = { 'sorted_comb_list': sorted_combination_doc_lst, 'doc_table_chain_pair_list': new_init_values } self.lmdb_obj.write_to_lmdb(ofname, final_pair_dict, final_pair_dict.keys())