def cartesian2cylindical (grid, z=0): """converts and interpolates a 3D cartesian grid to a cylindrical one z is the number of the z axis, defualts to 0 (first)""" if not(len(grid.shape)==3): raise IndexError out=np.zeros(grid.shape) if z==0: return np.array([x for x in pp.pmap(cartesian2polar, [grid[i,...] for i in xrange(grid.shape[0])], limit=6)]) if z==1: return np.array([x for x in pp.pmap(cartesian2polar, [grid[:,i,:] for i in xrange(grid.shape[1])], limit=6)]) if z==2: return np.array([x for x in pp.pmap(cartesian2polar, [grid[...,i] for i in xrange(grid.shape[2])], limit=6)])
def azimuthalAverage (grid, z=0): """averages an x,y,z cube in phi about z z=axis number to average around""" if not(len(grid.shape)==3): raise IndexError out=np.zeros(grid.shape) f=lambda x: polar2cartesian(Average(cartesian2polar(x))) if z==0: return np.array([x for x in pp.pmap(f, [grid[i,...] for i in xrange(grid.shape[0])], limit=6)]) if z==1: return np.array([x for x in pp.pmap(f, [grid[:,i,:] for i in xrange(grid.shape[1])], limit=6)]) if z==2: return np.array([x for x in pp.pmap(f, [grid[...,i] for i in xrange(grid.shape[2])], limit=6)])
def main(resultfilename): start_1=datetime.datetime.now() print str(start_1) resultfile=open(resultfilename,'w') #used for computing tf sequence_tf=getSequence(trainfile) #used for computing idf global user_itempairs_idf_list,user_itempairs_tf_list user_user_sim_matrix=[[0 for i in range(user_num)]for j in range(user_num)] start_2=datetime.datetime.now() print str(start_2)+' 2 used '+str(start_2-start_1)+'total used '+str(start_2-start_1) results=pprocess.pmap(calculate_tf,sequence_tf,limit) for result in results: user_id,item_pairs_list=result user_itempairs_tf_list.append((user_id,item_pairs_list)) for i in range(len(item_pairs_list)): item_i,item_j,tf,score=item_pairs_list[i] if score>0: user_itempairs_idf_list[int(item_i)-1][int(item_j)-1]+=1 else: user_itempairs_idf_list[int(item_j)-1][int(item_i)-1]+=1 #print 'user_id for tf '+user_id start_3=datetime.datetime.now() print str(start_3)+' 3 used '+str(start_3-start_2)+'total used '+str(start_3-start_1) global user_itempairs_tf_idf_list results=pprocess.pmap(calculate_tf_idf,user_itempairs_tf_list,limit) for result in results: user_itempairs_tf_idf_list.append(result) del user_itempairs_tf_list del user_itempairs_idf_list start_4=datetime.datetime.now() print str(start_4)+' 4 used '+str(start_4-start_3)+'total used '+str(start_4-start_1) results=pprocess.pmap(calculate_sim,user_itempairs_tf_idf_list,limit) for result in results: for i in range(len(result)): u_i=int(result[i][0])-1 u_j=int(result[i][1])-1 sim=float(result[i][2]) user_user_sim_matrix[u_i][u_j]=sim #print str(u_i)+' is computed ok' start_5=datetime.datetime.now() del user_itempairs_tf_idf_list print str(start_5)+' 5 used '+str(start_5-start_4)+'total used '+str(start_5-start_1) for i in range(user_num): for j in range(user_num): resultfile.write(str(user_user_sim_matrix[i][j])) resultfile.write(' ') resultfile.write('\n') start_6=datetime.datetime.now() print str(start_6)+' 6 used '+str(start_6-start_5)+'total used '+str(start_6-start_1) resultfile.close() trainfile.close() start_7=datetime.datetime.now() print str(start_7)+' 7 used '+str(start_7-start_6)+'total used '+str(start_7-start_1) print 'sim end' print '********************************'
def parse_gpo_hearings(self, n_cores=4): """ Primary parser function. Wraps and parallelizes methods described elsewhere in this file. """ import pprocess def parse(data): """ Wrapper for parser function, intended for parallel processing. Takes a data object with an initialized connection and a set of IDs to query and parse. """ cur = data['con'].cursor(cursor_factory=psycopg2.extras.DictCursor) output = [] for j in data['id_inds']: id_to_parse = self.id_values[j] cur.execute('select * from hearings where id = %s', (id_to_parse, )) entry = cur.fetchone() if entry is not None: parsed = ParseHearing( entry, committee_data=self.committee_data, member_table=self.member_table).parsed output.append(parsed) else: print((' Warning: id {} not found!'.format(id_to_parse))) # Returned value records whether the file was actually parsed. return output n_ids = len(self.id_values) # if n_ids is reasonably large (say >100), parallelize; if not, just do in serial if n_ids > 100: to_analyze = [{ 'con': psycopg2.connect(**self.credentials), 'id_inds': list( range(int(i * n_ids / n_cores), int((i + 1) * n_ids / n_cores))) } for i in range(n_cores)] self.results = [ r for r in pprocess.pmap(parse, to_analyze, limit=n_cores) ] self.results = list(chain(*self.results)) else: con = psycopg2.connect(**self.credentials) self.results = parse({ 'con': con, 'id_inds': list(range(len(self.id_values))) })
def generate(company_id, tables_list=None): project_id, url_id = company_id.split('_') all_doc_table_to_process = [] norm_res_list = sObj.slt_normresids(project_id, url_id) for doc_tup in norm_res_list: doc_id, page_number, norm_table_id = map(lambda x: str(x), doc_tup) if tables_list and norm_table_id not in tables_list: continue # Only selected tables ktup = (doc_id, norm_table_id) all_doc_table_to_process.append(ktup) lmdb_folder = os.path.join(output_path, company_id) if not os.path.exists(lmdb_folder): os.mkdir(lmdb_folder) doc_table_html_dict = {} #print company_id, "TOTAL TABLES", len(norm_res_list) #print company_id, "SELECTED TABLES", len(all_doc_table_to_process) for x in all_doc_table_to_process: f = generate_map_ds(company_id, x) print f sys.exit() sys.exit() res = pprocess.pmap( lambda x: generate_map_ds(company_id, all_doc_table_to_process[x]), range(0, len(all_doc_table_to_process)), 4) table_id_xml_bbox_dict = {} ####################################### for (ktup, rdict) in res: doc_id, table_id = ktup print table_id, rdict sys.exit() if rdict: table_id_xml_bbox_dict[table_id] = str(rdict) sys.exit() if table_id_xml_bbox_dict: fname = os.path.join(lmdb_folder, 'xml_bbox_map') if not tables_list: # Running for all tables, hence remove old data and create new cmd = 'rm -rf %s' % (fname) os.system(cmd) env = lmdb.open(fname, map_size=10 * 1000 * 1000 * 1000) with env.begin(write=True) as txn: for k, v in table_id_xml_bbox_dict.items(): txn.put('RST:' + k, v) print "done"
def predict_rankbased(testfilename,resultfilename): testfile=open(testfilename,'r') resultfile=open(resultfilename,'w') sequence=getSequence(testfile) results=pprocess.pmap(calculate, sequence, limit) for i in range(len(sequence)): userid,rank_list=results[i] for j in range(len(rank_list)): resultfile.write(str(userid)) resultfile.write(' ') resultfile.write(str(rank_list[j][1])) resultfile.write(' ') resultfile.write(str(rank_list[j][0])) resultfile.write('\n') print str(userid)+' done' testfile.close() resultfile.close()
def render(self, filename, limit): """ Render the image with many processes, saving it to 'filename', using the given process 'limit' to constrain the number of processes used. """ image = Image.new("RGB", (self.width, self.height)) draw = ImageDraw.Draw(image) total = self.width * self.height count = 0 for sy, row in enumerate(pprocess.pmap(self.render_row, xrange(0, self.height), limit)): for sx, col in enumerate(row): draw.point((sx,sy),fill=(col.x*255,col.y*255,col.z*255)) count = count + 1 percent = int((count/float(total))*100) sys.stdout.write(("\010" * 9) + "%3d%% %3d" % (percent, sy)) sys.stdout.flush() image.save(filename)
def generate_ret(company_id, tables_list=None): project_id, url_id = company_id.split('_') all_doc_table_to_process = [] norm_res_list = sObj.slt_normresids(project_id, url_id) for doc_tup in norm_res_list: doc_id, page_number, norm_table_id = map(lambda x: str(x), doc_tup) if tables_list and norm_table_id not in tables_list: continue # Only selected tables ktup = (doc_id, norm_table_id) all_doc_table_to_process.append(ktup) lmdb_folder = os.path.join(output_path, company_id) if not os.path.exists(lmdb_folder): os.mkdir(lmdb_folder) doc_table_html_dict = {} #print company_id, "TOTAL TABLES", len(norm_res_list) #print company_id, "SELECTED TABLES", len(all_doc_table_to_process) #for x in all_doc_table_to_process: # generate_map_ds(company_id, x) #sys.exit() res = pprocess.pmap( lambda x: generate_map_ds(company_id, all_doc_table_to_process[x]), range(0, len(all_doc_table_to_process)), 4) table_id_xml_bbox_dict = {} ####################################### for (ktup, rdict) in res: doc_id, table_id = ktup if rdict: table_id_xml_bbox_dict[table_id] = rdict return table_id_xml_bbox_dict
def ruleGenerationPP(argWrapper, orDescriptors=None,consistent=1): """ # Example of `orDescriptors`: # ['temperature',['normal','high']] # ['AttName2',[0,1,2],[8,9,10]] # orDescriptors=[[u'temperature', [u'normal', u'high']]] # dataWrapper=ruleGeneration(argWrapper,orDescriptors=orDescriptors) """ if not orDescriptors: orDescriptors = [] dataWrapper=argWrapper localsList=['numOB', 'numAT', 'support', 'accuracy', 'decision', 'infoMsg', 'attNameList','AT', 'OB'] numOB,numAT,support,accuracy,decision,infoMsg,attNameList,AT,OB=\ tuple([dataWrapper[k] for k in localsList]) # Todo: modify AT based on orDescriptors # print str(orDescriptors) >>>[[u'temperature', [u'normal', u'high']]] for od in orDescriptors: idx_att=attNameList.index(od[0]) at_index_att=AT[idx_att] for cb in od[1:]: for av in cb: at_index_att.remove(at_index_att[at_index_att.index([av])]) AT[idx_att].append(cb) # print AT[idx_att] D = decision - 1 conIndexes=filter(lambda x:x !=D,xrange(0,numAT)) CF=[] DCStep1=[] gStep=2 NumOfParticles=6 #init CF & DCStep1 for d in xrange(0, len(AT[D])): CF.append([[] for _ in itertools.repeat(None, len(conIndexes))]) lDCAll = [[] for _ in itertools.repeat(None, numAT)] #place holder for lDCAll[C].append(thisDC) DCStep1.append(lDCAll) #parallelize list of (d,C) tuples to form AN RDD dctps=itertools.product(xrange(0,len(AT[D])), conIndexes) for x in dctps: print x def kPerDC(dctp): d, C = dctp CFd0 = [] DCStep1dC = [] #print dctp for k in xrange(0, len(AT[C])): thisDC = [[] for _ in itertools.repeat(None, NumOfParticles)] sATV = set(AT[C][k]) sDECV = set(AT[D][d]) for ob in xrange(0, numOB): #todo!!!init OB as set sCON = set(OB[ob][C]) sDEC = set(OB[ob][D]) if sCON == sATV:#inf(p) if sDEC == sDECV: thisDC[0].append(ob + 1) elif sDECV.issubset(sDEC): thisDC[1].append(ob + 1) else: thisDC[2].append(ob + 1) elif sATV.issubset(sCON):#sup(p)-inf(p) if sDEC == sDECV: thisDC[3].append(ob + 1) elif sDECV.issubset(sDEC): thisDC[4].append(ob + 1) else: thisDC[5].append(ob + 1) else:#p' #thisDC[6].append(ob+1) pass # compute criteria values minsupp, minacc, maxsupp, maxacc = calCriteriaFromDC(thisDC, numOB) if minsupp >= support and minacc >= accuracy: CFd0.append([[C, k, d, minsupp, minacc, maxsupp, maxacc, 'Lower']]) elif maxsupp >= support and maxacc >= accuracy: CFd0.append([[C, k, d, minsupp, minacc, maxsupp, maxacc, 'Upper']]) elif maxsupp >= support: CFd0.append([[C, k, d, minsupp, minacc, maxsupp, maxacc, 'flag4merge']]) else: #only append to CF the promising DC better than 'flag4merge' pass #append all thisDC to lDCAll DCStep1dC.append([d, C, thisDC]) #Spark operation (such as map or reduce) works on separate copies of all the variables used in the function\ # so closures didn't work here. return (DCStep1dC, CFd0) #results_pp=pprocess.Queue(limit=4) #kPerDC_wrapper=results_pp.manage(pprocess.MakeParallel(kPerDC)) #for dctp in dctps: # kPerDC_wrapper(dctp) #results_pp=map(kPerDC,dctps) results_pp=pprocess.pmap(kPerDC,dctps,limit=8) for r in results_pp: dcws, cfs = r for dcw in dcws: d, C, thisDC = dcw DCStep1[d][C].append(thisDC) for cf in cfs: CF[d][0].append(cf) if gStep==2: #current step cf=1 DCCurrent=[ [] for _ in itertools.repeat(None,len(AT[D]))] # certain rules in step1 rs=[] for d in xrange(0,len(AT[D])): lenCFm1=len(CF[d][cf-1]) for k in xrange(0,lenCFm1): if CF[d][cf-1][k][0][7] != 'flag4merge': rs.append(CF[d][cf-1][k][0][0:2]) for d in xrange(0,len(AT[D])): lenCFm1=len(CF[d][cf-1]) for k in xrange(0,lenCFm1): # flag4merge rules CAN be used in multiple complex rules if CF[d][cf-1][k][0][7] != 'flag4merge': continue ##merge strategy consistent==1 if consistent == 1 and CF[d][cf-1][k][0][0:2] in rs: continue iC, iK, iD = CF[d][cf-1][k][0][0:3] # for: k to merge for km in xrange(k+1,lenCFm1): if CF[d][cf-1][km][0][7] != 'flag4merge': continue ##merge strategy consistent==1 if consistent == 1 and CF[d][cf-1][km][0][0:2] in rs: continue iCm,iKm,iDm=CF[d][cf-1][km][0][0:3] # not appropriate to merge if iC==iCm or iD!=iDm: continue DCK=DCStep1[iD][iC][iK] DCKm=DCStep1[iDm][iCm][iKm] mDC=[ [] for _ in itertools.repeat(None,NumOfParticles)] mDC[0]=list(set(DCK[0])& set(DCKm[0])) mDC[1]=list(set(DCK[1])& set(DCKm[1])) mDC[2]=list(set(DCK[2])& set(DCKm[2])) mDC[3]=list((set(DCK[0])& set(DCKm[3]))|(set(DCK[3])& set(DCKm[0]))|(set(DCK[3])& set(DCKm[3]))) mDC[4]=list((set(DCK[1])& set(DCKm[4]))|(set(DCK[4])& set(DCKm[1]))|(set(DCK[4])& set(DCKm[4]))) mDC[5]=list((set(DCK[2])& set(DCKm[5]))|(set(DCK[5])& set(DCKm[2]))|(set(DCK[5])& set(DCKm[5]))) minsupp,minacc,maxsupp,maxacc = calCriteriaFromDC(mDC,numOB) if minsupp >=support and minacc>=accuracy: # CF[d][cf-1][km][0][7] = 'flag4mergeDone' CF[d][cf].append([[iC,iK,iD,minsupp,minacc,maxsupp,maxacc,'Lower'], [iCm,iKm,iDm,minsupp,minacc,maxsupp,maxacc,'Lower']]) #todo:mark DCStep2 unused # DCCurrent[d].append([[iC,iK,iD],[iCm,iKm,iDm],mDC]) elif maxsupp >=support and maxacc>=accuracy: # CF[d][cf-1][km][0][7] = 'flag4mergeDone' CF[d][cf].append([[iC,iK,iD,minsupp,minacc,maxsupp,maxacc,'Upper'], [iCm,iKm,iDm,minsupp,minacc,maxsupp,maxacc,'Upper']]) # DCCurrent[d].append([[iC,iK,iD],[iCm,iKm,iDm],mDC]) else: #the merged DC still don't satisfy pass # eof: for km #eof: for k #eof: for d # #DCStep2 DCStep2=DCCurrent #eof: if gStep==2 for k in ['gStep','D','CF','DCStep1']: dataWrapper[k]=locals()[k] return dataWrapper
def generate(company_id, given_doc_ids=None, src_lan=None): company_name = get_comp_model(company_id) model_number = '50' project_id, url_id = company_id.split('_') all_doc_table_to_process = [] norm_res_list = get_norm_table_ids(company_name, model_number) print len(norm_res_list) #sys.exit() table_cell_dict = {} for doc_tup in norm_res_list: doc_id, page_number, norm_table_id, cell_dict, table_type = doc_tup #if norm_table_id != '100':continue #if int(norm_table_id) < 360:continue if table_type != 'Sentence': table_type = 'Table' ktup = (doc_id, norm_table_id, page_number, table_type) table_cell_dict[ktup] = {0: cell_dict} if given_doc_ids: if doc_id in given_doc_ids: all_doc_table_to_process.append(ktup) continue else: all_doc_table_to_process.append(ktup) lmdb_folder = os.path.join(output_path, company_id) if not os.path.exists(lmdb_folder): os.mkdir(lmdb_folder) doc_html_path = os.path.join(lmdb_folder, 'Doc_Htmls') if not os.path.exists(doc_html_path): os.mkdir(doc_html_path) table_html_path = os.path.join(lmdb_folder, 'Table_Htmls') if not os.path.exists(table_html_path): os.mkdir(table_html_path) ###################################################### #all_doc_table_to_process = [('164', '70286')] total = len(all_doc_table_to_process) translated_dict = {} res = [] ''' for i, x in enumerate(all_doc_table_to_process): print [x, i+1, '/', total] #if (x[1] != '8896'): continue f = generate_map_ds(company_id, x, table_cell_dict[x], src_lan, translated_dict) #print table_cell_dict[x] #sys.exit() each_html_str = f[2] table_id = x[1] tab_path = os.path.join(table_html_path, str(table_id)+'.html') ftab = open(tab_path, 'w') ftab.write('<html><body>'+str(each_html_str)+'</body></html>') ftab.close() res.append(f) #sys.exit() ''' print len(all_doc_table_to_process) doc_table_html_dict = {} res = pprocess.pmap( lambda x: generate_map_ds(company_id, all_doc_table_to_process[ x], table_cell_dict[all_doc_table_to_process[x]], src_lan, translated_dict), range(0, len(all_doc_table_to_process)), 6) project_id, url_id = company_id.split('_') table_id_xml_bbox_dict = {} ####################################### #print res error_list = [] for rf in res: if len(rf) != 6: continue doc_id, table_id, each_html_str, flg, sflg, er_str = rf print doc_id, table_id if not flg: st = '\t'.join([doc_id, table_id, sflg, er_str]) error_list.append(st) continue #print [each_html_str] #print '*'*100 tab_path = os.path.join(table_html_path, str(table_id) + '.html') ftab = open(tab_path, 'w') ftab.write('<html><body>' + each_html_str + '</body></html>') ftab.close() if doc_id not in doc_table_html_dict: doc_table_html_dict[doc_id] = [] #try: # Xml_bb_dict = Xml_Cell_Obj.get_cell_bbox_data(project_id, url_id, doc_id, table_id) # table_id_xml_bbox_dict[table_id] = str(Xml_bb_dict) #except Exception as e: # st = '\t'.join([doc_id, table_id, 'BBOX', str(e)]) # error_list.append(st) doc_table_html_dict[doc_id].append((table_id, each_html_str)) ##################################### for doc_id, table_html_str_list in doc_table_html_dict.items(): doc_path = os.path.join(doc_html_path, str(doc_id)) if not os.path.exists(doc_path): os.mkdir(doc_path) html_str = '<html><body>' t_html_str = '<html><body>' for (table_id, each_html_str) in table_html_str_list: print ' +++', table_id html_str += '<hr>' + str(each_html_str) + '<hr>' t_html_str += '<hr><div id="table-' + str( table_id) + '" class="table-container">' + str( each_html_str) + '</div><hr>' tab_path = os.path.join(table_html_path, str(table_id) + '.html') ftab = open(tab_path, 'w') ftab.write('<html><body>' + str(each_html_str) + '</body></html>') ftab.close() html_str += '</body></html>' t_html_str += '</body></html>' html_fname = os.path.join(doc_path, '1.html') fout = open(html_fname, 'w') fout.write(html_str) fout.close() html_fname = os.path.join(doc_path, '2.html') fout = open(html_fname, 'w') fout.write(t_html_str) fout.close() msg = insert_update_table_report(company_id, all_doc_table_to_process, company_name, model_number) fname = os.path.join(lmdb_folder, 'errors.txt') fout = open(fname, 'w') for st in error_list: st = st + '\n' fout.write(st) fout.close() if error_list: print 'please look this error log', fname print "done"
def generate(company_id, given_doc_ids=None): project_id, url_id = company_id.split('_') all_doc_table_to_process = [] norm_res_list = sObj.slt_normresids(project_id, url_id) for doc_tup in norm_res_list: doc_id, page_number, norm_table_id = map(lambda x: str(x), doc_tup) if norm_table_id not in ['452', '449', '446', '462']: continue ktup = (doc_id, norm_table_id) #hf = os.path.join(output_path, company_id, 'Table_Htmls', '%s.html'%(norm_table_id)) #if os.path.exists(hf):continue if given_doc_ids: if doc_id in given_doc_ids: #if norm_table_id != '4194':continue all_doc_table_to_process.append(ktup) continue else: all_doc_table_to_process.append(ktup) lmdb_folder = os.path.join(output_path, company_id) if not os.path.exists(lmdb_folder): os.mkdir(lmdb_folder) doc_html_path = os.path.join(lmdb_folder, 'Doc_Htmls') if not os.path.exists(doc_html_path): os.mkdir(doc_html_path) table_html_path = os.path.join(lmdb_folder, 'Table_Htmls') if not os.path.exists(table_html_path): os.mkdir(table_html_path) ###################################################### #all_doc_table_to_process = [('164', '70286')] doc_table_html_dict = {} #for x in all_doc_table_to_process: # #print x # f = generate_map_ds(company_id, x) #sys.exit() res = pprocess.pmap( lambda x: generate_map_ds(company_id, all_doc_table_to_process[x]), range(0, len(all_doc_table_to_process)), 4) project_id, url_id = company_id.split('_') table_id_xml_bbox_dict = {} ####################################### error_list = [] #for [doc_id, table_id, each_html_str, flg] in res: for rf in res: if len(rf) != 6: continue doc_id, table_id, each_html_str, flg, sflg, er_str = rf if not flg: st = '\t'.join([doc_id, table_id, sflg, er_str]) error_list.append(st) continue if doc_id not in doc_table_html_dict: doc_table_html_dict[doc_id] = [] try: Xml_bb_dict = Xml_Cell_Obj.get_cell_bbox_data( project_id, url_id, doc_id, table_id) table_id_xml_bbox_dict[table_id] = str(Xml_bb_dict) except Exception as e: st = '\t'.join([doc_id, table_id, 'BBOX', str(e)]) error_list.append(st) doc_table_html_dict[doc_id].append((table_id, each_html_str)) ##################################### for doc_id, table_html_str_list in doc_table_html_dict.items(): doc_path = os.path.join(doc_html_path, str(doc_id)) if not os.path.exists(doc_path): os.mkdir(doc_path) html_str = '<html><body>' t_html_str = '<html><body>' for (table_id, each_html_str) in table_html_str_list: html_str += '<hr>' + each_html_str + '<hr>' t_html_str += '<hr><div id="table-' + str( table_id ) + '" class="table-container">' + each_html_str + '</div><hr>' tab_path = os.path.join(table_html_path, str(table_id) + '.html') ftab = open(tab_path, 'w') ftab.write('<html><body>' + each_html_str + '</body></html>') ftab.close() html_str += '</body></html>' t_html_str += '</body></html>' html_fname = os.path.join(doc_path, '1.html') fout = open(html_fname, 'w') fout.write(html_str) fout.close() html_fname = os.path.join(doc_path, '2.html') fout = open(html_fname, 'w') fout.write(t_html_str) fout.close() #print table_id_xml_bbox_dict if not given_doc_ids: fname = os.path.join(lmdb_folder, 'xml_bbox_map') cmd = 'rm -rf %s' % (fname) os.system(cmd) env = lmdb.open(fname, map_size=10 * 1000 * 1000 * 1000) with env.begin(write=True) as txn: for k, v in table_id_xml_bbox_dict.items(): txn.put('RST:' + k, v) fname = os.path.join(lmdb_folder, 'errors.txt') fout = open(fname, 'w') for st in error_list: st = st + '\n' fout.write(st) fout.close() if error_list: print 'please look this error log', fname print "done"
def main(resultfilename): start_1 = datetime.datetime.now() print str(start_1) resultfile = open(resultfilename, 'w') #used for computing tf sequence_tf = getSequence(trainfile) #used for computing idf global user_itempairs_idf_list, user_itempairs_tf_list user_user_sim_matrix = [[0 for i in range(user_num)] for j in range(user_num)] start_2 = datetime.datetime.now() print str(start_2) + ' 2 used ' + str( start_2 - start_1) + 'total used ' + str(start_2 - start_1) results = pprocess.pmap(calculate_tf, sequence_tf, limit) for result in results: user_id, item_pairs_list = result user_itempairs_tf_list.append((user_id, item_pairs_list)) for i in range(len(item_pairs_list)): item_i, item_j, tf, score = item_pairs_list[i] if score > 0: user_itempairs_idf_list[int(item_i) - 1][int(item_j) - 1] += 1 else: user_itempairs_idf_list[int(item_j) - 1][int(item_i) - 1] += 1 #print 'user_id for tf '+user_id start_3 = datetime.datetime.now() print str(start_3) + ' 3 used ' + str( start_3 - start_2) + 'total used ' + str(start_3 - start_1) global user_itempairs_tf_idf_list results = pprocess.pmap(calculate_tf_idf, user_itempairs_tf_list, limit) for result in results: user_itempairs_tf_idf_list.append(result) del user_itempairs_tf_list del user_itempairs_idf_list start_4 = datetime.datetime.now() print str(start_4) + ' 4 used ' + str( start_4 - start_3) + 'total used ' + str(start_4 - start_1) results = pprocess.pmap(calculate_sim, user_itempairs_tf_idf_list, limit) for result in results: for i in range(len(result)): u_i = int(result[i][0]) - 1 u_j = int(result[i][1]) - 1 sim = float(result[i][2]) user_user_sim_matrix[u_i][u_j] = sim #print str(u_i)+' is computed ok' start_5 = datetime.datetime.now() del user_itempairs_tf_idf_list print str(start_5) + ' 5 used ' + str( start_5 - start_4) + 'total used ' + str(start_5 - start_1) for i in range(user_num): for j in range(user_num): resultfile.write(str(user_user_sim_matrix[i][j])) resultfile.write(' ') resultfile.write('\n') start_6 = datetime.datetime.now() print str(start_6) + ' 6 used ' + str( start_6 - start_5) + 'total used ' + str(start_6 - start_1) resultfile.close() trainfile.close() start_7 = datetime.datetime.now() print str(start_7) + ' 7 used ' + str( start_7 - start_6) + 'total used ' + str(start_7 - start_1) print 'sim end' print '********************************'
df = pd.concat(results) print df def printIt(s): print "original, sampled" for n in s: if n != -1: print '{0}, {1}'.format(n[0], n[1]) if sys.argv[1] == 'sub': func = task_sub report = report_sub elif sys.argv[1] == 'eq': func = task_eq_mut report = report_eq elif sys.argv[1] == 'muscore': func = task_sample_muscore report = printIt if len(sys.argv) > 3: results = pprocess.pmap(func, glob.glob(sys.argv[2]), limit=8) else: results = map(func, glob.glob(sys.argv[2])) report(results)
time.sleep(delay) return i * N + j # Main program. if __name__ == "__main__": t = time.time() # Initialise an array. sequence = [] for i in range(0, N): for j in range(0, N): sequence.append((i, j)) # Perform the work. results = pprocess.pmap(calculate, sequence, limit=limit) # Show the results. for i, result in enumerate(results): print result, if i % N == N - 1: print print "Time taken:", time.time() - t # vim: tabstop=4 expandtab shiftwidth=4
import pprocess import scipy.stats as st import time import numpy def pointless_function(x): rv = st.norm.rvs(loc=x, scale=1, size=2e8) m = numpy.mean(rv) return m if __name__ == "__main__": list_of_args = [1, 3, 10] # Serial computation: start = time.time() serial = map(pointless_function, list_of_args) # Print results. for i in serial: print i print "%f s for serial computation." % (time.time() - start) # Parallel computation: start = time.time() nproc = 3 # maximum number of simultaneous processes desired parallel = pprocess.pmap(pointless_function, list_of_args, limit=nproc) # Print results. for i in parallel: print i print "%f s for parallel computation." % (time.time() - start)
def generate(company_id): doc_page_cord_dict = cobj.get_adjustment_coordinates1(company_id) from getCompanyName_machineId import getCN_MID getCompanyName_machineId = getCN_MID() company_name, machine_id = getCompanyName_machineId[company_id] project_id, url_id = company_id.split('_') all_doc_table_to_process = [] norm_res_list = sObj.slt_normresids(project_id, url_id) doc_page_grid_dict = {} doc_table_page_dict = {} for doc_tup in norm_res_list: doc_id, page_number, norm_table_id = map(lambda x: str(x), doc_tup) #if doc_id != '44':continue #if norm_table_id != '6334':continue ktup = (doc_id, norm_table_id) doc_table_page_dict[ktup] = page_number all_doc_table_to_process.append(ktup) if doc_id not in doc_page_grid_dict: doc_page_grid_dict[doc_id] = {} if page_number not in doc_page_grid_dict[doc_id]: doc_page_grid_dict[doc_id][page_number] = [] doc_page_grid_dict[doc_id][page_number].append(norm_table_id) #print doc_page_grid_dict['28'].keys() #sys.exit() res = pprocess.pmap( lambda x: generate_map_ds(company_id, all_doc_table_to_process[x]), range(0, len(all_doc_table_to_process)), 8) doc_id_page_number_bbox_dict = {} ####################################### total = len(all_doc_table_to_process) cnt = 1 for (ktup, rdict, celldata) in res: doc_id, table_id = ktup #page_number = doc_table_page_dict[ktup] xml_sec_type_dict = get_cell_mdict(celldata) print[ktup, cnt, '/', total] for xml_id, c_ar in rdict.items(): if not xml_id.strip(): continue #sys.exit() page_number = xml_id.split('#')[0].split('_')[-1].strip() dk = (doc_id + '.pdf', page_number) r, c, txt, sec_type = xml_sec_type_dict[xml_id] b_ar, page_n = c_ar if str(page_n) == page_number: if dk not in doc_id_page_number_bbox_dict: doc_id_page_number_bbox_dict[dk] = {} if sec_type not in doc_id_page_number_bbox_dict[dk]: doc_id_page_number_bbox_dict[dk][sec_type] = [] n_ar = [] for ar in b_ar: st = '_'.join(map(str, ar)) n_ar.append(st) bb = '$'.join(n_ar) pc = doc_page_cord_dict.get(doc_id, {}).get(page_number, '') #print [doc_id, table_id, page_number, xml_id, txt, pc] if not pc: print[doc_id, table_id, page_number, xml_id, txt, pc] print 'page cord error' sys.exit() dd = (table_id, r, c, txt, bb, pc) if dd not in doc_id_page_number_bbox_dict[dk][sec_type]: doc_id_page_number_bbox_dict[dk][sec_type].append(dd) cnt += 1 #sys.exit() ###################################### ff = '/var/www/html/company_bbox/' if not os.path.exists(ff): cmd = 'mkdir -p %s' % (ff) os.system(cmd) fname = os.path.join(ff, company_name + '.txt') fout = open(fname, 'w') st = '\t'.join([ 'DOC_PDF', 'TABLE_ID', 'PAGE_NUMBER', 'SECTION_TYPE', 'ROW', 'COL', 'TXT', 'BBOX(split by $ then split by _ )', 'PAGE_CORDS' ]) st += '\n' fout.write(st) for dk, sec_dict in doc_id_page_number_bbox_dict.items(): for sec_type, bbox_ar in sec_dict.items(): for (table_id, r, c, txt, bb, pc) in bbox_ar: st = '\t'.join( [dk[0], table_id, dk[1], sec_type, r, c, txt, bb, pc]) st += '\n' fout.write(st) fout.close()