Ejemplo n.º 1
0
def cartesian2cylindical (grid, z=0):
    """converts and interpolates a 3D cartesian grid to a cylindrical one
z is the number of the z axis, defualts to 0 (first)"""
    if not(len(grid.shape)==3): raise IndexError 
    out=np.zeros(grid.shape)
    if z==0: return np.array([x for x in  pp.pmap(cartesian2polar, [grid[i,...] for i in xrange(grid.shape[0])], limit=6)]) 
    if z==1: return np.array([x for x in  pp.pmap(cartesian2polar, [grid[:,i,:] for i in xrange(grid.shape[1])], limit=6)]) 
    if z==2: return np.array([x for x in  pp.pmap(cartesian2polar, [grid[...,i] for i in xrange(grid.shape[2])], limit=6)]) 
Ejemplo n.º 2
0
def azimuthalAverage (grid, z=0):
    """averages an x,y,z cube in phi about z
z=axis number to average around"""
    if not(len(grid.shape)==3): raise IndexError 
    out=np.zeros(grid.shape)
    f=lambda x: polar2cartesian(Average(cartesian2polar(x)))
    if z==0: return np.array([x for x in  pp.pmap(f, [grid[i,...] for i in xrange(grid.shape[0])], limit=6)]) 
    if z==1: return np.array([x for x in  pp.pmap(f, [grid[:,i,:] for i in xrange(grid.shape[1])], limit=6)]) 
    if z==2: return np.array([x for x in  pp.pmap(f, [grid[...,i] for i in xrange(grid.shape[2])], limit=6)]) 
def main(resultfilename):
    start_1=datetime.datetime.now()
    print str(start_1)
    resultfile=open(resultfilename,'w')
    #used for computing tf
    sequence_tf=getSequence(trainfile)
    #used for computing idf
    global user_itempairs_idf_list,user_itempairs_tf_list
    user_user_sim_matrix=[[0 for i in range(user_num)]for j in range(user_num)]
    start_2=datetime.datetime.now()
    print str(start_2)+' 2 used '+str(start_2-start_1)+'total used '+str(start_2-start_1)
    results=pprocess.pmap(calculate_tf,sequence_tf,limit)
    for result in results:
        user_id,item_pairs_list=result
        user_itempairs_tf_list.append((user_id,item_pairs_list))
        for i in range(len(item_pairs_list)):
            item_i,item_j,tf,score=item_pairs_list[i]
            if score>0:
                user_itempairs_idf_list[int(item_i)-1][int(item_j)-1]+=1
            else:
                user_itempairs_idf_list[int(item_j)-1][int(item_i)-1]+=1
        #print 'user_id for tf '+user_id
    start_3=datetime.datetime.now()
    print str(start_3)+' 3 used '+str(start_3-start_2)+'total used '+str(start_3-start_1)
    global user_itempairs_tf_idf_list
    results=pprocess.pmap(calculate_tf_idf,user_itempairs_tf_list,limit)
    for result in results:
        user_itempairs_tf_idf_list.append(result)
    del user_itempairs_tf_list
    del user_itempairs_idf_list
    start_4=datetime.datetime.now()
    print str(start_4)+' 4 used '+str(start_4-start_3)+'total used '+str(start_4-start_1)
    results=pprocess.pmap(calculate_sim,user_itempairs_tf_idf_list,limit)
    for result in results:
        for i in range(len(result)):
            u_i=int(result[i][0])-1
            u_j=int(result[i][1])-1
            sim=float(result[i][2])
            user_user_sim_matrix[u_i][u_j]=sim
        #print str(u_i)+' is computed ok'
    start_5=datetime.datetime.now()
    del user_itempairs_tf_idf_list
    print str(start_5)+' 5 used '+str(start_5-start_4)+'total used '+str(start_5-start_1)
    for i in range(user_num):
        for j in range(user_num):
            resultfile.write(str(user_user_sim_matrix[i][j]))
            resultfile.write(' ')
        resultfile.write('\n')
    start_6=datetime.datetime.now()
    print str(start_6)+' 6 used '+str(start_6-start_5)+'total used '+str(start_6-start_1)
    resultfile.close()
    trainfile.close()
    start_7=datetime.datetime.now()
    print str(start_7)+' 7 used '+str(start_7-start_6)+'total used '+str(start_7-start_1)
    print 'sim end'
    print '********************************'
Ejemplo n.º 4
0
    def parse_gpo_hearings(self, n_cores=4):
        """ Primary parser function. Wraps and parallelizes methods described elsewhere in this file. """
        import pprocess

        def parse(data):
            """

            Wrapper for parser function, intended for parallel processing. Takes a data object with an initialized
            connection and a set of IDs to query and parse.

            """

            cur = data['con'].cursor(cursor_factory=psycopg2.extras.DictCursor)

            output = []

            for j in data['id_inds']:
                id_to_parse = self.id_values[j]

                cur.execute('select * from hearings where id = %s',
                            (id_to_parse, ))
                entry = cur.fetchone()
                if entry is not None:
                    parsed = ParseHearing(
                        entry,
                        committee_data=self.committee_data,
                        member_table=self.member_table).parsed
                    output.append(parsed)
                else:
                    print((' Warning: id {} not found!'.format(id_to_parse)))

            # Returned value records whether the file was actually parsed.
            return output

        n_ids = len(self.id_values)

        # if n_ids is reasonably large (say >100), parallelize; if not, just do in serial
        if n_ids > 100:
            to_analyze = [{
                'con':
                psycopg2.connect(**self.credentials),
                'id_inds':
                list(
                    range(int(i * n_ids / n_cores),
                          int((i + 1) * n_ids / n_cores)))
            } for i in range(n_cores)]

            self.results = [
                r for r in pprocess.pmap(parse, to_analyze, limit=n_cores)
            ]
            self.results = list(chain(*self.results))
        else:
            con = psycopg2.connect(**self.credentials)
            self.results = parse({
                'con': con,
                'id_inds': list(range(len(self.id_values)))
            })
Ejemplo n.º 5
0
def generate(company_id, tables_list=None):

    project_id, url_id = company_id.split('_')

    all_doc_table_to_process = []
    norm_res_list = sObj.slt_normresids(project_id, url_id)

    for doc_tup in norm_res_list:

        doc_id, page_number, norm_table_id = map(lambda x: str(x), doc_tup)

        if tables_list and norm_table_id not in tables_list:
            continue  # Only selected tables

        ktup = (doc_id, norm_table_id)

        all_doc_table_to_process.append(ktup)
    lmdb_folder = os.path.join(output_path, company_id)
    if not os.path.exists(lmdb_folder):
        os.mkdir(lmdb_folder)
    doc_table_html_dict = {}
    #print company_id, "TOTAL TABLES", len(norm_res_list)
    #print company_id, "SELECTED TABLES", len(all_doc_table_to_process)
    for x in all_doc_table_to_process:
        f = generate_map_ds(company_id, x)
        print f
        sys.exit()
    sys.exit()
    res = pprocess.pmap(
        lambda x: generate_map_ds(company_id, all_doc_table_to_process[x]),
        range(0, len(all_doc_table_to_process)), 4)
    table_id_xml_bbox_dict = {}
    #######################################
    for (ktup, rdict) in res:
        doc_id, table_id = ktup
        print table_id, rdict
        sys.exit()
        if rdict:
            table_id_xml_bbox_dict[table_id] = str(rdict)
    sys.exit()
    if table_id_xml_bbox_dict:
        fname = os.path.join(lmdb_folder, 'xml_bbox_map')
        if not tables_list:  # Running for all tables, hence remove old data and create new
            cmd = 'rm -rf %s' % (fname)
            os.system(cmd)
        env = lmdb.open(fname, map_size=10 * 1000 * 1000 * 1000)
        with env.begin(write=True) as txn:
            for k, v in table_id_xml_bbox_dict.items():
                txn.put('RST:' + k, v)
        print "done"
def predict_rankbased(testfilename,resultfilename):
    testfile=open(testfilename,'r')
    resultfile=open(resultfilename,'w')
    sequence=getSequence(testfile)  
    results=pprocess.pmap(calculate, sequence, limit)
    for i in range(len(sequence)):
        userid,rank_list=results[i]
        for j in range(len(rank_list)):
            resultfile.write(str(userid))
            resultfile.write(' ')
            resultfile.write(str(rank_list[j][1]))
            resultfile.write(' ')
            resultfile.write(str(rank_list[j][0]))
            resultfile.write('\n') 
        print str(userid)+' done'
    testfile.close()
    resultfile.close()
Ejemplo n.º 7
0
    def render(self, filename, limit):

        """
        Render the image with many processes, saving it to 'filename', using the
        given process 'limit' to constrain the number of processes used.
        """

        image = Image.new("RGB", (self.width, self.height))
        draw = ImageDraw.Draw(image)
        total = self.width * self.height
        count = 0

        for sy, row in enumerate(pprocess.pmap(self.render_row, xrange(0, self.height), limit)):
            for sx, col in enumerate(row):
                draw.point((sx,sy),fill=(col.x*255,col.y*255,col.z*255))
                count = count + 1
            percent = int((count/float(total))*100)
            sys.stdout.write(("\010" * 9) + "%3d%% %3d" % (percent, sy))
            sys.stdout.flush()

        image.save(filename)
Ejemplo n.º 8
0
def generate_ret(company_id, tables_list=None):

    project_id, url_id = company_id.split('_')

    all_doc_table_to_process = []
    norm_res_list = sObj.slt_normresids(project_id, url_id)

    for doc_tup in norm_res_list:

        doc_id, page_number, norm_table_id = map(lambda x: str(x), doc_tup)

        if tables_list and norm_table_id not in tables_list:
            continue  # Only selected tables

        ktup = (doc_id, norm_table_id)

        all_doc_table_to_process.append(ktup)
    lmdb_folder = os.path.join(output_path, company_id)
    if not os.path.exists(lmdb_folder):
        os.mkdir(lmdb_folder)
    doc_table_html_dict = {}
    #print company_id, "TOTAL TABLES", len(norm_res_list)
    #print company_id, "SELECTED TABLES", len(all_doc_table_to_process)
    #for x in all_doc_table_to_process:
    #    generate_map_ds(company_id, x)
    #sys.exit()
    res = pprocess.pmap(
        lambda x: generate_map_ds(company_id, all_doc_table_to_process[x]),
        range(0, len(all_doc_table_to_process)), 4)
    table_id_xml_bbox_dict = {}
    #######################################
    for (ktup, rdict) in res:
        doc_id, table_id = ktup
        if rdict:
            table_id_xml_bbox_dict[table_id] = rdict
    return table_id_xml_bbox_dict
Ejemplo n.º 9
0
def ruleGenerationPP(argWrapper, orDescriptors=None,consistent=1):
    """
    # Example of `orDescriptors`:
    # ['temperature',['normal','high']]
    # ['AttName2',[0,1,2],[8,9,10]]
    # orDescriptors=[[u'temperature', [u'normal', u'high']]]
    # dataWrapper=ruleGeneration(argWrapper,orDescriptors=orDescriptors)
    """

    if not orDescriptors:
        orDescriptors = []

    dataWrapper=argWrapper
    localsList=['numOB', 'numAT', 'support', 'accuracy', 'decision',
                'infoMsg', 'attNameList','AT', 'OB']
    numOB,numAT,support,accuracy,decision,infoMsg,attNameList,AT,OB=\
        tuple([dataWrapper[k] for k in localsList])


    # Todo: modify AT based on orDescriptors
    # print str(orDescriptors) >>>[[u'temperature', [u'normal', u'high']]]
    for od in orDescriptors:
        idx_att=attNameList.index(od[0])
        at_index_att=AT[idx_att]
        for cb in od[1:]:
            for av in cb:
                at_index_att.remove(at_index_att[at_index_att.index([av])])
            AT[idx_att].append(cb)
            # print AT[idx_att]


    D = decision - 1
    conIndexes=filter(lambda x:x !=D,xrange(0,numAT))
    CF=[]
    DCStep1=[]
    gStep=2
    NumOfParticles=6

    #init CF & DCStep1
    for d in xrange(0, len(AT[D])):
        CF.append([[] for _ in itertools.repeat(None, len(conIndexes))])
        lDCAll = [[] for _ in itertools.repeat(None, numAT)]
        #place holder for lDCAll[C].append(thisDC)
        DCStep1.append(lDCAll)


    #parallelize list of (d,C) tuples to form AN RDD
    dctps=itertools.product(xrange(0,len(AT[D])), conIndexes)
    for x in dctps:
        print x


    def kPerDC(dctp):
        d, C = dctp
        CFd0 = []
        DCStep1dC = []

        #print dctp

        for k in xrange(0, len(AT[C])):
            thisDC = [[] for _ in itertools.repeat(None, NumOfParticles)]
            sATV = set(AT[C][k])
            sDECV = set(AT[D][d])
            for ob in xrange(0, numOB):
                #todo!!!init OB as set
                sCON = set(OB[ob][C])
                sDEC = set(OB[ob][D])
                if sCON == sATV:#inf(p)
                    if sDEC == sDECV:
                        thisDC[0].append(ob + 1)
                    elif sDECV.issubset(sDEC):
                        thisDC[1].append(ob + 1)
                    else:
                        thisDC[2].append(ob + 1)
                elif sATV.issubset(sCON):#sup(p)-inf(p)
                    if sDEC == sDECV:
                        thisDC[3].append(ob + 1)
                    elif sDECV.issubset(sDEC):
                        thisDC[4].append(ob + 1)
                    else:
                        thisDC[5].append(ob + 1)
                else:#p'
                    #thisDC[6].append(ob+1)
                    pass
                    # compute criteria values
            minsupp, minacc, maxsupp, maxacc = calCriteriaFromDC(thisDC, numOB)

            if minsupp >= support and minacc >= accuracy:
                CFd0.append([[C, k, d, minsupp, minacc, maxsupp, maxacc, 'Lower']])
            elif maxsupp >= support and maxacc >= accuracy:
                CFd0.append([[C, k, d, minsupp, minacc, maxsupp, maxacc, 'Upper']])
            elif maxsupp >= support:
                CFd0.append([[C, k, d, minsupp, minacc, maxsupp, maxacc, 'flag4merge']])
            else:
                #only append to CF the promising DC better than 'flag4merge'
                pass
                #append all thisDC to lDCAll
            DCStep1dC.append([d, C, thisDC])

            #Spark operation (such as map or reduce) works on separate copies of all the variables used in the function\
            # so closures didn't work here.
        return (DCStep1dC, CFd0)

    #results_pp=pprocess.Queue(limit=4)
    #kPerDC_wrapper=results_pp.manage(pprocess.MakeParallel(kPerDC))
    #for dctp in dctps:
    #     kPerDC_wrapper(dctp)

    #results_pp=map(kPerDC,dctps)
    results_pp=pprocess.pmap(kPerDC,dctps,limit=8)

    for r in results_pp:
        dcws, cfs = r
        for dcw in dcws:
            d, C, thisDC = dcw
            DCStep1[d][C].append(thisDC)
        for cf in cfs:
            CF[d][0].append(cf)


    if gStep==2:
        #current step
        cf=1
        DCCurrent=[ [] for _ in itertools.repeat(None,len(AT[D]))]

        # certain rules in step1
        rs=[]
        for d in xrange(0,len(AT[D])):
            lenCFm1=len(CF[d][cf-1])
            for k in xrange(0,lenCFm1):
                if CF[d][cf-1][k][0][7] != 'flag4merge':
                    rs.append(CF[d][cf-1][k][0][0:2])

        for d in xrange(0,len(AT[D])):
            lenCFm1=len(CF[d][cf-1])

            for k in xrange(0,lenCFm1):
                # flag4merge rules CAN be used in multiple complex rules
                if CF[d][cf-1][k][0][7] != 'flag4merge':
                    continue

                ##merge strategy consistent==1
                if consistent == 1 and CF[d][cf-1][k][0][0:2] in rs:
                    continue

                iC, iK, iD = CF[d][cf-1][k][0][0:3]

                # for: k to merge
                for km in xrange(k+1,lenCFm1):
                    if CF[d][cf-1][km][0][7] != 'flag4merge':
                        continue

                    ##merge strategy consistent==1
                    if consistent == 1 and CF[d][cf-1][km][0][0:2] in rs:
                        continue

                    iCm,iKm,iDm=CF[d][cf-1][km][0][0:3]
                    # not appropriate to merge
                    if iC==iCm or iD!=iDm:
                        continue
                    DCK=DCStep1[iD][iC][iK]
                    DCKm=DCStep1[iDm][iCm][iKm]
                    mDC=[ [] for _ in itertools.repeat(None,NumOfParticles)]
                    mDC[0]=list(set(DCK[0])& set(DCKm[0]))
                    mDC[1]=list(set(DCK[1])& set(DCKm[1]))
                    mDC[2]=list(set(DCK[2])& set(DCKm[2]))
                    mDC[3]=list((set(DCK[0])& set(DCKm[3]))|(set(DCK[3])& set(DCKm[0]))|(set(DCK[3])& set(DCKm[3])))
                    mDC[4]=list((set(DCK[1])& set(DCKm[4]))|(set(DCK[4])& set(DCKm[1]))|(set(DCK[4])& set(DCKm[4])))
                    mDC[5]=list((set(DCK[2])& set(DCKm[5]))|(set(DCK[5])& set(DCKm[2]))|(set(DCK[5])& set(DCKm[5])))
                    minsupp,minacc,maxsupp,maxacc = calCriteriaFromDC(mDC,numOB)
                    if minsupp >=support and minacc>=accuracy:
                        # CF[d][cf-1][km][0][7] = 'flag4mergeDone'
                        CF[d][cf].append([[iC,iK,iD,minsupp,minacc,maxsupp,maxacc,'Lower'],
                                          [iCm,iKm,iDm,minsupp,minacc,maxsupp,maxacc,'Lower']])
                        #todo:mark DCStep2 unused
                        # DCCurrent[d].append([[iC,iK,iD],[iCm,iKm,iDm],mDC])
                    elif maxsupp >=support and maxacc>=accuracy:
                        # CF[d][cf-1][km][0][7] = 'flag4mergeDone'
                        CF[d][cf].append([[iC,iK,iD,minsupp,minacc,maxsupp,maxacc,'Upper'],
                                          [iCm,iKm,iDm,minsupp,minacc,maxsupp,maxacc,'Upper']])
                        # DCCurrent[d].append([[iC,iK,iD],[iCm,iKm,iDm],mDC])
                    else:
                        #the merged DC still don't satisfy
                        pass
                    # eof: for km
                #eof: for k
            #eof: for d
        # #DCStep2
        DCStep2=DCCurrent
        #eof: if gStep==2
    for k in ['gStep','D','CF','DCStep1']:
        dataWrapper[k]=locals()[k]
    return dataWrapper
Ejemplo n.º 10
0
def generate(company_id, given_doc_ids=None, src_lan=None):
    company_name = get_comp_model(company_id)
    model_number = '50'
    project_id, url_id = company_id.split('_')
    all_doc_table_to_process = []
    norm_res_list = get_norm_table_ids(company_name, model_number)
    print len(norm_res_list)
    #sys.exit()
    table_cell_dict = {}
    for doc_tup in norm_res_list:
        doc_id, page_number, norm_table_id, cell_dict, table_type = doc_tup
        #if norm_table_id != '100':continue
        #if int(norm_table_id) < 360:continue
        if table_type != 'Sentence':
            table_type = 'Table'
        ktup = (doc_id, norm_table_id, page_number, table_type)
        table_cell_dict[ktup] = {0: cell_dict}
        if given_doc_ids:
            if doc_id in given_doc_ids:
                all_doc_table_to_process.append(ktup)
            continue
        else:
            all_doc_table_to_process.append(ktup)
    lmdb_folder = os.path.join(output_path, company_id)
    if not os.path.exists(lmdb_folder):
        os.mkdir(lmdb_folder)
    doc_html_path = os.path.join(lmdb_folder, 'Doc_Htmls')
    if not os.path.exists(doc_html_path):
        os.mkdir(doc_html_path)
    table_html_path = os.path.join(lmdb_folder, 'Table_Htmls')
    if not os.path.exists(table_html_path):
        os.mkdir(table_html_path)
    ######################################################
    #all_doc_table_to_process = [('164', '70286')]
    total = len(all_doc_table_to_process)
    translated_dict = {}
    res = []
    '''
    for i, x in enumerate(all_doc_table_to_process):
        print [x, i+1, '/', total]
        #if (x[1] != '8896'): continue
        f = generate_map_ds(company_id, x, table_cell_dict[x], src_lan, translated_dict)
        #print table_cell_dict[x]
        #sys.exit()
        each_html_str = f[2]
        table_id = x[1]
        tab_path = os.path.join(table_html_path, str(table_id)+'.html')
        ftab = open(tab_path, 'w')
        ftab.write('<html><body>'+str(each_html_str)+'</body></html>')
        ftab.close()
        res.append(f)
    #sys.exit()
    '''
    print len(all_doc_table_to_process)
    doc_table_html_dict = {}
    res = pprocess.pmap(
        lambda x: generate_map_ds(company_id, all_doc_table_to_process[
            x], table_cell_dict[all_doc_table_to_process[x]], src_lan,
                                  translated_dict),
        range(0, len(all_doc_table_to_process)), 6)
    project_id, url_id = company_id.split('_')
    table_id_xml_bbox_dict = {}
    #######################################
    #print res
    error_list = []
    for rf in res:
        if len(rf) != 6: continue
        doc_id, table_id, each_html_str, flg, sflg, er_str = rf
        print doc_id, table_id
        if not flg:
            st = '\t'.join([doc_id, table_id, sflg, er_str])
            error_list.append(st)
            continue
        #print [each_html_str]
        #print '*'*100
        tab_path = os.path.join(table_html_path, str(table_id) + '.html')
        ftab = open(tab_path, 'w')
        ftab.write('<html><body>' + each_html_str + '</body></html>')
        ftab.close()
        if doc_id not in doc_table_html_dict:
            doc_table_html_dict[doc_id] = []
        #try:
        #    Xml_bb_dict = Xml_Cell_Obj.get_cell_bbox_data(project_id, url_id, doc_id, table_id)
        #    table_id_xml_bbox_dict[table_id] = str(Xml_bb_dict)
        #except Exception as e:
        #    st = '\t'.join([doc_id, table_id, 'BBOX', str(e)])
        #    error_list.append(st)
        doc_table_html_dict[doc_id].append((table_id, each_html_str))

    #####################################
    for doc_id, table_html_str_list in doc_table_html_dict.items():
        doc_path = os.path.join(doc_html_path, str(doc_id))
        if not os.path.exists(doc_path):
            os.mkdir(doc_path)
        html_str = '<html><body>'
        t_html_str = '<html><body>'
        for (table_id, each_html_str) in table_html_str_list:
            print '   +++', table_id
            html_str += '<hr>' + str(each_html_str) + '<hr>'
            t_html_str += '<hr><div id="table-' + str(
                table_id) + '" class="table-container">' + str(
                    each_html_str) + '</div><hr>'
            tab_path = os.path.join(table_html_path, str(table_id) + '.html')
            ftab = open(tab_path, 'w')
            ftab.write('<html><body>' + str(each_html_str) + '</body></html>')
            ftab.close()
        html_str += '</body></html>'
        t_html_str += '</body></html>'
        html_fname = os.path.join(doc_path, '1.html')
        fout = open(html_fname, 'w')
        fout.write(html_str)
        fout.close()

        html_fname = os.path.join(doc_path, '2.html')
        fout = open(html_fname, 'w')
        fout.write(t_html_str)
        fout.close()
    msg = insert_update_table_report(company_id, all_doc_table_to_process,
                                     company_name, model_number)
    fname = os.path.join(lmdb_folder, 'errors.txt')
    fout = open(fname, 'w')
    for st in error_list:
        st = st + '\n'
        fout.write(st)
    fout.close()
    if error_list:
        print 'please look this error log', fname
    print "done"
Ejemplo n.º 11
0
def generate(company_id, given_doc_ids=None):
    project_id, url_id = company_id.split('_')
    all_doc_table_to_process = []
    norm_res_list = sObj.slt_normresids(project_id, url_id)
    for doc_tup in norm_res_list:
        doc_id, page_number, norm_table_id = map(lambda x: str(x), doc_tup)
        if norm_table_id not in ['452', '449', '446', '462']: continue
        ktup = (doc_id, norm_table_id)
        #hf = os.path.join(output_path, company_id, 'Table_Htmls', '%s.html'%(norm_table_id))
        #if os.path.exists(hf):continue
        if given_doc_ids:
            if doc_id in given_doc_ids:
                #if norm_table_id != '4194':continue
                all_doc_table_to_process.append(ktup)
            continue
        else:
            all_doc_table_to_process.append(ktup)
    lmdb_folder = os.path.join(output_path, company_id)
    if not os.path.exists(lmdb_folder):
        os.mkdir(lmdb_folder)
    doc_html_path = os.path.join(lmdb_folder, 'Doc_Htmls')
    if not os.path.exists(doc_html_path):
        os.mkdir(doc_html_path)
    table_html_path = os.path.join(lmdb_folder, 'Table_Htmls')
    if not os.path.exists(table_html_path):
        os.mkdir(table_html_path)
    ######################################################
    #all_doc_table_to_process = [('164', '70286')]
    doc_table_html_dict = {}

    #for x in all_doc_table_to_process:
    #    #print x
    #    f = generate_map_ds(company_id, x)
    #sys.exit()
    res = pprocess.pmap(
        lambda x: generate_map_ds(company_id, all_doc_table_to_process[x]),
        range(0, len(all_doc_table_to_process)), 4)
    project_id, url_id = company_id.split('_')
    table_id_xml_bbox_dict = {}
    #######################################
    error_list = []
    #for [doc_id, table_id, each_html_str, flg] in res:
    for rf in res:
        if len(rf) != 6: continue
        doc_id, table_id, each_html_str, flg, sflg, er_str = rf
        if not flg:
            st = '\t'.join([doc_id, table_id, sflg, er_str])
            error_list.append(st)
            continue
        if doc_id not in doc_table_html_dict:
            doc_table_html_dict[doc_id] = []
        try:
            Xml_bb_dict = Xml_Cell_Obj.get_cell_bbox_data(
                project_id, url_id, doc_id, table_id)
            table_id_xml_bbox_dict[table_id] = str(Xml_bb_dict)
        except Exception as e:
            st = '\t'.join([doc_id, table_id, 'BBOX', str(e)])
            error_list.append(st)
        doc_table_html_dict[doc_id].append((table_id, each_html_str))
    #####################################
    for doc_id, table_html_str_list in doc_table_html_dict.items():
        doc_path = os.path.join(doc_html_path, str(doc_id))
        if not os.path.exists(doc_path):
            os.mkdir(doc_path)
        html_str = '<html><body>'
        t_html_str = '<html><body>'
        for (table_id, each_html_str) in table_html_str_list:
            html_str += '<hr>' + each_html_str + '<hr>'
            t_html_str += '<hr><div id="table-' + str(
                table_id
            ) + '" class="table-container">' + each_html_str + '</div><hr>'
            tab_path = os.path.join(table_html_path, str(table_id) + '.html')
            ftab = open(tab_path, 'w')
            ftab.write('<html><body>' + each_html_str + '</body></html>')
            ftab.close()
        html_str += '</body></html>'
        t_html_str += '</body></html>'
        html_fname = os.path.join(doc_path, '1.html')
        fout = open(html_fname, 'w')
        fout.write(html_str)
        fout.close()

        html_fname = os.path.join(doc_path, '2.html')
        fout = open(html_fname, 'w')
        fout.write(t_html_str)
        fout.close()

    #print table_id_xml_bbox_dict
    if not given_doc_ids:
        fname = os.path.join(lmdb_folder, 'xml_bbox_map')
        cmd = 'rm -rf %s' % (fname)
        os.system(cmd)
        env = lmdb.open(fname, map_size=10 * 1000 * 1000 * 1000)
        with env.begin(write=True) as txn:
            for k, v in table_id_xml_bbox_dict.items():
                txn.put('RST:' + k, v)
    fname = os.path.join(lmdb_folder, 'errors.txt')
    fout = open(fname, 'w')
    for st in error_list:
        st = st + '\n'
        fout.write(st)
    fout.close()
    if error_list:
        print 'please look this error log', fname
    print "done"
def main(resultfilename):
    start_1 = datetime.datetime.now()
    print str(start_1)
    resultfile = open(resultfilename, 'w')
    #used for computing tf
    sequence_tf = getSequence(trainfile)
    #used for computing idf
    global user_itempairs_idf_list, user_itempairs_tf_list
    user_user_sim_matrix = [[0 for i in range(user_num)]
                            for j in range(user_num)]
    start_2 = datetime.datetime.now()
    print str(start_2) + ' 2 used ' + str(
        start_2 - start_1) + 'total used ' + str(start_2 - start_1)
    results = pprocess.pmap(calculate_tf, sequence_tf, limit)
    for result in results:
        user_id, item_pairs_list = result
        user_itempairs_tf_list.append((user_id, item_pairs_list))
        for i in range(len(item_pairs_list)):
            item_i, item_j, tf, score = item_pairs_list[i]
            if score > 0:
                user_itempairs_idf_list[int(item_i) - 1][int(item_j) - 1] += 1
            else:
                user_itempairs_idf_list[int(item_j) - 1][int(item_i) - 1] += 1
        #print 'user_id for tf '+user_id
    start_3 = datetime.datetime.now()
    print str(start_3) + ' 3 used ' + str(
        start_3 - start_2) + 'total used ' + str(start_3 - start_1)
    global user_itempairs_tf_idf_list
    results = pprocess.pmap(calculate_tf_idf, user_itempairs_tf_list, limit)
    for result in results:
        user_itempairs_tf_idf_list.append(result)
    del user_itempairs_tf_list
    del user_itempairs_idf_list
    start_4 = datetime.datetime.now()
    print str(start_4) + ' 4 used ' + str(
        start_4 - start_3) + 'total used ' + str(start_4 - start_1)
    results = pprocess.pmap(calculate_sim, user_itempairs_tf_idf_list, limit)
    for result in results:
        for i in range(len(result)):
            u_i = int(result[i][0]) - 1
            u_j = int(result[i][1]) - 1
            sim = float(result[i][2])
            user_user_sim_matrix[u_i][u_j] = sim
        #print str(u_i)+' is computed ok'
    start_5 = datetime.datetime.now()
    del user_itempairs_tf_idf_list
    print str(start_5) + ' 5 used ' + str(
        start_5 - start_4) + 'total used ' + str(start_5 - start_1)
    for i in range(user_num):
        for j in range(user_num):
            resultfile.write(str(user_user_sim_matrix[i][j]))
            resultfile.write(' ')
        resultfile.write('\n')
    start_6 = datetime.datetime.now()
    print str(start_6) + ' 6 used ' + str(
        start_6 - start_5) + 'total used ' + str(start_6 - start_1)
    resultfile.close()
    trainfile.close()
    start_7 = datetime.datetime.now()
    print str(start_7) + ' 7 used ' + str(
        start_7 - start_6) + 'total used ' + str(start_7 - start_1)
    print 'sim end'
    print '********************************'
Ejemplo n.º 13
0
    df = pd.concat(results)
    print df


def printIt(s):
    print "original, sampled"
    for n in s:
        if n != -1:
	     print '{0}, {1}'.format(n[0], n[1])





if sys.argv[1] == 'sub':
    func = task_sub
    report = report_sub 
elif sys.argv[1] == 'eq':
    func = task_eq_mut
    report = report_eq
elif sys.argv[1] == 'muscore':
    func = task_sample_muscore
    report = printIt

if len(sys.argv) > 3:
    results = pprocess.pmap(func, glob.glob(sys.argv[2]), limit=8)
else:
    results = map(func, glob.glob(sys.argv[2]))

report(results)
Ejemplo n.º 14
0
    time.sleep(delay)
    return i * N + j

# Main program.

if __name__ == "__main__":

    t = time.time()

    # Initialise an array.

    sequence = []
    for i in range(0, N):
        for j in range(0, N):
            sequence.append((i, j))

    # Perform the work.

    results = pprocess.pmap(calculate, sequence, limit=limit)

    # Show the results.

    for i, result in enumerate(results):
        print result,
        if i % N == N - 1:
            print

    print "Time taken:", time.time() - t

# vim: tabstop=4 expandtab shiftwidth=4
Ejemplo n.º 15
0
import pprocess
import scipy.stats as st
import time
import numpy

def pointless_function(x):
    rv = st.norm.rvs(loc=x, scale=1, size=2e8)
    m = numpy.mean(rv)
    return m

if __name__ == "__main__":
    list_of_args = [1, 3, 10]

    # Serial computation:
    start = time.time()
    serial = map(pointless_function, list_of_args)
    # Print results.
    for i in serial:
        print i
    print "%f s for serial computation." % (time.time() - start)

    # Parallel computation:
    start = time.time()
    nproc = 3  # maximum number of simultaneous processes desired
    parallel = pprocess.pmap(pointless_function, list_of_args, limit=nproc)
    # Print results.
    for i in parallel:
        print i
    print "%f s for parallel computation." % (time.time() - start)
Ejemplo n.º 16
0
def generate(company_id):
    doc_page_cord_dict = cobj.get_adjustment_coordinates1(company_id)
    from getCompanyName_machineId import getCN_MID
    getCompanyName_machineId = getCN_MID()
    company_name, machine_id = getCompanyName_machineId[company_id]
    project_id, url_id = company_id.split('_')
    all_doc_table_to_process = []
    norm_res_list = sObj.slt_normresids(project_id, url_id)
    doc_page_grid_dict = {}
    doc_table_page_dict = {}
    for doc_tup in norm_res_list:
        doc_id, page_number, norm_table_id = map(lambda x: str(x), doc_tup)
        #if doc_id != '44':continue
        #if norm_table_id != '6334':continue
        ktup = (doc_id, norm_table_id)
        doc_table_page_dict[ktup] = page_number
        all_doc_table_to_process.append(ktup)
        if doc_id not in doc_page_grid_dict:
            doc_page_grid_dict[doc_id] = {}
        if page_number not in doc_page_grid_dict[doc_id]:
            doc_page_grid_dict[doc_id][page_number] = []
        doc_page_grid_dict[doc_id][page_number].append(norm_table_id)
    #print doc_page_grid_dict['28'].keys()
    #sys.exit()
    res = pprocess.pmap(
        lambda x: generate_map_ds(company_id, all_doc_table_to_process[x]),
        range(0, len(all_doc_table_to_process)), 8)
    doc_id_page_number_bbox_dict = {}
    #######################################
    total = len(all_doc_table_to_process)
    cnt = 1
    for (ktup, rdict, celldata) in res:
        doc_id, table_id = ktup
        #page_number = doc_table_page_dict[ktup]
        xml_sec_type_dict = get_cell_mdict(celldata)
        print[ktup, cnt, '/', total]
        for xml_id, c_ar in rdict.items():
            if not xml_id.strip(): continue
            #sys.exit()
            page_number = xml_id.split('#')[0].split('_')[-1].strip()
            dk = (doc_id + '.pdf', page_number)
            r, c, txt, sec_type = xml_sec_type_dict[xml_id]
            b_ar, page_n = c_ar
            if str(page_n) == page_number:
                if dk not in doc_id_page_number_bbox_dict:
                    doc_id_page_number_bbox_dict[dk] = {}
                if sec_type not in doc_id_page_number_bbox_dict[dk]:
                    doc_id_page_number_bbox_dict[dk][sec_type] = []
                n_ar = []
                for ar in b_ar:
                    st = '_'.join(map(str, ar))
                    n_ar.append(st)
                bb = '$'.join(n_ar)
                pc = doc_page_cord_dict.get(doc_id, {}).get(page_number, '')
                #print [doc_id, table_id, page_number, xml_id, txt, pc]
                if not pc:
                    print[doc_id, table_id, page_number, xml_id, txt, pc]
                    print 'page cord error'
                    sys.exit()
                dd = (table_id, r, c, txt, bb, pc)
                if dd not in doc_id_page_number_bbox_dict[dk][sec_type]:
                    doc_id_page_number_bbox_dict[dk][sec_type].append(dd)
        cnt += 1
    #sys.exit()
    ######################################
    ff = '/var/www/html/company_bbox/'
    if not os.path.exists(ff):
        cmd = 'mkdir -p %s' % (ff)
        os.system(cmd)
    fname = os.path.join(ff, company_name + '.txt')
    fout = open(fname, 'w')
    st = '\t'.join([
        'DOC_PDF', 'TABLE_ID', 'PAGE_NUMBER', 'SECTION_TYPE', 'ROW', 'COL',
        'TXT', 'BBOX(split by $ then split by _ )', 'PAGE_CORDS'
    ])
    st += '\n'
    fout.write(st)
    for dk, sec_dict in doc_id_page_number_bbox_dict.items():
        for sec_type, bbox_ar in sec_dict.items():
            for (table_id, r, c, txt, bb, pc) in bbox_ar:
                st = '\t'.join(
                    [dk[0], table_id, dk[1], sec_type, r, c, txt, bb, pc])
                st += '\n'
                fout.write(st)
    fout.close()