コード例 #1
0
ファイル: CFA.py プロジェクト: exedre/e4t
    def execute(self):
        logger.debug('begin')
        _newds = self._dataset

        ### Proxy if any
        #
        proxy_info = get_proxy()
        if proxy_info:
            pip={
                'http':proxy_info['proxy'],
                'https':proxy_info['proxy'],
            }

            proxy = urllib2.ProxyHandler(pip)
            opener = urllib2.build_opener(proxy)
            urllib2.install_opener(opener)

        page = urllib2.urlopen(self._url)
        logger.debug('Got %s',self._url)
        soup = BeautifulSoup(page)

        ### Questo blocco cerca la tabella tab2a
        tTags = soup.findAll('table')
        found = False
        for  i,tTag in enumerate(tTags):
            #print '====',i,str(tTag.tr.td.text)[0:200]
            if re.match('^Table %s' % self._table,str(tTag.tr.td.text),re.I):
                found=True
                break

            #        for  tTag in tTags:
            #tA = tTd.findNext('a')
            #tAv = tTag.find(attrs={'name':self._table})
            #if tAv:
            #    found=True
            #    break
#            print "--------------------"
        
        if not found:
            logger.error('Table %s not in page',self._table)
            return
            raise ValueError, "%s not found" % self._table

        tTrs = tTag.findAll('tr')
#        print ">>>>>>>>"
        i = 0
        inBlock = False
        res = []
        for tTr in tTrs:
            tTds = tTr.findAll('td')
            v = []
            for i,td in enumerate(tTds):                      
                txt = td.text
                
                if txt != ' ':

                    if re.match('^--$',txt):
                        txt = ''
                    elif re.match(',',txt):
                        txt = txt.replace(',','')
                    elif i==0 and re.match('^.+([0-9]\/?)$',txt):
                        m = re.match('^.+([0-9]\/?)$',txt)
                        m1 = m.group(1)
                        txt = txt[:-len(m1)]
                    elif re.match('^\*.+$',txt):
                        while txt[0]!='*':
                            txt = txt[1:]
                        
                    v.append(txt)                
            res.append(v)
            i += 1

        titles = [ 'Stand-by Arrangements', 
                   'Extended Arrangements', 
                   'Flexible Credit Line', 
                   'Precautionary' ]
        ttl = titles.pop(0)
        lk = -1
        v = {}
        t = {}
        for l in res:
            if len(l)==1 and re.match("^%s"%ttl,l[0],re.I):
                lk += 1
                if len(titles)>0:
                    ttl = titles.pop(0)
                    logger.debug('Next Title %s',ttl)
            elif len(l)==7:
                if not v.has_key(lk):
                    v[lk]=[]
                v[lk].append(l)
            elif len(l)==4 and re.search('Arrangements?$',l[0]):
                m = re.match('^([0-9]+) Arrangements?$',l[0])
                if m:
                    npx = m.group(1)
                    npt = False
                    if not t.has_key(lk):
                        t[lk]=[]                       
                    if len(v[lk])==int(npx):
                        logger.debug("Wow right arrangement found")
                        npt=True
                    l.append(npx)
                    l.append(npt)
                    t[lk].extend(l)                                    
            else:
                if len(l)>0:
                    logger.debug('LINE NOT INSERTED %d > %s', len(l), l)
        N = np.sum([ int(x[4]) for k,x in t.items()])
#        print "ROWS ARE=",N

        # Make the result vectors
        # 
        _ds = DataSet()
        l = 0
        for k,w in sorted(v.items()):
            for i,n in enumerate( ('MEMBER',
                                   'EFDATE', 
                                   'EXDATE', 
                                   'AGREED', 
                                   'UNDRAWN', 
                                   'OUTSTANDING', 
                                   'POQ', 
                                   ) ):
                name = "%s_%d" % (n,k)
                if not _ds.has_key(name):
                    _ds[name]=[]
                for m in w:
#                    print "WL=",m
                    if n=='POQ' and len(m[i])>0:
                        _ds[name].append(m[i]+"\\%")
                    else:
                        _ds[name].append(m[i].lstrip('*'))
                    l += 1

#        pprint(_ds)
        return _ds