def execute(self): logger.debug('begin') _newds = self._dataset ### Proxy if any # proxy_info = get_proxy() if proxy_info: pip={ 'http':proxy_info['proxy'], 'https':proxy_info['proxy'], } proxy = urllib2.ProxyHandler(pip) opener = urllib2.build_opener(proxy) urllib2.install_opener(opener) page = urllib2.urlopen(self._url) logger.debug('Got %s',self._url) soup = BeautifulSoup(page) ### Questo blocco cerca la tabella tab2a tTags = soup.findAll('table') found = False for i,tTag in enumerate(tTags): #print '====',i,str(tTag.tr.td.text)[0:200] if re.match('^Table %s' % self._table,str(tTag.tr.td.text),re.I): found=True break # for tTag in tTags: #tA = tTd.findNext('a') #tAv = tTag.find(attrs={'name':self._table}) #if tAv: # found=True # break # print "--------------------" if not found: logger.error('Table %s not in page',self._table) return raise ValueError, "%s not found" % self._table tTrs = tTag.findAll('tr') # print ">>>>>>>>" i = 0 inBlock = False res = [] for tTr in tTrs: tTds = tTr.findAll('td') v = [] for i,td in enumerate(tTds): txt = td.text if txt != ' ': if re.match('^--$',txt): txt = '' elif re.match(',',txt): txt = txt.replace(',','') elif i==0 and re.match('^.+([0-9]\/?)$',txt): m = re.match('^.+([0-9]\/?)$',txt) m1 = m.group(1) txt = txt[:-len(m1)] elif re.match('^\*.+$',txt): while txt[0]!='*': txt = txt[1:] v.append(txt) res.append(v) i += 1 titles = [ 'Stand-by Arrangements', 'Extended Arrangements', 'Flexible Credit Line', 'Precautionary' ] ttl = titles.pop(0) lk = -1 v = {} t = {} for l in res: if len(l)==1 and re.match("^%s"%ttl,l[0],re.I): lk += 1 if len(titles)>0: ttl = titles.pop(0) logger.debug('Next Title %s',ttl) elif len(l)==7: if not v.has_key(lk): v[lk]=[] v[lk].append(l) elif len(l)==4 and re.search('Arrangements?$',l[0]): m = re.match('^([0-9]+) Arrangements?$',l[0]) if m: npx = m.group(1) npt = False if not t.has_key(lk): t[lk]=[] if len(v[lk])==int(npx): logger.debug("Wow right arrangement found") npt=True l.append(npx) l.append(npt) t[lk].extend(l) else: if len(l)>0: logger.debug('LINE NOT INSERTED %d > %s', len(l), l) N = np.sum([ int(x[4]) for k,x in t.items()]) # print "ROWS ARE=",N # Make the result vectors # _ds = DataSet() l = 0 for k,w in sorted(v.items()): for i,n in enumerate( ('MEMBER', 'EFDATE', 'EXDATE', 'AGREED', 'UNDRAWN', 'OUTSTANDING', 'POQ', ) ): name = "%s_%d" % (n,k) if not _ds.has_key(name): _ds[name]=[] for m in w: # print "WL=",m if n=='POQ' and len(m[i])>0: _ds[name].append(m[i]+"\\%") else: _ds[name].append(m[i].lstrip('*')) l += 1 # pprint(_ds) return _ds