def getItrInfos(edp): logging.info('Getting ITR Infos for %s', edp) # Getting newer version of documents itrs = dict([(i['date'], i) for i in sorted(getITRs(edp), cmp=mycmp) if i['version'] > 0.0]) for dt, itr in itrs.iteritems(): if dt.year < 2015: logging.info('Ignoring ITR of %s', dt.year) continue tm = '{:%Y%m}'.format(dt) logging.info('Getting ITR of %s for %s', edp, tm) # Get NSD and NSR url_fdf = RE_LOC.search(openUrl(itr['url'], True)).group(1) params = urlparse.parse_qs(urlparse.urlparse(url_fdf).query) nsd, nsr = params['NumeroSequencialDocumento'][0], params['NumeroSequencialRegistroCvm'][0] infos = dict() infos['id'] = edp infos['period'] = tm infos['ptype'] = 'ITR' # Get Infos From DFPs Consolidadas - Balanco Patrimonial Ativo getDfpConBPA(nsd, nsr, infos) # Get Infos From DFPs Consolidadas - Balanco Patrimonial Passivo getDfpConBPP(nsd, nsr, infos) # Get Infos From DFPs Consolidadas - Demonstracao do Resultado getDftDemRes(nsd, nsr, infos, PARMS_GERAL_ITR) # Get Infos From Dados da Empresa - Composicao do Capital getDaeCc(nsd, nsr, infos) yield infos
def getDetails(enterprise): logging.info('Getting Details of Company %s', enterprise[0]) edp,name = enterprise[0],enterprise[1] d = dict() d['id'] = edp d['name'] = name d['register_date'] = datetime.now().isoformat() page = openUrl(buildUrl(URL_EDETAIL, 'CodCVM=' + edp)) # Company Data att = getTables(page, 'panel1a') for table in att[:3]: for row in getRows(table): rowToDict(row,d) #print tostring(page) att = HtmlElement(page).get_element_by_id('panel1a') d['trading_codes'] = [ l.text for l in HtmlElement(att).find_class('LinkCodNeg') ] att = d['industry_classification'].split('/') d['sector'] = att[0].strip() d['subsector'] = att[1].strip() d['segment'] = att[2].strip() d['detailwebsite'] = buildUrl(URL_RESUMO, 'codigoCvm=' + edp, 'idioma=pt-BR') if 'website' in d: d['website'] = urlparse.urlsplit(d['website'], 'http').geturl() return d
def getDfpInfos(edp): logging.info('Getting DFP Infos for %s', edp) # Getting newer version of documents dfps = dict([(i['date'], i) for i in sorted(getDFPs(edp), cmp=mycmp) if i['version'] > 0.0]) for year, dfp in dfps.iteritems(): logging.info('Getting DFP of %s for %s', edp, year.year) # Get NSD and NSR url_fdf = RE_LOC.search(openUrl(dfp['url'], True)).group(1) params = urlparse.parse_qs(urlparse.urlparse(url_fdf).query) nsd, nsr = params['NumeroSequencialDocumento'][0], params['NumeroSequencialRegistroCvm'][0] infos = dict() infos['id'] = edp infos['period'] = str(year.year) infos['ptype'] = 'DFP' # Get Infos From DFPs Consolidadas - Balanco Patrimonial Ativo getDfpConBPA(nsd, nsr, infos) # Get Infos From DFPs Consolidadas - Balanco Patrimonial Passivo getDfpConBPP(nsd, nsr, infos) # Get Infos From DFPs Consolidadas - Demonstracao do Resultado getDftDemRes(nsd, nsr, infos) # Get Infos From Dados da Empresa - Composicao do Capital getDaeCc(nsd, nsr, infos) yield infos
def updateEntries(cls, feed): xml = utils.openUrl(feed.url) if not xml: return dom = utils.parseXmlString(xml) parser = parsers.FeedParserFactory.create(dom) pagingKey = 0 for entryDict in parser.entries(): key = entryDict['key'] entry = EntryModel.get_by_key_name(key, parent=feed) if not entry: entry = EntryModel(parent=feed, key_name=key) entry.feed = feed feed.total += 1 if entry.fromDict(entryDict): entry.read = False entry.setPagingKey(pagingKey) entry.put() pagingKey += 1 feed.unread += 1 if pagingKey > 0: feed.put()
def downloadBdiFile(dt): fileurl = '{0}bdi{1}.zip'.format(URL_BDI, dt.strftime('%m%d')) logging.info('Downloading file %s', fileurl) # save file fp = StringIO(openUrl(fileurl, True)) z = zipfile.ZipFile(fp) for name in z.namelist(): yield z.open(name)
def msnMoneyTenYearSummary(symbol, local=False): import bs4 import utils if not len(symbol): return None url = "http://investing.money.msn.com/investments/financial-statements?symbol=" + symbol url, page = utils.openUrl(url, local) print url soup = bs4.BeautifulSoup(page, "lxml") # Extract tables tables = soup.find_all("table", {"class": "mnytbl"}) # Parse income statement table income = {} for row in tables[0].find_all("tr"): cols = row.find_all("td") if len(cols) == 0: continue tuple = () for icol, col in enumerate(cols): entry = col.find_all(text=True)[1].strip() if icol == 0: try: secs = utils.makeEpochTime(str(entry), "%m/%y") tuple = tuple + (secs,) except ValueError: tuple = tuple + (str(entry),) else: tuple = utils.extractData(entry, tuple) income[tuple[0]] = tuple[1:] # Parse balance sheet table balance = {} for row in tables[1].find_all("tr"): cols = row.find_all("td") if len(cols) == 0: continue tuple = () for icol, col in enumerate(cols): entry = col.find_all(text=True)[1].strip() if icol == 0: try: secs = utils.makeEpochTime(str(entry), "%m/%y") tuple = tuple + (secs,) except ValueError: tuple = tuple + (str(entry),) else: tuple = utils.extractData(entry, tuple) balance[tuple[0]] = tuple[1:] return income, balance
def getDaeCc(nsd, nsr, infos): logging.info('Getting Dados da Empresa - Composicao do Capital - %s %s', nsd, nsr) page = HtmlElement(openUrl(buildUrl(URL_DCC, PARMS_GERAL, PARMS_BPP, PARMS_BPP, 'NumeroSequencialDocumento=' + nsd, 'NumeroSequencialRegistroCvm=' + nsr))) # Multiplicador em = page.xpath('.//div[@id="UltimaTabela"]/table/tr/td/b/text()') m = 1000 if len(em) > 0 and em[0].find('(Mil)') != -1 else 1 for i in ['QtdAordCapiItgz', 'QtdAprfCapiItgz', 'QtdTotAcaoCapiItgz', 'QtdAordTeso', 'QtdAprfTeso', 'QtdTotAcaoTeso']: qnt = page.get_element_by_id('ctl00_cphPopUp_{0}_1'.format(i)) infos[i] = 0 if qnt is None else toInt(qnt.text) * m
def getDftDemRes(url, infos): logging.info('Getting DFs Consolidadas - Demonstracao do Resultado') t = dict() page = openUrl(url) table = HtmlElement(page).get_element_by_id(TABLE_BPP) for row in getRows(table): rowToDict(row, t) # Multiplicador m = getMultiplicador(page) # Receita Liquida infos['RL'] = toInt(t['3.01'][1])*m if '3.01' in t else 0 # Lucro Liquito infos['LL'] = next((toInt(v[1])*m for k,v in t.iteritems() if re.match('^Lucro.+odo$', v[0])), 0)
def getDfpConBPA(url, infos): logging.info('Getting DFs Consolidadas - Balanco Patrimonial Ativo') t = dict() page = HtmlElement(openUrl(url)) table = page.get_element_by_id(TABLE_BPP) for row in getRows(table): rowToDict(row, t) # Multiplicador m = getMultiplicador(page) # Caixa cxa = toInt(t['1.01.01'][1])*m if '1.01.01' in t else 0 apf = toInt(t['1.01.02'][1])*m if '1.01.02' in t else 0 infos['CAIXA'] = cxa + apf
def getDfpConBPA(nsd, nsr, infos): logging.info('Getting DFs Consolidadas - Balanco Patrimonial Ativo - %s %s', nsd, nsr) t = dict() page = HtmlElement(openUrl(buildUrl(URL_FDF, PARMS_GERAL, PARMS_BPP, 'NumeroSequencialDocumento=' + nsd, 'NumeroSequencialRegistroCvm=' + nsr, 'Informacao=2', 'Demonstracao=2'))) table = page.get_element_by_id(TABLE_BPP) for row in getRows(table): rowToDict(row, t) # Multiplicador m = getMultiplicador(page) # Caixa cxa = toInt(t['1.01.01'][1])*m if '1.01.01' in t else 0 apf = toInt(t['1.01.02'][1])*m if '1.01.02' in t else 0 infos['CAIXA'] = cxa + apf
def getDftDemRes(nsd, nsr, infos, params_geral=PARMS_GERAL): logging.info('Getting DFs Consolidadas - Demonstracao do Resultado - %s %s', nsd, nsr) t = dict() page = openUrl(buildUrl(URL_FDF, params_geral, PARMS_BPP, 'NumeroSequencialDocumento=' + nsd, 'NumeroSequencialRegistroCvm=' + nsr, 'Informacao=2', 'Demonstracao=4')) table = HtmlElement(page).get_element_by_id(TABLE_BPP) for row in getRows(table): rowToDict(row, t) # Multiplicador m = getMultiplicador(page) # Receita Liquida infos['RL'] = toInt(t['3.01'][1])*m if '3.01' in t else 0 # Lucro Liquito infos['LL'] = toInt(t['3.11'][1])*m if '3.11' in t else 0
def msnMoneyQuote( symbol, local = False ) : import bs4 import utils if not len(symbol) : return None url = 'http://investing.money.msn.com/investments/stock-price?symbol=' + symbol url,page = utils.openUrl(url,local) print url soup = bs4.BeautifulSoup(page,"lxml") # Extract date stamp from below "details" table footers = soup.find_all("span",{"class":"foot"}) string = footers[0].find_all(text=True)[0].strip().split(' ')[2] date = utils.makeEpochTime(string,'%m/%d/%Y') # Extract tables tables = soup.find_all("table",{"class":"mnytbl"}) # Parse "details" table details = {} tuple = () cntr = 0 for row in tables[0].find_all("tr") : cells = row.find_all("td") if len(cells) == 0 : continue data = cells[1].find_all(text=True)[1].strip() tuple = utils.extractData(data,tuple) cntr = cntr + 1 details[date] = tuple # Parse "financial highlights" table highlights = {} tuple = () cntr = 0 for row in tables[1].find_all("tr") : cells = row.find_all("td") if len(cells) == 0 : continue index = 2 if ( cntr == 2 or cntr == 3 ) else 1 data = cells[1].find_all(text=True)[index].strip() tuple = utils.extractData(data,tuple) cntr = cntr + 1 highlights[date] = tuple return details,highlights
def getDfpConBPP(nsd, nsr, infos): logging.info('Getting DFs Consolidadas - Balanco Patrimonial Passivo - %s %s', nsd, nsr) t = dict() page = openUrl(buildUrl(URL_FDF, PARMS_GERAL, PARMS_BPP, 'NumeroSequencialDocumento=' + nsd, 'NumeroSequencialRegistroCvm=' + nsr, 'Informacao=2', 'Demonstracao=3')) table = HtmlElement(page).get_element_by_id(TABLE_BPP) for row in getRows(table): rowToDict(row, t) # Multiplicador m = getMultiplicador(page) # Patrimonio Liquido infos['PL'] = toInt(t['2.03'][1])*m if '2.03' in t else 0 # Divida Bruta CP = toInt(t['2.01.04'][1])*m if '2.01.04' in t else 0 LP = toInt(t['2.02.01'][1])*m if '2.02.01' in t else 0 infos['DB'] = CP + LP
def getDfpConBPP(url, infos): logging.info('Getting DFs Consolidadas - Balanco Patrimonial Passivo') t = dict() page = openUrl(url) table = HtmlElement(page).get_element_by_id(TABLE_BPP) for row in getRows(table): rowToDict(row, t) # Multiplicador m = getMultiplicador(page) # Patrimonio Liquido infos['PL'] = toInt(t['2.03'][1])*m if '2.03' in t else 0 # Divida Bruta CP = toInt(t['2.01.04'][1])*m if '2.01.04' in t else 0 LP = toInt(t['2.02.01'][1])*m if '2.02.01' in t else 0 infos['DB'] = CP + LP # Dividendos Div = toInt(t['2.01.05.02.01'][1])*m if '2.01.05.02.01' in t else 0 infos['DIV'] = Div
def msnMoneyBalanceSheet( symbol, local = False ) : import bs4 import utils if not len(symbol) : return None url = 'http://investing.money.msn.com/investments/stock-balance-sheet/?symbol=' + symbol + '&stmtView=Ann' url,page = utils.openUrl(url,local) print url soup = bs4.BeautifulSoup(page,"lxml") rows = soup.find_all("tr") ncols = len(rows[-1].find_all("td"))-1 titles = [] tuples = [() for x in range(ncols)] for irow,row in enumerate( rows ) : for icol,col in enumerate( row.find_all("td") ) : entries = col.find_all(text=True) index = None if len(entries) == 1 : index = 0 elif len(entries) == 3 : index = 1 elif len(entries) == 7 : index = 4 else : continue entry = entries[index].strip().encode("utf-8") if irow == 7 or irow == 30 : continue if len(entry) : if icol == 0 : titles.append(str(entry)) else : dates = {1:'%Y',2:'%m/%d/%Y',4:'%m/%d/%Y'} try : date = dates[irow] secs = utils.makeEpochTime(str(entry),date) tuples[icol-1] = tuples[icol-1] + (secs,) except KeyError : tuples[icol-1] = utils.extractData(entry,tuples[icol-1]) dict = {} for col in range(len(tuples)) : dict[tuples[col][0]] = tuples[col][1:] return titles,dict
def getLinks(url): resp = dict() trp = openUrl(url, True) url_fdf = RE_LOC.search(trp).group(1) params = urlparse.parse_qs(urlparse.urlparse(url_fdf).query) resp['ctd'] = params['CodTipoDocumento'][0] resp['nsd'] = params['NumeroSequencialDocumento'][0] resp['nsr'] = params['NumeroSequencialRegistroCvm'][0] resp['cti'] = params['CodigoTipoInstituicao'][0] fparams = '&CodTipoDocumento={0}&NumeroSequencialDocumento={1}&'\ 'NumeroSequencialRegistroCvm={2}&CodigoTipoInstituicao={3}'.\ format(resp['ctd'], resp['nsd'], resp['nsr'], resp['cti']) for i in re.findall('"Text":"([^"]+)","Value":"([^"]+)"', trp): if re.match('^Balan.+Ativo$', i[0]): resp['dfp_bpa'] = escapeUrl(URL_RAD + i[1] + fparams) if re.match('^Balan.+Passivo$', i[0]): resp['dfp_bpp'] = escapeUrl(URL_RAD + i[1] + fparams) if re.match('^Demonstra.+Resultado$', i[0]): resp['dfp_dr'] = escapeUrl(URL_RAD + i[1] + fparams) return resp
def msnMoneyCompanyProfile( symbol, local = False ) : import bs4 import utils import datetime if not len(symbol) : return None url = 'http://investing.money.msn.com/investments/company-report?symbol=' + symbol url,page = utils.openUrl(url,local) print url soup = bs4.BeautifulSoup(page,"lxml") tables = soup.find_all("table",{"class":"mnytbl"}) text = tables[1].find_all("span")[0].find_all(text=True)[0].strip().encode("utf-8") d = datetime.date(1900,1,1) length = {0:len(text)} profile = {0:text} return length,profile
def getFdFiles(edp, dftype): logging.info('Getting {0} Infos for {1}'.format(dftype, edp)) # Getting newer version of documents fps = dict([(i['date'], i) for i in sorted(getFPs(edp, dftype), cmp=mycmp) if i['version'] > 0.0]) for year, fp in fps.iteritems(): logging.info('Getting {0} of {1} for {2}'.format(dftype, edp, year.year)) params = urlparse.parse_qs(urlparse.urlparse(fp['url']).query) query = 'NumeroSequencialDocumento={0}&CodigoInstituicao={1}'.format(params['NumeroSequencialDocumento'][0], params['CodigoTipoInstituicao'][0]) filepath = path.join(getPath(dftype), str(edp), str(year.year)) filename = '{0}/{1}_{2}-{3}.zip'.format(filepath, params['NumeroSequencialDocumento'][0], params['CodigoTipoInstituicao'][0], fp['version']) if path.isfile(filename): logging.info('File {0} exists, skiping...'.format(filename)) yield filename continue if not path.isdir(filepath): makedirs(filepath) logging.info('Downloading file {0}'.format(filename)) with open(filename, 'wb+') as f: f.write(openUrl(buildUrl(URL_DD, query), True)) yield filename
def getFPs(edp, dftype): page = openUrl(buildUrl(URL_DF, 'codigoCVM=' + edp, 'idioma=pt-br', 'tipo='+dftype.lower())) return [parseFd(dfp) for dfp in page.findall('.//div[@id="' + DIV_DFPS + '"]/div/div/div/div/p/a')]
def msnMoneyHistoricalPrices(symbol, local=False): import bs4 import utils if not len(symbol): return None url = ( "http://investing.money.msn.com/investments/equity-historical-price/?PT=7&D4=1&DD=1&D5=0&DCS=2&MA0=0&MA1=0&CF=0&nocookie=1&SZ=0&symbol=" + symbol ) # url = 'http://investing.money.msn.com/investments/equity-historical-price/?symbol=us%3a' + symbol + '&CA=0&CB=0&CC=0&CD=0&D4=1&DD=1&D5=0&DCS=2&MA0=0&MA1=0&C5=0&C5D=0&C6=0&C7=0&C7D=0&C8=0&C9=0&CF=4&D8=0&DB=1&DC=1&D9=0&DA=0&D1=0&SZ=0&PT=11' url, page = utils.openUrl(url, local) print url soup = bs4.BeautifulSoup(page, "lxml") rows = soup.find_all("tr") titles = [] prices = {} dividends = {} for irow, row in enumerate(rows): cols = row.find_all("td") # Extract titles from table header headers = row.find_all("th") for header in headers: entries = header.find_all(text=True) entry = entries[1].strip() if not len(entry): continue titles.append(str(entry)) # Extract ex-dividend dates, dividends paid, and share price if len(cols) == 3: date = 0 div = 0.0 price = 0.0 try: entries = cols[0].find_all(text=True) entry = entries[1].strip() if len(entry): date = utils.makeEpochTime(str(entry), "%m/%d/%Y") except ValueError: date = 0 try: entries = cols[1].find_all(text=True) entry = entries[1].strip().split(" ")[0] if len(entry): div = float(entry) except ValueError: div = 0.0 try: if irow < len(rows): entries = rows[irow + 1].find_all("td")[4].find_all(text=True) entry = entries[1].strip() price = float(entry) else: price = 0.0 except ValueError: price = 0.0 if date != 0: dividends[date] = (div, price) # Loop through rows and extract share prices else: tuple = () if len(cols) != 6: continue for icol, col in enumerate(cols): entries = col.find_all(text=True) entry = entries[1].strip() if not len(entry): continue try: secs = utils.makeEpochTime(str(entry), "%m/%d/%Y") tuple = tuple + (secs,) except ValueError: tuple = utils.extractData(entry, tuple) prices[tuple[0]] = tuple[1:] return titles, prices, dividends
def getITRs(edp): page = openUrl(buildUrl(URL_DF, 'codigoCVM=' + edp, 'idioma=pt-br', 'tipo=itr')) return [parseFd(itr) for itr in page.findall('.//div[@id="' + DIV_DFPS + '"]/div/div/div/div/p/a')]