def download(state, year, cache=True): """ Downloads data directly from Datasus ftp server :param state: two-letter state identifier: MG == Minas Gerais :param year: 4 digit integer :return: pandas dataframe """ state = state.upper() if year < 1994: raise ValueError("SINASC does not contain data before 1994") ftp = FTP('ftp.datasus.gov.br') ftp.login() if year >= 1996: ftp.cwd('/dissemin/publicos/SINASC/NOV/DNRES') fname = 'DN{}{}.DBC'.format(state, year) else: ftp.cwd('/dissemin/publicos/SINASC/ANT/DNRES') fname = 'DNR{}{}.DBC'.format(state, str(year)[-2:]) cachefile = os.path.join(CACHEPATH, 'SINASC_'+fname.split('.')[0] + '_.parquet') if os.path.exists(cachefile): df = pd.read_parquet(cachefile) return df ftp.retrbinary('RETR {}'.format(fname), open(fname, 'wb').write) df = read_dbc(fname, encoding='iso-8859-1') if cache: df.to_parquet(cachefile) os.unlink(fname) return df
def download(state, year, cache=True): """ Downloads data directly from Datasus ftp server :param state: two-letter state identifier: MG == Minas Gerais :param year: 4 digit integer :return: pandas dataframe """ assert len(str(year)) == 4 state = state.upper() if year < 1994: raise ValueError("SINASC does not contain data before 1994") ftp = FTP("ftp.datasus.gov.br") ftp.login() if year >= 1996: ftp.cwd("/dissemin/publicos/SINASC/NOV/DNRES") fname = "DN{}{}.DBC".format(state, year) else: ftp.cwd("/dissemin/publicos/SINASC/ANT/DNRES") fname = "DNR{}{}.DBC".format(state, str(year)[-2:]) cachefile = os.path.join(CACHEPATH, "SINASC_" + fname.split(".")[0] + "_.parquet") if os.path.exists(cachefile): df = pd.read_parquet(cachefile) return df ftp.retrbinary("RETR {}".format(fname), open(fname, "wb").write) df = read_dbc(fname, encoding="iso-8859-1") if cache: df.to_parquet(cachefile) os.unlink(fname) return df
def read_data_from_state(state): """ Reads data from the state passed in and formats it into a pandas data frame :param state: the Brazilian state code (e.g. "MG") to read into memory in a dataframe format :return: pandas dataframe """ print(f"reading {state} dbc file") fname = f'./data/DN{state}2017.dbc' df = read_dbc(fname, encoding='iso-8859-1') return df
def _fetch_file(fname, ftp, ftype): try: ftp.retrbinary('RETR {}'.format(fname), open(fname, 'wb').write) except: raise Exception("File {} not available".format(fname)) if ftype == 'DBC': df = read_dbc(fname, encoding='iso-8859-1') elif ftype == 'DBF': dbf = DBF(fname, encoding='iso-8859-1') df = pd.DataFrame(list(dbf)) os.unlink(fname) return df
def _fetch_file(fname, ftp, ftype): try: ftp.retrbinary("RETR {}".format(fname), open(fname, "wb").write) except error_perm: raise Exception("File {} not available".format(fname)) if ftype == "DBC": df = read_dbc(fname, encoding="iso-8859-1") elif ftype == "DBF": dbf = DBF(fname, encoding="iso-8859-1") df = pd.DataFrame(list(dbf)) os.unlink(fname) return df
def download(state, year, cache=True, folder=None): """ Downloads data directly from Datasus ftp server :param state: two-letter state identifier: MG == Minas Gerais :param year: 4 digit integer :return: pandas dataframe """ year2 = str(year)[-2:].zfill(2) state = state.upper() ftp_dir = "" fname = "" if year < 1979: raise ValueError("SIM does not contain data before 1979") elif year >= 1996: ftp_dir = '/dissemin/publicos/SIM/CID10/DORES' fname = 'DO{}{}.DBC'.format(state, year) else: ftp_dir = '/dissemin/publicos/SIM/CID9/DORES' fname = fname = 'DOR{}{}.DBC'.format(state, year2) cache_fail = False cachefile = os.path.join(CACHEPATH, 'SIM_' + fname.split('.')[0] + '_.parquet') if folder: fname = "{}/{}".format(folder, fname) elif cache: if os.path.exists(cachefile): df = pd.read_parquet(cachefile) return df else: cache_fail = True # Se tiver folder não tenta cache if not folder and (cache_fail or not cache): ftp = FTP('ftp.datasus.gov.br') ftp.login() ftp.cwd(ftp_dir) try: ftp.retrbinary('RETR {}'.format(fname), open(fname, 'wb').write) except: try: ftp.retrbinary('RETR {}'.format(fname.upper()), open(fname, 'wb').write) except: raise Exception("File {} not available".format(fname)) df = read_dbc(fname, encoding='iso-8859-1') df.to_parquet(cachefile) os.unlink(fname) return df
def _fetch_file(fname, path, ftype): ftp = FTP("ftp.datasus.gov.br") ftp.login() ftp.cwd(path) try: ftp.retrbinary("RETR {}".format(fname), open(fname, "wb").write) except: raise Exception("File {} not available".format(fname)) if ftype == "DBC": df = read_dbc(fname, encoding="iso-8859-1") elif ftype == "DBF": dbf = DBF(fname, encoding="iso-8859-1") df = pd.DataFrame(list(dbf)) os.unlink(fname) return df
def download(state, year, disease, cache=True): """ Downloads SINAN data directly from Datasus ftp server :param state: two-letter state identifier: MG == Minas Gerais :param year: 4 digit integer :disease: Diseases :return: pandas dataframe """ try: assert disease.title() in agravos except AssertionError: print( f'Disease {disease} is not available in SINAN.\nAvailable diseases: {list_diseases()}' ) year2 = str(year)[-2:].zfill(2) state = state.upper() if year < 2007: raise ValueError("SINAN does not contain data before 2007") ftp = FTP('ftp.datasus.gov.br') ftp.login() ftp.cwd("/dissemin/publicos/SINAN/DADOS/FINAIS") dis_code = agravos[disease.title()] fname = f'{dis_code}{state}{year2}.DBC' cachefile = os.path.join(CACHEPATH, 'SINAN_' + fname.split('.')[0] + '_.parquet') if os.path.exists(cachefile): df = pd.read_parquet(cachefile) return df try: ftp.retrbinary('RETR {}'.format(fname), open(fname, 'wb').write) except: try: ftp.retrbinary('RETR {}'.format(fname.upper()), open(fname, 'wb').write) except Exception as e: raise Exception("{}\nFile {} not available".format(e, fname)) df = read_dbc(fname, encoding='iso-8859-1') if cache: df.to_parquet(cachefile) os.unlink(fname) return df
def _fetch_file(fname: str, path: str, ftype: str) -> pd.DataFrame: """ Fetch a single file. :return: Pandas Dataframe """ ftp = FTP("ftp.datasus.gov.br") ftp.login() ftp.cwd(path) try: ftp.retrbinary("RETR {}".format(fname), open(fname, "wb").write) except: raise Exception("File {} not available".format(fname)) if ftype == "DBC": df = read_dbc(fname, encoding="iso-8859-1") elif ftype == "DBF": dbf = DBF(fname, encoding="iso-8859-1") df = pd.DataFrame(list(dbf)) if os.path.exists(fname): os.unlink(fname) return df
def download(state, year): """ Downloads data directly from Datasus ftp server :param state: two-letter state identifier: MG == Minas Gerais :param year: 4 digit integer :return: pandas dataframe """ if year < 1994: raise ValueError("SINASC does not contain data before 1994") ftp = FTP('ftp.datasus.gov.br') ftp.login() if year >= 1996: ftp.cwd('/dissemin/publicos/SINASC/NOV/DNRES') fname = 'DN{}{}.DBC'.format(state, year) else: ftp.cwd('/dissemin/publicos/SINASC/ANT/DNRES') fname = 'DNR{}{}.DBC'.format(state, str(year)[-2:]) ftp.retrbinary('RETR {}'.format(fname), open(fname, 'wb').write) df = read_dbc(fname, encoding='iso-8859-1') os.unlink(fname) return df
def _fetch_file(fname, ftp, ftype): """ Does the FTP fetching. :param fname: file name :param ftp: ftp connection object :param ftype: file type: DBF|DBC :return: pandas dataframe """ print("Downloading {}...".format(fname)) try: ftp.retrbinary('RETR {}'.format(fname), open(fname, 'wb').write) except: try: ftp.retrbinary('RETR {}'.format(fname.lower()), open(fname, 'wb').write) except: raise Exception("File {} not available".format(fname)) if ftype == 'DBC': df = read_dbc(fname, encoding='iso-8859-1') elif ftype == 'DBF': dbf = DBF(fname, encoding='iso-8859-1') df = pd.DataFrame(list(dbf)) os.unlink(fname) return df
def _fetch_file(fname, ftp, ftype): """ Does the FTP fetching. :param fname: file name :param ftp: ftp connection object :param ftype: file type: DBF|DBC :return: pandas dataframe """ print(f'Downloading {fname}...') try: ftp.retrbinary(f'RETR {fname}', open(fname, 'wb').write) except: try: ftp.retrbinary(f'RETR {fname.lower()}', open(fname, 'wb').write) except: raise Exception(f'File {fname} not available') if ftype == 'DBC': df = read_dbc(fname, encoding='iso-8859-1') elif ftype == 'DBF': dbf = DBF(fname, encoding='iso-8859-1') df = pd.DataFrame(list(dbf)) os.unlink(fname) return df
def download(state, year, cache=True): """ Downloads data directly from Datasus ftp server :param state: two-letter state identifier: MG == Minas Gerais :param year: 4 digit integer :return: pandas dataframe """ year2 = str(year)[-2:].zfill(2) state = state.upper() if year < 1979: raise ValueError("SIM does not contain data before 1979") ftp = FTP('ftp.datasus.gov.br') ftp.login() if year >= 1996: ftp.cwd('/dissemin/publicos/SIM/CID10/DORES') fname = 'DO{}{}.DBC'.format(state, year) else: ftp.cwd('/dissemin/publicos/SIM/CID9/DORES') fname = 'DOR{}{}.DBC'.format(state, year2) cachefile = os.path.join(CACHEPATH, 'SIM_'+fname.split('.')[0] + '_.parquet') if os.path.exists(cachefile): df = pd.read_parquet(cachefile) return df try: ftp.retrbinary('RETR {}'.format(fname), open(fname, 'wb').write) except: try: ftp.retrbinary('RETR {}'.format(fname.upper()), open(fname, 'wb').write) except: raise Exception("File {} not available".format(fname)) df = read_dbc(fname, encoding='iso-8859-1') if cache: df.to_parquet(cachefile) os.unlink(fname) return df
def test_read_dbc(self): df = read_dbc(b"test_data/sids.dbc") self.assertIsInstance(df, pd.DataFrame) self.assertGreater(df.size, 0)
def test_read_dbc(self): df = read_dbc(b'test_data/sids.dbc') self.assertIsInstance(df, pd.DataFrame) self.assertGreater(df.size, 0)