def openBL(): orig_cols = ['cod_mun','CNAE 2d','concatenar','empresas','corte'] bl = read_from_csv("dados/nfe/bl/012013.csv",delimiter=';',cols=orig_cols,header=1) bl=bl[['concatenar','empresas','corte']] #print bl #print "-------------------------" return bl
def test_HS(self,year): print "Entering in checkHS" dfDV = sql_to_df("SELECT s.hs_id as id,sum(export_val) as val,sum(import_val) as valimport FROM secex_ymp s where s.hs_id_len=6 and s.month=0 and s.year="+str(year)+" group by 1;",db) dfSent = read_from_csv("dados\exportacao\sent\MDIC_"+str(year)+".csv",delimiter="|") dfGroup = dfSent.groupby(dfSent.columns[10]) total=0 for hs in dfDV['id']: hsid= str(hs).zfill(6) hsshort=hsid[2:6] hsint=to_int(hsshort) if not hsint: #not isinstance(hs,int): continue valDV = dfDV[(dfDV['id']==hsid)]['val'].values[0] try: if valDV and str(valDV)<>'nan': valCSV= sum(dfGroup.get_group(hsint)[dfSent.columns[9]]) except: total=total+1 print "Not found in CSV a value for "+str(hsint)+" / "+str(hsshort)+" (original hs "+hs+") - Exports of value "+ str(valDV)+ " in the year "+str(year) continue valCSV=to_int(valCSV) valDV=to_int(valDV) if valCSV and valDV and valDV!=valCSV: total=total+1 txt= "ERROR in HS ("+str(year)+"): "+str(hsint)+" / "+str(hs)+" - Value in CSV "+ str(valCSV)+ " <> Value in DV "+str(valDV) + " - Difference: "+str(valCSV - valDV) print txt else: txt="OK" #print txt self.assertEqual(total, 0)
def test_Municipality(self,year,size,column): print "Entering in checkBRA" dfDV = sql_to_df("SELECT a.id_mdic as id,sum(export_val) as val,sum(import_val) as valimport FROM secex_ymb s,attrs_bra a where s.bra_id_len="+str(size)+" and a.id=s.bra_id and s.month=0 and s.year="+str(year)+" group by 1;",db) dfSent = read_from_csv("dados\exportacao\sent\MDIC_"+str(year)+".csv",delimiter="|") dfGroup = dfSent.groupby(dfSent.columns[column]) total=0 for mdic in dfDV['id']: mdicid=to_int(mdic) if not mdicid: #not isinstance(hs,int): continue valDV = dfDV[(dfDV['id']==mdicid)]['val'].values[0] try: if valDV and str(valDV)<>'nan': valCSV= dfGroup.get_group(mdicid)[dfSent.columns[9]].sum() except: total=total+1 print "Not found in CSV a value for "+str(mdic)+" - (original bra "+str(mdic)+") Exports of value "+ str(valDV)+ " in the year "+str(year) continue valCSV=to_int(valCSV) valDV=to_int(valDV) if valCSV and valDV and valDV!=valCSV: total=total+1 txt= "ERROR in BRA ("+str(year)+"): "+str(mdic)+" / "+str(mdicid)+" - Value in CSV "+ str(valCSV)+ " <> Value in DV "+str(valDV) + " - Difference: "+str(valCSV - valDV) print txt else: txt="OK" self.assertEqual(total, 0)
def test_main(self): #"Brasil;;;2.314;" cols=['bra','bra_sub1','bra_sub2','value',5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39] idsites=['bra','bra_sub1','bra_sub2','value'] dfSent = read_from_csv("docs\\check\\educacaosuperior\\sinopse_da_educacao_superior_2009-1.1.csv",delimiter=";",cols=cols,usecols=cols) dfSent=dfSent.drop(['bra_sub1','bra_sub2'],axis=1) dfSent = dfSent.dropna(thresh=2) mapStates=getMapStates() format = lambda x: city_fix(x,mapStates) dfSent['bra']= dfSent["bra"].map(format) dfSent['value'] = dfSent.apply(lambda f : to_number(f['value']) , axis = 1) dfSent['value'] = dfSent['value'].astype(np.float64) sql="SELECT bra_id as id,sum(enrolled) as value FROM hedu_ybucd where bra_id='4mg' and bra_id_len=3 AND d_id in ('A','B') and course_id_len=6 group by 1" dfDV = sql_to_df(sql,db) dfDV['value'] = dfDV['value'].astype(np.float64) total=run_check(dfDV,dfSent,'bra',0,'value') self.assertEqual(total, 0)
def test_Municipality(self,year,size): print "Entering in checkBRA" dfDV = sql_to_df("SELECT left(a.id_ibge,6) as id,sum(wage) as val FROM rais_yb s,attrs_bra a where s.bra_id_len="+str(size)+" and a.id=s.bra_id and year="+str(year)+" group by 1;",db) dfSent = read_from_csv("dados\emprego\sent\Rais"+str(year)+".csv",delimiter=";",cols=cols) #dfSent = read_from_csv("dados\emprego\sent\Rais"+str(year)+"Teste.csv",delimiter=",") total=run_check(dfDV,dfSent,'munic',year) self.assertEqual(total, 0)
def test_CNAE(self,year): print "Entering in checkCNAE" #before right 4 and len 5 dfDV = sql_to_df("SELECT right(s.cnae_id,4) as id,sum(wage) as val FROM rais_yi s where s.cnae_id_len=6 and year="+str(year)+" group by 1;",db) dfSent = read_from_csv("dados\emprego\sent\Rais"+str(year)+".csv",delimiter=";",cols=cols) total=run_check(dfDV,dfSent,'cnae',year) self.assertEqual(total, 0)
def main(): dirFile = 'teste.txt' dirCSV = 'teste.txt.csv' headers = ('ANO_CENSO','PK_COD_MATRICULA','FK_COD_ALUNO','NUM_IDADE', 'TP_SEXO', \ 'TP_COR_RACA', 'FK_COD_ESTADO_END', 'SIGLA_END', 'FK_COD_MUNICIPIO_END', 'ID_ZONA_RESIDENCIAL', \ 'FK_COD_MOD_ENSINO', 'FK_COD_ETAPA_ENSINO', 'PK_COD_TURMA', 'FK_COD_CURSO_PROF', 'PK_COD_ENTIDADE', \ 'FK_COD_ESTADO_ESCOLA', 'SIGLA_ESCOLA', 'COD_MUNICIPIO_ESCOLA', 'ID_LOCALIZACAO_ESC', 'ID_DEPENDENCIA_ADM_ESC') #Coluna começa com 0 columns = ((0,5),(5,18),(18,32),(43,47),(65,66), \ (66,67),(86,89),(89,91),(91,100),(100,101), \ (129,132),(132,136),(136,147),(147,156),(161,170), \ (170,173),(173,175),(175,184),(184,193),(193,194)) fixed_to_csv(dirFile,columns,dirCSV,headers) df = read_from_csv(dirCSV) #SIMPLE COMPUTED COLUMNS df["SIMPLE_COMPUTED_COLUMNS"]=df["ANO_CENSO"] + df["NUM_IDADE"] #COMPLEX COMPUTED COLUMNS df["COMPLEX_COMPUTED_COLUMNS"] = df.apply(sexo_to_number,axis=1) #REPLACE PART COLUMN df['REPLACE_PART_COLUMN'] = df['SIMPLE_COMPUTED_COLUMNS'].str.replace('20', 'XX') df2 = read_from_csv(dirCSV) #MAP COMPUTED COLUMNS df2 = read_from_csv(dirCSV) print df print df2 #MERGE df3 = df.merge(df2) print df3 df_to_csv(df3 , dirCSV)
def transform(year): cols = ('ANO', 'MES', 'PAIS', 'ESTADO', 'PORTO', 'MUNICIPIO', 'UNIDADE', 'QUANTIDADE', 'KGLIQUIDO', 'VALORFOB', 'HS2007') source_file = 'dados/importacao/sent/' + str(year) + str('_extract.csv') print source_file df = read_from_csv(source_file, 1, None, cols, None) rdCols = ['HS2007'] rd = read_from_csv('docs/classificacao/HS/anos/HS_Todos_Anos_IMP.csv', 1, ';', rdCols, converters={"HS2007": str}) f2 = lambda x: [rd.HS2007 == str(x)] df.apply(f2) print df
def openNFe(entrada,delimiter): ''' TransactedProduct_ID_NCM,TransactedProduct_ID_HS,EconomicAtivity_ID_CNAE_Receiver, EconomicAtivity_ID_CNAE_Sender,CFOP_ID,CFOP_Reclassification,CFOP_Flow,Receiver_Type, Sender_Type,Municipality_ID_Receiver,Municipality_ID_Sender,Year,Monthly, Receiver_Situation,Sender_Situation,Cost_Value,ICMS_ST_Value,ICMS_ST_RET_Value,ICMS_Value, IPI_Value,PIS_Value,COFINS_Value,II_Value,Product_Value,ISSQN_Value,Origin ''' orig_cols = ['TransactedProduct_ID_NCM','TransactedProduct_ID_HS','EconomicAtivity_ID_CNAE_Receiver','EconomicAtivity_ID_CNAE_Sender','CFOP_ID','CFOP_Reclassification','CFOP_Flow','Sender_Type','Sender_Type','Municipality_ID_Receiver','Municipality_ID_Sender','Year','Monthly','Sender_Situation','Sender_Situation','Cost_Value','ICMS_ST_Value','ICMS_ST_RET_Value','ICMS_Value','IPI_Value','PIS_Value','COFINS_Value','II_Value','Product_Value','ISSQN_Value','Origin'] converters = {"EconomicAtivity_ID_CNAE_Sender": str, "Municipality_ID_Sender": str, "Product_Value": floatvert} dados = read_from_csv(entrada,delimiter=delimiter,cols=orig_cols,converters=converters,header=1) dados=dados[['EconomicAtivity_ID_CNAE_Sender','TransactedProduct_ID_HS','Municipality_ID_Sender','Product_Value']] return dados
def transform(year): colsCurso = ('CO_IES', 'CO_CATEGORIA_ADMINISTRATIVA', 'CO_ORGANIZACAO_ACADEMICA', 'CO_NIVEL_ACADEMICO', 'CO_MODALIDADE_ENSINO', 'CO_GRAU_ACADEMICO', 'CO_CURSO', 'NO_CURSO', 'CO_MUNICIPIO_CURSO', 'CO_OCDE_AREA_GERAL', 'NO_OCDE_AREA_GERAL', 'CO_OCDE_AREA_ESPECIFICA', 'NO_OCDE_AREA_ESPECIFICA', 'CO_OCDE_AREA_DETALHADA', 'NO_OCDE_AREA_DETALHADA', 'CO_OCDE', 'NO_OCDE', 'IN_MATUTINO_CURSO', 'IN_VESPERTINO_CURSO', 'IN_NOTURNO_CURSO', 'IN_INTEGRAL_CURSO', 'QT_MATRICULA_CURSO', 'QT_CONCLUINTE_CURSO', 'QT_VAGAS_INTEGRAL_PRES', 'QT_VAGAS_MATUTINO_PRES', 'QT_VAGAS_NOTURNO_PRES', 'QT_VAGAS_VESPERTINO_PRES', 'QT_VAGAS_ANUAL_EAD') source_file = 'dados/importacao/sent/' + str(year) + str('_extract.csv') print source_file df = read_from_csv(source_file, 1, None, cols, None) if year == 2009: df['QT_VAGAS_CURSO'] = df['QT_VAGAS_INTEGRAL'] + df['QT_VAGAS_MATUTINO'] + df['QT_VAGAS_NOTURNO'] + df['QT_VAGAS_VESPERTINO'] df.drop('QT_VAGAS_INTEGRAL', 'QT_VAGAS_MATUTINO', 'QT_VAGAS_NOTURNO', 'QT_VAGAS_VESPERTINO', 1) elif year > 2009: df['QT_VAGAS_CURSO'] = df['QT_VAGAS_INTEGRAL_PRES'] + df['QT_VAGAS_MATUTINO_PRES'] + df['QT_VAGAS_NOTURNO_PRES'] + df['QT_VAGAS_VESPERTINO_PRES'] + df['QT_VAGAS_ANUAL_EAD'] df.drop('QT_VAGAS_INTEGRAL_PRES', 'QT_VAGAS_MATUTINO_PRES', 'QT_VAGAS_NOTURNO_PRES', 'QT_VAGAS_VESPERTINO_PRES', 'QT_VAGAS_ANUAL_EAD', 1) print df
def transform(year): cols = ['ANO','MES','HS','PAIS','UF','PORTO','MUNICIPIO','UNIDADE','QUANTIDADE','KGLIQUIDO','VALORFOB'] source_file = 'dados/exportacao/sent/' + str(year) + str('_extract.csv') df = read_from_csv(source_file, 1, ",", cols) df = left_df(df, 'HS', 4) if year == 2000 or year == 2001: ## 1996x2002 rdCols = ['HS96', 'HS02'] rd = read_from_csv('docs/classificacao/HS/anos/1996x2002.csv', 2, ';', rdCols, converters={"HS96": str, "HS02": str}) f = lambda x: rd['HS02'][rd.HS96 == str(x)] df = df.apply(f) # CONVERT TO 2007 rdCols2 = ['HS2007'] rd2 = read_from_csv('docs/classificacao/HS/anos/2007.csv', 2, '|', rdCols2, converters={"HS2007": str}) f2 = lambda x: [rd2.HS96 == str(x)] df = df.apply(f2) print df elif year > 2001 and year <= 2006: ## rdCols = ['HS02', 'HS07'] rd = read_from_csv('docs/classificacao/HS/anos/2002x2007.csv', 1, ';', rdCols, converters={"HS07": str}) f = lambda x: rd['HS02'][rd.HS07 == str(x)] df = df.apply(f) rdCols2 = ['HS2007'] rd2 = read_from_csv('docs/classificacao/HS/anos/2007.csv', 1, ';', rdCols2, converters={"HS2007": str}) f2 = lambda x: [rd2.HS2007 == str(x)] df = df.apply(f2) print df elif year >= 2007 and year <= 2011: rdCols = ['HS2012', 'HS2007'] rd = read_from_csv('docs/classificacao/HS/anos/2012x2007.csv', 1, ';', rdCols, converters={"HS2007": str}) f = lambda x: rd['HS2012'][rd.HS2007 == str(x)] df = df.apply(f) print df elif year >= 2012 and year <= 2014: rdCols = ['HS2012', 'HS2007'] rd = read_from_csv('docs/classificacao/HS/anos/2012x2007.csv', 1, ';', rdCols, converters={"HS2007": str}) f = lambda x: rd['HS2012'][rd.HS2007 == str(x)] df.apply(f) print df
def getMapStates(): cols=['bra','bra_accent','value'] mapStates = read_from_csv("docs\\check\\codestados.csv",delimiter=";",cols=cols) return mapStates
def test_CBO(self,year): print "Entering in checkCBO" dfDV = sql_to_df("SELECT s.cbo_id as id,sum(wage) as val FROM rais_yo s where s.cbo_id_len=4 and s.year="+str(year)+" group by 1;",db) dfSent = read_from_csv("dados\emprego\sent\Rais"+str(year)+".csv",delimiter=";",cols=cols) total=run_check(dfDV,dfSent,'cbo',year) self.assertEqual(total, 0)