コード例 #1
0
def openBL():
    orig_cols = ['cod_mun','CNAE 2d','concatenar','empresas','corte']
    bl = read_from_csv("dados/nfe/bl/012013.csv",delimiter=';',cols=orig_cols,header=1)
    bl=bl[['concatenar','empresas','corte']]
    #print bl
    #print "-------------------------"
    return bl
コード例 #2
0
ファイル: step_4_sent.py プロジェクト: radoraykov/datavivaetl
 def test_HS(self,year):
     print "Entering in checkHS" 
 
     dfDV = sql_to_df("SELECT s.hs_id as id,sum(export_val) as val,sum(import_val) as valimport FROM secex_ymp s where s.hs_id_len=6 and s.month=0 and s.year="+str(year)+" group by 1;",db)
 
     dfSent = read_from_csv("dados\exportacao\sent\MDIC_"+str(year)+".csv",delimiter="|")
     dfGroup = dfSent.groupby(dfSent.columns[10])
 
     total=0
     for hs in dfDV['id']:
         hsid= str(hs).zfill(6)   
         hsshort=hsid[2:6]
         hsint=to_int(hsshort)
         if not hsint: #not isinstance(hs,int):
             continue
 
         valDV = dfDV[(dfDV['id']==hsid)]['val'].values[0]
         try: 
             if valDV and str(valDV)<>'nan':
                 valCSV= sum(dfGroup.get_group(hsint)[dfSent.columns[9]])
         except: 
             total=total+1
             print "Not found in CSV a value for "+str(hsint)+" / "+str(hsshort)+" (original hs "+hs+") - Exports of value  "+ str(valDV)+ " in the year "+str(year)
             continue
          
         valCSV=to_int(valCSV)        
         valDV=to_int(valDV)
         if valCSV and valDV and valDV!=valCSV:
             total=total+1
             txt= "ERROR in HS ("+str(year)+"): "+str(hsint)+" / "+str(hs)+" - Value in CSV "+ str(valCSV)+ " <> Value in DV "+str(valDV) + " - Difference: "+str(valCSV - valDV)
             print txt
         else:
             txt="OK"
             #print txt
     self.assertEqual(total, 0)
コード例 #3
0
ファイル: step_4_sent.py プロジェクト: radoraykov/datavivaetl
 def test_Municipality(self,year,size,column):
     print "Entering in checkBRA" 
 
     dfDV = sql_to_df("SELECT a.id_mdic as id,sum(export_val) as val,sum(import_val) as valimport FROM secex_ymb s,attrs_bra a where s.bra_id_len="+str(size)+" and  a.id=s.bra_id and s.month=0 and s.year="+str(year)+" group by 1;",db)
 
     dfSent = read_from_csv("dados\exportacao\sent\MDIC_"+str(year)+".csv",delimiter="|")
     dfGroup = dfSent.groupby(dfSent.columns[column])
 
     total=0
     for mdic in dfDV['id']:        
         mdicid=to_int(mdic)
         if not mdicid: #not isinstance(hs,int):
             continue
         
         valDV = dfDV[(dfDV['id']==mdicid)]['val'].values[0]
         try: 
             if valDV and str(valDV)<>'nan':
                 valCSV= dfGroup.get_group(mdicid)[dfSent.columns[9]].sum()
         except:
             total=total+1
             print "Not found in CSV a value for "+str(mdic)+" - (original bra "+str(mdic)+")  Exports of value  "+ str(valDV)+ " in the year "+str(year)
             continue
          
         valCSV=to_int(valCSV)        
         valDV=to_int(valDV)
         if valCSV and valDV and  valDV!=valCSV:
             total=total+1
             txt= "ERROR in BRA ("+str(year)+"): "+str(mdic)+" / "+str(mdicid)+" - Value in CSV "+ str(valCSV)+ " <> Value in DV "+str(valDV) + " - Difference: "+str(valCSV - valDV)
             print txt
         else:
             txt="OK"
             
     self.assertEqual(total, 0)
コード例 #4
0
ファイル: step_4_sent.py プロジェクト: radoraykov/datavivaetl
    def test_main(self):

        #"Brasil;;;2.314;"
        cols=['bra','bra_sub1','bra_sub2','value',5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39]    
        idsites=['bra','bra_sub1','bra_sub2','value']
        
        dfSent = read_from_csv("docs\\check\\educacaosuperior\\sinopse_da_educacao_superior_2009-1.1.csv",delimiter=";",cols=cols,usecols=cols)
        dfSent=dfSent.drop(['bra_sub1','bra_sub2'],axis=1)
        dfSent = dfSent.dropna(thresh=2)
        
        mapStates=getMapStates()    
    
        format = lambda x:  city_fix(x,mapStates)
        dfSent['bra']= dfSent["bra"].map(format)
        dfSent['value'] = dfSent.apply(lambda f : to_number(f['value']) , axis = 1)
        dfSent['value'] = dfSent['value'].astype(np.float64)
        
    
        sql="SELECT bra_id as id,sum(enrolled) as value FROM hedu_ybucd where bra_id='4mg' and bra_id_len=3 AND d_id in ('A','B') and course_id_len=6 group by 1"
        dfDV = sql_to_df(sql,db)
        dfDV['value'] = dfDV['value'].astype(np.float64)
        
        total=run_check(dfDV,dfSent,'bra',0,'value')
        
        self.assertEqual(total, 0)
コード例 #5
0
ファイル: step_4_sent.py プロジェクト: radoraykov/datavivaetl
 def test_Municipality(self,year,size):
     print "Entering in checkBRA"    
     dfDV = sql_to_df("SELECT left(a.id_ibge,6) as id,sum(wage) as val FROM rais_yb s,attrs_bra a where s.bra_id_len="+str(size)+" and  a.id=s.bra_id and year="+str(year)+" group by 1;",db)
     dfSent = read_from_csv("dados\emprego\sent\Rais"+str(year)+".csv",delimiter=";",cols=cols)
     #dfSent = read_from_csv("dados\emprego\sent\Rais"+str(year)+"Teste.csv",delimiter=",")    
     total=run_check(dfDV,dfSent,'munic',year) 
     self.assertEqual(total, 0)
コード例 #6
0
ファイル: step_4_sent.py プロジェクト: radoraykov/datavivaetl
 def test_CNAE(self,year):
     print "Entering in checkCNAE" 
     #before right 4 and len 5
     dfDV = sql_to_df("SELECT right(s.cnae_id,4) as id,sum(wage) as val FROM rais_yi s where s.cnae_id_len=6 and year="+str(year)+" group by 1;",db)
     dfSent = read_from_csv("dados\emprego\sent\Rais"+str(year)+".csv",delimiter=";",cols=cols) 
     total=run_check(dfDV,dfSent,'cnae',year)  
     self.assertEqual(total, 0)
コード例 #7
0
ファイル: test.py プロジェクト: radoraykov/datavivaetl
def main():
    dirFile =  'teste.txt'
    dirCSV =  'teste.txt.csv'
    

    headers = ('ANO_CENSO','PK_COD_MATRICULA','FK_COD_ALUNO','NUM_IDADE', 'TP_SEXO', \
               'TP_COR_RACA', 'FK_COD_ESTADO_END', 'SIGLA_END', 'FK_COD_MUNICIPIO_END', 'ID_ZONA_RESIDENCIAL', \
               'FK_COD_MOD_ENSINO', 'FK_COD_ETAPA_ENSINO', 'PK_COD_TURMA', 'FK_COD_CURSO_PROF', 'PK_COD_ENTIDADE', \
               'FK_COD_ESTADO_ESCOLA', 'SIGLA_ESCOLA', 'COD_MUNICIPIO_ESCOLA', 'ID_LOCALIZACAO_ESC', 'ID_DEPENDENCIA_ADM_ESC')
                  
    #Coluna começa com 0
    columns = ((0,5),(5,18),(18,32),(43,47),(65,66), \
               (66,67),(86,89),(89,91),(91,100),(100,101), \
               (129,132),(132,136),(136,147),(147,156),(161,170), \
                   (170,173),(173,175),(175,184),(184,193),(193,194))
    
    
    fixed_to_csv(dirFile,columns,dirCSV,headers)
    
    df = read_from_csv(dirCSV)
    
    
    
    #SIMPLE COMPUTED COLUMNS
    df["SIMPLE_COMPUTED_COLUMNS"]=df["ANO_CENSO"] + df["NUM_IDADE"]
    
    #COMPLEX COMPUTED COLUMNS
    df["COMPLEX_COMPUTED_COLUMNS"] = df.apply(sexo_to_number,axis=1)
    
    #REPLACE PART COLUMN
    df['REPLACE_PART_COLUMN'] = df['SIMPLE_COMPUTED_COLUMNS'].str.replace('20', 'XX')
    
    
    df2 = read_from_csv(dirCSV)
 
    #MAP COMPUTED COLUMNS
    df2 = read_from_csv(dirCSV)
    
    print df
    print df2
    #MERGE
    df3 = df.merge(df2)
    print df3
    
    df_to_csv(df3 , dirCSV)
コード例 #8
0
ファイル: transform.py プロジェクト: radoraykov/datavivaetl
def transform(year):

    cols = ('ANO', 'MES', 'PAIS', 'ESTADO', 'PORTO', 'MUNICIPIO', 'UNIDADE', 'QUANTIDADE', 'KGLIQUIDO', 'VALORFOB', 'HS2007')

    source_file = 'dados/importacao/sent/' + str(year) + str('_extract.csv')

    print source_file

    df = read_from_csv(source_file, 1, None, cols, None)

    rdCols = ['HS2007']

    rd = read_from_csv('docs/classificacao/HS/anos/HS_Todos_Anos_IMP.csv', 1, ';', rdCols, converters={"HS2007": str})

    f2 = lambda x: [rd.HS2007 == str(x)]

    df.apply(f2)

    print df
コード例 #9
0
def openNFe(entrada,delimiter):
    '''
    TransactedProduct_ID_NCM,TransactedProduct_ID_HS,EconomicAtivity_ID_CNAE_Receiver,
    EconomicAtivity_ID_CNAE_Sender,CFOP_ID,CFOP_Reclassification,CFOP_Flow,Receiver_Type,
    Sender_Type,Municipality_ID_Receiver,Municipality_ID_Sender,Year,Monthly,
    Receiver_Situation,Sender_Situation,Cost_Value,ICMS_ST_Value,ICMS_ST_RET_Value,ICMS_Value,
    IPI_Value,PIS_Value,COFINS_Value,II_Value,Product_Value,ISSQN_Value,Origin
    '''
    
    orig_cols = ['TransactedProduct_ID_NCM','TransactedProduct_ID_HS','EconomicAtivity_ID_CNAE_Receiver','EconomicAtivity_ID_CNAE_Sender','CFOP_ID','CFOP_Reclassification','CFOP_Flow','Sender_Type','Sender_Type','Municipality_ID_Receiver','Municipality_ID_Sender','Year','Monthly','Sender_Situation','Sender_Situation','Cost_Value','ICMS_ST_Value','ICMS_ST_RET_Value','ICMS_Value','IPI_Value','PIS_Value','COFINS_Value','II_Value','Product_Value','ISSQN_Value','Origin']
    converters = {"EconomicAtivity_ID_CNAE_Sender": str, "Municipality_ID_Sender": str, "Product_Value": floatvert}
    
    dados = read_from_csv(entrada,delimiter=delimiter,cols=orig_cols,converters=converters,header=1)
    dados=dados[['EconomicAtivity_ID_CNAE_Sender','TransactedProduct_ID_HS','Municipality_ID_Sender','Product_Value']]

    return dados
コード例 #10
0
ファイル: transform.py プロジェクト: radoraykov/datavivaetl
def transform(year):

    colsCurso = ('CO_IES', 'CO_CATEGORIA_ADMINISTRATIVA', 'CO_ORGANIZACAO_ACADEMICA', 'CO_NIVEL_ACADEMICO', 'CO_MODALIDADE_ENSINO', 'CO_GRAU_ACADEMICO', 'CO_CURSO', 'NO_CURSO', 'CO_MUNICIPIO_CURSO', 'CO_OCDE_AREA_GERAL', 'NO_OCDE_AREA_GERAL', 'CO_OCDE_AREA_ESPECIFICA', 'NO_OCDE_AREA_ESPECIFICA', 'CO_OCDE_AREA_DETALHADA', 'NO_OCDE_AREA_DETALHADA', 'CO_OCDE', 'NO_OCDE', 'IN_MATUTINO_CURSO', 'IN_VESPERTINO_CURSO', 'IN_NOTURNO_CURSO', 'IN_INTEGRAL_CURSO', 'QT_MATRICULA_CURSO', 'QT_CONCLUINTE_CURSO', 'QT_VAGAS_INTEGRAL_PRES', 'QT_VAGAS_MATUTINO_PRES', 'QT_VAGAS_NOTURNO_PRES', 'QT_VAGAS_VESPERTINO_PRES', 'QT_VAGAS_ANUAL_EAD')

    source_file = 'dados/importacao/sent/' + str(year) + str('_extract.csv')

    print source_file

    df = read_from_csv(source_file, 1, None, cols, None)

    if year == 2009:

        df['QT_VAGAS_CURSO'] = df['QT_VAGAS_INTEGRAL'] + df['QT_VAGAS_MATUTINO'] + df['QT_VAGAS_NOTURNO'] + df['QT_VAGAS_VESPERTINO']

        df.drop('QT_VAGAS_INTEGRAL', 'QT_VAGAS_MATUTINO', 'QT_VAGAS_NOTURNO', 'QT_VAGAS_VESPERTINO', 1)

    elif year > 2009:

        df['QT_VAGAS_CURSO'] = df['QT_VAGAS_INTEGRAL_PRES'] + df['QT_VAGAS_MATUTINO_PRES'] + df['QT_VAGAS_NOTURNO_PRES'] + df['QT_VAGAS_VESPERTINO_PRES'] + df['QT_VAGAS_ANUAL_EAD']

        df.drop('QT_VAGAS_INTEGRAL_PRES', 'QT_VAGAS_MATUTINO_PRES', 'QT_VAGAS_NOTURNO_PRES', 'QT_VAGAS_VESPERTINO_PRES', 'QT_VAGAS_ANUAL_EAD', 1)

    print df
コード例 #11
0
ファイル: transform.py プロジェクト: radoraykov/datavivaetl
def transform(year):

    cols = ['ANO','MES','HS','PAIS','UF','PORTO','MUNICIPIO','UNIDADE','QUANTIDADE','KGLIQUIDO','VALORFOB']

    source_file = 'dados/exportacao/sent/' + str(year) + str('_extract.csv')


    df = read_from_csv(source_file, 1, ",", cols)
    df = left_df(df, 'HS', 4)

    if year == 2000 or year == 2001:

        ## 1996x2002
        rdCols = ['HS96', 'HS02']
        rd = read_from_csv('docs/classificacao/HS/anos/1996x2002.csv', 2, ';', rdCols, converters={"HS96": str, "HS02": str})


        f = lambda x: rd['HS02'][rd.HS96 == str(x)]
        df = df.apply(f)

        # CONVERT TO 2007
        rdCols2 = ['HS2007']
        rd2 = read_from_csv('docs/classificacao/HS/anos/2007.csv', 2, '|', rdCols2, converters={"HS2007": str})

        f2 = lambda x: [rd2.HS96 == str(x)]
        df = df.apply(f2)

        print df

    elif year > 2001 and year <= 2006:
        ##
        rdCols = ['HS02', 'HS07']
        rd = read_from_csv('docs/classificacao/HS/anos/2002x2007.csv', 1, ';', rdCols, converters={"HS07": str})

        f = lambda x: rd['HS02'][rd.HS07 == str(x)]
        df = df.apply(f)

        rdCols2 = ['HS2007']
        rd2 = read_from_csv('docs/classificacao/HS/anos/2007.csv', 1, ';', rdCols2, converters={"HS2007": str})

        f2 = lambda x: [rd2.HS2007 == str(x)]
        df = df.apply(f2)

        print df

    elif year >= 2007 and year <= 2011:

        rdCols = ['HS2012', 'HS2007']
        rd = read_from_csv('docs/classificacao/HS/anos/2012x2007.csv', 1, ';', rdCols, converters={"HS2007": str})

        f = lambda x: rd['HS2012'][rd.HS2007 == str(x)]
        df = df.apply(f)

        print df

    elif year >= 2012 and year <= 2014:

        rdCols = ['HS2012', 'HS2007']
        rd = read_from_csv('docs/classificacao/HS/anos/2012x2007.csv', 1, ';', rdCols, converters={"HS2007": str})

        f = lambda x: rd['HS2012'][rd.HS2007 == str(x)]
        df.apply(f)

        print df
コード例 #12
0
ファイル: step_4_sent.py プロジェクト: radoraykov/datavivaetl
def getMapStates():
    cols=['bra','bra_accent','value']
    mapStates = read_from_csv("docs\\check\\codestados.csv",delimiter=";",cols=cols)
    return mapStates
コード例 #13
0
ファイル: step_4_sent.py プロジェクト: radoraykov/datavivaetl
 def test_CBO(self,year):
     print "Entering in checkCBO" 
     dfDV = sql_to_df("SELECT s.cbo_id as id,sum(wage) as val FROM rais_yo s where s.cbo_id_len=4 and s.year="+str(year)+" group by 1;",db)
     dfSent = read_from_csv("dados\emprego\sent\Rais"+str(year)+".csv",delimiter=";",cols=cols)   
     total=run_check(dfDV,dfSent,'cbo',year)  
     self.assertEqual(total, 0)