def put_to_excel(result): data = DataFrame(result) writer = ExcelWriter(excel, engine='xlsxwriter') data.to_excel(writer, sheet_name='Sheet1') writer.save()
for idx in range(0, len(debet)): if idx == 0: # idx pasti bernilai 0 awal = debet[idx] - kredit[idx] sel.append(awal) else: # idx pasti lebih besar dari 0 idx_before = idx - 1 lanjutan = debet[idx] - kredit[idx] + sel[idx_before] sel.append(lanjutan) return sel # mengambil data dari data.csv dan di ubah menjadi # panda DataFrame df = pd.read_csv('pertemuan6/data.csv') print(df.head()) # mengambil nilai berbentuk array dari tiap kolom # yaitu kolom debet dan kredit dan menambah kolom # selisih ke dalam DataFrame debet = df.debet.values kredit = df.kredit.values df['selisih'] = balance(debet,kredit) # mencetak DataFrame yang telah di ubah ke dalam # File excel dengan nama data-olahan.xlsx writer = ExcelWriter('pertemuan6/data-olahan.xlsx') df.to_excel(writer,'Sheet1',index=False) writer.save()
def write_xlsx(df, name_file): writer = ExcelWriter(f'{name_file}.xlsx') df.to_excel(writer, 'Sheet1') writer.save() return 'ФАЙЛ СОХРАНЕН'
condition_7 = False # Condition 8: IBD RS rating >70 and the higher the better if (RS_Rating > 70): condition_8 = True else: condition_8 = False if ( condition_1 and condition_2 and condition_3 and condition_4 and condition_5 and condition_6 and condition_7 and condition_8): final.append(stock) index.append(n) dataframe = pd.DataFrame(list(zip(final, index)), columns=['Company', 'Index']) dataframe.to_csv('stocks.csv') exportList = exportList.append( {'Stock': stock, "RS_Rating": RS_Rating, "50 Day MA": moving_average_50, "150 Day Ma": moving_average_150, "200 Day MA": moving_average_200, "52 Week Low": low_of_52week, "52 week High": high_of_52week}, ignore_index=True) print(stock + " made the requirements") except Exception as e: print(e) print("No data on " + stock) print(exportList) writer = ExcelWriter("ScreenOutput.xlsx") exportList.to_excel(writer, "Sheet1") writer.save()
for value in false: print(value) otherList = otherList.append( { 'Stock': stock, "RS_Rating": RS_Rating, "50 Day MA": moving_average_50, "150 Day Ma": moving_average_150, "200 Day MA": moving_average_200, "52 Week Low": low_of_52week, "52 week High": high_of_52week }, ignore_index=True) except Exception as e: print(e) print("No data on " + stock) print(exportList) writer = ExcelWriter( '/Users/shashank/Documents/Code/Python/Outputs/mm-screener-output/Export-Output_{}.xlsx' .format(today)) exportList.to_excel(writer, "Sheet1") writer.save() writer = ExcelWriter( '/Users/shashank/Documents/Code/Python/Outputs/mm-screener-output/Other-Output_{}.xlsx' .format(today)) otherList.to_excel(writer, "Sheet1") writer.save()
import pandas as pd import xlrd import openpyxl from pandas import ExcelWriter writer = ExcelWriter(r'/home/zach/Documents/Excel/export.xlsx') LOD_List = 0.011, 0.0107, 0.0067, 0.0073, .0048, 0.0048, 0.0235, 0.0114, 0.0035, 0.0092, 0.0117, 0.0012, 0.0062, 0.0082, 0.0020, 0.0052, 0.0012, 0.0070, 0.0073, 0.0094 Compound_List = [ "PFMOAA", "R-EVE", "Byproduct 5", "Byproduct 4", "PMPA", "PFO2HxA", "PEPA", "NVHOS", "PFECA_B", "PFO3OA", "HFPO-DA", "PES", "PFECA_G", "PFO4DA", "Hydro-EVE Acid", "EVE-Acid", "Byproduct 6", "PFO5DA", "Byproduct 2", "Byproduct 1" ] data = pd.read_excel(r'/home/zach/Documents/Excel/LOD_correction.xlsx', 'Sheet4') Compound_Counter = 0 Append = pd.DataFrame({'File Name': [], 'Compound': [], 'Concentration': []}) def parse_compound(T3Compound, T3Compound_LOD): T3Compound_parse = data.loc[data['Compound'] == T3Compound] T3Compound_report = T3Compound_parse.loc[ T3Compound_parse['Concentration'] > T3Compound_LOD, ['File Name', 'Compound', 'Concentration']] return T3Compound_report for Compounds in Compound_List: Append = Append.append( parse_compound(Compound_List[Compound_Counter], LOD_List[Compound_Counter]))
where filename is something like "mysheet.xlsx" (no quotes) It will return a file called transcribed.xlsx""" try: cmu = nltk.corpus.cmudict.dict() except LookupError: nltk.download('cmudict') cmu = nltk.corpus.cmudict.dict() words = np.array(pd.read_excel(sys.argv[1]).fillna('')) bad = '-undefined-' def transcribe(word): try: return (' '.join(cmu[word.lower()][0])) except KeyError: if word == '': return ('') else: return (bad) f = np.vectorize(transcribe, otypes=['object']) transcribed = pd.DataFrame(f(words)) writer = ExcelWriter('transcribed.xlsx') transcribed.to_excel(writer, 'Sheet1', index=False) writer.save()
def single_acct_analysis(self,accid,label=None,save=False,savepath=''): ''' Analysize single account. parameters: accid,pass account id number, which is transferred into regular expression by this method, although regular expression is supported. label,default set to self.accid_col if None is passed to this parameter. ''' accid_item=self.__trans_accid_regex(accid) acct_data=self.getAcct(accid_item,accid_label=label,over_write=False,pure=False,side='all') # acct_data=acct_data.set_index('glid',inplace=True) acct_sum=acct_data[self.drcrdesc].sum(axis=0) print('---start analysize %s---'%str(accid)) print(get_time_str()) print('---Account Data---') print('account data shape:',acct_data.shape) print('account sum:',acct_sum) print('theAcct %s:\n'%str(accid),acct_data) if acct_sum[0] != 0: dr_acct_data=self.getAcct(accid_item,accid_label=label,over_write=False,pure=False,side='dr') # dr_acct_data=dr_acct_data.set_index('glid',inplace=True) print('---Debit Data---') print('debit data shape:',dr_acct_data.shape) print('debit_side %s:\n'%str(accid),dr_acct_data) # print(dr_acct_data) else: dr_acct_data=None pass if acct_sum[1] != 0: cr_acct_data=self.getAcct(accid_item,accid_label=label,over_write=False,pure=False,side='cr') # cr_acct_data=cr_acct_data.set_index('glid',inplace=True) print('---Credit Data---') print('credit data shape:',cr_acct_data.shape) print('credit_side %s:\n'%str(accid),cr_acct_data) # print(cr_acct_data) else: cr_acct_data=None pass if save==True: import os # from autk import get_time_str from openpyxl import Workbook,load_workbook from pandas import ExcelWriter if savepath=='': savename=''.join(['theAcct',str(accid),'-',get_time_str(),'.xlsx']) savepath=os.path.join(os.path.abspath(os.curdir),savename) wb=Workbook() wb.save(savepath) print('new workbook created at current directory.') elif os.path.isdir(savepath): savename=''.join(['theAcct',str(accid),'-',get_time_str(),'.xlsx']) savepath=os.path.join(os.path.abspath(savepath),savename) wb=Workbook() wb.save(savepath) print('new workbook created at %s'%savepath) elif os.path.isfile(savepath): wb=load_workbook(savepath) print('workbook loaded at %s'%savepath) else: print('woc???,file not exist?') wb=Workbook() wb.save(savepath) wter=ExcelWriter(savepath,engine='openpyxl') wter.book=wb acct_data.to_excel(wter,sheet_name=''.join(['acct_',str(accid)])) if dr_acct_data is not None: dr_acct_data.to_excel(wter,sheet_name=''.join(['dr_',str(accid)])) wter.save() else: pass if cr_acct_data is not None: cr_acct_data.to_excel(wter,sheet_name=''.join(['cr_',str(accid)])) wter.save() else: pass wter.save() wb.close() print('%s data saved.'%str(accid)) else: print('analysis result not saved.') print('---end %s analysis---'%str(accid)) print(get_time_str()) return [acct_data,dr_acct_data,cr_acct_data]
part2_sample2 = f2.sample(frac=0.0333, random_state=3) part2_sample3 = f3.sample(frac=0.0333, random_state=4) part2_sample4 = f4.sample(frac=0.0333, random_state=5) # concat every part of the sample, with the randomly generated sample from the group, that it was not a part of. sample1 = pd.concat([part1_sample1, part2_sample1]) sample2 = pd.concat([part1_sample2, part2_sample2]) sample3 = pd.concat([part1_sample3, part2_sample3]) sample4 = pd.concat([part1_sample4, part2_sample4]) return sample1, sample2, sample3, sample4 #%% samples = random_sample(4000) #%% from pandas import ExcelWriter # gem samples som excel filer writer1 = ExcelWriter('Niels_1.xlsx') samples[0].to_excel(writer1,'Sheet1') writer1.save() writer2 = ExcelWriter('Simon_2.xlsx') samples[1].to_excel(writer2,'Sheet1') writer2.save() writer3 = ExcelWriter('Elias_3.xlsx') samples[2].to_excel(writer3,'Sheet1') writer3.save() writer4 = ExcelWriter('Julius_4.xlsx') samples[3].to_excel(writer4,'Sheet1') writer4.save()
except: Popup('Your path, sheet, or indicator is incorrect') toExport = list() sameResult = Same(dataINframes, dataINlist) diffResult = Different(dataINframes, sameResult) for df in sameResult: df.reset_index(drop=True, inplace=True) for df in diffResult: df.reset_index(drop=True, inplace=True) toExport.append(sameResult) toExport.append(diffResult) window.Close() window = Window('CompyrisonTool').Layout(layout3) if (event == 'export'): writer = ExcelWriter(values['exportPath'] + "/OUTPUT.xlsx", engine='xlsxwriter') for same in range(0, len(toExport[0])): header = fileList[same][0].rsplit( '/', 1)[-1][:-5] + fileList[same][1] + fileList[same][2] if len(header) > 27: header = header[(len(header) - 27):] sameHeader = header + 'SAME' toExport[0][same].to_excel(writer, sheet_name=sameHeader) for diff in range(0, len(toExport[1])): header = fileList[diff][0].rsplit( '/', 1)[-1][:-5] + fileList[diff][1] + fileList[diff][2] if len(header) > 27: header = header[(len(header) - 27):] diffHeader = header + 'DIFF' toExport[1][diff].to_excel(writer, sheet_name=diffHeader) try:
def mtm_curva_debenture(): import datetime, time import pandas as pd import pymysql as db import numpy as np import logging from findt import FinDt from pandas import ExcelWriter from dependencias.Metodos.funcoes_auxiliares import get_data_ultimo_dia_util_mes_anterior from dependencias.Metodos.funcoes_auxiliares import full_path_from_database logger = logging.getLogger(__name__) # Pega a data do último dia útil do mês anterior e deixa no formato específico para utilização da função dtbase = get_data_ultimo_dia_util_mes_anterior() dtbase_concat = dtbase[0] + dtbase[1] + dtbase[2] # Diretório de save de planilhas save_path_puposicao_final = full_path_from_database( 'get_output_quadro419') + 'puposicao_final_deb.xlsx' save_path_teste_dif_deb = full_path_from_database( 'get_output_quadro419') + 'teste_dif_deb.xlsx' feriados_sheet = full_path_from_database( 'feriados_nacionais') + 'feriados_nacionais.csv' tol = 0.20 writer = ExcelWriter(save_path_puposicao_final) dt_base_rel = datetime.date(int(dtbase[0]), int(dtbase[1]), int(dtbase[2])) # 1 - Leitura e criação de tabelas # Informações do cálculo de MTM logger.info("Conectando no Banco de dados") connection = db.connect('localhost', user='******', passwd='root', db='projeto_inv', use_unicode=True, charset="utf8") logger.info("Conexão com DB executada com sucesso") query = "SELECT * FROM projeto_inv.mtm_titprivado WHERE tipo_ativo = 'DBS'" #query = "SELECT * FROM projeto_inv.mtm_titprivado" mtm_titprivado0 = pd.read_sql(query, con=connection) logger.info("Leitura do banco de dados executada com sucesso") mtm_titprivado = mtm_titprivado0.copy() # Seleciona debentures # Seleciona a última carga de debentures da data da posicao mtm_titprivado['dtrel'] = mtm_titprivado['id_papel'].str.split('_') mtm_titprivado['dtrel'] = mtm_titprivado['dtrel'].str[0] mtm_titprivado = mtm_titprivado[mtm_titprivado.dtrel == dtbase_concat].copy() mtm_titprivado = mtm_titprivado[mtm_titprivado.data_bd == max( mtm_titprivado.data_bd)] # Renomeia columnas mtm_titprivado = mtm_titprivado.rename(columns={ 'data_fim': 'dt_ref', 'dtoperacao': 'dtoperacao_mtm' }) # Reajusta papéis indesaxos a DI mtm_titprivado['dt_ref'] = pd.to_datetime(mtm_titprivado['dt_ref']) mtm_titprivado['dt_ref'] = np.where( mtm_titprivado['indexador'] == 'DI1', mtm_titprivado['dt_ref'] + pd.DateOffset(months=0, days=1), mtm_titprivado['dt_ref']) mtm_titprivado['dt_ref'] = mtm_titprivado['dt_ref'].dt.date # Altera o nome do id_papel para levar em consideração o flag mtm_titprivado['id_papel_old'] = mtm_titprivado['id_papel'] mtm_titprivado['id_papel'] = mtm_titprivado['id_papel_old'].str.split('_') mtm_titprivado['id_papel'] = mtm_titprivado['id_papel'].str[0] + '_' + mtm_titprivado['id_papel'].str[1] + '_' + \ mtm_titprivado['id_papel'].str[2] del mtm_titprivado['data_bd'] del mtm_titprivado['dtrel'] query = 'SELECT * FROM projeto_inv.xml_debenture_org' xml_titprivado = pd.read_sql(query, con=connection) logger.info("Leitura do banco de dados executada com sucesso") # Seleciona a última carga de debentures da data da posicao xml_titprivado['dtrel'] = xml_titprivado['id_papel'].str.split('_') xml_titprivado['dtrel'] = xml_titprivado['dtrel'].str[0] xml_titprivado = xml_titprivado[xml_titprivado.dtrel == dtbase_concat].copy() xml_titprivado = xml_titprivado[xml_titprivado.data_bd == max( xml_titprivado.data_bd)] original = xml_titprivado.copy() del xml_titprivado['data_bd'] del xml_titprivado['indexador'] del xml_titprivado['dtrel'] # Puxa as informações de negociação em mercado secuindário da Anbima para debentures -> linha dtspread query = 'SELECT * FROM projeto_inv.anbima_debentures' anbima_deb = pd.read_sql(query, con=connection) logger.info("Leitura do banco de dados executada com sucesso") #Fecha conexão connection.close() anbima_deb = anbima_deb[anbima_deb.data_referencia <= dt_base_rel] anbima_deb = anbima_deb.sort(['codigo', 'data_referencia', 'data_bd'], ascending=[True, True, True]) anbima_deb = anbima_deb.drop_duplicates(subset=['codigo'], take_last=True) anbima_deb = anbima_deb[['codigo', 'data_referencia', 'pu']].copy() anbima_deb['codigo'] = anbima_deb['codigo'].astype(str) anbima_deb = anbima_deb.rename(columns={ 'codigo': 'coddeb', 'data_referencia': 'data_spread' }) anbima_deb.coddeb.unique().tolist() xml_titprivado.coddeb.unique().tolist() # Criação da tabela xml + anbima xml_titprivado = xml_titprivado.merge(anbima_deb, on=['coddeb'], how='left') # Para os papéis que não tiveram negociação, assume data_spread = data_relatorio xml_titprivado['data_spread'] = np.where( xml_titprivado['data_spread'].isnull(), dt_base_rel, xml_titprivado['data_spread']) # Preenchimento com puposicao do xml no pu vindo da anbima quando não tem pu # Tira o valor médio de todos os pu's posicao x = xml_titprivado[['isin', 'puposicao']].groupby(['isin']).agg(['mean']) x = x.reset_index(level=None, drop=False, inplace=False, col_level=0, col_fill='') x1 = pd.DataFrame(columns=['isin', 'pumedio']) x1['isin'] = x['isin'] x1['pumedio'] = x['puposicao'] xml_titprivado = xml_titprivado.merge(x1, on=['isin'], how='left') xml_titprivado['pu'] = np.where(xml_titprivado['pu'].isnull(), xml_titprivado['pumedio'], xml_titprivado['pu']) del xml_titprivado['pumedio'] # Criação da tabela mtm + xml titprivado = xml_titprivado.merge(mtm_titprivado, on=['id_papel'], how='left') # Criação da coluna de data de referencia da posição titprivado['data_referencia'] = titprivado['id_papel'].str[0:8] titprivado['data_referencia'] = pd.to_datetime( titprivado['data_referencia']).dt.date titprivado.caracteristica.unique() logger.info("Cálculo marcação na curva") # 2 - Cálculo marcação na curva ###### AQUI ZERA!!################# Verificar titprivado_curva = titprivado[titprivado.caracteristica == 'V'].copy() del titprivado_curva['vne'] # Seleciona a parte do fluxo entre a data da compra e a data da posição titprivado_curva = titprivado_curva[ titprivado_curva.dt_ref >= titprivado_curva.dtoperacao_mtm].copy() titprivado_curva = titprivado_curva[ titprivado_curva.dt_ref <= titprivado_curva.data_referencia].copy() # Preenchimento do VNE na data da compra tp_curva_dtop = titprivado_curva[[ 'id_papel_old', 'saldo_dev_juros_perc', 'pucompra' ]][titprivado_curva.dt_ref == titprivado_curva.dtoperacao_mtm].copy() tp_curva_dtop['vne'] = tp_curva_dtop['pucompra'] * ( 1 - tp_curva_dtop['saldo_dev_juros_perc']) del tp_curva_dtop['saldo_dev_juros_perc'] del tp_curva_dtop['pucompra'] titprivado_curva = titprivado_curva.merge(tp_curva_dtop, on=['id_papel_old'], how='left') titprivado_curva[ 'principal_perc_acum'] = 1 - titprivado_curva['principal_perc'] titprivado_curva['principal_perc_acum'] = titprivado_curva[[ 'id_papel_old', 'principal_perc_acum' ]].groupby(['id_papel_old']).agg(['cumprod']) titprivado_curva['vne'] = titprivado_curva['vne'] * titprivado_curva[ 'principal_perc_acum'] titprivado_curva['pagto_juros'] = titprivado_curva[ 'vne'] * titprivado_curva['pagto_juros_perc'] titprivado_curva[ 'vna'] = titprivado_curva['vne'] * titprivado_curva['fator_index_per'] titprivado_curva['vna'][titprivado_curva.indexador == 'DI1'] = titprivado_curva['vne'][ titprivado_curva.indexador == 'DI1'] titprivado_curva['saldo_dev_juros'] = titprivado_curva[ 'vna'] * titprivado_curva['saldo_dev_juros_perc'] titprivado_curva['pupar'] = titprivado_curva['vna'] + titprivado_curva[ 'saldo_dev_juros'] + titprivado_curva['pagto_juros'] titprivado_curva['dif_curva'] = titprivado_curva[ 'pupar'] / titprivado_curva['puposicao'] - 1 titprivado_curva['dif_curva'] = titprivado_curva['dif_curva'].abs() titprivado_curva = titprivado_curva[ titprivado_curva.dt_ref == titprivado_curva.data_referencia].copy() titprivado_curva = titprivado_curva[[ 'id_papel_old', 'id_papel', 'codigo_isin', 'dif_curva', 'pupar' ]].copy() titprivado_curva = titprivado_curva.sort(['dif_curva'], ascending=[True]) titprivado_curva = titprivado_curva.drop_duplicates(subset=['id_papel'], take_last=False) titprivado = titprivado.merge( titprivado_curva, on=['id_papel_old', 'id_papel', 'codigo_isin'], how='left') titprivado = titprivado[((titprivado.caracteristica == 'V') & (titprivado.dif_curva.notnull())) | (titprivado.caracteristica == 'N')].copy() titprivado = titprivado.reset_index(level=None, drop=True, inplace=False, col_level=0, col_fill='') logger.info("Cálculo do mtm na data_spread; a)prazo_du_spread") # 3 - Cálculo do mtm na data_spread; a) prazo_du_spread # Verificação de papéis sem a data de referência titprivado[titprivado.dt_ref.isnull()].to_excel(writer, 'dt_ref_NaN') # Retira papéis sem data de referência titprivado = titprivado[titprivado.dt_ref.notnull()].copy() # Cria vetor das datas dt_min = min(titprivado['dt_ref']) dt_min = dt_min.replace(day=1) dt_min = datetime.date(dt_min.year, dt_min.month, dt_min.day) dt_max = max(titprivado['dt_ref']) dt_max = dt_max.replace(day=1, month=dt_max.month) dt_max = dt_max + pd.DateOffset(months=1) dt_max = dt_max - pd.DateOffset(days=1) dt_max = datetime.date(dt_max.year, dt_max.month, dt_max.day) dt_ref = pd.date_range(start=dt_min, end=dt_max, freq='D').date serie_dias = pd.DataFrame(columns=['dt_ref', 'aux']) serie_dias['dt_ref'] = dt_ref # Cria vetor das datas úteis per = FinDt.DatasFinanceiras(dt_min, dt_max, path_arquivo=feriados_sheet) du = pd.DataFrame(columns=['dt_ref']) du['dt_ref'] = per.dias(3) du['du_1'] = 1 serie_dias = serie_dias.merge(du, on=['dt_ref'], how='left') serie_dias['du_1'] = serie_dias['du_1'].fillna(0) serie_dias['indice_du'] = np.cumsum(serie_dias['du_1']) del serie_dias['aux'] del serie_dias['du_1'] fim = serie_dias.copy() fim = fim.rename(columns={'indice_du': 'indice_du_fim_spread'}) inicio = serie_dias.copy() inicio = inicio.rename(columns={ 'dt_ref': 'data_spread', 'indice_du': 'indice_du_inicio_spread' }) # Une as séries dias à tabela titprivado titprivado = titprivado.merge(fim, on=['dt_ref'], how='left') titprivado = titprivado.merge(inicio, on=['data_spread'], how='left') # Calcula o prazo_du_spread titprivado['prazo_du_spread'] = titprivado[ 'indice_du_fim_spread'] - titprivado['indice_du_inicio_spread'] logger.info("Cálculo do mtm na data_spread; b) taxa_spot") # 4 - Cálculo do mtm na data_spread; b) taxa_spot if len(titprivado[titprivado.indexador == 'PRE']) != 0: maximo_tp_PRE = max( titprivado['prazo_du_spread'][titprivado.indexador == 'PRE']) if len(titprivado[titprivado.indexador == 'IGP']) != 0: maximo_tp_IGPM = max( titprivado['prazo_du_spread'][titprivado.indexador == 'IGP']) if len(titprivado[titprivado.indexador == 'IPCA']) != 0: maximo_tp_IPCA = max( titprivado['prazo_du_spread'][titprivado.indexador == 'IPCA']) # ----Base de interpolações para cálculo do spread dt_min_interpol = str(min(titprivado['data_spread'])) logger.info("Conectando no Banco de dados") connection = db.connect('localhost', user='******', passwd='root', db='projeto_inv', use_unicode=True, charset="utf8") logger.info("Conexão com DB executada com sucesso") # Data de 2020 limite para a criacao das tabelas dt_ref = pd.date_range(start=datetime.date(int(dt_min_interpol[0:4]), int(dt_min_interpol[5:7]), 1), end=datetime.date(int(2020), int(10), 31), freq='M').date # Uniao das tabelas criadas com union all query = '' for dt in dt_ref: month = '0' + str(dt.month) if len(str(dt.month)) == 1 else str( dt.month) year = '0' + str(dt.year) if len(str(dt.year)) == 1 else str(dt.year) query = query + 'SELECT * FROM projeto_inv.curva_ettj_interpol_' + year + "_" + month + " UNION ALL " query = query[:-11] query = 'select * from (' + query + ') AS a where dt_ref<=' + '"' + dtbase_concat + '" and dt_ref>=' + '"' + dt_min_interpol + '" and indexador_cod in("PRE","DIM","DIC");' ettj = pd.read_sql(query, con=connection) logger.info("Leitura do banco de dados executada com sucesso") #Fecha conexão connection.close() # Seleciona a última carga ettj = ettj.sort(['indexador_cod', 'dt_ref', 'data_bd'], ascending=[True, False, False]) ettj = ettj.drop_duplicates(subset=[ 'prazo', 'tx_spot', 'tx_spot_ano', 'tx_termo_dia', 'indexador_cod' ], take_last=False) ettj['indexador'] = np.where( ettj['indexador_cod'] == 'DIC', 'IPCA', np.where(ettj['indexador_cod'] == 'DIM', 'IGP', 'PRE')) ettj = ettj.rename(columns={'prazo': 'prazo_du'}) ettj_filtro = ettj[['prazo_du', 'tx_spot', 'tx_spot_ano', 'indexador']] ettj_filtro = ettj_filtro.rename(columns={'prazo_du': 'prazo_du_spread'}) # Extrapolação PRE, se necessário if len(titprivado[titprivado.indexador == 'PRE']) != 0: maximo_ettj = max( ettj_filtro['prazo_du_spread'][ettj_filtro.indexador == 'PRE']) # del ettj_fluxo if maximo_ettj < max( titprivado['prazo_du_spread'][titprivado.indexador == 'PRE']): ettj_filtro_PRE = ettj_filtro[[ 'prazo_du_spread', 'tx_spot_ano', 'indexador' ]][ettj_filtro.indexador == 'PRE'].copy() ettj_filtro_PRE = ettj_filtro_PRE.reset_index(level=None, drop=True, inplace=False, col_level=0, col_fill='') ettj_filtro_PRE = ettj_filtro_PRE[0:maximo_ettj - 1].copy() tx_max = ettj_filtro_PRE['tx_spot_ano'].loc[len(ettj_filtro_PRE) - 1] ettj_aux = pd.DataFrame(columns=['prazo_du_spread', 'indexador']) ettj_aux['prazo_du_spread'] = np.linspace(1, maximo_tp_PRE, maximo_tp_PRE) ettj_aux['indexador'] = 'PRE' ettj_aux = ettj_aux.merge(ettj_filtro_PRE, on=['prazo_du_spread', 'indexador'], how='left') ettj_aux['tx_spot_ano'] = ettj_aux['tx_spot_ano'].fillna(tx_max) ettj_aux['tx_spot'] = (1 + ettj_aux['tx_spot_ano'])**( ettj_aux['prazo_du_spread'] / 252) - 1 ettj_fluxo = ettj_fluxo.append(ettj_aux) else: ettj_aux = ettj_filtro.copy() ettj_fluxo = ettj_aux.copy() else: ettj_fluxo = ettj_filtro.copy() # Extrapolação IGPM, se necessário if len(titprivado[titprivado.indexador == 'IGP']) != 0: maximo_ettj = max( ettj_filtro['prazo_du_spread'][ettj_filtro.indexador == 'IGP']) # del ettj_fluxo if maximo_ettj < max( titprivado['prazo_du_spread'][titprivado.indexador == 'IGP']): ettj_filtro_IGPM = ettj_filtro[[ 'prazo_du_spread', 'tx_spot_ano', 'indexador' ]][ettj_filtro.indexador == 'IGP'].copy() ettj_filtro_IGPM = ettj_filtro_IGPM.reset_index(level=None, drop=True, inplace=False, col_level=0, col_fill='') ettj_filtro_IGPM = ettj_filtro_IGPM[0:maximo_ettj - 1].copy() tx_max = ettj_filtro_IGPM['tx_spot_ano'].loc[len(ettj_filtro_IGPM) - 1] ettj_aux = pd.DataFrame(columns=['prazo_du_spread', 'indexador']) ettj_aux['prazo_du_spread'] = np.linspace(1, maximo_tp_IGPM, maximo_tp_IGPM) ettj_aux['indexador'] = 'IGP' ettj_aux = ettj_aux.merge(ettj_filtro_IGPM, on=['prazo_du_spread', 'indexador'], how='left') ettj_aux['tx_spot_ano'] = ettj_aux['tx_spot_ano'].fillna(tx_max) ettj_aux['tx_spot'] = (1 + ettj_aux['tx_spot_ano'])**( ettj_aux['prazo_du_spread'] / 252) - 1 ettj_fluxo = ettj_fluxo.append(ettj_aux) else: ettj_aux = ettj_filtro.copy() ettj_fluxo = ettj_aux.copy() else: ettj_fluxo = ettj_filtro.copy() # Extrapolação IPCA, se necessário if len(titprivado[titprivado.indexador == 'IPCA']) != 0: maximo_ettj = max( ettj_filtro['prazo_du_spread'][ettj_filtro.indexador == 'IPCA']) # del ettj_fluxo if maximo_ettj < max( titprivado['prazo_du_spread'][titprivado.indexador == 'IPCA']): ettj_filtro_IPCA = ettj_filtro[[ 'prazo_du_spread', 'tx_spot_ano', 'indexador' ]][ettj_filtro.indexador == 'IPCA'].copy() ettj_filtro_IPCA = ettj_filtro_IPCA.reset_index(level=None, drop=True, inplace=False, col_level=0, col_fill='') ettj_filtro_IPCA = ettj_filtro_IPCA[0:maximo_ettj - 1].copy() tx_max = ettj_filtro_IPCA['tx_spot_ano'].loc[len(ettj_filtro_IPCA) - 1] ettj_aux = pd.DataFrame(columns=['prazo_du_spread', 'indexador']) ettj_aux['prazo_du_spread'] = np.linspace(1, maximo_tp_IPCA, maximo_tp_IPCA) ettj_aux['indexador'] = 'IPCA' ettj_aux = ettj_aux.merge(ettj_filtro_IPCA, on=['prazo_du_spread', 'indexador'], how='left') ettj_aux['tx_spot_ano'] = ettj_aux['tx_spot_ano'].fillna(tx_max) ettj_aux['tx_spot'] = (1 + ettj_aux['tx_spot_ano'])**( ettj_aux['prazo_du_spread'] / 252) - 1 ettj_fluxo = ettj_fluxo.append(ettj_aux) else: ettj_aux = ettj_filtro.copy() ettj_fluxo = ettj_aux.copy() else: ettj_fluxo = ettj_filtro.copy() # Une a ETTJ à tabela titprivado ettj_fluxo = ettj_fluxo.rename(columns={ 'tx_spot': 'tx_spot_spread', 'tx_spot_ano': 'tx_spot_ano_spread' }) titprivado = titprivado.merge(ettj_fluxo, on=['prazo_du_spread', 'indexador'], how='left') # Preenche com 0 onde não tem taxa spot (prazo_su_spread<0, indexador=DI1) titprivado['tx_spot_spread'] = titprivado['tx_spot_spread'].fillna(0) titprivado['tx_spot_ano_spread'] = titprivado['tx_spot_ano_spread'].fillna( 0) logger.info("Cálculo do mtm na data_spread; b) valor presente e mtm") # 5 - Cálculo do mtm na data_spread; c) valor presente e mtm titprivado['fator_desconto_spread'] = 1 / (1 + titprivado['tx_spot_spread']) titprivado[ 'pv_spread'] = titprivado['fv'] * titprivado['fator_desconto_spread'] titprivado['pv_spread'] = np.where(titprivado['prazo_du_spread'] < 0, 0, titprivado['pv_spread']) x = titprivado[['id_papel', 'pv_spread']].groupby(['id_papel']).agg(['sum']) x = x.reset_index(level=None, drop=False, inplace=False, col_level=0, col_fill='') x1 = pd.DataFrame(columns=['id_papel', 'mtm_spread']) x1['id_papel'] = x['id_papel'] x1['mtm_spread'] = x['pv_spread'] titprivado = titprivado.merge(x1, on=['id_papel'], how='left') titprivado['dif'] = titprivado['mtm_spread'] / titprivado['pu'] - 1 writer = ExcelWriter(save_path_puposicao_final) aux = titprivado.drop_duplicates(subset=['codigo_isin']) aux[[ 'id_papel_old', 'codigo_isin', 'flag', 'indexador', 'puemissao', 'data_emissao', 'data_expiracao', 'puposicao', 'percentual_indexador', 'taxa_juros', 'pu', 'mtm_spread', 'dif' ]].to_excel(writer, 'dif') logger.info("Cálculo spread de crédito") # 6 - Cálculo spread de crédito titprivado_spread = \ titprivado[['id_papel', 'codigo_isin', 'data_spread', 'pu', 'dt_ref', 'prazo_du_spread', 'pv_spread']][ (titprivado.caracteristica == 'N') & (titprivado.dif != 1)] # Seleciona apenas o fluxo com prazo_du positivo titprivado_spread = titprivado_spread[ titprivado_spread.dt_ref >= titprivado_spread.data_spread] titprivado_spread = titprivado_spread.drop_duplicates( subset=['id_papel', 'prazo_du_spread']) titprivado_spread['pv_pv_fluxo'] = np.where( titprivado_spread['dt_ref'] == titprivado_spread['data_spread'], -titprivado_spread['pu'], titprivado_spread['pv_spread']) tp_spread = titprivado_spread[[ 'id_papel', 'dt_ref', 'prazo_du_spread', 'pv_pv_fluxo' ]].copy() tp_spread['prazo_du'] = tp_spread['prazo_du_spread'].astype(float) tp_spread = tp_spread.drop_duplicates( subset=['id_papel', 'prazo_du_spread'], take_last=True) id_papel = titprivado_spread['id_papel'].unique() spread = np.zeros((len(id_papel))) spread_aux = pd.DataFrame(columns=['id_papel', 'spread']) spread_aux['id_papel'] = id_papel start_time = time.time() for i in range(0, len(id_papel)): v = tp_spread['pv_pv_fluxo'][tp_spread.id_papel == id_papel[i]].values v = np.meshgrid(v, sparse=True) s = np.linspace(-0.9999, 0.9999, 10000) t = tp_spread['prazo_du_spread'][tp_spread.id_papel == id_papel[i]].values t, s = np.meshgrid(t, s, sparse=True) f_ts = 1. / (1 + s)**(t / 252) f_spread = v * f_ts f_sum = f_spread.sum(axis=1, dtype='float') min_index = abs(f_sum).argmin() spread[i] = s[min_index] print(time.time() - start_time, i, id_papel[i], spread[i]) spread_aux['spread'].iloc[i] = spread[i] titprivado = titprivado.merge(spread_aux, on=['id_papel'], how='left') aux = titprivado.drop_duplicates(subset=['id_papel']) aux[[ 'id_papel_old', 'codigo_isin', 'valor_nominal', 'puposicao', 'mtm_spread', 'pu', 'spread' ]].to_excel(save_path_teste_dif_deb) logger.info("Seleção dos papéis cuja marcação não ficou boa") # 7 - Seleção dos papéis cuja marcação não ficou boa tp_bigdif = titprivado[[ 'data_spread', 'codigo_isin', 'id_papel', 'flag', 'indexador', 'dtemissao', 'data_emissao', 'dtvencimento', 'data_expiracao', 'valor_nominal', 'puemissao', 'juros_cada', 'coupom', 'taxa_juros', 'percentual_indexador', 'percindex', 'perc_amortizacao', 'dt_ref', 'vne', 'du_per', 'prazo_du', 'fator_index_per', 'fator_juros_per', 'pv', 'fv', 'mtm', 'puposicao', 'pu', 'dif', 'spread' ]].copy() tp_bigdif['dif'] = tp_bigdif['mtm'] / tp_bigdif['pu'] - 1 tp_bigdif[(tp_bigdif.dif > tol) | (tp_bigdif.dif < -tol) | (tp_bigdif.spread > tol) | (tp_bigdif.spread < -tol)].to_excel( writer, 'bigdif') logger.info( "Atualização do fluxo de percentual de mtm com o spread e carregamento da tabela" ) # 8 - Atualização do fluxo de percentual de mtm com o spread e carregamento da tabela para preenchimento do quadro 419 titprivado_perc = titprivado.copy() titprivado_perc = titprivado_perc.rename( columns={ 'mtm': 'mtm_old', 'pv': 'pv_old', 'pv_DV100': 'pv_DV100_old', 'fator_desconto': 'fator_desconto_old', 'fator_desconto_DV100': 'fator_desconto_DV100_old', 'DV100': 'DV100_old' }) # Escolhe o melhor spread - SIGNIFICA O MELHOR FLUXO titprivado_perc['spread'] = titprivado_perc['spread'].fillna(0) # Pega penas uma linha para não ter problemas x = titprivado_perc[[ 'id_papel', 'codigo_isin', 'spread' ]][titprivado_perc.dt_ref == titprivado_perc.data_referencia].copy() x = x.sort(['codigo_isin', 'spread'], ascending=[True, True]) x = x.drop_duplicates(subset=['codigo_isin'], take_last=False) x['marker'] = 1 titprivado_perc = titprivado_perc.merge( x, on=['codigo_isin', 'id_papel', 'spread'], how='left') titprivado_perc = titprivado_perc[titprivado_perc.marker == 1].copy() del titprivado_perc['marker'] titprivado_perc = titprivado_perc.drop_duplicates( subset=['codigo_isin', 'dt_ref'], take_last=True) # titprivado_perc['puposicao_final'] = np.where(titprivado_perc['caracteristica']=='N',titprivado_perc['pu'],titprivado_perc['mtm_old']) titprivado_perc = titprivado_perc[titprivado_perc.prazo_du >= 0] aux = titprivado_perc[[ 'id_papel', 'codigo_isin' ]][titprivado_perc.dt_ref == titprivado_perc.data_referencia].copy() aux = aux.drop_duplicates(subset=['codigo_isin'], take_last=True) aux['marker'] = 1 titprivado_perc = titprivado_perc.merge(aux, on=['id_papel', 'codigo_isin'], how='left') titprivado_perc = titprivado_perc[titprivado_perc.marker == 1].copy() del titprivado_perc['marker'] # Recalcula apenas para que está marcado a mercado aux = titprivado_perc[titprivado_perc.caracteristica == 'V'].copy() aux['mtm_DV100_N'] = 0.0 aux = aux.rename(columns={'mtm_old': 'mtm'}) titprivado_perc = titprivado_perc[titprivado_perc.caracteristica == 'N'].copy() # Cálculo do fator de desconto atualizado pelo spread titprivado_perc[ 'fator_desconto'] = titprivado_perc['fator_desconto_old'] / ( 1 + titprivado_perc['spread'])**(titprivado_perc['prazo_du'] / 252) titprivado_perc['fator_desconto_DV100'] = titprivado_perc[ 'fator_desconto_DV100_old'] / (1 + titprivado_perc['spread'])**( titprivado_perc['prazo_du'] / 252) # Calculo do pv titprivado_perc[ 'pv'] = titprivado_perc['fv'] * titprivado_perc['fator_desconto'] titprivado_perc['pv_DV100'] = titprivado_perc['fv'] * titprivado_perc[ 'fator_desconto_DV100'] # Calculo do MTM x = titprivado_perc[['codigo_isin', 'pv', 'pv_DV100']].groupby(['codigo_isin']).agg(['sum']) x = x.reset_index(level=None, drop=False, inplace=False, col_level=0, col_fill='') x1 = pd.DataFrame(columns=['codigo_isin', 'mtm', 'mtm_DV100_N']) x1['codigo_isin'] = x['codigo_isin'] x1['mtm'] = x['pv'] x1['mtm_DV100_N'] = x['pv_DV100'] titprivado_perc = titprivado_perc.merge(x1, on=['codigo_isin'], how='left') titprivado_perc['mtm_DV100'] = 0.0 # Escolhe o melhor fluxo titprivado_perc['dif_new'] = titprivado_perc['mtm'] - titprivado_perc['pu'] titprivado_perc['dif_new'] = titprivado_perc['dif_new'].abs() titprivado_perc[ 'dif_old'] = titprivado_perc['mtm_old'] - titprivado_perc['pu'] titprivado_perc['dif_old'] = titprivado_perc['dif_old'].abs() titprivado_perc['mtm'] = np.where( titprivado_perc['dif_old'] < titprivado_perc['dif_new'], titprivado_perc['mtm_old'], titprivado_perc['mtm']) titprivado_perc['mtm_DV100'] = np.where( titprivado_perc['dif_old'] < titprivado_perc['dif_new'], titprivado_perc['mtm_DV100'], titprivado_perc['mtm_DV100_N']) titprivado_perc['pv'] = np.where( titprivado_perc['dif_old'] < titprivado_perc['dif_new'], titprivado_perc['pv_old'], titprivado_perc['pv']) titprivado_perc['pv_DV100'] = np.where( titprivado_perc['dif_old'] < titprivado_perc['dif_new'], titprivado_perc['pv_DV100_old'], titprivado_perc['pv_DV100']) titprivado_perc['fator_desconto'] = np.where( titprivado_perc['dif_old'] < titprivado_perc['dif_new'], titprivado_perc['fator_desconto_old'], titprivado_perc['fator_desconto']) titprivado_perc['fator_desconto_DV100'] = np.where( titprivado_perc['dif_old'] < titprivado_perc['dif_new'], titprivado_perc['fator_desconto_DV100_old'], titprivado_perc['fator_desconto_DV100']) titprivado_perc[ 'dif_perc'] = titprivado_perc['dif_new'] / titprivado_perc['pu'] # titprivado_perc['mtm'] = np.where(titprivado_perc['dif_perc']>0.10,titprivado_perc['pu'],titprivado_perc['mtm']) # Cálculo do DV100 titprivado_perc = titprivado_perc.append(aux) titprivado_perc[ 'DV100'] = titprivado_perc['mtm'] - titprivado_perc['mtm_DV100'] # Cálculo do perc_mtm titprivado_perc[ 'perc_mtm'] = titprivado_perc['pv'] / titprivado_perc['mtm'] # Cálculo da duration titprivado_perc[ 'duration'] = titprivado_perc['perc_mtm'] * titprivado_perc['prazo_du'] x = titprivado_perc[['codigo_isin', 'duration']].groupby(['codigo_isin']).agg(['sum']) x = x.reset_index(level=None, drop=False, inplace=False, col_level=0, col_fill='') x1 = pd.DataFrame(columns=['codigo_isin', 'duration']) x1['codigo_isin'] = x['codigo_isin'] x1['duration'] = x['duration'] del titprivado_perc['duration'] titprivado_perc = titprivado_perc.merge(x1, on=['codigo_isin'], how='left') titprivado_perc[[ 'codigo_isin', 'pu', 'mtm', 'dt_ref', 'mtm_old', 'dif_new', 'dif_old', 'duration', 'spread' ]].to_excel(writer, 'conclusao_fluxo') titprivado_perc[ titprivado_perc.dt_ref == titprivado_perc.data_referencia][[ 'codigo_isin', 'pu', 'mtm', 'dt_ref', 'mtm_old', 'dif_new', 'dif_old', 'duration', 'spread' ]].to_excel(writer, 'conclusao_resumido') finalizacao = titprivado_perc[[ 'codigo_isin', 'mtm' ]][titprivado_perc.dt_ref == titprivado_perc.data_referencia].copy() del titprivado_perc['fator_desconto_DV100_old'] del titprivado_perc['fator_desconto_old'] del titprivado_perc['mtm_old'] del titprivado_perc['mtm_DV100_N'] del titprivado_perc['pv_DV100_old'] del titprivado_perc['pv_old'] del titprivado_perc['DV100_old'] del titprivado_perc['dif_new'] del titprivado_perc['dif_old'] del titprivado_perc['dif_perc'] writer.save() # Alteração de formato das colunas que são int e foram lidas como float (sabe lá pq...) # id_mtm_titprivado titprivado_perc['id_mtm_titprivado'] = titprivado_perc[ 'id_mtm_titprivado'].astype(int) # id_bmf_numeraca aux = titprivado_perc[[ 'id_papel', 'id_bmf_numeraca' ]][titprivado_perc.id_bmf_numeraca.notnull()].copy() aux['id_bmf_numeraca'] = aux['id_bmf_numeraca'].astype(int) del titprivado_perc['id_bmf_numeraca'] aux = aux.drop_duplicates() titprivado_perc = titprivado_perc.merge(aux, on=['id_papel'], how='left') # pagto_amortizacao aux = titprivado_perc[[ 'id_papel', 'indexador_dc_du' ]][titprivado_perc.indexador_dc_du.notnull()].copy() aux['indexador_dc_du'] = aux['indexador_dc_du'].astype(int) del titprivado_perc['indexador_dc_du'] aux = aux.drop_duplicates() titprivado_perc = titprivado_perc.merge(aux, on=['id_papel']) # juros_cada aux = titprivado_perc[['id_papel', 'juros_cada' ]][titprivado_perc.juros_cada.notnull()].copy() aux['juros_cada'] = aux['juros_cada'].astype(int) del titprivado_perc['juros_cada'] aux = aux.drop_duplicates() titprivado_perc = titprivado_perc.merge(aux, on=['id_papel'], how='left') # indexador_dc_du aux = titprivado_perc[[ 'id_papel', 'indexador_dc_du' ]][titprivado_perc.indexador_dc_du.notnull()].copy() aux['indexador_dc_du'] = aux['indexador_dc_du'].astype(int) del titprivado_perc['indexador_dc_du'] aux = aux.drop_duplicates() titprivado_perc = titprivado_perc.merge(aux, on=['id_papel']) # juros_dc_du aux = titprivado_perc[['id_papel', 'juros_dc_du' ]][titprivado_perc.juros_dc_du.notnull()].copy() aux['juros_dc_du'] = aux['juros_dc_du'].astype(int) del titprivado_perc['juros_dc_du'] aux = aux.drop_duplicates() titprivado_perc = titprivado_perc.merge(aux, on=['id_papel']) # flag_inclusao titprivado_perc['flag_inclusao'] = titprivado_perc['flag_inclusao'].astype( int) # du_per titprivado_perc['du_per'] = titprivado_perc['du_per'].astype(int) # dc_per titprivado_perc['dc_per'] = titprivado_perc['dc_per'].astype(int) # dt_ref -> data_fim titprivado_perc['dt_ref'] = pd.to_datetime(titprivado_perc['dt_ref']) titprivado_perc['data_fim'] = np.where( titprivado_perc['indexador'] == 'DI1', titprivado_perc['dt_ref'] - pd.DateOffset(months=0, days=1), titprivado_perc['dt_ref']) titprivado_perc['id_papel'] = titprivado_perc['id_papel_old'] titprivado_perc['data_mtm'] = titprivado_perc['data_referencia'] titprivado_perc['data_negociacao'] = titprivado_perc['data_referencia'] # Tabelas não necessárias - MTM del titprivado_perc['data_referencia'] del titprivado_perc['id_papel_old'] del titprivado_perc['indice_du_mtm'] del titprivado_perc['indice_dc_mtm'] del titprivado_perc['ano_dt_ref2'] del titprivado_perc['mes_dt_ref2'] del titprivado_perc['dia_dt_ref2'] del titprivado_perc['vertices_positivo'] del titprivado_perc['indice_dc_dt_ref2'] del titprivado_perc['indice_du_dt_ref2'] del titprivado_perc['prazo_dc'] del titprivado_perc['ano_inicio'] del titprivado_perc['mes_inicio'] del titprivado_perc['dia_inicio'] del titprivado_perc['indice_du_inicio'] del titprivado_perc['indice_dc_inicio'] del titprivado_perc['ano_fim'] del titprivado_perc['mes_fim'] del titprivado_perc['dia_fim'] del titprivado_perc['indice_du_fim'] del titprivado_perc['indice_dc_fim'] del titprivado_perc['dt_ref'] del titprivado_perc['dtoperacao_mtm'] del titprivado_perc['dif'] del titprivado_perc['pu_mercado'] del titprivado_perc['pu_curva'] del titprivado_perc['mtm_mercado'] del titprivado_perc['mtm_curva'] del titprivado_perc['pu_regra_xml'] del titprivado_perc['mtm_regra_xml'] del titprivado_perc['data_spread'] del titprivado_perc['pu'] del titprivado_perc['dif_curva'] del titprivado_perc['pupar'] del titprivado_perc['indice_du_fim_spread'] del titprivado_perc['indice_du_inicio_spread'] del titprivado_perc['prazo_du_spread'] del titprivado_perc['mtm_spread'] del titprivado_perc['pv_spread'] del titprivado_perc['tx_spot_spread'] del titprivado_perc['tx_spot_ano_spread'] del titprivado_perc['fator_desconto_spread'] # Tabelas não necessárias - XML del titprivado_perc['id_xml_debenture'] del titprivado_perc['isin'] del titprivado_perc['coddeb'] del titprivado_perc['dtemissao'] del titprivado_perc['dtoperacao'] del titprivado_perc['dtvencimento'] del titprivado_perc['cnpjemissor'] del titprivado_perc['qtdisponivel'] del titprivado_perc['qtgarantia'] del titprivado_perc['pucompra'] del titprivado_perc['puvencimento'] del titprivado_perc['puposicao'] del titprivado_perc['puemissao'] del titprivado_perc['principal'] del titprivado_perc['tributos'] del titprivado_perc['valorfindisp'] del titprivado_perc['valorfinemgar'] del titprivado_perc['coupom'] del titprivado_perc['percindex'] del titprivado_perc['caracteristica'] del titprivado_perc['percprovcred'] del titprivado_perc['classeoperacao'] del titprivado_perc['idinternoativo'] del titprivado_perc['nivelrsc'] del titprivado_perc['header_id'] del titprivado_perc['cusip'] del titprivado_perc['depgar'] del titprivado_perc['debconv'] del titprivado_perc['debpartlucro'] del titprivado_perc['SPE'] del titprivado_perc['dtretorno'] del titprivado_perc['puretorno'] del titprivado_perc['indexadorcomp'] del titprivado_perc['perindexcomp'] del titprivado_perc['txoperacao'] del titprivado_perc['classecomp'] # Remove as duplicatas de isin titprivado_perc = titprivado_perc.sort( ['codigo_isin', 'id_papel', 'data_fim'], ascending=[True, True, True]) titprivado_perc = titprivado_perc.drop_duplicates( subset=['codigo_isin', 'data_fim']) # titprivado_perc['data_bd'] = horario_bd titprivado_perc['data_bd'] = datetime.datetime.today() titprivado_perc = titprivado_perc.where((pd.notnull(titprivado_perc)), None) titprivado_perc['flag1'] = titprivado_perc['flag'].str[0:2] del titprivado_perc['flag'] titprivado_perc = titprivado_perc.rename(columns={'flag1': 'flag'}) logger.info("Conectando no Banco de dados") connection = db.connect('localhost', user='******', passwd='root', db='projeto_inv', use_unicode=True, charset="utf8") logger.info("Conexão com DB executada com sucesso") logger.info("Salvando base de dados - mtm_renda_fixa") pd.io.sql.to_sql(titprivado_perc, name='mtm_renda_fixa', con=connection, if_exists='append', flavor='mysql', index=0) #Fecha conexão connection.close() logger.info("Preenchimento tabela xml") # 9 - Preenchimento tabela xml del original['id_xml_debenture'] del original['pu_mercado'] del original['mtm_mercado'] del original['pu_curva'] del original['mtm_curva'] del original['pu_regra_xml'] del original['mtm_regra_xml'] del original['data_referencia'] del titprivado['mtm'] titprivado = titprivado.merge(finalizacao, on=['codigo_isin'], how='left') titprivado_xml = titprivado[titprivado.dt_ref == titprivado.data_referencia].copy() titprivado_xml = titprivado_xml.drop_duplicates(subset=['id_papel'], take_last=True) titprivado_xml = titprivado_xml.rename(columns={'mtm': 'mtm_calculado'}) anbima_deb = anbima_deb.rename(columns={'pu': 'pu_n'}) titprivado_xml = titprivado_xml.merge(anbima_deb, on=['coddeb', 'data_spread'], how='left') titprivado_xml['pu'] = np.where(titprivado_xml['pu_n'].notnull(), titprivado_xml['pu_n'], titprivado_xml['mtm_calculado']) titprivado_xml['pu_mercado'] = np.where( titprivado_xml['caracteristica'] == 'N', titprivado_xml['pu'], 0) titprivado_xml['pu_curva'] = np.where( titprivado_xml['caracteristica'] == 'V', titprivado_xml['pupar'], 0) titprivado_xml = titprivado_xml[[ 'id_papel', 'pu_mercado', 'pu_curva', 'data_referencia' ]].copy() final = original.merge(titprivado_xml, on=['id_papel'], how='left') final['data_referencia'] = dt_base_rel final['pu_mercado'] = np.where( (final['pu_mercado'].isnull()) | (final['pu_mercado'] == 0), final['puposicao'], final['pu_mercado']) final['pu_mercado'] = np.where(final['dtretorno'].notnull(), final['puposicao'], final['pu_mercado']) final['mtm_mercado'] = final['pu_mercado'] * (final['qtdisponivel'] + final['qtgarantia']) final['pu_curva'] = np.where(final['pu_curva'].isnull(), final['puposicao'], final['pu_curva']) final['pu_curva'] = np.where(final['dtretorno'].notnull(), final['puposicao'], final['pu_curva']) final['mtm_curva'] = final['pu_curva'] * (final['qtdisponivel'] + final['qtgarantia']) final['pu_regra_xml'] = np.where(final['caracteristica'] == 'N', final['pu_mercado'], final['pu_curva']) final['mtm_regra_xml'] = np.where(final['caracteristica'] == 'N', final['mtm_mercado'], final['mtm_curva']) final['data_bd'] = datetime.datetime.today() del final['dtrel'] final['indexador'] = final['indexador'].str.replace('IGPM', 'IGP') final['indexador'] = final['indexador'].str.replace('IAP', 'IPC') final['indexador'] = final['indexador'].str.replace('SEM-ÍNDICE', 'PRE') final['indexador'] = final['indexador'].str.replace('ANB', 'DI1') final['indexador'] = final['indexador'].str.replace('ANBID', 'DI1') final['indexador'] = final['indexador'].str.replace('CDI', 'DI1') final['indexador'] = final['indexador'].str.replace('IPCA', 'IPC') logger.info("Conectando no Banco de dados") connection = db.connect('localhost', user='******', passwd='root', db='projeto_inv', use_unicode=True, charset="utf8") logger.info("Conexão com DB executada com sucesso") logger.info("Salvando base de dados - xml_debenture") pd.io.sql.to_sql(final, name='xml_debenture', con=connection, if_exists='append', flavor='mysql', index=0) #Fecha conexão connection.close()
nullNum = list(data.isnull().sum()) ratioNull = dict() parametersList = list(data) # print(parametersList) for i in range(len(parametersList)): ratioNull[parametersList[i]] = nullNum[i] # print(ratioNull) # with open('ratioNull.csv', 'w') as csv_file: # writer = csv.writer(csv_file) # for key, value in ratioNull.items(): # writer.writerow([key, value]) tempData = pd.DataFrame({ 'Number of Null': nullNum, 'Param': parametersList, }) newData = data.iloc[:, :-4] newData.__delitem__('Chla_Picoeuk.') newData.__delitem__('NH4') # newData.fillna(newData.mean(),inplace=True) print(newData) # sns.set(style="white") # ax = sns.barplot(x="Param", y="Number of Null", data=tempData, order=parametersList) # ax.set_xticklabels(labels=parametersList,rotation=90) # fig = ax.get_figure() # fig.savefig("NumberNull.png") # plt.show() writer = ExcelWriter('NumberNullWithNull.xlsx') newData.to_excel(writer, 'Sheet1') writer.save()
print('error') if count>len(class_card): break continue p+=1 n+=1 driver.close() #lets convert list to pandas dataframe df1 = pd.DataFrame(sr_no, columns=['sr_no']) df2=pd.DataFrame(description_list, columns=['Description']) df3=pd.DataFrame(price_list, columns=['Price']) df4=pd.DataFrame(address_list, columns=['Address']) df5=pd.DataFrame(category_list, columns=['Category']) df6=pd.DataFrame(area_list, columns=['Area']) df7=pd.DataFrame(NumBath_list, columns=['Bathrooms']) #lets join all the data frames df12=df1.join(df2) df123=df12.join(df3) df1234=df123.join(df4) df12345=df1234.join(df5) df123456=df12345.join(df6) df1234567=df123456.join(df7) #lets convert pandas dataframe object to excel file writer = ExcelWriter('PropertyFinder' + '.xlsx') df1234567.to_excel(writer, 'Sheet1', index=False) writer.save()
mainDF.set_value(row, 'P_Skills_Number', skillsReqNo) mainDF.set_value(row, 'F_Skill_Score_1', skillScore1) mainDF.set_value(row, 'F_Skill_Score_2', skillScore2) #print(mainDF[0:30]) ## Geo Matching mainDF['F_Country_Match'] = np.nan row = -1 while True: row += 1 try: empNat = mainDF['P_country'][row].lower().replace(' ', '') except: break freNat = mainDF['F_Location_Country'][row].lower().replace(' ', '') if freNat == empNat: mainDF.set_value(row, 'F_Country_Match', 1) else: mainDF.set_value(row, 'F_Country_Match', 0) #print(mainDF[0:100]) ## Output to Excel writer = ExcelWriter(outFileLoc) mainDF.to_excel(writer, 'data') writer.save()
height = int(input("Height : ")) gender = input("Gender : ") d = { 'name': name, 'age': age, 'height': height, 'weight': weight, 'gender': gender } l.append(d) df = pd.DataFrame(l) #Save data to file writer = ExcelWriter('Student-data.xlsx') df.to_excel(writer, 'Sheet1', index=False) writer.save() #Retrieve data df = pd.read_excel("Student-data.xlsx") #Sort based on height print(df.sort_values(['height'])) #Caluclate BMI df['bmi'] = df['weight'] / df['height']**2 print(df) #grouping bmi
'Cookie': jsessionid, 'X-XSRF-TOKEN': token } else: headers = { 'Content-Type': "application/json", 'Cookie': jsessionid } base_url = "https://%s:%s/dataservice" % (vmanage_host, vmanage_port) # Get security events # open excel file filename = 'Security Events %s.xlsx' % time.strftime("%Y-%m-%d") writer = ExcelWriter(filename) api_url = "/event" payload = { "query": { "condition": "AND", "rules": [{ "value": [start_date + "T00:00:00 UTC", end_date + "T00:00:00 UTC"], "field": "entry_time", "type": "date", "operator":
'Line': line, 'True Positive': positive }, ignore_index=True) elif tool == 'gt-csonar': data = data.append( { 'Severity': 'N/A', 'CWE': 'N/A', 'Clang Alert': 0, 'CodeSonar Alert': 1, 'Clang Rule': 'N/A', 'CodeSonar Rule': rule, 'Line': line, 'True Positive': positive }, ignore_index=True) else: prev_entry = indexmap[alertpath][line] if tool == 'Clang': data.set_value(prev_entry, 'Clang Rule', rule) data.set_value(prev_entry, 'Clang Alert', 1) data.set_value(prev_entry, 'CWE', cwe) data.set_value(prev_entry, 'Severity', severity) elif tool == 'gt-csonar': data.set_value(prev_entry, 'CodeSonar Rule', rule) data.set_value(prev_entry, 'CodeSonar Alert', 1) writer = ExcelWriter('combined_data_table.xlsx') data.to_excel(writer, 'sheet 1', index=False) writer.save()
def create_m5(): """ The CREATE_M5 operation builds the fifth data matrix for the analysis for each gene of interest, adding to the fourth matrix data about the expression of candidate regulatory genes of genes of interest belonging to the other gene sets with respect to the model gene.. One data matrix for each target gene is created and exported locally in as many Excel files as the considered genes; while the whole set of M5 matrixes is returned as a Python dictionary (dict_model_v5.p), where each target gene (set as key) is associated to a Pandas dataframe containing M5 data of interest (set as value). :return: a Python dictionary Example:: import genereg as gr m5_dict = gr.DataMatrixes.create_m5() """ # Load input data: # Genes of interest EntrezConversion_df = pd.read_excel('./Genes_of_Interest.xlsx', sheetname='Sheet1', header=0, converters={ 'GENE_SYMBOL': str, 'ENTREZ_GENE_ID': str, 'GENE_SET': str }) # Models_v4 of genes of interest dict_model_v4 = pickle.load( open('./4_Data_Matrix_Construction/Model4/dict_model_v4.p', 'rb')) # Distinct regulatory genes for each gene of interest dict_RegulGenes = pickle.load( open('./2_Regulatory_Genes/dict_RegulGenes.p', 'rb')) # Gene expression values for regulatory genes expr_regulatory_df = pd.read_excel( './3_TCGA_Data/Gene_Expression/Gene_Expression-RegulatoryGenes.xlsx', sheetname='Sheet1', header=0) # Create a list containing the Gene Symbols of the genes of interest gene_interest_SYMs = [] for i, r in EntrezConversion_df.iterrows(): sym = r['GENE_SYMBOL'] if sym not in gene_interest_SYMs: gene_interest_SYMs.append(sym) # Get the TCGA aliquots aliquots = [] for i, r in expr_regulatory_df.iterrows(): if i != 'ENTREZ_GENE_ID': aliquots.append(i) # Create a dictionary where, for each gene of interest set as key (the model gene), we have a dataframe representing the model (matrix of data) of that gene. # This model contains all the information of the fourth model, plus additional columns with the expression of the regulatory genes for each gene of interest belonging to other gene sets with respect to the model gene, # while the different TCGA aliquots are the indexes of the rows dict_model_v5 = {} # Define the variables we need for the computation model_gene_pathways = [] # list of the gene sets the model gene belongs to other_pathway_genes = [ ] # list of the gene symbol of the genes belonging to different gene sets other_pathway_genes_RegulGenes_SYM = [ ] # list of gene symbols for the regulatory genes of gene in other gene sets new_columns = [] # list of the new columns names to be added to the model # Execute the following code for each gene of interest for gene in gene_interest_SYMs: model_gene_SYM = gene # get the Gene Symbol of the current gene # Get the gene sets of the model gene for i, r in EntrezConversion_df.iterrows(): sym = r['GENE_SYMBOL'] if sym == model_gene_SYM: p = r['GENE_SET'] model_gene_pathways.append(p) # Get the genes of interest belonging to other gene sets for i, r in EntrezConversion_df.iterrows(): path = r['GENE_SET'] if (path not in model_gene_pathways) and (path != 'GLUCOSE_METABOLISM'): symbol = r['GENE_SYMBOL'] if symbol not in other_pathway_genes: # consider only once the genes belonging to multiple gene sets other_pathway_genes.append(symbol) # Get the list of regulatory genes for the genes in the other gene sets for elem in other_pathway_genes: elem_regulatory_genes = dict_RegulGenes[elem] other_pathway_genes_RegulGenes_SYM = other_pathway_genes_RegulGenes_SYM + elem_regulatory_genes other_pathway_genes_RegulGenes_SYM = list( set(other_pathway_genes_RegulGenes_SYM) ) # keep only distinct regulatory genes # Get the fourth model for the current gene (model_v4) model_4_df = dict_model_v4[model_gene_SYM] # Identify the new columns to be added to the matrix: # in this case they are the columns corresponding to regulatory genes of genes in other gene sets # (be careful not to have duplicated columns, so add only the symbols of the genes # that are not already contained in the previous model) old_columns = list(model_4_df.columns.values) for g in other_pathway_genes_RegulGenes_SYM: if g not in old_columns: new_columns.append(g) # Create the new part of the model to add new_df = pd.DataFrame(index=aliquots, columns=new_columns) # Add the expression values for all the new regulatory genes and for each TCGA aliquot for index, row in new_df.iterrows(): for column_name, values in new_df.iteritems( ): # iterate along the columns of the dataframe expr = expr_regulatory_df.get_value(index, column_name) new_df.set_value(index, column_name, expr) # Join the two dataframes and create the new model (model_v5) model_5_df = model_4_df.join(new_df) # Set the new model in correspondence of the correct model gene key in the new dictionary dict_model_v5[model_gene_SYM] = model_5_df # Reset the variables for the next iteration on the next gene of interest model_gene_pathways = [] other_pathway_genes = [] other_pathway_genes_RegulGenes_SYM = [] new_columns = [] # Remove duplicate columns of the model gene for gene in gene_interest_SYMs: data_matrix = dict_model_v5[gene] matrix_cols = list(data_matrix.columns.values) if gene in matrix_cols: data_matrix.drop(gene, axis=1, inplace=True) # Export the dictionary into a pickle file in order to be able to import it back and use it to progressively build the next models for the genes of interest, adding further information pickle.dump( dict_model_v5, open('./4_Data_Matrix_Construction/Model5/dict_model_v5.p', 'wb')) # Export the models as .xlsx files for gene in gene_interest_SYMs: model_gene_SYM = gene pathway = EntrezConversion_df.loc[EntrezConversion_df['GENE_SYMBOL'] == model_gene_SYM, 'GENE_SET'].iloc[0] gene_ID = EntrezConversion_df.loc[EntrezConversion_df['GENE_SYMBOL'] == model_gene_SYM, 'ENTREZ_GENE_ID'].iloc[0] file_name = 'Gene_' + gene_ID + '_[' + model_gene_SYM + ']' + '_(' + pathway + ')-Model_v5.xlsx' writer = ExcelWriter('./4_Data_Matrix_Construction/Model5/' + file_name) output_df = dict_model_v5[model_gene_SYM] output_df.to_excel(writer, 'Sheet1') writer.save() # Handle genes belonging to multiple gene sets multiple_pathway_genes = [] n = EntrezConversion_df['GENE_SYMBOL'].value_counts() for i, v in n.items(): if v > 1: multiple_pathway_genes.append(i) for g in multiple_pathway_genes: filtered_df = EntrezConversion_df.loc[ EntrezConversion_df['GENE_SYMBOL'] == g] pathways = (filtered_df.GENE_SET.unique()).tolist() gene_ID = EntrezConversion_df.loc[EntrezConversion_df['GENE_SYMBOL'] == g, 'ENTREZ_GENE_ID'].iloc[0] for p in pathways: # Import the 'model_v4' matrix for the current gene current_pathway_model = pd.read_excel( './4_Data_Matrix_Construction/Model4/Gene_' + gene_ID + '_[' + g + ']_(' + p + ')-Model_v4.xlsx', sheetname='Sheet1', header=0) # Current gene set model current_pathway_other_genes = [] current_pathway_other_RegulGenes_SYM = [] current_pathway_new_columns = [] # Get the genes of interest belonging to other gene sets for i, r in EntrezConversion_df.iterrows(): path = r['GENE_SET'] if (path != p): sym = r['GENE_SYMBOL'] if sym != g: current_pathway_other_genes.append(sym) # Get the list of regulatory genes for each one of the genes belonging to other gene sets for elem in current_pathway_other_genes: elem_regulatory_genes = dict_RegulGenes[elem] current_pathway_other_RegulGenes_SYM = current_pathway_other_RegulGenes_SYM + elem_regulatory_genes current_pathway_other_RegulGenes_SYM = list( set(current_pathway_other_RegulGenes_SYM) ) # keep only distinct regulatory genes # Identify the new columns to be added to the matrix current_pathway_old_columns = list( current_pathway_model.columns.values) for gene in current_pathway_other_RegulGenes_SYM: if gene not in current_pathway_old_columns: current_pathway_new_columns.append(gene) # Create the new part of the model to add current_pathway_new_df = pd.DataFrame( index=aliquots, columns=current_pathway_new_columns) # Add the expression values for all the new regulatory genes and for each TCGA aliquot for index, row in current_pathway_new_df.iterrows(): for column_name, values in current_pathway_new_df.iteritems(): expr = expr_regulatory_df.get_value(index, column_name) current_pathway_new_df.set_value(index, column_name, expr) # Join the two dataframes and create the new model (model_v5) current_pathway_model_5_df = current_pathway_model.join( current_pathway_new_df) # Remove duplicate columns of the model gene current_pathway_matrix_cols = list( current_pathway_model_5_df.columns.values) if g in current_pathway_matrix_cols: current_pathway_model_5_df.drop(g, axis=1, inplace=True) writer = ExcelWriter('./4_Data_Matrix_Construction/Model5/Gene_' + gene_ID + '_[' + g + ']_(' + p + ')-Model_v5.xlsx') current_pathway_model_5_df.to_excel(writer, 'Sheet1') writer.save() return dict_model_v5
def extract_into_excel(layer_info_file='layer_info.txt', layer_shape_file='layer_shape.txt', debug=False): layer_info_reader = open(layer_info_file, 'rb') layer_info = pickle.load(layer_info_reader) net_name = layer_info['name'] print(net_name) layer_shape_reader = open(layer_shape_file, 'rb') layer_shape = pickle.load(layer_shape_reader) layer_name_time_reader = open(str(net_name) + '-each_layer_time.txt', 'rb') layer_type_time_reader = open(str(net_name) + '-layer_type_time.txt', 'rb') layer_name_record_dict = pickle.load(layer_name_time_reader) layer_type_record_dict = pickle.load(layer_type_time_reader) flops_membwd_reader = open(str(net_name) + '-flops_membwd.txt', 'rb') flops_membwd_dict = pickle.load(flops_membwd_reader) layer_name_list = [] layer_name_type_list = [] layer_name_time_list = [] layer_type_list = [] layer_type_time_list = [] flops_membwd_type_list = [] flops_membwd_values_list = [] max_bottoms_length = max_tops_length = 0 # multiple input and output shape info layer_shape_list_dict = {} kernel_size_list = [] stride_list = [] pad_list = [] try: for k, v in list(layer_shape.items()): if v['type'] == 'Input' or v['type'] == 'Accuracy': del layer_shape[k] continue max_bottoms_length = max(max_bottoms_length, len(v['bottoms'])) max_tops_length = max(max_tops_length, len(v['tops'])) # determine the input and output tuples length for i in range(0, max_bottoms_length): layer_shape_list_dict['Input' + str(i) + ' N'] = [] layer_shape_list_dict['Input' + str(i) + ' C'] = [] layer_shape_list_dict['Input' + str(i) + ' H'] = [] layer_shape_list_dict['Input' + str(i) + ' W'] = [] for i in range(0, max_tops_length): layer_shape_list_dict['Output' + str(i) + ' N'] = [] layer_shape_list_dict['Output' + str(i) + ' C'] = [] layer_shape_list_dict['Output' + str(i) + ' H'] = [] layer_shape_list_dict['Output' + str(i) + ' W'] = [] for k, v in layer_name_record_dict.items(): layer_name_list.append(str(k)) layer_name_type_list.append(str(layer_info[str(k)])) layer_name_time_list.append(float(v)) # 'kernel_size', 'stride', 'pad' if layer_info[str(k)] == 'Convolution' or layer_info[str( k)] == 'Pooling': kernel_size_list.append(int( layer_shape[str(k)]['kernel_size'])) stride_list.append(int(layer_shape[str(k)]['stride'])) pad_list.append(int(layer_shape[str(k)]['pad'])) else: kernel_size_list.append(float('nan')) stride_list.append(float('nan')) pad_list.append(float('nan')) # Input and output shape for i in range(0, len(layer_shape[str(k)]['bottoms'])): if debug: print('max tops len:', max_tops_length, 'max bottoms len:', max_bottoms_length) print('layer:', str(k), 'type:', layer_shape[str(k)]['type'], 'bottoms:', layer_shape[str(k)]['bottoms'][i]) if len(layer_shape[str(k)]['bottoms'][i]) == 1: layer_shape_list_dict['Input' + str(i) + ' N'].append( int(layer_shape[str(k)]['bottoms'][i][0])) layer_shape_list_dict['Input' + str(i) + ' C'].append( float('nan')) layer_shape_list_dict['Input' + str(i) + ' H'].append( float('nan')) layer_shape_list_dict['Input' + str(i) + ' W'].append( float('nan')) elif len(layer_shape[str(k)]['bottoms'][i]) == 2: layer_shape_list_dict['Input' + str(i) + ' N'].append( int(layer_shape[str(k)]['bottoms'][i][0])) layer_shape_list_dict['Input' + str(i) + ' C'].append( int(layer_shape[str(k)]['bottoms'][i][1])) layer_shape_list_dict['Input' + str(i) + ' H'].append( float('nan')) layer_shape_list_dict['Input' + str(i) + ' W'].append( float('nan')) elif len(layer_shape[str(k)]['bottoms'][i]) == 3: layer_shape_list_dict['Input' + str(i) + ' N'].append( int(layer_shape[str(k)]['bottoms'][i][0])) layer_shape_list_dict['Input' + str(i) + ' C'].append( int(layer_shape[str(k)]['bottoms'][i][1])) layer_shape_list_dict['Input' + str(i) + ' H'].append( int(layer_shape[str(k)]['bottoms'][i][2])) layer_shape_list_dict['Input' + str(i) + ' W'].append( float('nan')) elif len(layer_shape[str(k)]['bottoms'][i]) == 4: layer_shape_list_dict['Input' + str(i) + ' N'].append( int(layer_shape[str(k)]['bottoms'][i][0])) layer_shape_list_dict['Input' + str(i) + ' C'].append( int(layer_shape[str(k)]['bottoms'][i][1])) layer_shape_list_dict['Input' + str(i) + ' H'].append( int(layer_shape[str(k)]['bottoms'][i][2])) layer_shape_list_dict['Input' + str(i) + ' W'].append( int(layer_shape[str(k)]['bottoms'][i][3])) for i in range(len(layer_shape[str(k)]['bottoms']), max_bottoms_length): layer_shape_list_dict['Input' + str(i) + ' N'].append( float('nan')) layer_shape_list_dict['Input' + str(i) + ' C'].append( float('nan')) layer_shape_list_dict['Input' + str(i) + ' H'].append( float('nan')) layer_shape_list_dict['Input' + str(i) + ' W'].append( float('nan')) for i in range(0, len(layer_shape[str(k)]['tops'])): if len(layer_shape[str(k)]['tops'][i]) == 1: layer_shape_list_dict['Output' + str(i) + ' N'].append( int(layer_shape[str(k)]['tops'][i][0])) layer_shape_list_dict['Output' + str(i) + ' C'].append( float('nan')) layer_shape_list_dict['Output' + str(i) + ' H'].append( float('nan')) layer_shape_list_dict['Output' + str(i) + ' W'].append( float('nan')) elif len(layer_shape[str(k)]['tops'][i]) == 2: layer_shape_list_dict['Output' + str(i) + ' N'].append( int(layer_shape[str(k)]['tops'][i][0])) layer_shape_list_dict['Output' + str(i) + ' C'].append( int(layer_shape[str(k)]['tops'][i][1])) layer_shape_list_dict['Output' + str(i) + ' H'].append( float('nan')) layer_shape_list_dict['Output' + str(i) + ' W'].append( float('nan')) elif len(layer_shape[str(k)]['tops'][i]) == 3: layer_shape_list_dict['Output' + str(i) + ' N'].append( int(layer_shape[str(k)]['tops'][i][0])) layer_shape_list_dict['Output' + str(i) + ' C'].append( int(layer_shape[str(k)]['tops'][i][1])) layer_shape_list_dict['Output' + str(i) + ' H'].append( int(layer_shape[str(k)]['tops'][i][2])) layer_shape_list_dict['Output' + str(i) + ' W'].append( float('nan')) elif len(layer_shape[str(k)]['tops'][i]) == 4: layer_shape_list_dict['Output' + str(i) + ' N'].append( int(layer_shape[str(k)]['tops'][i][0])) layer_shape_list_dict['Output' + str(i) + ' C'].append( int(layer_shape[str(k)]['tops'][i][1])) layer_shape_list_dict['Output' + str(i) + ' H'].append( int(layer_shape[str(k)]['tops'][i][2])) layer_shape_list_dict['Output' + str(i) + ' W'].append( int(layer_shape[str(k)]['tops'][i][3])) for i in range(len(layer_shape[str(k)]['tops']), max_tops_length): if debug: print('max tops len:', max_tops_length, 'max bottoms len:', max_bottoms_length) print('layer:', str(k), 'type:', layer_shape[str(k)]['type'], 'tops:', layer_shape[str(k)]['tops']) layer_shape_list_dict['Output' + str(i) + ' N'].append( float('nan')) layer_shape_list_dict['Output' + str(i) + ' C'].append( float('nan')) layer_shape_list_dict['Output' + str(i) + ' H'].append( float('nan')) layer_shape_list_dict['Output' + str(i) + ' W'].append( float('nan')) for k, v in layer_type_record_dict.items(): layer_type_list.append(str(k)) layer_type_time_list.append(float(v)) for k, v in flops_membwd_dict.items(): flops_membwd_type_list.append(str(k)) flops_membwd_values_list.append(float(v)) finally: layer_info_reader.close() layer_shape_reader.close() layer_name_time_reader.close() layer_type_time_reader.close() flops_membwd_reader.close() assert len(layer_name_list) == len(layer_name_time_list) and \ len(layer_name_time_list) == len(kernel_size_list) and \ len(kernel_size_list) == len(stride_list) and \ len(stride_list) == len(pad_list) and \ len(layer_type_list) == len(layer_type_time_list) and \ len(flops_membwd_type_list) == len(flops_membwd_values_list), \ " Error! Must have same records length!" # calculate flops and memory accessing bytes ops_list = [] mem_bytes_list = [] for layer_name in layer_name_list: flops, mem_bytes = calculate_mlu_ops_byte(layer_name, layer_shape) ops_list.append(flops) mem_bytes_list.append(mem_bytes) gflops_list = [] intensity_list = [] total_model_ops = 0.0 total_model_mem_bytes = 0.0 for i, exe_time in enumerate(layer_name_time_list): gflops_list.append(ops_list[i] / 1e9 / (exe_time / 1e3)) intensity_list.append(float(ops_list[i] / mem_bytes_list[i])) total_model_ops += ops_list[i] total_model_mem_bytes += mem_bytes_list[i] avg_model_intensity = float(total_model_ops / total_model_mem_bytes) total_model_time = 0 for time in layer_type_time_list: total_model_time += time avg_model_gflops = total_model_ops / 1e9 / (total_model_time / 1e3) # for sheet4 columns value_list = [ total_model_ops, total_model_mem_bytes, total_model_time, avg_model_gflops, avg_model_intensity ] name_list = [ 'model ops', 'model bytes', 'model time(ms)', 'model GFLOPS', 'model intensity' ] sheet1_od = collections.OrderedDict() sheet1_od['layer name'] = layer_name_list sheet1_od['layer type'] = layer_name_type_list sheet1_od['time(ms)'] = layer_name_time_list sheet1_od['Ops'] = ops_list sheet1_od['Bytes'] = mem_bytes_list sheet1_od['GFLOPS'] = gflops_list sheet1_od['Intensity'] = intensity_list for i in range(0, max_bottoms_length): sheet1_od['Input' + str(i) + ' N'] = layer_shape_list_dict['Input' + str(i) + ' N'] sheet1_od['Input' + str(i) + ' C'] = layer_shape_list_dict['Input' + str(i) + ' C'] sheet1_od['Input' + str(i) + ' H'] = layer_shape_list_dict['Input' + str(i) + ' H'] sheet1_od['Input' + str(i) + ' W'] = layer_shape_list_dict['Input' + str(i) + ' W'] sheet1_od['kernel size'] = kernel_size_list sheet1_od['stride'] = stride_list sheet1_od['pad'] = pad_list for i in range(0, max_tops_length): sheet1_od['Output' + str(i) + ' N'] = layer_shape_list_dict['Output' + str(i) + ' N'] sheet1_od['Output' + str(i) + ' C'] = layer_shape_list_dict['Output' + str(i) + ' C'] sheet1_od['Output' + str(i) + ' H'] = layer_shape_list_dict['Output' + str(i) + ' H'] sheet1_od['Output' + str(i) + ' W'] = layer_shape_list_dict['Output' + str(i) + ' W'] sheet1_df = pd.DataFrame(sheet1_od) sheet2_od = collections.OrderedDict() sheet2_od['layer type'] = layer_type_list sheet2_od['time(ms)'] = layer_type_time_list sheet2_df = pd.DataFrame(sheet2_od) sheet3_od = collections.OrderedDict() sheet3_od['type'] = flops_membwd_type_list sheet3_od['values'] = flops_membwd_values_list sheet3_df = pd.DataFrame(sheet3_od) sheet4_od = collections.OrderedDict() sheet4_od['name'] = name_list sheet4_od['values'] = value_list sheet4_df = pd.DataFrame(sheet4_od) excel_file_name = str(net_name) + '.xlsx' writer = ExcelWriter(excel_file_name) sheet1_df.to_excel(writer, 'Sheet1', index=False) sheet2_df.to_excel(writer, 'Sheet2', index=False) sheet3_df.to_excel(writer, 'Sheet3', index=False) sheet4_df.to_excel(writer, 'Sheet4', index=False) writer.save()
def create_m1(): """ The CREATE_M1 operation builds the first data matrix for each gene of interest, collecting the current gene expression and methylation values, along with the expression values of all the genes in the same gene set. One data matrix for each target gene is created and exported locally in as many Excel files as the considered genes; while the whole set of M1 matrixes is returned as a Python dictionary (dict_model_v1.p), where each target gene (set as key) is associated to a Pandas dataframe containing M1 data of interest (set as value). :return: a Python dictionary Example:: import genereg as gr m1_dict = gr.DataMatrixes.create_m1() """ # Load input data: # Genes of interest EntrezConversion_df = pd.read_excel('./Genes_of_Interest.xlsx', sheetname='Sheet1', header=0, converters={ 'GENE_SYMBOL': str, 'ENTREZ_GENE_ID': str, 'GENE_SET': str }) # Methylation values for genes of interest methyl_df = pd.read_excel( './3_TCGA_Data/Methylation/Methylation_Values.xlsx', sheetname='Sheet1', header=0) # Gene expression values for genes of interest expr_interest_df = pd.read_excel( './3_TCGA_Data/Gene_Expression/Gene_Expression-InterestGenes.xlsx', sheetname='Sheet1', header=0) # Create a list containing the Gene Symbols of the genes of interest gene_interest_SYMs = [] for i, r in EntrezConversion_df.iterrows(): sym = r['GENE_SYMBOL'] if sym not in gene_interest_SYMs: gene_interest_SYMs.append(sym) # Get the TCGA aliquots aliquots = [] for i, r in methyl_df.iterrows(): if i != 'ENTREZ_GENE_ID': aliquots.append(i) # Create a dictionary where, for each gene of interest set as key (the model gene), we have a dataframe representing the model (matrix of data) of that gene. # This model the expression and methylation values of the model gene in the first and second columns, and the expression of all the genes that belong to the # model gene set in the other columns, while the different TCGA aliquots are the indexes of the rows. dict_model_v1 = {} # Define the variables we need for the computation model_gene_pathways = [] # list of the gene sets the model gene belongs to same_pathway_genes = [ ] # list of the symbols of the genes belonging to the same gene sets as the model gene df_columns = [] # list of the model columns names # Execute the following code for each gene of interest for gene in gene_interest_SYMs: model_gene_SYM = gene # get the Gene Symbol of the current gene # Get the gene sets of the model gene for i, r in EntrezConversion_df.iterrows(): sym = r['GENE_SYMBOL'] if sym == model_gene_SYM: p = r['GENE_SET'] model_gene_pathways.append(p) # Get the genes of interest belonging to the model gene set for i, r in EntrezConversion_df.iterrows(): path = r['GENE_SET'] if path in model_gene_pathways: symbol = r['GENE_SYMBOL'] if symbol != model_gene_SYM: same_pathway_genes.append(symbol) # Define the columns of the model gene matrix of data df_columns.append( 'EXPRESSION (' + model_gene_SYM + ')') # the second column contains the expression of the model gene df_columns.append( 'METHYLATION (' + model_gene_SYM + ')') # the third column contains the methylation of the model gene for g in same_pathway_genes: df_columns.append( g ) # we have a column for each gene in the same gene set of the model gene # In correspondence of the model gene key in the dictionary, # set its model as value, with the proper indexes and column names dict_model_v1[model_gene_SYM] = pd.DataFrame(index=aliquots, columns=df_columns) # Reset the variables for the next iteration on the next gene of interest model_gene_pathways = [] same_pathway_genes = [] df_columns = [] # Fill the models for each gene of interest for gene, matrix in dict_model_v1.items(): first_col = 'EXPRESSION (' + gene + ')' second_col = 'METHYLATION (' + gene + ')' # Add the expression and methylation values of each model gene and for each TCGA aliquot for index, row in matrix.iterrows(): model_expr = expr_interest_df.get_value(index, gene) # get the expression model_methyl = methyl_df.get_value( index, gene) # get the mathylation value # set the two values in the correct cell of the matrix matrix.set_value(index, first_col, model_expr) matrix.set_value(index, second_col, model_methyl) # Add the expression values for all the other genes belonging to the same gene set of the model gene for index, row in matrix.iterrows(): for column_name, values in matrix.iteritems( ): # iterate along the columns of the dataframe # skip the first two columns and add the proper values if (column_name != first_col) and (column_name != second_col): expr = expr_interest_df.get_value(index, column_name) matrix.set_value(index, column_name, expr) # Export the dictionary into a pickle file in order to be able to import it back and use it to progressively build the next models for the genes of interest, adding further information. pickle.dump( dict_model_v1, open('./4_Data_Matrix_Construction/Model1/dict_model_v1.p', 'wb')) # Export the models as .xlsx files for gene in gene_interest_SYMs: model_gene_SYM = gene pathway = EntrezConversion_df.loc[EntrezConversion_df['GENE_SYMBOL'] == model_gene_SYM, 'GENE_SET'].iloc[0] gene_ID = EntrezConversion_df.loc[EntrezConversion_df['GENE_SYMBOL'] == model_gene_SYM, 'ENTREZ_GENE_ID'].iloc[0] file_name = 'Gene_' + gene_ID + '_[' + model_gene_SYM + ']' + '_(' + pathway + ')-Model_v1.xlsx' writer = ExcelWriter('./4_Data_Matrix_Construction/Model1/' + file_name) output_df = dict_model_v1[model_gene_SYM] output_df.to_excel(writer, 'Sheet1') writer.save() # Handle genes belonging to multiple gene sets multiple_pathway_genes = [] n = EntrezConversion_df['GENE_SYMBOL'].value_counts() for i, v in n.items(): if v > 1: multiple_pathway_genes.append(i) for g in multiple_pathway_genes: filtered_df = EntrezConversion_df.loc[ EntrezConversion_df['GENE_SYMBOL'] == g] pathways = (filtered_df.GENE_SET.unique()).tolist() gene_ID = EntrezConversion_df.loc[EntrezConversion_df['GENE_SYMBOL'] == g, 'ENTREZ_GENE_ID'].iloc[0] for p in pathways: current_pathway_model = dict_model_v1[g].copy() # Extract the genes of interest in the current gene set current_pathway_genes = [] for i, r in EntrezConversion_df.iterrows(): sym = r['GENE_SYMBOL'] path = r['GENE_SET'] if path == p: current_pathway_genes.append(sym) # Extract list of columns in the full model all_columns = [] for column_name, values in current_pathway_model.iteritems(): if (column_name != 'EXPRESSION (' + g + ')') and ( column_name != 'METHYLATION (' + g + ')'): all_columns.append(column_name) # Extract the columns to remove form the model other_pathway_genes = list( set(all_columns) - set(current_pathway_genes)) for i in other_pathway_genes: if (i != g): current_pathway_model.drop(i, axis=1, inplace=True) writer = ExcelWriter('./4_Data_Matrix_Construction/Model1/Gene_' + gene_ID + '_[' + g + ']_(' + p + ')-Model_v1.xlsx') current_pathway_model.to_excel(writer, 'Sheet1') writer.save() return dict_model_v1
def UpdateExcel(self): df1 = self.getArticles() writer = ExcelWriter(destFolder + '\\ElConfidencial.xlsx') df1 = df1.drop_duplicates(subset='title', keep='first') df1.to_excel(writer, sheet_name='ElConfidencial', index = False) writer.save()
for index, row in tweets.iterrows(): row["Tweet_Content"] = row["Tweet_Content"].strip('b\'') row["Timestamp"] = parse(row["Timestamp"]).strftime('%m/%d/%y') row["Timestamp"] = pd.to_datetime(row["Timestamp"]) tweets["Timestamp"] = pd.to_datetime(tweets['Timestamp']).dt.date SP500 = SP500.reindex(index=SP500.index[::-1]) # setting the column of tweets dataframe tweets.columns = ["Twitter_ID", "Tweet_ID", "Date", "Tweet_content"] # changing the type of SP500 Date column to datetime SP500 = SP500.astype({"Date": "datetime64"}) # converting the Date column of tweets dataframe to datetime tweets = tweets.astype({"Date": "datetime64"}) #Writing the dataframe into Excel output_filename = 'cleaned_twitter.xlsx' writer = ExcelWriter(output_filename) tweets.to_excel(writer) writer.save() #---MERGING--- # dataframe that stores merged data of S&P500 and tweets # merging on common column Date tweets_stock = tweets.merge(SP500, how='left', on='Date') tweets_stock = tweets_stock[pd.notnull(tweets_stock["Open"])] tweets_stock.reset_index(drop=True) #---CLEANING AND MERGING AIRLINE DATA (with Twitter Data)--- #Southwest_Airlines # reading the data scraped from southwest file airline_filename = 'Southwest.xlsx' southwest_airlines = pd.read_excel(airline_filename)
#print (counter_b) counter_b +=1 prog_b1 += '#' per_b=(counter_b/16)*100 print('B1_Config Downloading ',str(stat[0]) + '[',prog_b1,'] ',str(round(per_b,0)) + '%', end='\r') if counter_b == 16: print('B1_Config Downloading ',str(stat[1]) + '[',prog_b1,'] ',str(round(per_b,0)) + '%', end='') break print('\n Building 1 Download Complete') print('################## WRITING ##################') with ExcelWriter('test.xlsx') as writer: for x in st_erdc: df_erdc[counter_c]=pd.read_csv(*PATH-CONFIDENTIAL*+str(st_erdc[counter_c]) +'.csv') df_erdc[counter_c].to_excel(writer, sheet_name=str(st_erdc[counter_c])) #print(counter_c) counter_c +=1 if counter_c==14: break print('ERDC Writing Complete') for x in st_b1:
tagData.append(process_url(url, driver)) except Exception as e1: print(e1) continue counter += 1 if (counter % 1 == 0): print('Done fetching tags for', counter, 'blogs') driver.close() return tagData start = time.time() tags = get_tags(urls[28000:29000]) done = time.time() elapsed = done - start print('Total Time Taken', elapsed, 'seconds') tags_df = pd.DataFrame(tags) tags_df.columns = ['url', 'tags'] tags_df df = df.drop_duplicates(subset='url') final = pd.merge(df, tags_df, on='url') final writer = ExcelWriter('TC_final29.xlsx') final.to_excel(writer, 'Sheet1') writer.save()
ignore_index=True) # Rename the file names of the files in folder if such file name matches with the files names in excel data for i, f in enumerate(os.listdir(FolderPathToCheck)): src = os.path.join(FolderPathToCheck, f) for idx, row in excel_data.iterrows(): if f == excel_data.loc[idx, 'FileName']: dst = os.path.join(FolderPathToCheck, excel_data.loc[idx, 'FileRenamed']) os.rename(src, dst) # Split the "File_Path_Name" column into seperate columns as File Name, File Path and File extention for idx, row in filesInFolder_data.iterrows(): filesInFolder_data.loc[idx, 'File_Path'], filesInFolder_data.loc[ idx, 'File_Name'] = os.path.split(filesInFolder_data.loc[idx, 'File_Path_Name']) filesInFolder_data.loc[idx, 'File_Extention'] = Path( filesInFolder_data.loc[idx, 'File_Path_Name']).suffix # Write the details of files in the folder to a excel sheet. writer = ExcelWriter(FolderPathToSaveResult + "/All_File_Header_Data.xlsx") filesInFolder_data.to_excel(writer, 'Sheet1', index=False) writer.save() # Write the details of files in the folder to a excel sheet. writer = ExcelWriter(FolderPathToSaveResult + "/excel_data_new.xlsx") excel_data.to_excel(writer, 'Sheet1', index=False) writer.save()
if i == j: final[i] = {} for idItemA in android_final[i]: final[i][idItemA] = android_final[i][idItemA] for idItemB in ios_final[j]: final[i][idItemB] = ios_final[j][idItemB] return final final = mergDictionaries(android_final, ios_final) from pandas import ExcelWriter df = pandas.DataFrame.from_dict(final, orient='index') df.to_csv('data.csv', header=True, index=True, index_label='datetime', encoding='utf-8') writer = ExcelWriter('data.xlsx') df.to_excel(writer, sheet_name='PokemonGoAnalysisData', header=True, index=True, index_label='datetime') writer.save() #dumping the dictionary in the json file json.dump(final, outfile)
def ExcelHandler(FileName): ExcelInput = read_excel(FileName, converters={'Date': dateFix}) with ExcelWriter('Fixed_File.xlsx', engine='xlsxwriter') as writer: ExcelInput.to_excel(writer)
def get_woe_iv(self, bins=5): bin_interval = 1.0 / bins x = self.x y = self.y event = self.event y_column = y.name xy_concat = pd.concat([x, y], axis=1).copy() list_q = np.arange(0, 1, bin_interval) xy_concat.loc[xy_concat[y_column] == event, y_column] = 1 xy_concat.loc[xy_concat[y_column] <> event, y_column] = 0 total_count = xy_concat.shape[0] total_pos_count = xy_concat[y_column].sum() total_neg_count = total_count - total_pos_count pos_rt = (total_pos_count + 0.0) / total_count columns_bin_dict = self.columns_bin_dict woe_t = pd.DataFrame() for i in x.columns: nunique_i = x[i].nunique() type_i = str(x[i].dtype) if nunique_i <= 1: print i, " : 0 or 1 level, no woe/iv calculated." columns_bin_dict[i] = [] continue elif nunique_i <= 10: #value_counts xy_concat[i] = xy_concat[i].fillna("nan") df_temp = xy_concat.groupby(i)[y_column].agg([pd.Series.sum,pd.Series.count]).reset_index()\ .rename(columns = {"sum":"pos_count","count":"cat_total_count",i:"var_cat"}) columns_bin_dict[i] = [] else: if type_i == "object": print i, ": too many values for discrete variables." continue else: var_name = i var_name_bin = str(i) + "_bin" if i not in columns_bin_dict.keys(): list_q2 = list(xy_concat[i].dropna().quantile( list_q).unique()) + [np.inf] list_q2[0] = -np.inf if len(list_q2) == 2: list_q2 = list(pd.Series(xy_concat[i].dropna().unique()) \ .quantile(list_q)) + [np.inf] list_q2[0] = -np.inf else: list_q2 = columns_bin_dict[i] xy_concat[var_name_bin] = pd.cut(x[var_name], list_q2).astype("string") df_temp = xy_concat.groupby(var_name_bin)[y_column].agg([pd.Series.sum,pd.Series.count]).reset_index()\ .rename(columns = {"sum":"pos_count","count":"cat_total_count",var_name_bin:"var_cat"}) columns_bin_dict[i] = list_q2 df_temp["var_name"] = i df_temp["neg_count"] = df_temp["cat_total_count"] - df_temp[ "pos_count"] df_temp["p_ni"] = df_temp["neg_count"] / total_neg_count df_temp["p_yi"] = df_temp["pos_count"] / total_pos_count df_temp["woe"] = np.log(df_temp["pos_count"]/total_pos_count / \ df_temp["p_ni"]) woe_t = woe_t.append(df_temp) woe_t["odds"] = woe_t["pos_count"] / woe_t["neg_count"] woe_t.loc[woe_t["woe"] >= self.max_woe, "woe"] = self.max_woe woe_t.loc[woe_t["woe"] <= self.min_woe, "woe"] = self.min_woe woe_t["iv_i"] = (woe_t["p_yi"] - woe_t["p_ni"]) * woe_t["woe"] woe_t["p_y_total"] = pos_rt woe_show_cols = [ 'var_name', 'var_cat', 'cat_total_count', 'pos_count', 'neg_count', 'p_ni', "p_y_total", "odds", 'p_yi', 'woe', 'iv_i' ] woe_t = woe_t[woe_show_cols] woe_t["var_cat"] = woe_t["var_cat"].astype("string") iv_t = woe_t.groupby("var_name")["iv_i"].sum().reset_index().rename( columns={"iv_i": "iv"}) filePath = self.workpath + "woe_t.xlsx" excel_writer = ExcelWriter(filePath) woe_t.to_excel(excel_writer, sheet_name="woe", index=False) iv_t.to_excel(excel_writer, sheet_name="iv", index=False) excel_writer.save() self.woe_t = woe_t self.iv_t = iv_t print filePath, " generated;\n <name>.woe_t, <name>.iv_t available"
#!/usr/bin/env python # -*- coding=utf-8 -*- __author__ = "柯博文老師 Powen Ko, www.powenko.com" import pandas as pd #data = pd.read_csv('ExpensesRecord.csv') df = pd.read_excel('ExpensesRecord.xls', 'sheet') #data = pd.read_html('http://www.fdic.gov/bank/individual/failed/banklist.html') print(df.head(5)) from pandas import ExcelWriter writer = ExcelWriter('test.xlsx', engine='xlsxwriter') df.to_excel(writer, sheet_name='sheet2') writer.save()
def save_xls(list_dfs, xls_path): writer = ExcelWriter(xls_path) for n, df in enumerate(list_dfs): sheet_name = str(classifiers[n]) df.to_excel(writer, sheet_name) writer.save()