コード例 #1
0
def put_to_excel(result):
    data = DataFrame(result)
    writer = ExcelWriter(excel, engine='xlsxwriter')
    data.to_excel(writer, sheet_name='Sheet1')
    writer.save()
コード例 #2
0
    for idx in range(0, len(debet)):
        if idx == 0: # idx pasti bernilai 0
            awal = debet[idx] - kredit[idx]
            sel.append(awal)
        else: # idx pasti lebih besar dari 0
            idx_before = idx - 1
            lanjutan =  debet[idx] - kredit[idx] + sel[idx_before]
            sel.append(lanjutan)
    return sel


# mengambil data dari data.csv dan di ubah menjadi
# panda DataFrame
df = pd.read_csv('pertemuan6/data.csv')
print(df.head())

# mengambil nilai berbentuk array dari tiap kolom
# yaitu kolom debet dan kredit dan menambah kolom
# selisih ke dalam DataFrame

debet = df.debet.values
kredit = df.kredit.values
df['selisih'] = balance(debet,kredit)


# mencetak DataFrame yang telah di ubah ke dalam
# File excel dengan nama data-olahan.xlsx

writer = ExcelWriter('pertemuan6/data-olahan.xlsx')
df.to_excel(writer,'Sheet1',index=False)
writer.save()
コード例 #3
0
def write_xlsx(df, name_file):
    writer = ExcelWriter(f'{name_file}.xlsx')
    df.to_excel(writer, 'Sheet1')
    writer.save()
    return 'ФАЙЛ СОХРАНЕН'
コード例 #4
0
        condition_7 = False
    # Condition 8: IBD RS rating >70 and the higher the better
    if (RS_Rating > 70):
        condition_8 = True
    else:
        condition_8 = False

    if (
            condition_1 and condition_2 and condition_3 and condition_4 and condition_5 and condition_6 and condition_7 and condition_8):
        final.append(stock)
        index.append(n)

        dataframe = pd.DataFrame(list(zip(final, index)), columns=['Company', 'Index'])

        dataframe.to_csv('stocks.csv')

        exportList = exportList.append(
            {'Stock': stock, "RS_Rating": RS_Rating, "50 Day MA": moving_average_50, "150 Day Ma": moving_average_150,
             "200 Day MA": moving_average_200, "52 Week Low": low_of_52week, "52 week High": high_of_52week},
            ignore_index=True)
        print(stock + " made the requirements")
except Exception as e:
    print(e)
    print("No data on " + stock)

print(exportList)

writer = ExcelWriter("ScreenOutput.xlsx")
exportList.to_excel(writer, "Sheet1")
writer.save()
コード例 #5
0
ファイル: mm_stock_screener.py プロジェクト: Gwill/Finance
            for value in false:
                print(value)

            otherList = otherList.append(
                {
                    'Stock': stock,
                    "RS_Rating": RS_Rating,
                    "50 Day MA": moving_average_50,
                    "150 Day Ma": moving_average_150,
                    "200 Day MA": moving_average_200,
                    "52 Week Low": low_of_52week,
                    "52 week High": high_of_52week
                },
                ignore_index=True)
    except Exception as e:
        print(e)
        print("No data on " + stock)

print(exportList)

writer = ExcelWriter(
    '/Users/shashank/Documents/Code/Python/Outputs/mm-screener-output/Export-Output_{}.xlsx'
    .format(today))
exportList.to_excel(writer, "Sheet1")
writer.save()

writer = ExcelWriter(
    '/Users/shashank/Documents/Code/Python/Outputs/mm-screener-output/Other-Output_{}.xlsx'
    .format(today))
otherList.to_excel(writer, "Sheet1")
writer.save()
コード例 #6
0
ファイル: import.py プロジェクト: lindenzed/lcms
import pandas as pd
import xlrd
import openpyxl
from pandas import ExcelWriter

writer = ExcelWriter(r'/home/zach/Documents/Excel/export.xlsx')
LOD_List = 0.011, 0.0107, 0.0067, 0.0073, .0048, 0.0048, 0.0235, 0.0114, 0.0035, 0.0092, 0.0117, 0.0012, 0.0062, 0.0082, 0.0020, 0.0052, 0.0012, 0.0070, 0.0073, 0.0094
Compound_List = [
    "PFMOAA", "R-EVE", "Byproduct 5", "Byproduct 4", "PMPA", "PFO2HxA", "PEPA",
    "NVHOS", "PFECA_B", "PFO3OA", "HFPO-DA", "PES", "PFECA_G", "PFO4DA",
    "Hydro-EVE Acid", "EVE-Acid", "Byproduct 6", "PFO5DA", "Byproduct 2",
    "Byproduct 1"
]
data = pd.read_excel(r'/home/zach/Documents/Excel/LOD_correction.xlsx',
                     'Sheet4')
Compound_Counter = 0
Append = pd.DataFrame({'File Name': [], 'Compound': [], 'Concentration': []})


def parse_compound(T3Compound, T3Compound_LOD):
    T3Compound_parse = data.loc[data['Compound'] == T3Compound]
    T3Compound_report = T3Compound_parse.loc[
        T3Compound_parse['Concentration'] > T3Compound_LOD,
        ['File Name', 'Compound', 'Concentration']]
    return T3Compound_report


for Compounds in Compound_List:
    Append = Append.append(
        parse_compound(Compound_List[Compound_Counter],
                       LOD_List[Compound_Counter]))
コード例 #7
0
ファイル: cmu.py プロジェクト: SaraBlalockNg/misc
where filename is something like "mysheet.xlsx" (no quotes)
It will return a file called transcribed.xlsx"""

try:
    cmu = nltk.corpus.cmudict.dict()
except LookupError:
    nltk.download('cmudict')
    cmu = nltk.corpus.cmudict.dict()

words = np.array(pd.read_excel(sys.argv[1]).fillna(''))

bad = '-undefined-'


def transcribe(word):
    try:
        return (' '.join(cmu[word.lower()][0]))
    except KeyError:
        if word == '':
            return ('')
        else:
            return (bad)


f = np.vectorize(transcribe, otypes=['object'])

transcribed = pd.DataFrame(f(words))

writer = ExcelWriter('transcribed.xlsx')
transcribed.to_excel(writer, 'Sheet1', index=False)
writer.save()
コード例 #8
0
ファイル: mortalGL.py プロジェクト: zefuirusu/austudy
 def single_acct_analysis(self,accid,label=None,save=False,savepath=''):
     '''
     Analysize single account.
     parameters:
         accid,pass account id number, which is transferred into regular expression by this method, although regular expression is supported.
         label,default set to self.accid_col if None is passed to this parameter.
     '''
     accid_item=self.__trans_accid_regex(accid)
     acct_data=self.getAcct(accid_item,accid_label=label,over_write=False,pure=False,side='all')
     # acct_data=acct_data.set_index('glid',inplace=True)
     acct_sum=acct_data[self.drcrdesc].sum(axis=0)
     print('---start analysize %s---'%str(accid))
     print(get_time_str())
     print('---Account Data---')
     print('account data shape:',acct_data.shape)
     print('account sum:',acct_sum)
     print('theAcct %s:\n'%str(accid),acct_data)
     if acct_sum[0] != 0:
         dr_acct_data=self.getAcct(accid_item,accid_label=label,over_write=False,pure=False,side='dr')
         # dr_acct_data=dr_acct_data.set_index('glid',inplace=True)
         print('---Debit Data---')
         print('debit data shape:',dr_acct_data.shape)
         print('debit_side %s:\n'%str(accid),dr_acct_data)
         # print(dr_acct_data)
     else:
         dr_acct_data=None
         pass
     if acct_sum[1] != 0:
         cr_acct_data=self.getAcct(accid_item,accid_label=label,over_write=False,pure=False,side='cr')
         # cr_acct_data=cr_acct_data.set_index('glid',inplace=True)
         print('---Credit Data---')
         print('credit data shape:',cr_acct_data.shape)
         print('credit_side %s:\n'%str(accid),cr_acct_data)
         # print(cr_acct_data)
     else:
         cr_acct_data=None
         pass
     if save==True:
         import os
         # from autk import get_time_str
         from openpyxl import Workbook,load_workbook
         from pandas import ExcelWriter
         if savepath=='':
             savename=''.join(['theAcct',str(accid),'-',get_time_str(),'.xlsx'])
             savepath=os.path.join(os.path.abspath(os.curdir),savename)
             wb=Workbook()
             wb.save(savepath)
             print('new workbook created at current directory.')
         elif os.path.isdir(savepath):
             savename=''.join(['theAcct',str(accid),'-',get_time_str(),'.xlsx'])
             savepath=os.path.join(os.path.abspath(savepath),savename)
             wb=Workbook()
             wb.save(savepath)
             print('new workbook created at %s'%savepath)
         elif os.path.isfile(savepath):
             wb=load_workbook(savepath)
             print('workbook loaded at %s'%savepath)
         else:
             print('woc???,file not exist?')
             wb=Workbook()
             wb.save(savepath)
         wter=ExcelWriter(savepath,engine='openpyxl')
         wter.book=wb
         acct_data.to_excel(wter,sheet_name=''.join(['acct_',str(accid)]))
         if dr_acct_data is not None:
             dr_acct_data.to_excel(wter,sheet_name=''.join(['dr_',str(accid)]))
             wter.save()
         else:
             pass
         if cr_acct_data is not None:
             cr_acct_data.to_excel(wter,sheet_name=''.join(['cr_',str(accid)]))
             wter.save()
         else:
             pass
         wter.save()
         wb.close()
         print('%s data saved.'%str(accid))
     else:
         print('analysis result not saved.')
     print('---end %s analysis---'%str(accid))
     print(get_time_str())
     return [acct_data,dr_acct_data,cr_acct_data]
コード例 #9
0
    part2_sample2 = f2.sample(frac=0.0333, random_state=3)
    part2_sample3 = f3.sample(frac=0.0333, random_state=4)
    part2_sample4 = f4.sample(frac=0.0333, random_state=5)
    # concat every part of the sample, with the randomly generated sample from the group, that it was not a part of.
    sample1 = pd.concat([part1_sample1, part2_sample1])
    sample2 = pd.concat([part1_sample2, part2_sample2])
    sample3 = pd.concat([part1_sample3, part2_sample3])
    sample4 = pd.concat([part1_sample4, part2_sample4])
    return sample1, sample2, sample3, sample4

#%%
samples = random_sample(4000)

#%%
from pandas import ExcelWriter
# gem samples som excel filer
writer1 = ExcelWriter('Niels_1.xlsx')
samples[0].to_excel(writer1,'Sheet1')
writer1.save()

writer2 = ExcelWriter('Simon_2.xlsx')
samples[1].to_excel(writer2,'Sheet1')
writer2.save()

writer3 = ExcelWriter('Elias_3.xlsx')
samples[2].to_excel(writer3,'Sheet1')
writer3.save()

writer4 = ExcelWriter('Julius_4.xlsx')
samples[3].to_excel(writer4,'Sheet1')
writer4.save()
コード例 #10
0
        except:
            Popup('Your path, sheet, or indicator is incorrect')
        toExport = list()
        sameResult = Same(dataINframes, dataINlist)
        diffResult = Different(dataINframes, sameResult)
        for df in sameResult:
            df.reset_index(drop=True, inplace=True)
        for df in diffResult:
            df.reset_index(drop=True, inplace=True)
        toExport.append(sameResult)
        toExport.append(diffResult)
        window.Close()
        window = Window('CompyrisonTool').Layout(layout3)

    if (event == 'export'):
        writer = ExcelWriter(values['exportPath'] + "/OUTPUT.xlsx",
                             engine='xlsxwriter')
        for same in range(0, len(toExport[0])):
            header = fileList[same][0].rsplit(
                '/', 1)[-1][:-5] + fileList[same][1] + fileList[same][2]
            if len(header) > 27:
                header = header[(len(header) - 27):]
            sameHeader = header + 'SAME'
            toExport[0][same].to_excel(writer, sheet_name=sameHeader)
        for diff in range(0, len(toExport[1])):
            header = fileList[diff][0].rsplit(
                '/', 1)[-1][:-5] + fileList[diff][1] + fileList[diff][2]
            if len(header) > 27:
                header = header[(len(header) - 27):]
            diffHeader = header + 'DIFF'
            toExport[1][diff].to_excel(writer, sheet_name=diffHeader)
        try:
コード例 #11
0
def mtm_curva_debenture():

    import datetime, time
    import pandas as pd
    import pymysql as db
    import numpy as np
    import logging
    from findt import FinDt
    from pandas import ExcelWriter
    from dependencias.Metodos.funcoes_auxiliares import get_data_ultimo_dia_util_mes_anterior
    from dependencias.Metodos.funcoes_auxiliares import full_path_from_database

    logger = logging.getLogger(__name__)

    # Pega a data do último dia útil do mês anterior e deixa no formato específico para utilização da função
    dtbase = get_data_ultimo_dia_util_mes_anterior()
    dtbase_concat = dtbase[0] + dtbase[1] + dtbase[2]

    # Diretório de save de planilhas
    save_path_puposicao_final = full_path_from_database(
        'get_output_quadro419') + 'puposicao_final_deb.xlsx'
    save_path_teste_dif_deb = full_path_from_database(
        'get_output_quadro419') + 'teste_dif_deb.xlsx'
    feriados_sheet = full_path_from_database(
        'feriados_nacionais') + 'feriados_nacionais.csv'

    tol = 0.20

    writer = ExcelWriter(save_path_puposicao_final)

    dt_base_rel = datetime.date(int(dtbase[0]), int(dtbase[1]), int(dtbase[2]))

    # 1 - Leitura e criação de tabelas
    # Informações do cálculo de MTM
    logger.info("Conectando no Banco de dados")
    connection = db.connect('localhost',
                            user='******',
                            passwd='root',
                            db='projeto_inv',
                            use_unicode=True,
                            charset="utf8")
    logger.info("Conexão com DB executada com sucesso")

    query = "SELECT * FROM projeto_inv.mtm_titprivado WHERE tipo_ativo = 'DBS'"
    #query = "SELECT * FROM projeto_inv.mtm_titprivado"
    mtm_titprivado0 = pd.read_sql(query, con=connection)
    logger.info("Leitura do banco de dados executada com sucesso")

    mtm_titprivado = mtm_titprivado0.copy()

    # Seleciona debentures

    # Seleciona a última carga de debentures da data da posicao
    mtm_titprivado['dtrel'] = mtm_titprivado['id_papel'].str.split('_')
    mtm_titprivado['dtrel'] = mtm_titprivado['dtrel'].str[0]

    mtm_titprivado = mtm_titprivado[mtm_titprivado.dtrel ==
                                    dtbase_concat].copy()
    mtm_titprivado = mtm_titprivado[mtm_titprivado.data_bd == max(
        mtm_titprivado.data_bd)]

    # Renomeia columnas
    mtm_titprivado = mtm_titprivado.rename(columns={
        'data_fim': 'dt_ref',
        'dtoperacao': 'dtoperacao_mtm'
    })

    # Reajusta papéis indesaxos a DI
    mtm_titprivado['dt_ref'] = pd.to_datetime(mtm_titprivado['dt_ref'])
    mtm_titprivado['dt_ref'] = np.where(
        mtm_titprivado['indexador'] == 'DI1',
        mtm_titprivado['dt_ref'] + pd.DateOffset(months=0, days=1),
        mtm_titprivado['dt_ref'])
    mtm_titprivado['dt_ref'] = mtm_titprivado['dt_ref'].dt.date

    # Altera o nome do id_papel para levar em consideração o flag
    mtm_titprivado['id_papel_old'] = mtm_titprivado['id_papel']
    mtm_titprivado['id_papel'] = mtm_titprivado['id_papel_old'].str.split('_')
    mtm_titprivado['id_papel'] = mtm_titprivado['id_papel'].str[0] + '_' + mtm_titprivado['id_papel'].str[1] + '_' + \
                                 mtm_titprivado['id_papel'].str[2]

    del mtm_titprivado['data_bd']
    del mtm_titprivado['dtrel']

    query = 'SELECT * FROM projeto_inv.xml_debenture_org'
    xml_titprivado = pd.read_sql(query, con=connection)
    logger.info("Leitura do banco de dados executada com sucesso")

    # Seleciona a última carga de debentures da data da posicao
    xml_titprivado['dtrel'] = xml_titprivado['id_papel'].str.split('_')
    xml_titprivado['dtrel'] = xml_titprivado['dtrel'].str[0]

    xml_titprivado = xml_titprivado[xml_titprivado.dtrel ==
                                    dtbase_concat].copy()
    xml_titprivado = xml_titprivado[xml_titprivado.data_bd == max(
        xml_titprivado.data_bd)]

    original = xml_titprivado.copy()

    del xml_titprivado['data_bd']
    del xml_titprivado['indexador']
    del xml_titprivado['dtrel']

    # Puxa as informações de negociação em mercado secuindário da Anbima para debentures -> linha dtspread
    query = 'SELECT * FROM projeto_inv.anbima_debentures'
    anbima_deb = pd.read_sql(query, con=connection)
    logger.info("Leitura do banco de dados executada com sucesso")

    #Fecha conexão
    connection.close()

    anbima_deb = anbima_deb[anbima_deb.data_referencia <= dt_base_rel]
    anbima_deb = anbima_deb.sort(['codigo', 'data_referencia', 'data_bd'],
                                 ascending=[True, True, True])
    anbima_deb = anbima_deb.drop_duplicates(subset=['codigo'], take_last=True)
    anbima_deb = anbima_deb[['codigo', 'data_referencia', 'pu']].copy()
    anbima_deb['codigo'] = anbima_deb['codigo'].astype(str)
    anbima_deb = anbima_deb.rename(columns={
        'codigo': 'coddeb',
        'data_referencia': 'data_spread'
    })
    anbima_deb.coddeb.unique().tolist()
    xml_titprivado.coddeb.unique().tolist()

    # Criação da tabela xml + anbima
    xml_titprivado = xml_titprivado.merge(anbima_deb,
                                          on=['coddeb'],
                                          how='left')

    # Para os papéis que não tiveram negociação, assume data_spread = data_relatorio
    xml_titprivado['data_spread'] = np.where(
        xml_titprivado['data_spread'].isnull(), dt_base_rel,
        xml_titprivado['data_spread'])

    # Preenchimento com puposicao do xml no pu vindo da anbima quando não tem pu
    # Tira o valor médio de todos os pu's posicao
    x = xml_titprivado[['isin', 'puposicao']].groupby(['isin']).agg(['mean'])
    x = x.reset_index(level=None,
                      drop=False,
                      inplace=False,
                      col_level=0,
                      col_fill='')
    x1 = pd.DataFrame(columns=['isin', 'pumedio'])
    x1['isin'] = x['isin']
    x1['pumedio'] = x['puposicao']
    xml_titprivado = xml_titprivado.merge(x1, on=['isin'], how='left')

    xml_titprivado['pu'] = np.where(xml_titprivado['pu'].isnull(),
                                    xml_titprivado['pumedio'],
                                    xml_titprivado['pu'])

    del xml_titprivado['pumedio']

    # Criação da tabela mtm + xml
    titprivado = xml_titprivado.merge(mtm_titprivado,
                                      on=['id_papel'],
                                      how='left')

    # Criação da coluna de data de referencia da posição
    titprivado['data_referencia'] = titprivado['id_papel'].str[0:8]
    titprivado['data_referencia'] = pd.to_datetime(
        titprivado['data_referencia']).dt.date

    titprivado.caracteristica.unique()

    logger.info("Cálculo marcação na curva")
    # 2 - Cálculo marcação na curva

    ###### AQUI ZERA!!################# Verificar
    titprivado_curva = titprivado[titprivado.caracteristica == 'V'].copy()

    del titprivado_curva['vne']

    # Seleciona a parte do fluxo entre a data da compra e a data da posição
    titprivado_curva = titprivado_curva[
        titprivado_curva.dt_ref >= titprivado_curva.dtoperacao_mtm].copy()
    titprivado_curva = titprivado_curva[
        titprivado_curva.dt_ref <= titprivado_curva.data_referencia].copy()

    # Preenchimento do VNE na data da compra
    tp_curva_dtop = titprivado_curva[[
        'id_papel_old', 'saldo_dev_juros_perc', 'pucompra'
    ]][titprivado_curva.dt_ref == titprivado_curva.dtoperacao_mtm].copy()

    tp_curva_dtop['vne'] = tp_curva_dtop['pucompra'] * (
        1 - tp_curva_dtop['saldo_dev_juros_perc'])

    del tp_curva_dtop['saldo_dev_juros_perc']
    del tp_curva_dtop['pucompra']

    titprivado_curva = titprivado_curva.merge(tp_curva_dtop,
                                              on=['id_papel_old'],
                                              how='left')

    titprivado_curva[
        'principal_perc_acum'] = 1 - titprivado_curva['principal_perc']
    titprivado_curva['principal_perc_acum'] = titprivado_curva[[
        'id_papel_old', 'principal_perc_acum'
    ]].groupby(['id_papel_old']).agg(['cumprod'])
    titprivado_curva['vne'] = titprivado_curva['vne'] * titprivado_curva[
        'principal_perc_acum']
    titprivado_curva['pagto_juros'] = titprivado_curva[
        'vne'] * titprivado_curva['pagto_juros_perc']
    titprivado_curva[
        'vna'] = titprivado_curva['vne'] * titprivado_curva['fator_index_per']
    titprivado_curva['vna'][titprivado_curva.indexador ==
                            'DI1'] = titprivado_curva['vne'][
                                titprivado_curva.indexador == 'DI1']
    titprivado_curva['saldo_dev_juros'] = titprivado_curva[
        'vna'] * titprivado_curva['saldo_dev_juros_perc']
    titprivado_curva['pupar'] = titprivado_curva['vna'] + titprivado_curva[
        'saldo_dev_juros'] + titprivado_curva['pagto_juros']
    titprivado_curva['dif_curva'] = titprivado_curva[
        'pupar'] / titprivado_curva['puposicao'] - 1
    titprivado_curva['dif_curva'] = titprivado_curva['dif_curva'].abs()
    titprivado_curva = titprivado_curva[
        titprivado_curva.dt_ref == titprivado_curva.data_referencia].copy()
    titprivado_curva = titprivado_curva[[
        'id_papel_old', 'id_papel', 'codigo_isin', 'dif_curva', 'pupar'
    ]].copy()
    titprivado_curva = titprivado_curva.sort(['dif_curva'], ascending=[True])
    titprivado_curva = titprivado_curva.drop_duplicates(subset=['id_papel'],
                                                        take_last=False)

    titprivado = titprivado.merge(
        titprivado_curva,
        on=['id_papel_old', 'id_papel', 'codigo_isin'],
        how='left')
    titprivado = titprivado[((titprivado.caracteristica == 'V') &
                             (titprivado.dif_curva.notnull())) |
                            (titprivado.caracteristica == 'N')].copy()
    titprivado = titprivado.reset_index(level=None,
                                        drop=True,
                                        inplace=False,
                                        col_level=0,
                                        col_fill='')

    logger.info("Cálculo do mtm na data_spread; a)prazo_du_spread")
    # 3 - Cálculo do mtm na data_spread; a) prazo_du_spread

    # Verificação de papéis sem a data de referência
    titprivado[titprivado.dt_ref.isnull()].to_excel(writer, 'dt_ref_NaN')

    # Retira papéis sem data de referência
    titprivado = titprivado[titprivado.dt_ref.notnull()].copy()

    # Cria vetor das datas
    dt_min = min(titprivado['dt_ref'])
    dt_min = dt_min.replace(day=1)
    dt_min = datetime.date(dt_min.year, dt_min.month, dt_min.day)

    dt_max = max(titprivado['dt_ref'])
    dt_max = dt_max.replace(day=1, month=dt_max.month)
    dt_max = dt_max + pd.DateOffset(months=1)
    dt_max = dt_max - pd.DateOffset(days=1)
    dt_max = datetime.date(dt_max.year, dt_max.month, dt_max.day)

    dt_ref = pd.date_range(start=dt_min, end=dt_max, freq='D').date

    serie_dias = pd.DataFrame(columns=['dt_ref', 'aux'])
    serie_dias['dt_ref'] = dt_ref

    # Cria vetor das datas úteis
    per = FinDt.DatasFinanceiras(dt_min, dt_max, path_arquivo=feriados_sheet)

    du = pd.DataFrame(columns=['dt_ref'])
    du['dt_ref'] = per.dias(3)
    du['du_1'] = 1

    serie_dias = serie_dias.merge(du, on=['dt_ref'], how='left')

    serie_dias['du_1'] = serie_dias['du_1'].fillna(0)

    serie_dias['indice_du'] = np.cumsum(serie_dias['du_1'])

    del serie_dias['aux']
    del serie_dias['du_1']

    fim = serie_dias.copy()
    fim = fim.rename(columns={'indice_du': 'indice_du_fim_spread'})

    inicio = serie_dias.copy()
    inicio = inicio.rename(columns={
        'dt_ref': 'data_spread',
        'indice_du': 'indice_du_inicio_spread'
    })

    # Une as séries dias à tabela titprivado
    titprivado = titprivado.merge(fim, on=['dt_ref'], how='left')
    titprivado = titprivado.merge(inicio, on=['data_spread'], how='left')

    # Calcula o prazo_du_spread
    titprivado['prazo_du_spread'] = titprivado[
        'indice_du_fim_spread'] - titprivado['indice_du_inicio_spread']

    logger.info("Cálculo do mtm na data_spread; b) taxa_spot")
    # 4 - Cálculo do mtm na data_spread; b) taxa_spot

    if len(titprivado[titprivado.indexador == 'PRE']) != 0:
        maximo_tp_PRE = max(
            titprivado['prazo_du_spread'][titprivado.indexador == 'PRE'])

    if len(titprivado[titprivado.indexador == 'IGP']) != 0:
        maximo_tp_IGPM = max(
            titprivado['prazo_du_spread'][titprivado.indexador == 'IGP'])

    if len(titprivado[titprivado.indexador == 'IPCA']) != 0:
        maximo_tp_IPCA = max(
            titprivado['prazo_du_spread'][titprivado.indexador == 'IPCA'])

    # ----Base de interpolações para cálculo do spread
    dt_min_interpol = str(min(titprivado['data_spread']))

    logger.info("Conectando no Banco de dados")
    connection = db.connect('localhost',
                            user='******',
                            passwd='root',
                            db='projeto_inv',
                            use_unicode=True,
                            charset="utf8")
    logger.info("Conexão com DB executada com sucesso")

    # Data de 2020 limite para a criacao das tabelas
    dt_ref = pd.date_range(start=datetime.date(int(dt_min_interpol[0:4]),
                                               int(dt_min_interpol[5:7]), 1),
                           end=datetime.date(int(2020), int(10), 31),
                           freq='M').date

    # Uniao das tabelas criadas com union all
    query = ''
    for dt in dt_ref:
        month = '0' + str(dt.month) if len(str(dt.month)) == 1 else str(
            dt.month)
        year = '0' + str(dt.year) if len(str(dt.year)) == 1 else str(dt.year)
        query = query + 'SELECT * FROM projeto_inv.curva_ettj_interpol_' + year + "_" + month + " UNION ALL "
    query = query[:-11]

    query = 'select * from (' + query + ') AS a where dt_ref<=' + '"' + dtbase_concat + '" and dt_ref>=' + '"' + dt_min_interpol + '" and indexador_cod in("PRE","DIM","DIC");'

    ettj = pd.read_sql(query, con=connection)
    logger.info("Leitura do banco de dados executada com sucesso")

    #Fecha conexão
    connection.close()

    # Seleciona a última carga
    ettj = ettj.sort(['indexador_cod', 'dt_ref', 'data_bd'],
                     ascending=[True, False, False])
    ettj = ettj.drop_duplicates(subset=[
        'prazo', 'tx_spot', 'tx_spot_ano', 'tx_termo_dia', 'indexador_cod'
    ],
                                take_last=False)
    ettj['indexador'] = np.where(
        ettj['indexador_cod'] == 'DIC', 'IPCA',
        np.where(ettj['indexador_cod'] == 'DIM', 'IGP', 'PRE'))
    ettj = ettj.rename(columns={'prazo': 'prazo_du'})
    ettj_filtro = ettj[['prazo_du', 'tx_spot', 'tx_spot_ano', 'indexador']]
    ettj_filtro = ettj_filtro.rename(columns={'prazo_du': 'prazo_du_spread'})

    # Extrapolação PRE, se necessário
    if len(titprivado[titprivado.indexador == 'PRE']) != 0:
        maximo_ettj = max(
            ettj_filtro['prazo_du_spread'][ettj_filtro.indexador == 'PRE'])
        # del ettj_fluxo
        if maximo_ettj < max(
                titprivado['prazo_du_spread'][titprivado.indexador == 'PRE']):
            ettj_filtro_PRE = ettj_filtro[[
                'prazo_du_spread', 'tx_spot_ano', 'indexador'
            ]][ettj_filtro.indexador == 'PRE'].copy()
            ettj_filtro_PRE = ettj_filtro_PRE.reset_index(level=None,
                                                          drop=True,
                                                          inplace=False,
                                                          col_level=0,
                                                          col_fill='')
            ettj_filtro_PRE = ettj_filtro_PRE[0:maximo_ettj - 1].copy()

            tx_max = ettj_filtro_PRE['tx_spot_ano'].loc[len(ettj_filtro_PRE) -
                                                        1]

            ettj_aux = pd.DataFrame(columns=['prazo_du_spread', 'indexador'])
            ettj_aux['prazo_du_spread'] = np.linspace(1, maximo_tp_PRE,
                                                      maximo_tp_PRE)
            ettj_aux['indexador'] = 'PRE'
            ettj_aux = ettj_aux.merge(ettj_filtro_PRE,
                                      on=['prazo_du_spread', 'indexador'],
                                      how='left')
            ettj_aux['tx_spot_ano'] = ettj_aux['tx_spot_ano'].fillna(tx_max)
            ettj_aux['tx_spot'] = (1 + ettj_aux['tx_spot_ano'])**(
                ettj_aux['prazo_du_spread'] / 252) - 1
            ettj_fluxo = ettj_fluxo.append(ettj_aux)
        else:
            ettj_aux = ettj_filtro.copy()
            ettj_fluxo = ettj_aux.copy()
    else:
        ettj_fluxo = ettj_filtro.copy()

    # Extrapolação IGPM, se necessário
    if len(titprivado[titprivado.indexador == 'IGP']) != 0:
        maximo_ettj = max(
            ettj_filtro['prazo_du_spread'][ettj_filtro.indexador == 'IGP'])
        # del ettj_fluxo
        if maximo_ettj < max(
                titprivado['prazo_du_spread'][titprivado.indexador == 'IGP']):
            ettj_filtro_IGPM = ettj_filtro[[
                'prazo_du_spread', 'tx_spot_ano', 'indexador'
            ]][ettj_filtro.indexador == 'IGP'].copy()
            ettj_filtro_IGPM = ettj_filtro_IGPM.reset_index(level=None,
                                                            drop=True,
                                                            inplace=False,
                                                            col_level=0,
                                                            col_fill='')
            ettj_filtro_IGPM = ettj_filtro_IGPM[0:maximo_ettj - 1].copy()

            tx_max = ettj_filtro_IGPM['tx_spot_ano'].loc[len(ettj_filtro_IGPM)
                                                         - 1]

            ettj_aux = pd.DataFrame(columns=['prazo_du_spread', 'indexador'])
            ettj_aux['prazo_du_spread'] = np.linspace(1, maximo_tp_IGPM,
                                                      maximo_tp_IGPM)
            ettj_aux['indexador'] = 'IGP'
            ettj_aux = ettj_aux.merge(ettj_filtro_IGPM,
                                      on=['prazo_du_spread', 'indexador'],
                                      how='left')
            ettj_aux['tx_spot_ano'] = ettj_aux['tx_spot_ano'].fillna(tx_max)
            ettj_aux['tx_spot'] = (1 + ettj_aux['tx_spot_ano'])**(
                ettj_aux['prazo_du_spread'] / 252) - 1
            ettj_fluxo = ettj_fluxo.append(ettj_aux)
        else:
            ettj_aux = ettj_filtro.copy()
            ettj_fluxo = ettj_aux.copy()
    else:
        ettj_fluxo = ettj_filtro.copy()

    # Extrapolação IPCA, se necessário
    if len(titprivado[titprivado.indexador == 'IPCA']) != 0:
        maximo_ettj = max(
            ettj_filtro['prazo_du_spread'][ettj_filtro.indexador == 'IPCA'])
        # del ettj_fluxo
        if maximo_ettj < max(
                titprivado['prazo_du_spread'][titprivado.indexador == 'IPCA']):
            ettj_filtro_IPCA = ettj_filtro[[
                'prazo_du_spread', 'tx_spot_ano', 'indexador'
            ]][ettj_filtro.indexador == 'IPCA'].copy()
            ettj_filtro_IPCA = ettj_filtro_IPCA.reset_index(level=None,
                                                            drop=True,
                                                            inplace=False,
                                                            col_level=0,
                                                            col_fill='')
            ettj_filtro_IPCA = ettj_filtro_IPCA[0:maximo_ettj - 1].copy()

            tx_max = ettj_filtro_IPCA['tx_spot_ano'].loc[len(ettj_filtro_IPCA)
                                                         - 1]

            ettj_aux = pd.DataFrame(columns=['prazo_du_spread', 'indexador'])
            ettj_aux['prazo_du_spread'] = np.linspace(1, maximo_tp_IPCA,
                                                      maximo_tp_IPCA)
            ettj_aux['indexador'] = 'IPCA'
            ettj_aux = ettj_aux.merge(ettj_filtro_IPCA,
                                      on=['prazo_du_spread', 'indexador'],
                                      how='left')
            ettj_aux['tx_spot_ano'] = ettj_aux['tx_spot_ano'].fillna(tx_max)
            ettj_aux['tx_spot'] = (1 + ettj_aux['tx_spot_ano'])**(
                ettj_aux['prazo_du_spread'] / 252) - 1
            ettj_fluxo = ettj_fluxo.append(ettj_aux)
        else:
            ettj_aux = ettj_filtro.copy()
            ettj_fluxo = ettj_aux.copy()
    else:
        ettj_fluxo = ettj_filtro.copy()

    # Une a ETTJ à tabela titprivado
    ettj_fluxo = ettj_fluxo.rename(columns={
        'tx_spot': 'tx_spot_spread',
        'tx_spot_ano': 'tx_spot_ano_spread'
    })
    titprivado = titprivado.merge(ettj_fluxo,
                                  on=['prazo_du_spread', 'indexador'],
                                  how='left')

    # Preenche com 0 onde não tem taxa spot (prazo_su_spread<0, indexador=DI1)
    titprivado['tx_spot_spread'] = titprivado['tx_spot_spread'].fillna(0)
    titprivado['tx_spot_ano_spread'] = titprivado['tx_spot_ano_spread'].fillna(
        0)

    logger.info("Cálculo do mtm na data_spread; b) valor presente e mtm")
    # 5 - Cálculo do mtm na data_spread; c) valor presente e mtm

    titprivado['fator_desconto_spread'] = 1 / (1 +
                                               titprivado['tx_spot_spread'])
    titprivado[
        'pv_spread'] = titprivado['fv'] * titprivado['fator_desconto_spread']
    titprivado['pv_spread'] = np.where(titprivado['prazo_du_spread'] < 0, 0,
                                       titprivado['pv_spread'])

    x = titprivado[['id_papel',
                    'pv_spread']].groupby(['id_papel']).agg(['sum'])
    x = x.reset_index(level=None,
                      drop=False,
                      inplace=False,
                      col_level=0,
                      col_fill='')
    x1 = pd.DataFrame(columns=['id_papel', 'mtm_spread'])
    x1['id_papel'] = x['id_papel']
    x1['mtm_spread'] = x['pv_spread']
    titprivado = titprivado.merge(x1, on=['id_papel'], how='left')
    titprivado['dif'] = titprivado['mtm_spread'] / titprivado['pu'] - 1

    writer = ExcelWriter(save_path_puposicao_final)

    aux = titprivado.drop_duplicates(subset=['codigo_isin'])
    aux[[
        'id_papel_old', 'codigo_isin', 'flag', 'indexador', 'puemissao',
        'data_emissao', 'data_expiracao', 'puposicao', 'percentual_indexador',
        'taxa_juros', 'pu', 'mtm_spread', 'dif'
    ]].to_excel(writer, 'dif')

    logger.info("Cálculo spread de crédito")
    # 6 - Cálculo spread de crédito

    titprivado_spread = \
    titprivado[['id_papel', 'codigo_isin', 'data_spread', 'pu', 'dt_ref', 'prazo_du_spread', 'pv_spread']][
        (titprivado.caracteristica == 'N') & (titprivado.dif != 1)]

    # Seleciona apenas o fluxo com prazo_du positivo
    titprivado_spread = titprivado_spread[
        titprivado_spread.dt_ref >= titprivado_spread.data_spread]
    titprivado_spread = titprivado_spread.drop_duplicates(
        subset=['id_papel', 'prazo_du_spread'])
    titprivado_spread['pv_pv_fluxo'] = np.where(
        titprivado_spread['dt_ref'] == titprivado_spread['data_spread'],
        -titprivado_spread['pu'], titprivado_spread['pv_spread'])
    tp_spread = titprivado_spread[[
        'id_papel', 'dt_ref', 'prazo_du_spread', 'pv_pv_fluxo'
    ]].copy()
    tp_spread['prazo_du'] = tp_spread['prazo_du_spread'].astype(float)
    tp_spread = tp_spread.drop_duplicates(
        subset=['id_papel', 'prazo_du_spread'], take_last=True)

    id_papel = titprivado_spread['id_papel'].unique()
    spread = np.zeros((len(id_papel)))

    spread_aux = pd.DataFrame(columns=['id_papel', 'spread'])
    spread_aux['id_papel'] = id_papel

    start_time = time.time()
    for i in range(0, len(id_papel)):
        v = tp_spread['pv_pv_fluxo'][tp_spread.id_papel == id_papel[i]].values
        v = np.meshgrid(v, sparse=True)

        s = np.linspace(-0.9999, 0.9999, 10000)
        t = tp_spread['prazo_du_spread'][tp_spread.id_papel ==
                                         id_papel[i]].values
        t, s = np.meshgrid(t, s, sparse=True)
        f_ts = 1. / (1 + s)**(t / 252)

        f_spread = v * f_ts

        f_sum = f_spread.sum(axis=1, dtype='float')

        min_index = abs(f_sum).argmin()

        spread[i] = s[min_index]

        print(time.time() - start_time, i, id_papel[i], spread[i])

        spread_aux['spread'].iloc[i] = spread[i]

    titprivado = titprivado.merge(spread_aux, on=['id_papel'], how='left')

    aux = titprivado.drop_duplicates(subset=['id_papel'])

    aux[[
        'id_papel_old', 'codigo_isin', 'valor_nominal', 'puposicao',
        'mtm_spread', 'pu', 'spread'
    ]].to_excel(save_path_teste_dif_deb)

    logger.info("Seleção dos papéis cuja marcação não ficou boa")
    # 7 - Seleção dos papéis cuja marcação não ficou boa

    tp_bigdif = titprivado[[
        'data_spread', 'codigo_isin', 'id_papel', 'flag', 'indexador',
        'dtemissao', 'data_emissao', 'dtvencimento', 'data_expiracao',
        'valor_nominal', 'puemissao', 'juros_cada', 'coupom', 'taxa_juros',
        'percentual_indexador', 'percindex', 'perc_amortizacao', 'dt_ref',
        'vne', 'du_per', 'prazo_du', 'fator_index_per', 'fator_juros_per',
        'pv', 'fv', 'mtm', 'puposicao', 'pu', 'dif', 'spread'
    ]].copy()

    tp_bigdif['dif'] = tp_bigdif['mtm'] / tp_bigdif['pu'] - 1

    tp_bigdif[(tp_bigdif.dif > tol) | (tp_bigdif.dif < -tol) |
              (tp_bigdif.spread > tol) | (tp_bigdif.spread < -tol)].to_excel(
                  writer, 'bigdif')

    logger.info(
        "Atualização do fluxo de percentual de mtm com o spread e carregamento da tabela"
    )
    # 8 - Atualização do fluxo de percentual de mtm com o spread e carregamento da tabela para preenchimento do quadro 419

    titprivado_perc = titprivado.copy()

    titprivado_perc = titprivado_perc.rename(
        columns={
            'mtm': 'mtm_old',
            'pv': 'pv_old',
            'pv_DV100': 'pv_DV100_old',
            'fator_desconto': 'fator_desconto_old',
            'fator_desconto_DV100': 'fator_desconto_DV100_old',
            'DV100': 'DV100_old'
        })

    # Escolhe o melhor spread - SIGNIFICA O MELHOR FLUXO
    titprivado_perc['spread'] = titprivado_perc['spread'].fillna(0)

    # Pega penas uma linha para não ter problemas
    x = titprivado_perc[[
        'id_papel', 'codigo_isin', 'spread'
    ]][titprivado_perc.dt_ref == titprivado_perc.data_referencia].copy()

    x = x.sort(['codigo_isin', 'spread'], ascending=[True, True])
    x = x.drop_duplicates(subset=['codigo_isin'], take_last=False)

    x['marker'] = 1

    titprivado_perc = titprivado_perc.merge(
        x, on=['codigo_isin', 'id_papel', 'spread'], how='left')

    titprivado_perc = titprivado_perc[titprivado_perc.marker == 1].copy()
    del titprivado_perc['marker']

    titprivado_perc = titprivado_perc.drop_duplicates(
        subset=['codigo_isin', 'dt_ref'], take_last=True)

    # titprivado_perc['puposicao_final'] = np.where(titprivado_perc['caracteristica']=='N',titprivado_perc['pu'],titprivado_perc['mtm_old'])

    titprivado_perc = titprivado_perc[titprivado_perc.prazo_du >= 0]

    aux = titprivado_perc[[
        'id_papel', 'codigo_isin'
    ]][titprivado_perc.dt_ref == titprivado_perc.data_referencia].copy()
    aux = aux.drop_duplicates(subset=['codigo_isin'], take_last=True)
    aux['marker'] = 1

    titprivado_perc = titprivado_perc.merge(aux,
                                            on=['id_papel', 'codigo_isin'],
                                            how='left')
    titprivado_perc = titprivado_perc[titprivado_perc.marker == 1].copy()
    del titprivado_perc['marker']

    # Recalcula apenas para que está marcado a mercado
    aux = titprivado_perc[titprivado_perc.caracteristica == 'V'].copy()
    aux['mtm_DV100_N'] = 0.0
    aux = aux.rename(columns={'mtm_old': 'mtm'})

    titprivado_perc = titprivado_perc[titprivado_perc.caracteristica ==
                                      'N'].copy()

    # Cálculo do fator de desconto atualizado pelo spread
    titprivado_perc[
        'fator_desconto'] = titprivado_perc['fator_desconto_old'] / (
            1 + titprivado_perc['spread'])**(titprivado_perc['prazo_du'] / 252)
    titprivado_perc['fator_desconto_DV100'] = titprivado_perc[
        'fator_desconto_DV100_old'] / (1 + titprivado_perc['spread'])**(
            titprivado_perc['prazo_du'] / 252)

    # Calculo do pv
    titprivado_perc[
        'pv'] = titprivado_perc['fv'] * titprivado_perc['fator_desconto']
    titprivado_perc['pv_DV100'] = titprivado_perc['fv'] * titprivado_perc[
        'fator_desconto_DV100']

    # Calculo do MTM
    x = titprivado_perc[['codigo_isin', 'pv',
                         'pv_DV100']].groupby(['codigo_isin']).agg(['sum'])
    x = x.reset_index(level=None,
                      drop=False,
                      inplace=False,
                      col_level=0,
                      col_fill='')
    x1 = pd.DataFrame(columns=['codigo_isin', 'mtm', 'mtm_DV100_N'])
    x1['codigo_isin'] = x['codigo_isin']
    x1['mtm'] = x['pv']
    x1['mtm_DV100_N'] = x['pv_DV100']
    titprivado_perc = titprivado_perc.merge(x1, on=['codigo_isin'], how='left')

    titprivado_perc['mtm_DV100'] = 0.0

    # Escolhe o melhor fluxo
    titprivado_perc['dif_new'] = titprivado_perc['mtm'] - titprivado_perc['pu']
    titprivado_perc['dif_new'] = titprivado_perc['dif_new'].abs()
    titprivado_perc[
        'dif_old'] = titprivado_perc['mtm_old'] - titprivado_perc['pu']
    titprivado_perc['dif_old'] = titprivado_perc['dif_old'].abs()

    titprivado_perc['mtm'] = np.where(
        titprivado_perc['dif_old'] < titprivado_perc['dif_new'],
        titprivado_perc['mtm_old'], titprivado_perc['mtm'])
    titprivado_perc['mtm_DV100'] = np.where(
        titprivado_perc['dif_old'] < titprivado_perc['dif_new'],
        titprivado_perc['mtm_DV100'], titprivado_perc['mtm_DV100_N'])
    titprivado_perc['pv'] = np.where(
        titprivado_perc['dif_old'] < titprivado_perc['dif_new'],
        titprivado_perc['pv_old'], titprivado_perc['pv'])
    titprivado_perc['pv_DV100'] = np.where(
        titprivado_perc['dif_old'] < titprivado_perc['dif_new'],
        titprivado_perc['pv_DV100_old'], titprivado_perc['pv_DV100'])
    titprivado_perc['fator_desconto'] = np.where(
        titprivado_perc['dif_old'] < titprivado_perc['dif_new'],
        titprivado_perc['fator_desconto_old'],
        titprivado_perc['fator_desconto'])
    titprivado_perc['fator_desconto_DV100'] = np.where(
        titprivado_perc['dif_old'] < titprivado_perc['dif_new'],
        titprivado_perc['fator_desconto_DV100_old'],
        titprivado_perc['fator_desconto_DV100'])

    titprivado_perc[
        'dif_perc'] = titprivado_perc['dif_new'] / titprivado_perc['pu']

    # titprivado_perc['mtm'] = np.where(titprivado_perc['dif_perc']>0.10,titprivado_perc['pu'],titprivado_perc['mtm'])

    # Cálculo do DV100
    titprivado_perc = titprivado_perc.append(aux)

    titprivado_perc[
        'DV100'] = titprivado_perc['mtm'] - titprivado_perc['mtm_DV100']

    # Cálculo do perc_mtm
    titprivado_perc[
        'perc_mtm'] = titprivado_perc['pv'] / titprivado_perc['mtm']

    # Cálculo da duration
    titprivado_perc[
        'duration'] = titprivado_perc['perc_mtm'] * titprivado_perc['prazo_du']
    x = titprivado_perc[['codigo_isin',
                         'duration']].groupby(['codigo_isin']).agg(['sum'])
    x = x.reset_index(level=None,
                      drop=False,
                      inplace=False,
                      col_level=0,
                      col_fill='')
    x1 = pd.DataFrame(columns=['codigo_isin', 'duration'])
    x1['codigo_isin'] = x['codigo_isin']
    x1['duration'] = x['duration']
    del titprivado_perc['duration']
    titprivado_perc = titprivado_perc.merge(x1, on=['codigo_isin'], how='left')

    titprivado_perc[[
        'codigo_isin', 'pu', 'mtm', 'dt_ref', 'mtm_old', 'dif_new', 'dif_old',
        'duration', 'spread'
    ]].to_excel(writer, 'conclusao_fluxo')
    titprivado_perc[
        titprivado_perc.dt_ref == titprivado_perc.data_referencia][[
            'codigo_isin', 'pu', 'mtm', 'dt_ref', 'mtm_old', 'dif_new',
            'dif_old', 'duration', 'spread'
        ]].to_excel(writer, 'conclusao_resumido')

    finalizacao = titprivado_perc[[
        'codigo_isin', 'mtm'
    ]][titprivado_perc.dt_ref == titprivado_perc.data_referencia].copy()

    del titprivado_perc['fator_desconto_DV100_old']
    del titprivado_perc['fator_desconto_old']
    del titprivado_perc['mtm_old']
    del titprivado_perc['mtm_DV100_N']
    del titprivado_perc['pv_DV100_old']
    del titprivado_perc['pv_old']
    del titprivado_perc['DV100_old']
    del titprivado_perc['dif_new']
    del titprivado_perc['dif_old']
    del titprivado_perc['dif_perc']

    writer.save()

    #    Alteração de formato das colunas que são int e foram lidas como float (sabe lá pq...)

    # id_mtm_titprivado
    titprivado_perc['id_mtm_titprivado'] = titprivado_perc[
        'id_mtm_titprivado'].astype(int)

    # id_bmf_numeraca
    aux = titprivado_perc[[
        'id_papel', 'id_bmf_numeraca'
    ]][titprivado_perc.id_bmf_numeraca.notnull()].copy()
    aux['id_bmf_numeraca'] = aux['id_bmf_numeraca'].astype(int)
    del titprivado_perc['id_bmf_numeraca']
    aux = aux.drop_duplicates()
    titprivado_perc = titprivado_perc.merge(aux, on=['id_papel'], how='left')

    # pagto_amortizacao
    aux = titprivado_perc[[
        'id_papel', 'indexador_dc_du'
    ]][titprivado_perc.indexador_dc_du.notnull()].copy()
    aux['indexador_dc_du'] = aux['indexador_dc_du'].astype(int)
    del titprivado_perc['indexador_dc_du']
    aux = aux.drop_duplicates()
    titprivado_perc = titprivado_perc.merge(aux, on=['id_papel'])

    # juros_cada
    aux = titprivado_perc[['id_papel', 'juros_cada'
                           ]][titprivado_perc.juros_cada.notnull()].copy()
    aux['juros_cada'] = aux['juros_cada'].astype(int)
    del titprivado_perc['juros_cada']
    aux = aux.drop_duplicates()
    titprivado_perc = titprivado_perc.merge(aux, on=['id_papel'], how='left')

    # indexador_dc_du
    aux = titprivado_perc[[
        'id_papel', 'indexador_dc_du'
    ]][titprivado_perc.indexador_dc_du.notnull()].copy()
    aux['indexador_dc_du'] = aux['indexador_dc_du'].astype(int)
    del titprivado_perc['indexador_dc_du']
    aux = aux.drop_duplicates()
    titprivado_perc = titprivado_perc.merge(aux, on=['id_papel'])

    # juros_dc_du
    aux = titprivado_perc[['id_papel', 'juros_dc_du'
                           ]][titprivado_perc.juros_dc_du.notnull()].copy()
    aux['juros_dc_du'] = aux['juros_dc_du'].astype(int)
    del titprivado_perc['juros_dc_du']
    aux = aux.drop_duplicates()
    titprivado_perc = titprivado_perc.merge(aux, on=['id_papel'])

    # flag_inclusao
    titprivado_perc['flag_inclusao'] = titprivado_perc['flag_inclusao'].astype(
        int)

    # du_per
    titprivado_perc['du_per'] = titprivado_perc['du_per'].astype(int)

    # dc_per
    titprivado_perc['dc_per'] = titprivado_perc['dc_per'].astype(int)

    # dt_ref -> data_fim
    titprivado_perc['dt_ref'] = pd.to_datetime(titprivado_perc['dt_ref'])
    titprivado_perc['data_fim'] = np.where(
        titprivado_perc['indexador'] == 'DI1',
        titprivado_perc['dt_ref'] - pd.DateOffset(months=0, days=1),
        titprivado_perc['dt_ref'])

    titprivado_perc['id_papel'] = titprivado_perc['id_papel_old']

    titprivado_perc['data_mtm'] = titprivado_perc['data_referencia']
    titprivado_perc['data_negociacao'] = titprivado_perc['data_referencia']

    # Tabelas não necessárias - MTM
    del titprivado_perc['data_referencia']
    del titprivado_perc['id_papel_old']
    del titprivado_perc['indice_du_mtm']
    del titprivado_perc['indice_dc_mtm']
    del titprivado_perc['ano_dt_ref2']
    del titprivado_perc['mes_dt_ref2']
    del titprivado_perc['dia_dt_ref2']
    del titprivado_perc['vertices_positivo']
    del titprivado_perc['indice_dc_dt_ref2']
    del titprivado_perc['indice_du_dt_ref2']
    del titprivado_perc['prazo_dc']
    del titprivado_perc['ano_inicio']
    del titprivado_perc['mes_inicio']
    del titprivado_perc['dia_inicio']
    del titprivado_perc['indice_du_inicio']
    del titprivado_perc['indice_dc_inicio']
    del titprivado_perc['ano_fim']
    del titprivado_perc['mes_fim']
    del titprivado_perc['dia_fim']
    del titprivado_perc['indice_du_fim']
    del titprivado_perc['indice_dc_fim']
    del titprivado_perc['dt_ref']
    del titprivado_perc['dtoperacao_mtm']
    del titprivado_perc['dif']
    del titprivado_perc['pu_mercado']
    del titprivado_perc['pu_curva']
    del titprivado_perc['mtm_mercado']
    del titprivado_perc['mtm_curva']
    del titprivado_perc['pu_regra_xml']
    del titprivado_perc['mtm_regra_xml']
    del titprivado_perc['data_spread']
    del titprivado_perc['pu']
    del titprivado_perc['dif_curva']
    del titprivado_perc['pupar']
    del titprivado_perc['indice_du_fim_spread']
    del titprivado_perc['indice_du_inicio_spread']
    del titprivado_perc['prazo_du_spread']
    del titprivado_perc['mtm_spread']
    del titprivado_perc['pv_spread']
    del titprivado_perc['tx_spot_spread']
    del titprivado_perc['tx_spot_ano_spread']
    del titprivado_perc['fator_desconto_spread']

    # Tabelas não necessárias - XML
    del titprivado_perc['id_xml_debenture']
    del titprivado_perc['isin']
    del titprivado_perc['coddeb']
    del titprivado_perc['dtemissao']
    del titprivado_perc['dtoperacao']
    del titprivado_perc['dtvencimento']
    del titprivado_perc['cnpjemissor']
    del titprivado_perc['qtdisponivel']
    del titprivado_perc['qtgarantia']
    del titprivado_perc['pucompra']
    del titprivado_perc['puvencimento']
    del titprivado_perc['puposicao']
    del titprivado_perc['puemissao']
    del titprivado_perc['principal']
    del titprivado_perc['tributos']
    del titprivado_perc['valorfindisp']
    del titprivado_perc['valorfinemgar']
    del titprivado_perc['coupom']
    del titprivado_perc['percindex']
    del titprivado_perc['caracteristica']
    del titprivado_perc['percprovcred']
    del titprivado_perc['classeoperacao']
    del titprivado_perc['idinternoativo']
    del titprivado_perc['nivelrsc']
    del titprivado_perc['header_id']
    del titprivado_perc['cusip']
    del titprivado_perc['depgar']
    del titprivado_perc['debconv']
    del titprivado_perc['debpartlucro']
    del titprivado_perc['SPE']
    del titprivado_perc['dtretorno']
    del titprivado_perc['puretorno']
    del titprivado_perc['indexadorcomp']
    del titprivado_perc['perindexcomp']
    del titprivado_perc['txoperacao']
    del titprivado_perc['classecomp']

    # Remove as duplicatas de isin
    titprivado_perc = titprivado_perc.sort(
        ['codigo_isin', 'id_papel', 'data_fim'], ascending=[True, True, True])
    titprivado_perc = titprivado_perc.drop_duplicates(
        subset=['codigo_isin', 'data_fim'])

    # titprivado_perc['data_bd'] = horario_bd
    titprivado_perc['data_bd'] = datetime.datetime.today()

    titprivado_perc = titprivado_perc.where((pd.notnull(titprivado_perc)),
                                            None)

    titprivado_perc['flag1'] = titprivado_perc['flag'].str[0:2]
    del titprivado_perc['flag']
    titprivado_perc = titprivado_perc.rename(columns={'flag1': 'flag'})

    logger.info("Conectando no Banco de dados")
    connection = db.connect('localhost',
                            user='******',
                            passwd='root',
                            db='projeto_inv',
                            use_unicode=True,
                            charset="utf8")
    logger.info("Conexão com DB executada com sucesso")

    logger.info("Salvando base de dados - mtm_renda_fixa")
    pd.io.sql.to_sql(titprivado_perc,
                     name='mtm_renda_fixa',
                     con=connection,
                     if_exists='append',
                     flavor='mysql',
                     index=0)

    #Fecha conexão
    connection.close()

    logger.info("Preenchimento tabela xml")
    # 9 - Preenchimento tabela xml

    del original['id_xml_debenture']
    del original['pu_mercado']
    del original['mtm_mercado']
    del original['pu_curva']
    del original['mtm_curva']
    del original['pu_regra_xml']
    del original['mtm_regra_xml']
    del original['data_referencia']

    del titprivado['mtm']

    titprivado = titprivado.merge(finalizacao, on=['codigo_isin'], how='left')

    titprivado_xml = titprivado[titprivado.dt_ref ==
                                titprivado.data_referencia].copy()
    titprivado_xml = titprivado_xml.drop_duplicates(subset=['id_papel'],
                                                    take_last=True)
    titprivado_xml = titprivado_xml.rename(columns={'mtm': 'mtm_calculado'})

    anbima_deb = anbima_deb.rename(columns={'pu': 'pu_n'})

    titprivado_xml = titprivado_xml.merge(anbima_deb,
                                          on=['coddeb', 'data_spread'],
                                          how='left')
    titprivado_xml['pu'] = np.where(titprivado_xml['pu_n'].notnull(),
                                    titprivado_xml['pu_n'],
                                    titprivado_xml['mtm_calculado'])
    titprivado_xml['pu_mercado'] = np.where(
        titprivado_xml['caracteristica'] == 'N', titprivado_xml['pu'], 0)
    titprivado_xml['pu_curva'] = np.where(
        titprivado_xml['caracteristica'] == 'V', titprivado_xml['pupar'], 0)
    titprivado_xml = titprivado_xml[[
        'id_papel', 'pu_mercado', 'pu_curva', 'data_referencia'
    ]].copy()

    final = original.merge(titprivado_xml, on=['id_papel'], how='left')
    final['data_referencia'] = dt_base_rel
    final['pu_mercado'] = np.where(
        (final['pu_mercado'].isnull()) | (final['pu_mercado'] == 0),
        final['puposicao'], final['pu_mercado'])
    final['pu_mercado'] = np.where(final['dtretorno'].notnull(),
                                   final['puposicao'], final['pu_mercado'])
    final['mtm_mercado'] = final['pu_mercado'] * (final['qtdisponivel'] +
                                                  final['qtgarantia'])
    final['pu_curva'] = np.where(final['pu_curva'].isnull(),
                                 final['puposicao'], final['pu_curva'])
    final['pu_curva'] = np.where(final['dtretorno'].notnull(),
                                 final['puposicao'], final['pu_curva'])
    final['mtm_curva'] = final['pu_curva'] * (final['qtdisponivel'] +
                                              final['qtgarantia'])
    final['pu_regra_xml'] = np.where(final['caracteristica'] == 'N',
                                     final['pu_mercado'], final['pu_curva'])
    final['mtm_regra_xml'] = np.where(final['caracteristica'] == 'N',
                                      final['mtm_mercado'], final['mtm_curva'])
    final['data_bd'] = datetime.datetime.today()

    del final['dtrel']

    final['indexador'] = final['indexador'].str.replace('IGPM', 'IGP')
    final['indexador'] = final['indexador'].str.replace('IAP', 'IPC')
    final['indexador'] = final['indexador'].str.replace('SEM-ÍNDICE', 'PRE')
    final['indexador'] = final['indexador'].str.replace('ANB', 'DI1')
    final['indexador'] = final['indexador'].str.replace('ANBID', 'DI1')
    final['indexador'] = final['indexador'].str.replace('CDI', 'DI1')
    final['indexador'] = final['indexador'].str.replace('IPCA', 'IPC')

    logger.info("Conectando no Banco de dados")
    connection = db.connect('localhost',
                            user='******',
                            passwd='root',
                            db='projeto_inv',
                            use_unicode=True,
                            charset="utf8")
    logger.info("Conexão com DB executada com sucesso")

    logger.info("Salvando base de dados - xml_debenture")
    pd.io.sql.to_sql(final,
                     name='xml_debenture',
                     con=connection,
                     if_exists='append',
                     flavor='mysql',
                     index=0)

    #Fecha conexão
    connection.close()
コード例 #12
0
nullNum = list(data.isnull().sum())
ratioNull = dict()
parametersList = list(data)
# print(parametersList)
for i in range(len(parametersList)):
    ratioNull[parametersList[i]] = nullNum[i]
# print(ratioNull)
# with open('ratioNull.csv', 'w') as csv_file:
#     writer = csv.writer(csv_file)
#     for key, value in ratioNull.items():
#        writer.writerow([key, value])
tempData = pd.DataFrame({
    'Number of Null': nullNum,
    'Param': parametersList,
})
newData = data.iloc[:, :-4]
newData.__delitem__('Chla_Picoeuk.')
newData.__delitem__('NH4')
# newData.fillna(newData.mean(),inplace=True)
print(newData)
# sns.set(style="white")
# ax = sns.barplot(x="Param", y="Number of Null", data=tempData, order=parametersList)
# ax.set_xticklabels(labels=parametersList,rotation=90)
# fig = ax.get_figure()
# fig.savefig("NumberNull.png")
# plt.show()

writer = ExcelWriter('NumberNullWithNull.xlsx')
newData.to_excel(writer, 'Sheet1')
writer.save()
コード例 #13
0
                 print('error')
                 if count>len(class_card):
                     break
                 continue
             p+=1     
        n+=1
       
driver.close()

#lets convert list to pandas dataframe
df1 = pd.DataFrame(sr_no, columns=['sr_no'])
df2=pd.DataFrame(description_list, columns=['Description'])
df3=pd.DataFrame(price_list, columns=['Price'])
df4=pd.DataFrame(address_list, columns=['Address'])
df5=pd.DataFrame(category_list, columns=['Category'])
df6=pd.DataFrame(area_list, columns=['Area'])            
df7=pd.DataFrame(NumBath_list, columns=['Bathrooms'])            
        
#lets join all the data frames            
df12=df1.join(df2)
df123=df12.join(df3)
df1234=df123.join(df4)
df12345=df1234.join(df5)
df123456=df12345.join(df6)
df1234567=df123456.join(df7)

#lets convert pandas dataframe object to excel file
writer = ExcelWriter('PropertyFinder' + '.xlsx')
df1234567.to_excel(writer, 'Sheet1', index=False)
writer.save()   
コード例 #14
0
    mainDF.set_value(row, 'P_Skills_Number', skillsReqNo)
    mainDF.set_value(row, 'F_Skill_Score_1', skillScore1)
    mainDF.set_value(row, 'F_Skill_Score_2', skillScore2)

#print(mainDF[0:30])

## Geo Matching
mainDF['F_Country_Match'] = np.nan

row = -1
while True:
    row += 1
    try:
        empNat = mainDF['P_country'][row].lower().replace(' ', '')
    except:
        break

    freNat = mainDF['F_Location_Country'][row].lower().replace(' ', '')
    if freNat == empNat:
        mainDF.set_value(row, 'F_Country_Match', 1)
    else:
        mainDF.set_value(row, 'F_Country_Match', 0)

#print(mainDF[0:100])

## Output to Excel
writer = ExcelWriter(outFileLoc)
mainDF.to_excel(writer, 'data')
writer.save()
コード例 #15
0
    height = int(input("Height : "))
    gender = input("Gender : ")

    d = {
        'name': name,
        'age': age,
        'height': height,
        'weight': weight,
        'gender': gender
    }
    l.append(d)

df = pd.DataFrame(l)

#Save data to file
writer = ExcelWriter('Student-data.xlsx')
df.to_excel(writer, 'Sheet1', index=False)
writer.save()

#Retrieve data
df = pd.read_excel("Student-data.xlsx")

#Sort based on height

print(df.sort_values(['height']))

#Caluclate BMI
df['bmi'] = df['weight'] / df['height']**2
print(df)

#grouping bmi
コード例 #16
0
                'Cookie': jsessionid,
                'X-XSRF-TOKEN': token
            }
        else:
            headers = {
                'Content-Type': "application/json",
                'Cookie': jsessionid
            }

        base_url = "https://%s:%s/dataservice" % (vmanage_host, vmanage_port)

        # Get security events

        # open excel file
        filename = 'Security Events %s.xlsx' % time.strftime("%Y-%m-%d")
        writer = ExcelWriter(filename)

        api_url = "/event"

        payload = {
            "query": {
                "condition":
                "AND",
                "rules": [{
                    "value":
                    [start_date + "T00:00:00 UTC", end_date + "T00:00:00 UTC"],
                    "field":
                    "entry_time",
                    "type":
                    "date",
                    "operator":
コード例 #17
0
                    'Line': line,
                    'True Positive': positive
                },
                ignore_index=True)
        elif tool == 'gt-csonar':
            data = data.append(
                {
                    'Severity': 'N/A',
                    'CWE': 'N/A',
                    'Clang Alert': 0,
                    'CodeSonar Alert': 1,
                    'Clang Rule': 'N/A',
                    'CodeSonar Rule': rule,
                    'Line': line,
                    'True Positive': positive
                },
                ignore_index=True)
    else:
        prev_entry = indexmap[alertpath][line]
        if tool == 'Clang':
            data.set_value(prev_entry, 'Clang Rule', rule)
            data.set_value(prev_entry, 'Clang Alert', 1)
            data.set_value(prev_entry, 'CWE', cwe)
            data.set_value(prev_entry, 'Severity', severity)
        elif tool == 'gt-csonar':
            data.set_value(prev_entry, 'CodeSonar Rule', rule)
            data.set_value(prev_entry, 'CodeSonar Alert', 1)
writer = ExcelWriter('combined_data_table.xlsx')
data.to_excel(writer, 'sheet 1', index=False)
writer.save()
コード例 #18
0
def create_m5():
    """
	The CREATE_M5 operation builds the fifth data matrix for the analysis for each gene of interest, adding to the fourth matrix data about the expression of candidate regulatory genes of genes of interest belonging to the other gene sets with respect to the model gene.. One data matrix for each target gene is created and exported locally in as many Excel files as the considered genes; while the whole set of M5 matrixes is returned as a Python dictionary (dict_model_v5.p), where each target gene (set as key) is associated to a Pandas dataframe containing M5 data of interest (set as value).

	:return: a Python dictionary
	
	Example::
	
		import genereg as gr
		m5_dict = gr.DataMatrixes.create_m5()
	"""

    # Load input data:

    # Genes of interest
    EntrezConversion_df = pd.read_excel('./Genes_of_Interest.xlsx',
                                        sheetname='Sheet1',
                                        header=0,
                                        converters={
                                            'GENE_SYMBOL': str,
                                            'ENTREZ_GENE_ID': str,
                                            'GENE_SET': str
                                        })

    # Models_v4 of genes of interest
    dict_model_v4 = pickle.load(
        open('./4_Data_Matrix_Construction/Model4/dict_model_v4.p', 'rb'))

    # Distinct regulatory genes for each gene of interest
    dict_RegulGenes = pickle.load(
        open('./2_Regulatory_Genes/dict_RegulGenes.p', 'rb'))

    # Gene expression values for regulatory genes
    expr_regulatory_df = pd.read_excel(
        './3_TCGA_Data/Gene_Expression/Gene_Expression-RegulatoryGenes.xlsx',
        sheetname='Sheet1',
        header=0)

    # Create a list containing the Gene Symbols of the genes of interest
    gene_interest_SYMs = []
    for i, r in EntrezConversion_df.iterrows():
        sym = r['GENE_SYMBOL']
        if sym not in gene_interest_SYMs:
            gene_interest_SYMs.append(sym)

    # Get the TCGA aliquots
    aliquots = []
    for i, r in expr_regulatory_df.iterrows():
        if i != 'ENTREZ_GENE_ID':
            aliquots.append(i)

    # Create a dictionary where, for each gene of interest set as key (the model gene), we have a dataframe representing the model (matrix of data) of that gene.
    # This model contains all the information of the fourth model, plus additional columns with the expression of the regulatory genes for each gene of interest belonging to other gene sets with respect to the model gene,
    # while the different TCGA aliquots are the indexes of the rows
    dict_model_v5 = {}

    # Define the variables we need for the computation
    model_gene_pathways = []  # list of the gene sets the model gene belongs to
    other_pathway_genes = [
    ]  # list of the gene symbol of the genes belonging to different gene sets
    other_pathway_genes_RegulGenes_SYM = [
    ]  # list of gene symbols for the regulatory genes of gene in other gene sets
    new_columns = []  # list of the new columns names to be added to the model

    # Execute the following code for each gene of interest
    for gene in gene_interest_SYMs:

        model_gene_SYM = gene  # get the Gene Symbol of the current gene

        # Get the gene sets of the model gene
        for i, r in EntrezConversion_df.iterrows():
            sym = r['GENE_SYMBOL']
            if sym == model_gene_SYM:
                p = r['GENE_SET']
                model_gene_pathways.append(p)

        # Get the genes of interest belonging to other gene sets
        for i, r in EntrezConversion_df.iterrows():
            path = r['GENE_SET']
            if (path not in model_gene_pathways) and (path !=
                                                      'GLUCOSE_METABOLISM'):
                symbol = r['GENE_SYMBOL']
                if symbol not in other_pathway_genes:  # consider only once the genes belonging to multiple gene sets
                    other_pathway_genes.append(symbol)

        # Get the list of regulatory genes for the genes in the other gene sets
        for elem in other_pathway_genes:
            elem_regulatory_genes = dict_RegulGenes[elem]
            other_pathway_genes_RegulGenes_SYM = other_pathway_genes_RegulGenes_SYM + elem_regulatory_genes
        other_pathway_genes_RegulGenes_SYM = list(
            set(other_pathway_genes_RegulGenes_SYM)
        )  # keep only distinct regulatory genes

        # Get the fourth model for the current gene (model_v4)
        model_4_df = dict_model_v4[model_gene_SYM]

        # Identify the new columns to be added to the matrix:
        # in this case they are the columns corresponding to regulatory genes of genes in other gene sets
        # (be careful not to have duplicated columns, so add only the symbols of the genes
        # that are not already contained in the previous model)
        old_columns = list(model_4_df.columns.values)
        for g in other_pathway_genes_RegulGenes_SYM:
            if g not in old_columns:
                new_columns.append(g)

        # Create the new part of the model to add
        new_df = pd.DataFrame(index=aliquots, columns=new_columns)

        # Add the expression values for all the new regulatory genes and for each TCGA aliquot
        for index, row in new_df.iterrows():
            for column_name, values in new_df.iteritems(
            ):  # iterate along the columns of the dataframe
                expr = expr_regulatory_df.get_value(index, column_name)
                new_df.set_value(index, column_name, expr)

        # Join the two dataframes and create the new model (model_v5)
        model_5_df = model_4_df.join(new_df)

        # Set the new model in correspondence of the correct model gene key in the new dictionary
        dict_model_v5[model_gene_SYM] = model_5_df

        # Reset the variables for the next iteration on the next gene of interest
        model_gene_pathways = []
        other_pathway_genes = []
        other_pathway_genes_RegulGenes_SYM = []
        new_columns = []

    # Remove duplicate columns of the model gene
    for gene in gene_interest_SYMs:
        data_matrix = dict_model_v5[gene]
        matrix_cols = list(data_matrix.columns.values)
        if gene in matrix_cols:
            data_matrix.drop(gene, axis=1, inplace=True)

    # Export the dictionary into a pickle file in order to be able to import it back and use it to progressively build the next models for the genes of interest, adding further information
    pickle.dump(
        dict_model_v5,
        open('./4_Data_Matrix_Construction/Model5/dict_model_v5.p', 'wb'))

    # Export the models as .xlsx files
    for gene in gene_interest_SYMs:

        model_gene_SYM = gene
        pathway = EntrezConversion_df.loc[EntrezConversion_df['GENE_SYMBOL'] ==
                                          model_gene_SYM, 'GENE_SET'].iloc[0]
        gene_ID = EntrezConversion_df.loc[EntrezConversion_df['GENE_SYMBOL'] ==
                                          model_gene_SYM,
                                          'ENTREZ_GENE_ID'].iloc[0]
        file_name = 'Gene_' + gene_ID + '_[' + model_gene_SYM + ']' + '_(' + pathway + ')-Model_v5.xlsx'

        writer = ExcelWriter('./4_Data_Matrix_Construction/Model5/' +
                             file_name)
        output_df = dict_model_v5[model_gene_SYM]
        output_df.to_excel(writer, 'Sheet1')
        writer.save()

    # Handle genes belonging to multiple gene sets
    multiple_pathway_genes = []
    n = EntrezConversion_df['GENE_SYMBOL'].value_counts()
    for i, v in n.items():
        if v > 1:
            multiple_pathway_genes.append(i)

    for g in multiple_pathway_genes:
        filtered_df = EntrezConversion_df.loc[
            EntrezConversion_df['GENE_SYMBOL'] == g]
        pathways = (filtered_df.GENE_SET.unique()).tolist()
        gene_ID = EntrezConversion_df.loc[EntrezConversion_df['GENE_SYMBOL'] ==
                                          g, 'ENTREZ_GENE_ID'].iloc[0]

        for p in pathways:
            # Import the 'model_v4' matrix for the current gene
            current_pathway_model = pd.read_excel(
                './4_Data_Matrix_Construction/Model4/Gene_' + gene_ID + '_[' +
                g + ']_(' + p + ')-Model_v4.xlsx',
                sheetname='Sheet1',
                header=0)

            # Current gene set model
            current_pathway_other_genes = []
            current_pathway_other_RegulGenes_SYM = []
            current_pathway_new_columns = []

            # Get the genes of interest belonging to other gene sets
            for i, r in EntrezConversion_df.iterrows():
                path = r['GENE_SET']
                if (path != p):
                    sym = r['GENE_SYMBOL']
                    if sym != g:
                        current_pathway_other_genes.append(sym)

            # Get the list of regulatory genes for each one of the genes belonging to other gene sets
            for elem in current_pathway_other_genes:
                elem_regulatory_genes = dict_RegulGenes[elem]
                current_pathway_other_RegulGenes_SYM = current_pathway_other_RegulGenes_SYM + elem_regulatory_genes
            current_pathway_other_RegulGenes_SYM = list(
                set(current_pathway_other_RegulGenes_SYM)
            )  # keep only distinct regulatory genes

            # Identify the new columns to be added to the matrix
            current_pathway_old_columns = list(
                current_pathway_model.columns.values)
            for gene in current_pathway_other_RegulGenes_SYM:
                if gene not in current_pathway_old_columns:
                    current_pathway_new_columns.append(gene)

            # Create the new part of the model to add
            current_pathway_new_df = pd.DataFrame(
                index=aliquots, columns=current_pathway_new_columns)

            # Add the expression values for all the new regulatory genes and for each TCGA aliquot
            for index, row in current_pathway_new_df.iterrows():
                for column_name, values in current_pathway_new_df.iteritems():
                    expr = expr_regulatory_df.get_value(index, column_name)
                    current_pathway_new_df.set_value(index, column_name, expr)

            # Join the two dataframes and create the new model (model_v5)
            current_pathway_model_5_df = current_pathway_model.join(
                current_pathway_new_df)

            # Remove duplicate columns of the model gene
            current_pathway_matrix_cols = list(
                current_pathway_model_5_df.columns.values)
            if g in current_pathway_matrix_cols:
                current_pathway_model_5_df.drop(g, axis=1, inplace=True)

            writer = ExcelWriter('./4_Data_Matrix_Construction/Model5/Gene_' +
                                 gene_ID + '_[' + g + ']_(' + p +
                                 ')-Model_v5.xlsx')
            current_pathway_model_5_df.to_excel(writer, 'Sheet1')
            writer.save()

    return dict_model_v5
コード例 #19
0
ファイル: excel_io.py プロジェクト: lijiansong/lang
def extract_into_excel(layer_info_file='layer_info.txt',
                       layer_shape_file='layer_shape.txt',
                       debug=False):
    layer_info_reader = open(layer_info_file, 'rb')
    layer_info = pickle.load(layer_info_reader)
    net_name = layer_info['name']
    print(net_name)
    layer_shape_reader = open(layer_shape_file, 'rb')
    layer_shape = pickle.load(layer_shape_reader)

    layer_name_time_reader = open(str(net_name) + '-each_layer_time.txt', 'rb')
    layer_type_time_reader = open(str(net_name) + '-layer_type_time.txt', 'rb')

    layer_name_record_dict = pickle.load(layer_name_time_reader)
    layer_type_record_dict = pickle.load(layer_type_time_reader)

    flops_membwd_reader = open(str(net_name) + '-flops_membwd.txt', 'rb')
    flops_membwd_dict = pickle.load(flops_membwd_reader)

    layer_name_list = []
    layer_name_type_list = []
    layer_name_time_list = []
    layer_type_list = []
    layer_type_time_list = []
    flops_membwd_type_list = []
    flops_membwd_values_list = []

    max_bottoms_length = max_tops_length = 0
    # multiple input and output shape info
    layer_shape_list_dict = {}
    kernel_size_list = []
    stride_list = []
    pad_list = []

    try:
        for k, v in list(layer_shape.items()):
            if v['type'] == 'Input' or v['type'] == 'Accuracy':
                del layer_shape[k]
                continue
            max_bottoms_length = max(max_bottoms_length, len(v['bottoms']))
            max_tops_length = max(max_tops_length, len(v['tops']))
        # determine the input and output tuples length
        for i in range(0, max_bottoms_length):
            layer_shape_list_dict['Input' + str(i) + ' N'] = []
            layer_shape_list_dict['Input' + str(i) + ' C'] = []
            layer_shape_list_dict['Input' + str(i) + ' H'] = []
            layer_shape_list_dict['Input' + str(i) + ' W'] = []
        for i in range(0, max_tops_length):
            layer_shape_list_dict['Output' + str(i) + ' N'] = []
            layer_shape_list_dict['Output' + str(i) + ' C'] = []
            layer_shape_list_dict['Output' + str(i) + ' H'] = []
            layer_shape_list_dict['Output' + str(i) + ' W'] = []

        for k, v in layer_name_record_dict.items():
            layer_name_list.append(str(k))
            layer_name_type_list.append(str(layer_info[str(k)]))
            layer_name_time_list.append(float(v))
            # 'kernel_size', 'stride', 'pad'
            if layer_info[str(k)] == 'Convolution' or layer_info[str(
                    k)] == 'Pooling':
                kernel_size_list.append(int(
                    layer_shape[str(k)]['kernel_size']))
                stride_list.append(int(layer_shape[str(k)]['stride']))
                pad_list.append(int(layer_shape[str(k)]['pad']))
            else:
                kernel_size_list.append(float('nan'))
                stride_list.append(float('nan'))
                pad_list.append(float('nan'))
            # Input and output shape
            for i in range(0, len(layer_shape[str(k)]['bottoms'])):
                if debug:
                    print('max tops len:', max_tops_length, 'max bottoms len:',
                          max_bottoms_length)
                    print('layer:', str(k), 'type:',
                          layer_shape[str(k)]['type'], 'bottoms:',
                          layer_shape[str(k)]['bottoms'][i])
                if len(layer_shape[str(k)]['bottoms'][i]) == 1:
                    layer_shape_list_dict['Input' + str(i) + ' N'].append(
                        int(layer_shape[str(k)]['bottoms'][i][0]))
                    layer_shape_list_dict['Input' + str(i) + ' C'].append(
                        float('nan'))
                    layer_shape_list_dict['Input' + str(i) + ' H'].append(
                        float('nan'))
                    layer_shape_list_dict['Input' + str(i) + ' W'].append(
                        float('nan'))
                elif len(layer_shape[str(k)]['bottoms'][i]) == 2:
                    layer_shape_list_dict['Input' + str(i) + ' N'].append(
                        int(layer_shape[str(k)]['bottoms'][i][0]))
                    layer_shape_list_dict['Input' + str(i) + ' C'].append(
                        int(layer_shape[str(k)]['bottoms'][i][1]))
                    layer_shape_list_dict['Input' + str(i) + ' H'].append(
                        float('nan'))
                    layer_shape_list_dict['Input' + str(i) + ' W'].append(
                        float('nan'))
                elif len(layer_shape[str(k)]['bottoms'][i]) == 3:
                    layer_shape_list_dict['Input' + str(i) + ' N'].append(
                        int(layer_shape[str(k)]['bottoms'][i][0]))
                    layer_shape_list_dict['Input' + str(i) + ' C'].append(
                        int(layer_shape[str(k)]['bottoms'][i][1]))
                    layer_shape_list_dict['Input' + str(i) + ' H'].append(
                        int(layer_shape[str(k)]['bottoms'][i][2]))
                    layer_shape_list_dict['Input' + str(i) + ' W'].append(
                        float('nan'))
                elif len(layer_shape[str(k)]['bottoms'][i]) == 4:
                    layer_shape_list_dict['Input' + str(i) + ' N'].append(
                        int(layer_shape[str(k)]['bottoms'][i][0]))
                    layer_shape_list_dict['Input' + str(i) + ' C'].append(
                        int(layer_shape[str(k)]['bottoms'][i][1]))
                    layer_shape_list_dict['Input' + str(i) + ' H'].append(
                        int(layer_shape[str(k)]['bottoms'][i][2]))
                    layer_shape_list_dict['Input' + str(i) + ' W'].append(
                        int(layer_shape[str(k)]['bottoms'][i][3]))
            for i in range(len(layer_shape[str(k)]['bottoms']),
                           max_bottoms_length):
                layer_shape_list_dict['Input' + str(i) + ' N'].append(
                    float('nan'))
                layer_shape_list_dict['Input' + str(i) + ' C'].append(
                    float('nan'))
                layer_shape_list_dict['Input' + str(i) + ' H'].append(
                    float('nan'))
                layer_shape_list_dict['Input' + str(i) + ' W'].append(
                    float('nan'))

            for i in range(0, len(layer_shape[str(k)]['tops'])):
                if len(layer_shape[str(k)]['tops'][i]) == 1:
                    layer_shape_list_dict['Output' + str(i) + ' N'].append(
                        int(layer_shape[str(k)]['tops'][i][0]))
                    layer_shape_list_dict['Output' + str(i) + ' C'].append(
                        float('nan'))
                    layer_shape_list_dict['Output' + str(i) + ' H'].append(
                        float('nan'))
                    layer_shape_list_dict['Output' + str(i) + ' W'].append(
                        float('nan'))
                elif len(layer_shape[str(k)]['tops'][i]) == 2:
                    layer_shape_list_dict['Output' + str(i) + ' N'].append(
                        int(layer_shape[str(k)]['tops'][i][0]))
                    layer_shape_list_dict['Output' + str(i) + ' C'].append(
                        int(layer_shape[str(k)]['tops'][i][1]))
                    layer_shape_list_dict['Output' + str(i) + ' H'].append(
                        float('nan'))
                    layer_shape_list_dict['Output' + str(i) + ' W'].append(
                        float('nan'))
                elif len(layer_shape[str(k)]['tops'][i]) == 3:
                    layer_shape_list_dict['Output' + str(i) + ' N'].append(
                        int(layer_shape[str(k)]['tops'][i][0]))
                    layer_shape_list_dict['Output' + str(i) + ' C'].append(
                        int(layer_shape[str(k)]['tops'][i][1]))
                    layer_shape_list_dict['Output' + str(i) + ' H'].append(
                        int(layer_shape[str(k)]['tops'][i][2]))
                    layer_shape_list_dict['Output' + str(i) + ' W'].append(
                        float('nan'))
                elif len(layer_shape[str(k)]['tops'][i]) == 4:
                    layer_shape_list_dict['Output' + str(i) + ' N'].append(
                        int(layer_shape[str(k)]['tops'][i][0]))
                    layer_shape_list_dict['Output' + str(i) + ' C'].append(
                        int(layer_shape[str(k)]['tops'][i][1]))
                    layer_shape_list_dict['Output' + str(i) + ' H'].append(
                        int(layer_shape[str(k)]['tops'][i][2]))
                    layer_shape_list_dict['Output' + str(i) + ' W'].append(
                        int(layer_shape[str(k)]['tops'][i][3]))
            for i in range(len(layer_shape[str(k)]['tops']), max_tops_length):
                if debug:
                    print('max tops len:', max_tops_length, 'max bottoms len:',
                          max_bottoms_length)
                    print('layer:', str(k), 'type:',
                          layer_shape[str(k)]['type'], 'tops:',
                          layer_shape[str(k)]['tops'])
                layer_shape_list_dict['Output' + str(i) + ' N'].append(
                    float('nan'))
                layer_shape_list_dict['Output' + str(i) + ' C'].append(
                    float('nan'))
                layer_shape_list_dict['Output' + str(i) + ' H'].append(
                    float('nan'))
                layer_shape_list_dict['Output' + str(i) + ' W'].append(
                    float('nan'))

        for k, v in layer_type_record_dict.items():
            layer_type_list.append(str(k))
            layer_type_time_list.append(float(v))

        for k, v in flops_membwd_dict.items():
            flops_membwd_type_list.append(str(k))
            flops_membwd_values_list.append(float(v))

    finally:
        layer_info_reader.close()
        layer_shape_reader.close()
        layer_name_time_reader.close()
        layer_type_time_reader.close()
        flops_membwd_reader.close()

    assert len(layer_name_list) == len(layer_name_time_list) and \
            len(layer_name_time_list) == len(kernel_size_list) and \
            len(kernel_size_list) == len(stride_list) and \
            len(stride_list) == len(pad_list) and \
            len(layer_type_list) == len(layer_type_time_list) and \
            len(flops_membwd_type_list) == len(flops_membwd_values_list), \
            " Error! Must have same records length!"

    # calculate flops and memory accessing bytes
    ops_list = []
    mem_bytes_list = []
    for layer_name in layer_name_list:
        flops, mem_bytes = calculate_mlu_ops_byte(layer_name, layer_shape)
        ops_list.append(flops)
        mem_bytes_list.append(mem_bytes)

    gflops_list = []
    intensity_list = []
    total_model_ops = 0.0
    total_model_mem_bytes = 0.0
    for i, exe_time in enumerate(layer_name_time_list):
        gflops_list.append(ops_list[i] / 1e9 / (exe_time / 1e3))
        intensity_list.append(float(ops_list[i] / mem_bytes_list[i]))
        total_model_ops += ops_list[i]
        total_model_mem_bytes += mem_bytes_list[i]

    avg_model_intensity = float(total_model_ops / total_model_mem_bytes)
    total_model_time = 0
    for time in layer_type_time_list:
        total_model_time += time
    avg_model_gflops = total_model_ops / 1e9 / (total_model_time / 1e3)

    # for sheet4 columns
    value_list = [
        total_model_ops, total_model_mem_bytes, total_model_time,
        avg_model_gflops, avg_model_intensity
    ]
    name_list = [
        'model ops', 'model bytes', 'model time(ms)', 'model GFLOPS',
        'model intensity'
    ]

    sheet1_od = collections.OrderedDict()
    sheet1_od['layer name'] = layer_name_list
    sheet1_od['layer type'] = layer_name_type_list
    sheet1_od['time(ms)'] = layer_name_time_list
    sheet1_od['Ops'] = ops_list
    sheet1_od['Bytes'] = mem_bytes_list
    sheet1_od['GFLOPS'] = gflops_list
    sheet1_od['Intensity'] = intensity_list
    for i in range(0, max_bottoms_length):
        sheet1_od['Input' + str(i) +
                  ' N'] = layer_shape_list_dict['Input' + str(i) + ' N']
        sheet1_od['Input' + str(i) +
                  ' C'] = layer_shape_list_dict['Input' + str(i) + ' C']
        sheet1_od['Input' + str(i) +
                  ' H'] = layer_shape_list_dict['Input' + str(i) + ' H']
        sheet1_od['Input' + str(i) +
                  ' W'] = layer_shape_list_dict['Input' + str(i) + ' W']
    sheet1_od['kernel size'] = kernel_size_list
    sheet1_od['stride'] = stride_list
    sheet1_od['pad'] = pad_list
    for i in range(0, max_tops_length):
        sheet1_od['Output' + str(i) +
                  ' N'] = layer_shape_list_dict['Output' + str(i) + ' N']
        sheet1_od['Output' + str(i) +
                  ' C'] = layer_shape_list_dict['Output' + str(i) + ' C']
        sheet1_od['Output' + str(i) +
                  ' H'] = layer_shape_list_dict['Output' + str(i) + ' H']
        sheet1_od['Output' + str(i) +
                  ' W'] = layer_shape_list_dict['Output' + str(i) + ' W']
    sheet1_df = pd.DataFrame(sheet1_od)

    sheet2_od = collections.OrderedDict()
    sheet2_od['layer type'] = layer_type_list
    sheet2_od['time(ms)'] = layer_type_time_list
    sheet2_df = pd.DataFrame(sheet2_od)

    sheet3_od = collections.OrderedDict()
    sheet3_od['type'] = flops_membwd_type_list
    sheet3_od['values'] = flops_membwd_values_list
    sheet3_df = pd.DataFrame(sheet3_od)

    sheet4_od = collections.OrderedDict()
    sheet4_od['name'] = name_list
    sheet4_od['values'] = value_list
    sheet4_df = pd.DataFrame(sheet4_od)

    excel_file_name = str(net_name) + '.xlsx'
    writer = ExcelWriter(excel_file_name)
    sheet1_df.to_excel(writer, 'Sheet1', index=False)
    sheet2_df.to_excel(writer, 'Sheet2', index=False)
    sheet3_df.to_excel(writer, 'Sheet3', index=False)
    sheet4_df.to_excel(writer, 'Sheet4', index=False)
    writer.save()
コード例 #20
0
def create_m1():
    """
	The CREATE_M1 operation builds the first data matrix for each gene of interest, collecting the current gene expression and methylation values, along with the expression values of all the genes in the same gene set. One data matrix for each target gene is created and exported locally in as many Excel files as the considered genes; while the whole set of M1 matrixes is returned as a Python dictionary (dict_model_v1.p), where each target gene (set as key) is associated to a Pandas dataframe containing M1 data of interest (set as value). 

	:return: a Python dictionary
	
	Example::
	
		import genereg as gr
		m1_dict = gr.DataMatrixes.create_m1()
	"""

    # Load input data:

    # Genes of interest
    EntrezConversion_df = pd.read_excel('./Genes_of_Interest.xlsx',
                                        sheetname='Sheet1',
                                        header=0,
                                        converters={
                                            'GENE_SYMBOL': str,
                                            'ENTREZ_GENE_ID': str,
                                            'GENE_SET': str
                                        })

    # Methylation values for genes of interest
    methyl_df = pd.read_excel(
        './3_TCGA_Data/Methylation/Methylation_Values.xlsx',
        sheetname='Sheet1',
        header=0)

    # Gene expression values for genes of interest
    expr_interest_df = pd.read_excel(
        './3_TCGA_Data/Gene_Expression/Gene_Expression-InterestGenes.xlsx',
        sheetname='Sheet1',
        header=0)

    # Create a list containing the Gene Symbols of the genes of interest
    gene_interest_SYMs = []
    for i, r in EntrezConversion_df.iterrows():
        sym = r['GENE_SYMBOL']
        if sym not in gene_interest_SYMs:
            gene_interest_SYMs.append(sym)

    # Get the TCGA aliquots
    aliquots = []
    for i, r in methyl_df.iterrows():
        if i != 'ENTREZ_GENE_ID':
            aliquots.append(i)

    # Create a dictionary where, for each gene of interest set as key (the model gene), we have a dataframe representing the model (matrix of data) of that gene.
    # This model the expression and methylation values of the model gene in the first and second columns, and the expression of all the genes that belong to the
    # model gene set in the other columns, while the different TCGA aliquots are the indexes of the rows.
    dict_model_v1 = {}

    # Define the variables we need for the computation
    model_gene_pathways = []  # list of the gene sets the model gene belongs to
    same_pathway_genes = [
    ]  # list of the symbols of the genes belonging to the same gene sets as the model gene
    df_columns = []  # list of the model columns names

    # Execute the following code for each gene of interest
    for gene in gene_interest_SYMs:

        model_gene_SYM = gene  # get the Gene Symbol of the current gene

        # Get the gene sets of the model gene
        for i, r in EntrezConversion_df.iterrows():
            sym = r['GENE_SYMBOL']
            if sym == model_gene_SYM:
                p = r['GENE_SET']
                model_gene_pathways.append(p)

        # Get the genes of interest belonging to the model gene set
        for i, r in EntrezConversion_df.iterrows():
            path = r['GENE_SET']
            if path in model_gene_pathways:
                symbol = r['GENE_SYMBOL']
                if symbol != model_gene_SYM:
                    same_pathway_genes.append(symbol)

        # Define the columns of the model gene matrix of data
        df_columns.append(
            'EXPRESSION (' + model_gene_SYM +
            ')')  # the second column contains the expression of the model gene
        df_columns.append(
            'METHYLATION (' + model_gene_SYM +
            ')')  # the third column contains the methylation of the model gene
        for g in same_pathway_genes:
            df_columns.append(
                g
            )  # we have a column for each gene in the same gene set of the model gene

        # In correspondence of the model gene key in the dictionary,
        # set its model as value, with the proper indexes and column names
        dict_model_v1[model_gene_SYM] = pd.DataFrame(index=aliquots,
                                                     columns=df_columns)

        # Reset the variables for the next iteration on the next gene of interest
        model_gene_pathways = []
        same_pathway_genes = []
        df_columns = []

    # Fill the models for each gene of interest
    for gene, matrix in dict_model_v1.items():

        first_col = 'EXPRESSION (' + gene + ')'
        second_col = 'METHYLATION (' + gene + ')'

        # Add the expression and methylation values of each model gene and for each TCGA aliquot
        for index, row in matrix.iterrows():
            model_expr = expr_interest_df.get_value(index,
                                                    gene)  # get the expression
            model_methyl = methyl_df.get_value(
                index, gene)  # get the mathylation value
            # set the two values in the correct cell of the matrix
            matrix.set_value(index, first_col, model_expr)
            matrix.set_value(index, second_col, model_methyl)

        # Add the expression values for all the other genes belonging to the same gene set of the model gene
        for index, row in matrix.iterrows():
            for column_name, values in matrix.iteritems(
            ):  # iterate along the columns of the dataframe
                # skip the first two columns and add the proper values
                if (column_name != first_col) and (column_name != second_col):
                    expr = expr_interest_df.get_value(index, column_name)
                    matrix.set_value(index, column_name, expr)

    # Export the dictionary into a pickle file in order to be able to import it back and use it to progressively build the next models for the genes of interest, adding further information.
    pickle.dump(
        dict_model_v1,
        open('./4_Data_Matrix_Construction/Model1/dict_model_v1.p', 'wb'))

    # Export the models as .xlsx files
    for gene in gene_interest_SYMs:

        model_gene_SYM = gene
        pathway = EntrezConversion_df.loc[EntrezConversion_df['GENE_SYMBOL'] ==
                                          model_gene_SYM, 'GENE_SET'].iloc[0]
        gene_ID = EntrezConversion_df.loc[EntrezConversion_df['GENE_SYMBOL'] ==
                                          model_gene_SYM,
                                          'ENTREZ_GENE_ID'].iloc[0]
        file_name = 'Gene_' + gene_ID + '_[' + model_gene_SYM + ']' + '_(' + pathway + ')-Model_v1.xlsx'

        writer = ExcelWriter('./4_Data_Matrix_Construction/Model1/' +
                             file_name)
        output_df = dict_model_v1[model_gene_SYM]
        output_df.to_excel(writer, 'Sheet1')
        writer.save()

    # Handle genes belonging to multiple gene sets
    multiple_pathway_genes = []
    n = EntrezConversion_df['GENE_SYMBOL'].value_counts()
    for i, v in n.items():
        if v > 1:
            multiple_pathway_genes.append(i)

    for g in multiple_pathway_genes:
        filtered_df = EntrezConversion_df.loc[
            EntrezConversion_df['GENE_SYMBOL'] == g]
        pathways = (filtered_df.GENE_SET.unique()).tolist()
        gene_ID = EntrezConversion_df.loc[EntrezConversion_df['GENE_SYMBOL'] ==
                                          g, 'ENTREZ_GENE_ID'].iloc[0]

        for p in pathways:
            current_pathway_model = dict_model_v1[g].copy()

            # Extract the genes of interest in the current gene set
            current_pathway_genes = []
            for i, r in EntrezConversion_df.iterrows():
                sym = r['GENE_SYMBOL']
                path = r['GENE_SET']
                if path == p:
                    current_pathway_genes.append(sym)

            # Extract list of columns in the full model
            all_columns = []
            for column_name, values in current_pathway_model.iteritems():
                if (column_name != 'EXPRESSION (' + g + ')') and (
                        column_name != 'METHYLATION (' + g + ')'):
                    all_columns.append(column_name)

            # Extract the columns to remove form the model
            other_pathway_genes = list(
                set(all_columns) - set(current_pathway_genes))

            for i in other_pathway_genes:
                if (i != g):
                    current_pathway_model.drop(i, axis=1, inplace=True)

            writer = ExcelWriter('./4_Data_Matrix_Construction/Model1/Gene_' +
                                 gene_ID + '_[' + g + ']_(' + p +
                                 ')-Model_v1.xlsx')
            current_pathway_model.to_excel(writer, 'Sheet1')
            writer.save()

    return dict_model_v1
コード例 #21
0
 def UpdateExcel(self):
     df1 = self.getArticles()
     writer = ExcelWriter(destFolder + '\\ElConfidencial.xlsx')
     df1 = df1.drop_duplicates(subset='title', keep='first')
     df1.to_excel(writer, sheet_name='ElConfidencial', index = False)
     writer.save()
コード例 #22
0
for index, row in tweets.iterrows():
    row["Tweet_Content"] = row["Tweet_Content"].strip('b\'')
    row["Timestamp"] = parse(row["Timestamp"]).strftime('%m/%d/%y')
    row["Timestamp"] = pd.to_datetime(row["Timestamp"])
    tweets["Timestamp"] = pd.to_datetime(tweets['Timestamp']).dt.date
SP500 = SP500.reindex(index=SP500.index[::-1])
# setting the column of tweets dataframe
tweets.columns = ["Twitter_ID", "Tweet_ID", "Date", "Tweet_content"]
# changing the type of SP500 Date column to datetime
SP500 = SP500.astype({"Date": "datetime64"})
# converting the Date column of tweets dataframe to datetime
tweets = tweets.astype({"Date": "datetime64"})

#Writing the dataframe into Excel
output_filename = 'cleaned_twitter.xlsx'
writer = ExcelWriter(output_filename)
tweets.to_excel(writer)
writer.save()

#---MERGING---
# dataframe that stores merged data of S&P500 and tweets
# merging on common column Date
tweets_stock = tweets.merge(SP500, how='left', on='Date')
tweets_stock = tweets_stock[pd.notnull(tweets_stock["Open"])]
tweets_stock.reset_index(drop=True)

#---CLEANING AND MERGING AIRLINE DATA (with Twitter Data)---
#Southwest_Airlines
# reading the data scraped from southwest file
airline_filename = 'Southwest.xlsx'
southwest_airlines = pd.read_excel(airline_filename)
コード例 #23
0
ファイル: meBot.py プロジェクト: rabdulkader/meBOT-Python
    #print (counter_b)
    counter_b +=1
    prog_b1 += '#'
    per_b=(counter_b/16)*100
    print('B1_Config Downloading ',str(stat[0]) + '[',prog_b1,'] ',str(round(per_b,0)) + '%', end='\r')
    
    if counter_b == 16:
        print('B1_Config Downloading ',str(stat[1]) + '[',prog_b1,'] ',str(round(per_b,0)) + '%', end='')
        break
        
print('\n Building 1 Download Complete')
    
        
print('################## WRITING ##################')
    
with ExcelWriter('test.xlsx') as writer:
    for x in st_erdc:
        
        df_erdc[counter_c]=pd.read_csv(*PATH-CONFIDENTIAL*+str(st_erdc[counter_c]) +'.csv')
        
        df_erdc[counter_c].to_excel(writer, sheet_name=str(st_erdc[counter_c]))
        
        #print(counter_c)
        counter_c +=1
        if counter_c==14:
                break
                
    print('ERDC Writing Complete')

    for x in st_b1:
コード例 #24
0
                tagData.append(process_url(url, driver))
            except Exception as e1:
                print(e1)
                continue
        counter += 1
        if (counter % 1 == 0):
            print('Done fetching tags for', counter, 'blogs')
    driver.close()
    return tagData


start = time.time()

tags = get_tags(urls[28000:29000])

done = time.time()
elapsed = done - start
print('Total Time Taken', elapsed, 'seconds')

tags_df = pd.DataFrame(tags)
tags_df.columns = ['url', 'tags']
tags_df

df = df.drop_duplicates(subset='url')
final = pd.merge(df, tags_df, on='url')
final

writer = ExcelWriter('TC_final29.xlsx')
final.to_excel(writer, 'Sheet1')
writer.save()
コード例 #25
0
                                              ignore_index=True)

# Rename the file names of the files in folder if such file name matches with the files names in excel data
for i, f in enumerate(os.listdir(FolderPathToCheck)):
    src = os.path.join(FolderPathToCheck, f)
    for idx, row in excel_data.iterrows():
        if f == excel_data.loc[idx, 'FileName']:
            dst = os.path.join(FolderPathToCheck,
                               excel_data.loc[idx, 'FileRenamed'])
            os.rename(src, dst)

# Split the "File_Path_Name" column into seperate columns as File Name, File Path and File extention

for idx, row in filesInFolder_data.iterrows():
    filesInFolder_data.loc[idx, 'File_Path'], filesInFolder_data.loc[
        idx,
        'File_Name'] = os.path.split(filesInFolder_data.loc[idx,
                                                            'File_Path_Name'])
    filesInFolder_data.loc[idx, 'File_Extention'] = Path(
        filesInFolder_data.loc[idx, 'File_Path_Name']).suffix

# Write the details of files in the folder to a excel sheet.

writer = ExcelWriter(FolderPathToSaveResult + "/All_File_Header_Data.xlsx")
filesInFolder_data.to_excel(writer, 'Sheet1', index=False)
writer.save()

# Write the details of files in the folder to a excel sheet.
writer = ExcelWriter(FolderPathToSaveResult + "/excel_data_new.xlsx")
excel_data.to_excel(writer, 'Sheet1', index=False)
writer.save()
コード例 #26
0
            if i == j:
                final[i] = {}
                for idItemA in android_final[i]:
                    final[i][idItemA] = android_final[i][idItemA]

                for idItemB in ios_final[j]:
                    final[i][idItemB] = ios_final[j][idItemB]

    return final


final = mergDictionaries(android_final, ios_final)

from pandas import ExcelWriter
df = pandas.DataFrame.from_dict(final, orient='index')
df.to_csv('data.csv',
          header=True,
          index=True,
          index_label='datetime',
          encoding='utf-8')
writer = ExcelWriter('data.xlsx')
df.to_excel(writer,
            sheet_name='PokemonGoAnalysisData',
            header=True,
            index=True,
            index_label='datetime')
writer.save()

#dumping the dictionary in the json file
json.dump(final, outfile)
コード例 #27
0
def ExcelHandler(FileName):
    ExcelInput = read_excel(FileName, converters={'Date': dateFix})
    with ExcelWriter('Fixed_File.xlsx', engine='xlsxwriter') as writer:
        ExcelInput.to_excel(writer)
コード例 #28
0
    def get_woe_iv(self, bins=5):
        bin_interval = 1.0 / bins
        x = self.x
        y = self.y
        event = self.event
        y_column = y.name

        xy_concat = pd.concat([x, y], axis=1).copy()
        list_q = np.arange(0, 1, bin_interval)

        xy_concat.loc[xy_concat[y_column] == event, y_column] = 1
        xy_concat.loc[xy_concat[y_column] <> event, y_column] = 0
        total_count = xy_concat.shape[0]
        total_pos_count = xy_concat[y_column].sum()
        total_neg_count = total_count - total_pos_count
        pos_rt = (total_pos_count + 0.0) / total_count

        columns_bin_dict = self.columns_bin_dict
        woe_t = pd.DataFrame()
        for i in x.columns:
            nunique_i = x[i].nunique()
            type_i = str(x[i].dtype)

            if nunique_i <= 1:
                print i, " : 0 or 1 level, no woe/iv calculated."
                columns_bin_dict[i] = []
                continue
            elif nunique_i <= 10:
                #value_counts
                xy_concat[i] = xy_concat[i].fillna("nan")
                df_temp = xy_concat.groupby(i)[y_column].agg([pd.Series.sum,pd.Series.count]).reset_index()\
                            .rename(columns = {"sum":"pos_count","count":"cat_total_count",i:"var_cat"})
                columns_bin_dict[i] = []
            else:
                if type_i == "object":
                    print i, ": too many values for discrete variables."
                    continue
                else:
                    var_name = i
                    var_name_bin = str(i) + "_bin"
                    if i not in columns_bin_dict.keys():
                        list_q2 = list(xy_concat[i].dropna().quantile(
                            list_q).unique()) + [np.inf]
                        list_q2[0] = -np.inf
                        if len(list_q2) == 2:
                            list_q2 = list(pd.Series(xy_concat[i].dropna().unique()) \
                                .quantile(list_q)) + [np.inf]
                            list_q2[0] = -np.inf
                    else:
                        list_q2 = columns_bin_dict[i]
                    xy_concat[var_name_bin] = pd.cut(x[var_name],
                                                     list_q2).astype("string")
                    df_temp = xy_concat.groupby(var_name_bin)[y_column].agg([pd.Series.sum,pd.Series.count]).reset_index()\
                            .rename(columns = {"sum":"pos_count","count":"cat_total_count",var_name_bin:"var_cat"})

                    columns_bin_dict[i] = list_q2

            df_temp["var_name"] = i
            df_temp["neg_count"] = df_temp["cat_total_count"] - df_temp[
                "pos_count"]
            df_temp["p_ni"] = df_temp["neg_count"] / total_neg_count
            df_temp["p_yi"] = df_temp["pos_count"] / total_pos_count
            df_temp["woe"] = np.log(df_temp["pos_count"]/total_pos_count / \
                            df_temp["p_ni"])
            woe_t = woe_t.append(df_temp)
        woe_t["odds"] = woe_t["pos_count"] / woe_t["neg_count"]
        woe_t.loc[woe_t["woe"] >= self.max_woe, "woe"] = self.max_woe
        woe_t.loc[woe_t["woe"] <= self.min_woe, "woe"] = self.min_woe
        woe_t["iv_i"] = (woe_t["p_yi"] - woe_t["p_ni"]) * woe_t["woe"]
        woe_t["p_y_total"] = pos_rt
        woe_show_cols = [
            'var_name', 'var_cat', 'cat_total_count', 'pos_count', 'neg_count',
            'p_ni', "p_y_total", "odds", 'p_yi', 'woe', 'iv_i'
        ]
        woe_t = woe_t[woe_show_cols]
        woe_t["var_cat"] = woe_t["var_cat"].astype("string")
        iv_t = woe_t.groupby("var_name")["iv_i"].sum().reset_index().rename(
            columns={"iv_i": "iv"})
        filePath = self.workpath + "woe_t.xlsx"
        excel_writer = ExcelWriter(filePath)
        woe_t.to_excel(excel_writer, sheet_name="woe", index=False)
        iv_t.to_excel(excel_writer, sheet_name="iv", index=False)
        excel_writer.save()

        self.woe_t = woe_t
        self.iv_t = iv_t
        print filePath, " generated;\n <name>.woe_t, <name>.iv_t available"
コード例 #29
0
#!/usr/bin/env python
# -*- coding=utf-8 -*-
__author__ = "柯博文老師 Powen Ko, www.powenko.com"
import pandas as pd
#data = pd.read_csv('ExpensesRecord.csv')
df = pd.read_excel('ExpensesRecord.xls', 'sheet')
#data = pd.read_html('http://www.fdic.gov/bank/individual/failed/banklist.html')
print(df.head(5))

from pandas import ExcelWriter
writer = ExcelWriter('test.xlsx', engine='xlsxwriter')
df.to_excel(writer, sheet_name='sheet2')
writer.save()
コード例 #30
0
def save_xls(list_dfs, xls_path):
    writer = ExcelWriter(xls_path)
    for n, df in enumerate(list_dfs):
        sheet_name = str(classifiers[n])
        df.to_excel(writer, sheet_name)
    writer.save()