Ejemplo n.º 1
0
conn_oracle.close()

#Wrangle data into an Access acceptable form using numpy and pandas
res = [list(elem) for elem in res]
res = np.array(res)

column_names = []
for i in range(len(cursor.description)):
    column_names.append(cursor.description[i][0])

res = pd.DataFrame(res, columns=column_names)
'''The shortcut to the entire data wrangling process is :
   res = pd.read_sql_query(sql,conn_oracle)'''

#write the dataframe to MS Excel
writer = pd.ExcelWriter("H:\oracle_data.xlsx", engine="xlsxwriter")
res.to_excel(writer, sheet_name="Oracle", index=False)
writer.save()

#write the dataframe to sqlite database
conn_sqlit = sqlite3.connect(r"H:\testdata.db")
cursor_sq = conn_sqlit.cursor()

res.to_sql("First_table", conn_sqlit, if_exists='replace', index=False)

data = pd.read_sql_query("Select * from First_table", conn_sqlit)

print data

conn_sqlit.close()
Ejemplo n.º 2
0
    beg = datetime.now()

    order_list = []
    workpieces_in_time(8 * 60 * 60)
    data = np.array(data)
    print('工件完成数', len(data))
    print(data[:, 0])

    if err_rate > 0:
        # ATTENTION:***如果报错了 就是工作运行期间没有故障**
        is_err_str = 'err'
        data_err_df = pd.DataFrame(err_data)
        data_err_df.columns = ['故障cnc编号', '故障开始时间', '故障结束时间']
        data_err_df.index += 1
        # create and writer pd.DataFrame to excel
        writer = pd.ExcelWriter('Save_Excel_case1_' + is_err_str + '_how.xlsx')
        data_err_df.to_excel(writer, 'page_1', float_format='%.5f')  # float_format 控制精度
        writer.save()
    else:
        is_err_str = 'no_err'

    en = datetime.now()

    # 生成excel表格
    # 注意组号
    data_df = pd.DataFrame(data)
    # change the index and column name
    data_df.columns = ['加工编号', '上料时间', '下料时间']
    data_df.index += 1
    # create and writer pd.DataFrame to excel
    writer = pd.ExcelWriter('Save_Excel_case1_' + is_err_str + '.xlsx')
Ejemplo n.º 3
0
def main():

    # import xls

    # case logic
    '''
    1	ID
    2	GeoMarket
    3	Country
    4	Region
    5	Product Line
    6	IncidentType
    7	FormStatus
    8	Description
    9	IncidentDate
    10	EmploymentType
    11	InjuryNature
    12	RiskRanking
    13	RiskRating
    14	Root Cause(5 Why's)
    15	Created By
    16	FormSubmittedBy
    17	QHSE Report Workflow
    18	InjuryLocation
    19	InjuryNatureMechanism
    20	Primary Root Cause
    21	NonProductiveTime
    22	Test XML
    23	PINType
    24	Cost of Poor Quality (USD)
    25	Job Number
    26	Item Type
    27	Path
    '''

    # import xls file
    data = pd.read_excel(
        r'C:\Users\stacy\My WrWx\00_projects\reservoirGroup\Adam\Oil and Gas PIN System Summary Dashboard.xlsx',
        sheet_name='PIN Data')
    print(data)  # print a summary table of the xlsx contents
    print('Col Headers:\n', data.columns)  # print a list of the headers
    print(data['Region'])  # print all rows within a column as a list

    # iterate over the region list from above using a loop
    for i in data.index:
        print(data['Region'][i])

    # take entire columns from the sheet and put into lists
    region = data['Region']
    company = data['Company']
    raisedBy = data['RaisedBy']

    nu_region = []
    nu_company = []
    nu_raisedBy = []

    i = 0
    for item in region:
        if i == 10:
            nu_region.append('Bananas')
            nu_company.append('Apples')
            nu_raisedBy.append('ORanges')
        else:
            nu_region.append(region[i])
            nu_company.append(company[i])
            nu_raisedBy.append(raisedBy[i])
        i += 1

    # show a subtable of the imported excel file
    df = pd.DataFrame(data, columns=['PinID', 'Risk', 'Region', 'Company'])
    print(df)

    # pandas output
    pandas_file = 'C:/Users/stacy/My WrWx/00_projects/reservoirGroup/Adam/pandas_test.xlsx'
    pandas_file_2 = 'C:/Users/stacy/My WrWx/00_projects/reservoirGroup/Adam/pandas_test_apples.xlsx'
    pandas_file_3 = 'C:/Users/stacy/My WrWx/00_projects/reservoirGroup/Adam/historical_pin_hse.xlsx'

    writer = pd.ExcelWriter(pandas_file)
    df.to_excel(writer, 'PIN_Data', index=False)
    writer.save()

    df2 = pd.DataFrame({
        'Region': nu_region,
        'Company': nu_company,
        'RaisedBy': nu_raisedBy
    })
    writer2 = pd.ExcelWriter(pandas_file_2)
    df2.to_excel(writer2, 'Historical PIN and HSE', index=False)
    writer2.save()

    print('Done.')
    value = df.loc[i][3]
    if value[-1] != ',':
        value = value + ','
    value = value.replace(',,', ',')
    value = value.replace(',', ',')
    value = value.replace('\n', '')
    text = str(value).split(',')
    # print(text)
    valuelist = text[2: -4]
    # print(valuelist)
    pointlist = []
    x = 0
    while x < len(valuelist):
        pointlist.append(valuelist[x+1] + '#' + valuelist[x+2])
        x += 3
    # print(pointlist)
    df.loc[i, '坐标1'] = str(pointlist).replace('[', '').replace('\'', '').replace(']', '')

del df['坐标']
df2 = df.drop('坐标1', axis=1).join(df['坐标1'].str.split(',', expand=True).stack().reset_index(level=1, drop=True).rename('坐标2'))
# print(df2)
df2['经度'], df2['纬度'] = df2['坐标2'].str.split('#', 1).str
del df2['坐标2']
print(df2)

outputroute = 'F:\\测试数据\\酉阳\\cxy探矿权处理\\test.xls'
writer = pd.ExcelWriter(outputroute)
df2.to_excel(writer, float_format='%.5f')
writer.save()

Ejemplo n.º 5
0
def merge_converter(filename):
    df = pandas.read_excel(filename)
    #df.drop_duplicates(subset=['one'], keep=False, inplace=True)
    #df.loc['Total'] = pandas.Series(df.sum())
    df.insert(7, 'seven', '')
    # PERCENTAGE CALCULATIONS
    df['two_percentage'] = df['two'].apply(lambda a:
                                           (a / df['two'].sum()) * 100)
    df['five_percentage'] = df['five'].apply(lambda a:
                                             (a / df['five'].sum()) * 100)
    df['three_percentage'] = df['three'].apply(lambda a:
                                               (a / df['three'].sum()) * 100)

    df['six_percentage'] = df['six'].apply(lambda a:
                                           (a / df['six'].sum()) * 100)

    df.insert(12, 'twelve', '')

    # DIFFERENCE

    df['two_p_diff'] = df.two_percentage.diff()
    df['five_p_diff'] = df.five_percentage.diff()

    df.insert(15, 'fifteen', '')
    # DIFFERENCE ENDS

    # SAME SAME

    df['two_p_same'] = df.apply(
        lambda x: x['two_p_diff']
        if x['two_p_diff'] * x['five_p_diff'] > 0 else np.NaN,
        axis=1)
    df['five_p_same'] = df.apply(
        lambda x: x['five_p_diff']
        if x['five_p_diff'] * x['two_p_diff'] > 0 else np.NaN,
        axis=1)

    #Changing p_diff to p_same and deleting  same

    df['two_p_diff'] = df['two_p_same']
    df['five_p_diff'] = df['five_p_same']

    del df['two_p_same']
    del df['five_p_same']

    ## CHECK IF ALL FOUR COLUMNS HAVE DATA IN IT OTHERWISE ENTER NULL DATA

    #GET VALUE OF ABOVE ROW FOR TWO PERCENTAGE

    index_of_not_null_two_p_diff = df[~df.two_p_diff.isnull()].index.tolist()
    print(index_of_not_null_two_p_diff)

    total_rows = df.shape[0] - 3
    blank_list = [np.NaN] * (total_rows)

    #SHOW ROWS TWO THREE FIVE SIX ABOVE AND SAME

    two_p_values = []
    three_p_values = []
    five_p_values = []
    six_p_values = []

    two_p_v_d = []
    three_p_v_d = []
    five_p_v_d = []
    six_p_v_d = []

    for j in index_of_not_null_two_p_diff:
        two_p_values.append('')
        two_p_values.append(df._get_value(j - 1, 'two_percentage'))
        two_p_values.append(df._get_value(j, 'two_percentage'))
        two_p_v_d.append(
            df._get_value(j, 'two_percentage') -
            df._get_value(j - 1, 'two_percentage'))

        three_p_values.append('')
        three_p_values.append(df._get_value(j - 1, 'three_percentage'))
        three_p_values.append(df._get_value(j, 'three_percentage'))  #
        three_p_v_d.append(df._get_value(j, 'three_percentage'))

        five_p_values.append('')
        five_p_values.append(df._get_value(j - 1, 'five_percentage'))
        five_p_values.append(df._get_value(j, 'five_percentage'))  #
        five_p_v_d.append(
            df._get_value(j, 'five_percentage') -
            df._get_value(j - 1, 'five_percentage'))

        six_p_values.append('')
        six_p_values.append(df._get_value(j - 1, 'six_percentage'))
        six_p_values.append(df._get_value(j, 'six_percentage'))
        six_p_v_d.append(df._get_value(j, 'six_percentage'))

    #df['two_p_values'] = two_p_values + another_blank_list*(total_rows_another - len(two_p_values))
    #df['three_p_values'] = three_p_values + another_blank_list*(total_rows_another - len(three_p_values))
    #df['five_p_values'] = five_p_values + another_blank_list*(total_rows_another - len(five_p_values))
    #df['six_p_values'] = six_p_values + another_blank_list*(total_rows_another - len(six_p_values))

    additional1 = pandas.DataFrame({'two_p_values': two_p_values})
    additional2 = pandas.DataFrame({'three_p_values': three_p_values})
    additional3 = pandas.DataFrame({'five_p_values': five_p_values})
    additional4 = pandas.DataFrame({'six_p_values': six_p_values})

    df = pandas.concat([df, additional1], axis=1)
    df = pandas.concat([df, additional2], axis=1)
    df = pandas.concat([df, additional3], axis=1)
    df = pandas.concat([df, additional4], axis=1)

    df.insert(20, 'twenty', '')

    total_rows_another = df.shape[0]
    another_blank_list = [np.NaN]

    #two_p_v_d = sorted(two_p_v_d)
    #three_p_v_d = sorted(three_p_v_d)
    #five_p_v_d = sorted(five_p_v_d)
    #six_p_v_d = sorted(six_p_v_d)

    two_p_v_d_neg = [abs(x) for x in two_p_v_d if x < 0]
    three_p_v_d_neg = []

    for ind in range(0, len(two_p_v_d_neg)):
        three_p_v_d_neg.append(three_p_v_d[ind])
        #del three_p_v_d[0]

    five_p_v_d_neg = [abs(x) for x in five_p_v_d if x < 0]
    six_p_v_d_neg = []

    for ind in range(0, len(five_p_v_d_neg)):
        six_p_v_d_neg.append(six_p_v_d[ind])
        #del six_p_v_d[0]

    two_p_v_d_pos = [x for x in two_p_v_d if x > 0]
    three_p_v_d_pos = []

    for indi in range(len(two_p_v_d_neg), len(two_p_v_d)):
        three_p_v_d_pos.append(three_p_v_d[ind])

    five_p_v_d_pos = [x for x in five_p_v_d if x > 0]
    six_p_v_d_pos = []

    for indi in range(len(five_p_v_d_neg), len(five_p_v_d)):
        six_p_v_d_pos.append(six_p_v_d[ind])

    print("####################")
    print(f"length of two_p_v_d {len(two_p_v_d)}")
    print(f"length of two_p_v_d_pos {len(two_p_v_d_pos)}")
    print(f"length of two_p_v_d_neg {len(two_p_v_d_neg)}")
    print("####################")

    print("####################")
    print(f"length of three_p_v_d {len(three_p_v_d)}")
    print(f"length of three_p_v_d_pos {len(three_p_v_d_pos)}")
    print(f"length of three_p_v_d_neg {len(three_p_v_d_neg)}")
    print("####################")

    print("####################")
    print(f"length of five_p_v_d {len(five_p_v_d)}")
    print(f"length of five_p_v_d_pos {len(five_p_v_d_pos)}")
    print(f"length of five_p_v_d_neg {len(five_p_v_d_neg)}")
    print("####################")

    print("####################")
    print(f"length of six_p_v_d {len(six_p_v_d)}")
    print(f"length of six_p_v_d_pos {len(six_p_v_d_pos)}")
    print(f"length of six   _p_v_d_neg {len(six_p_v_d_neg)}")
    print("####################")

    minus_two_three_neg = list(
        map(operator.sub, two_p_v_d_neg, three_p_v_d_neg))
    minus_two_three_pos = list(
        map(operator.sub, two_p_v_d_pos, three_p_v_d_pos))

    minus_five_six_neg = list(map(operator.sub, five_p_v_d_neg, six_p_v_d_neg))
    minus_five_six_pos = list(map(operator.sub, five_p_v_d_pos, six_p_v_d_pos))

    df_sort = pandas.DataFrame()

    df_sort['two_p_v_d'] = two_p_v_d + another_blank_list * (
        total_rows_another - len(two_p_v_d))
    df_sort['three_p_v_d'] = three_p_v_d + another_blank_list * (
        total_rows_another - len(three_p_v_d))
    df_sort['five_p_v_d'] = five_p_v_d + another_blank_list * (
        total_rows_another - len(five_p_v_d))
    df_sort['six_p_v_d'] = six_p_v_d + another_blank_list * (
        total_rows_another - len(six_p_v_d))

    df['two_p_v_d'] = df_sort['two_p_v_d']
    df['three_p_v_d'] = df_sort['three_p_v_d']
    df['five_p_v_d'] = df_sort['five_p_v_d']
    df['six_p_v_d'] = df_sort['six_p_v_d']

    df['two_p_v_d_neg'] = df.apply(lambda x: x['two_p_v_d']
                                   if x['two_p_v_d'] < 0 else np.NaN,
                                   axis=1)
    df['three_p_v_d_neg'] = df.apply(lambda x: x['three_p_v_d']
                                     if x['two_p_v_d'] < 0 else np.NaN,
                                     axis=1)
    df['five_p_v_d_neg'] = df.apply(lambda x: x['five_p_v_d']
                                    if x['two_p_v_d'] < 0 else np.NaN,
                                    axis=1)
    df['six_p_v_d_neg'] = df.apply(lambda x: x['six_p_v_d']
                                   if x['two_p_v_d'] < 0 else np.NaN,
                                   axis=1)

    df['two_p_v_d_neg'] = df['two_p_v_d_neg'].abs()
    df['three_p_v_d_neg'] = df['three_p_v_d_neg'].abs()
    df['five_p_v_d_neg'] = df['five_p_v_d_neg'].abs()
    df['six_p_v_d_neg'] = df['six_p_v_d_neg'].abs()

    df_sort['two_p_v_d_pos'] = df.apply(lambda x: x['two_p_v_d']
                                        if x['two_p_v_d'] > 0 else np.NaN,
                                        axis=1)
    df_sort['three_p_v_d_pos'] = df.apply(lambda x: x['three_p_v_d']
                                          if x['two_p_v_d'] > 0 else np.NaN,
                                          axis=1)
    df_sort['five_p_v_d_pos'] = df.apply(lambda x: x['five_p_v_d']
                                         if x['two_p_v_d'] > 0 else np.NaN,
                                         axis=1)
    df_sort['six_p_v_d_pos'] = df.apply(lambda x: x['six_p_v_d']
                                        if x['two_p_v_d'] > 0 else np.NaN,
                                        axis=1)

    df_sort = df_sort[df_sort['two_p_v_d_pos'].notna()].reset_index()

    df['two_p_v_d_pos'] = df_sort['two_p_v_d_pos']
    df['three_p_v_d_pos'] = df_sort['three_p_v_d_pos']
    df['five_p_v_d_pos'] = df_sort['five_p_v_d_pos']
    df['six_p_v_d_pos'] = df_sort['six_p_v_d_pos']

    df['2_p_v_d_pos_percentage'] = df['two_p_v_d_pos'].apply(
        lambda a: (a / df['two_p_v_d_pos'].sum()) * 100)
    df['3_p_v_d_pos_percentage'] = df['three_p_v_d_pos'].apply(
        lambda a: (a / df['three_p_v_d_pos'].sum()) * 100)
    df['5_p_v_d_pos_percentage'] = df['five_p_v_d_pos'].apply(
        lambda a: (a / df['five_p_v_d_pos'].sum()) * 100)
    df['6_p_v_d_pos_percentage'] = df['six_p_v_d_pos'].apply(
        lambda a: (a / df['six_p_v_d_pos'].sum()) * 100)

    ## DF2 FOR DROPPING ROWS WITH NAN VALUES IN NEGRATIVE PERCENTAGE OF TWO THREE FIVE SIX VAUES
    df2 = pandas.DataFrame()

    df2['2_p_v_d_neg_percentage'] = df['two_p_v_d_neg'].apply(
        lambda a: (a / df['two_p_v_d_neg'].sum()) * 100)
    df2['3_p_v_d_neg_percentage'] = df['three_p_v_d_neg'].apply(
        lambda a: (a / df['three_p_v_d_neg'].sum()) * 100)
    df2['5_p_v_d_neg_percentage'] = df['five_p_v_d_neg'].apply(
        lambda a: (a / df['five_p_v_d_neg'].sum()) * 100)
    df2['6_p_v_d_neg_percentage'] = df['six_p_v_d_neg'].apply(
        lambda a: (a / df['six_p_v_d_neg'].sum()) * 100)

    print("THIS IS DF2 WITHOUT DROP NA")
    print(df2)
    df2 = df2.dropna().reset_index()

    print("THIS IS DF2 WITH DROP NA")
    print(df2)

    df['2_p_v_d_neg_percentage'] = df2['2_p_v_d_neg_percentage']
    df['3_p_v_d_neg_percentage'] = df2['3_p_v_d_neg_percentage']
    df['5_p_v_d_neg_percentage'] = df2['5_p_v_d_neg_percentage']
    df['6_p_v_d_neg_percentage'] = df2['6_p_v_d_neg_percentage']

    df.insert(25, 'twenty_fifth', '')
    df.insert(30, '29', '')
    df.insert(35, '35', '')

    #AB 3 or 5 ko dekhna hai agar dono chota chota hai toh poori row rakhni hai warna nahi rakhni hai

    df_sort['2_p_v_d_pos_p_c'] = df.apply(
        lambda x: x['2_p_v_d_pos_percentage']
        if x['3_p_v_d_pos_percentage'] < x['2_p_v_d_pos_percentage'] and
        x['6_p_v_d_pos_percentage'] < x['5_p_v_d_pos_percentage'] else np.NaN,
        axis=1)
    df_sort['3_p_v_d_pos_p_c'] = df.apply(
        lambda x: x['3_p_v_d_pos_percentage']
        if x['3_p_v_d_pos_percentage'] < x['2_p_v_d_pos_percentage'] and
        x['6_p_v_d_pos_percentage'] < x['5_p_v_d_pos_percentage'] else np.NaN,
        axis=1)
    df_sort['5_p_v_d_pos_p_c'] = df.apply(
        lambda x: x['5_p_v_d_pos_percentage']
        if x['3_p_v_d_pos_percentage'] < x['2_p_v_d_pos_percentage'] and
        x['6_p_v_d_pos_percentage'] < x['5_p_v_d_pos_percentage'] else np.NaN,
        axis=1)
    df_sort['6_p_v_d_pos_p_c'] = df.apply(
        lambda x: x['6_p_v_d_pos_percentage']
        if x['3_p_v_d_pos_percentage'] < x['2_p_v_d_pos_percentage'] and
        x['6_p_v_d_pos_percentage'] < x['5_p_v_d_pos_percentage'] else np.NaN,
        axis=1)

    df_sort['2_p_v_d_neg_p_c'] = df.apply(
        lambda x: x['2_p_v_d_neg_percentage']
        if x['3_p_v_d_neg_percentage'] < x['2_p_v_d_neg_percentage'] and
        x['6_p_v_d_neg_percentage'] < x['5_p_v_d_neg_percentage'] else np.NaN,
        axis=1)
    df_sort['3_p_v_d_neg_p_c'] = df.apply(
        lambda x: x['3_p_v_d_neg_percentage']
        if x['3_p_v_d_neg_percentage'] < x['2_p_v_d_neg_percentage'] and
        x['6_p_v_d_neg_percentage'] < x['5_p_v_d_neg_percentage'] else np.NaN,
        axis=1)
    df_sort['5_p_v_d_neg_p_c'] = df.apply(
        lambda x: x['5_p_v_d_neg_percentage']
        if x['3_p_v_d_neg_percentage'] < x['2_p_v_d_neg_percentage'] and
        x['6_p_v_d_neg_percentage'] < x['5_p_v_d_neg_percentage'] else np.NaN,
        axis=1)
    df_sort['6_p_v_d_neg_p_c'] = df.apply(
        lambda x: x['6_p_v_d_neg_percentage']
        if x['3_p_v_d_neg_percentage'] < x['2_p_v_d_neg_percentage'] and
        x['6_p_v_d_neg_percentage'] < x['5_p_v_d_neg_percentage'] else np.NaN,
        axis=1)

    list_2_p_v_d_pos_p_c = df_sort['2_p_v_d_pos_p_c'].values.tolist()
    list_3_p_v_d_pos_p_c = df_sort['3_p_v_d_pos_p_c'].values.tolist()
    list_5_p_v_d_pos_p_c = df_sort['5_p_v_d_pos_p_c'].values.tolist()
    list_6_p_v_d_pos_p_c = df_sort['6_p_v_d_pos_p_c'].values.tolist()

    list_2_p_v_d_neg_p_c = df_sort['2_p_v_d_neg_p_c'].values.tolist()
    list_3_p_v_d_neg_p_c = df_sort['3_p_v_d_neg_p_c'].values.tolist()
    list_5_p_v_d_neg_p_c = df_sort['5_p_v_d_neg_p_c'].values.tolist()
    list_6_p_v_d_neg_p_c = df_sort['6_p_v_d_neg_p_c'].values.tolist()

    list_2_p_v_d_pos_p_c.append(df_sort['2_p_v_d_pos_p_c'].mean())
    list_3_p_v_d_pos_p_c.append(df_sort['3_p_v_d_pos_p_c'].mean())
    list_5_p_v_d_pos_p_c.append(df_sort['5_p_v_d_pos_p_c'].mean())
    list_6_p_v_d_pos_p_c.append(df_sort['6_p_v_d_pos_p_c'].mean())

    list_2_p_v_d_neg_p_c.append(df_sort['2_p_v_d_neg_p_c'].mean())
    list_3_p_v_d_neg_p_c.append(df_sort['3_p_v_d_neg_p_c'].mean())
    list_5_p_v_d_neg_p_c.append(df_sort['5_p_v_d_neg_p_c'].mean())
    list_6_p_v_d_neg_p_c.append(df_sort['6_p_v_d_neg_p_c'].mean())

    list_2_p_v_d_pos_p_c.append(df_sort['2_p_v_d_pos_p_c'].count())
    list_3_p_v_d_pos_p_c.append(df_sort['3_p_v_d_pos_p_c'].count())
    list_5_p_v_d_pos_p_c.append(df_sort['5_p_v_d_pos_p_c'].count())
    list_6_p_v_d_pos_p_c.append(df_sort['6_p_v_d_pos_p_c'].count())

    list_2_p_v_d_neg_p_c.append(df_sort['2_p_v_d_neg_p_c'].count())
    list_3_p_v_d_neg_p_c.append(df_sort['3_p_v_d_neg_p_c'].count())
    list_5_p_v_d_neg_p_c.append(df_sort['5_p_v_d_neg_p_c'].count())
    list_6_p_v_d_neg_p_c.append(df_sort['6_p_v_d_neg_p_c'].count())

    df.insert(40, '40', '')
    print("THIS IS DF SORT NEWWWW")
    del df_sort['index']
    del df_sort['two_p_v_d']
    del df_sort['three_p_v_d']
    del df_sort['five_p_v_d']
    del df_sort['six_p_v_d']
    del df_sort['two_p_v_d_pos']
    del df_sort['three_p_v_d_pos']
    del df_sort['five_p_v_d_pos']
    del df_sort['six_p_v_d_pos']

    #df_sort.loc[df_sort.shape[0]] = df_sort.mean()
    #df_sort.loc[df_sort.shape[0]] = df_sort.count()

    #### END WALA PART HAI JISME COUNT KRNA THA PHIR PERCETAGE NIKALNI THI PHIR MINUS KRNA THA

    twopospercentage = (df_sort['2_p_v_d_pos_p_c'].mean() * 100) / (
        df_sort['2_p_v_d_pos_p_c'].mean() + df_sort['2_p_v_d_neg_p_c'].mean())
    threepospercentage = (df_sort['3_p_v_d_pos_p_c'].mean() * 100) / (
        df_sort['3_p_v_d_pos_p_c'].mean() + df_sort['3_p_v_d_neg_p_c'].mean())
    fivepospercentage = (df_sort['5_p_v_d_pos_p_c'].mean() * 100) / (
        df_sort['5_p_v_d_pos_p_c'].mean() + df_sort['5_p_v_d_neg_p_c'].mean())
    sixpospercentage = (df_sort['6_p_v_d_pos_p_c'].mean() * 100) / (
        df_sort['6_p_v_d_pos_p_c'].mean() + df_sort['6_p_v_d_neg_p_c'].mean())

    twonegpercentage = (df_sort['2_p_v_d_neg_p_c'].mean() * 100) / (
        df_sort['2_p_v_d_pos_p_c'].mean() + df_sort['2_p_v_d_neg_p_c'].mean())
    threenegpercentage = (df_sort['3_p_v_d_neg_p_c'].mean() * 100) / (
        df_sort['3_p_v_d_pos_p_c'].mean() + df_sort['3_p_v_d_neg_p_c'].mean())
    fivenegpercentage = (df_sort['5_p_v_d_neg_p_c'].mean() * 100) / (
        df_sort['5_p_v_d_pos_p_c'].mean() + df_sort['5_p_v_d_neg_p_c'].mean())
    sixnegpercentage = (df_sort['6_p_v_d_neg_p_c'].mean() * 100) / (
        df_sort['6_p_v_d_pos_p_c'].mean() + df_sort['6_p_v_d_neg_p_c'].mean())

    list_2_p_v_d_pos_p_c.append(twopospercentage)
    list_3_p_v_d_pos_p_c.append(threepospercentage)
    list_3_p_v_d_pos_p_c.append(threepospercentage / twopospercentage)
    list_5_p_v_d_pos_p_c.append(fivepospercentage)
    list_6_p_v_d_pos_p_c.append(sixpospercentage)
    list_6_p_v_d_pos_p_c.append(sixpospercentage / fivepospercentage)

    list_2_p_v_d_neg_p_c.append(twonegpercentage)
    list_3_p_v_d_neg_p_c.append(threenegpercentage)
    list_3_p_v_d_neg_p_c.append(threenegpercentage / twonegpercentage)
    list_5_p_v_d_neg_p_c.append(fivenegpercentage)
    list_6_p_v_d_neg_p_c.append(sixnegpercentage)
    list_6_p_v_d_neg_p_c.append(sixnegpercentage / fivenegpercentage)

    total_rows = df.shape[0]
    blank_list = [np.NaN]

    df['2_p_v_d_pos_p_c'] = list_2_p_v_d_pos_p_c + blank_list * (
        total_rows - len(list_2_p_v_d_pos_p_c))
    df['3_p_v_d_pos_p_c'] = list_3_p_v_d_pos_p_c + blank_list * (
        total_rows - len(list_3_p_v_d_pos_p_c))
    df['5_p_v_d_pos_p_c'] = list_5_p_v_d_pos_p_c + blank_list * (
        total_rows - len(list_5_p_v_d_pos_p_c))
    df['6_p_v_d_pos_p_c'] = list_6_p_v_d_pos_p_c + blank_list * (
        total_rows - len(list_6_p_v_d_pos_p_c))

    df['2_p_v_d_neg_p_c'] = list_2_p_v_d_neg_p_c + blank_list * (
        total_rows - len(list_2_p_v_d_neg_p_c))
    df['3_p_v_d_neg_p_c'] = list_3_p_v_d_neg_p_c + blank_list * (
        total_rows - len(list_3_p_v_d_neg_p_c))
    df['5_p_v_d_neg_p_c'] = list_5_p_v_d_neg_p_c + blank_list * (
        total_rows - len(list_5_p_v_d_neg_p_c))
    df['6_p_v_d_neg_p_c'] = list_6_p_v_d_neg_p_c + blank_list * (
        total_rows - len(list_6_p_v_d_neg_p_c))

    #LAST COLUMN ADDED

    df2 = df2.iloc[0:0]

    df2['AV DIVIDE BY AU'] = df_sort.apply(
        lambda x: x['3_p_v_d_pos_p_c'] / x['2_p_v_d_pos_p_c'], axis=1)
    df2['AX DIVIDE BY AW'] = df_sort.apply(
        lambda x: x['6_p_v_d_pos_p_c'] / x['5_p_v_d_pos_p_c'], axis=1)

    df2['BA DIVIDE BY AZ'] = df_sort.apply(
        lambda x: x['3_p_v_d_neg_p_c'] / x['2_p_v_d_neg_p_c'], axis=1)
    df2['BC DIVIDE BY BB'] = df_sort.apply(
        lambda x: x['6_p_v_d_neg_p_c'] / x['5_p_v_d_neg_p_c'], axis=1)

    #df2.loc[df2.shape[0]] = df2.mean()

    AV_DIVIDE_BY_AU = df2['AV DIVIDE BY AU'].values.tolist()
    AX_DIVIDE_BY_AW = df2['AX DIVIDE BY AW'].values.tolist()

    BA_DIVIDE_BY_AZ = df2['BA DIVIDE BY AZ'].values.tolist()
    BC_DIVIDE_BY_BB = df2['BC DIVIDE BY BB'].values.tolist()

    print(df_sort)
    df.insert(45, '45', '')
    df.insert(50, '50', '')
    df.insert(55, '55', '')

    AV_DIVIDE_BY_AU.append(df2['AV DIVIDE BY AU'].mean())
    AX_DIVIDE_BY_AW.append(df2['AX DIVIDE BY AW'].mean())
    BA_DIVIDE_BY_AZ.append(df2['BA DIVIDE BY AZ'].mean())
    BC_DIVIDE_BY_BB.append(df2['BC DIVIDE BY BB'].mean())

    AV_DIVIDE_BY_AU.append(
        df2['AV DIVIDE BY AU'].mean() * 100 /
        (df2['BA DIVIDE BY AZ'].mean() + df2['AV DIVIDE BY AU'].mean()))
    BA_DIVIDE_BY_AZ.append(
        df2['BA DIVIDE BY AZ'].mean() * 100 /
        (df2['BA DIVIDE BY AZ'].mean() + df2['AV DIVIDE BY AU'].mean()))

    AX_DIVIDE_BY_AW.append(
        df2['AX DIVIDE BY AW'].mean() * 100 /
        (df2['AX DIVIDE BY AW'].mean() + df2['BC DIVIDE BY BB'].mean()))
    BC_DIVIDE_BY_BB.append(
        df2['BC DIVIDE BY BB'].mean() * 100 /
        (df2['AX DIVIDE BY AW'].mean() + df2['BC DIVIDE BY BB'].mean()))

    AV_DIVIDE_BY_AU.append(
        abs(df2['AV DIVIDE BY AU'].mean() - df2['BA DIVIDE BY AZ'].mean()))
    BA_DIVIDE_BY_AZ.append(
        abs(df2['AX DIVIDE BY AW'].mean() - df2['BC DIVIDE BY BB'].mean()))

    total_rows = df.shape[0]
    blank_list = [np.NaN]

    df['AV DIVIDE BY AU'] = AV_DIVIDE_BY_AU + blank_list * (
        total_rows - len(AV_DIVIDE_BY_AU))
    df['AX DIVIDE BY AW'] = AX_DIVIDE_BY_AW + blank_list * (
        total_rows - len(AX_DIVIDE_BY_AW))
    df['BA DIVIDE BY AZ'] = BA_DIVIDE_BY_AZ + blank_list * (
        total_rows - len(BA_DIVIDE_BY_AZ))
    df['BC DIVIDE BY BB'] = BC_DIVIDE_BY_BB + blank_list * (
        total_rows - len(BC_DIVIDE_BY_BB))

    #df['two_p_v_d_abs'] = df['two_p_v_d'].abs()
    #df['three_p_v_d_abs'] = df['three_p_v_d'].abs()
    #df['five_p_v_d_abs'] = df['five_p_v_d'].abs()
    #df['six_p_v_d_abs'] = df['six_p_v_d'].abs()

    #df.insert(30, 'thirty', '')

    #df['2-3abs'] = df['two_p_v_d_abs'] - df['three_p_v_d_abs']
    #df['5-6abs'] = df['five_p_v_d_abs'] - df['six_p_v_d_abs']

    #df['2-3abs'] = df['2-3abs'].abs()
    #df['5-6abs'] = df['5-6abs'].abs()

    #df.insert(33, '33', '')
    #two_p_v_d_pos_list = df.apply(lambda x :x['two_p_v_d'] if x['two_p_v_d'] > 0 else np.NaN ,axis = 1).values.tolist()
    #two_p_v_d_pos_list = [x for x in two_p_v_d_pos_list if math.isnan(x) == False]
    #df['two_p_v_d_pos'] = two_p_v_d_pos_list + another_blank_list*(total_rows_another - len(two_p_v_d_pos_list))

    #three_p_v_d_pos_list = df.apply(lambda x :x['three_p_v_d'] if x['two_p_v_d'] > 0 else np.NaN ,axis = 1).values.tolist()
    #three_p_v_d_pos_list = [x for x in three_p_v_d_pos_list if math.isnan(x) == False]
    #df['three_p_v_d_pos'] = three_p_v_d_pos_list + another_blank_list*(total_rows_another - len(three_p_v_d_pos_list))

    #five_p_v_d_pos_list = df.apply(lambda x :x['five_p_v_d'] if x['five_p_v_d'] > 0 else np.NaN ,axis = 1).values.tolist()
    #five_p_v_d_pos_list = [x for x in five_p_v_d_pos_list if math.isnan(x) == False]
    #df['five_p_v_d_pos'] = five_p_v_d_pos_list + another_blank_list*(total_rows_another - len(five_p_v_d_pos_list))

    #six_p_v_d_pos_list = df.apply(lambda x :x['six_p_v_d'] if x['five_p_v_d'] > 0 else np.NaN ,axis = 1).values.tolist()
    #six_p_v_d_pos_list = [x for x in six_p_v_d_pos_list if math.isnan(x) == False]
    #df['six_p_v_d_pos'] = six_p_v_d_pos_list + another_blank_list*(total_rows_another - len(six_p_v_d_pos_list))

    #df.insert(38, '38', '')
    #two_p_v_d_neg_list = df.apply(lambda x :x['two_p_v_d'] if x['two_p_v_d'] < 0 else np.NaN ,axis = 1).values.tolist()
    #two_p_v_d_neg_list = [x for x in two_p_v_d_neg_list if math.isnan(x) == False]
    #df['two_p_v_d_neg'] = two_p_v_d_neg_list + another_blank_list*(total_rows_another - len(two_p_v_d_neg_list))

    #three_p_v_d_neg_list = df.apply(lambda x :x['three_p_v_d'] if x['two_p_v_d'] < 0 else np.NaN ,axis = 1).values.tolist()
    #three_p_v_d_neg_list = [x for x in three_p_v_d_neg_list if math.isnan(x) == False]
    #df['three_p_v_d_neg'] = three_p_v_d_neg_list + another_blank_list*(total_rows_another - len(three_p_v_d_neg_list))

    #five_p_v_d_neg_list = df.apply(lambda x :x['five_p_v_d'] if x['five_p_v_d'] < 0 else np.NaN ,axis = 1).values.tolist()
    #five_p_v_d_neg_list = [x for x in five_p_v_d_neg_list if math.isnan(x) == False]
    #df['five_p_v_d_neg'] = five_p_v_d_neg_list + another_blank_list*(total_rows_another - len(five_p_v_d_neg_list))

    #six_p_v_d_neg_list = df.apply(lambda x :x['six_p_v_d'] if x['five_p_v_d'] < 0 else np.NaN ,axis = 1).values.tolist()
    #six_p_v_d_neg_list = [x for x in six_p_v_d_neg_list if math.isnan(x) == False]
    #df['six_p_v_d_neg'] = six_p_v_d_neg_list + another_blank_list*(total_rows_another - len(six_p_v_d_neg_list))

    #df['two_p_v_d_neg'] = df['two_p_v_d_neg'].abs()
    #df['three_p_v_d_neg'] = df['three_p_v_d_neg'].abs()
    #df['five_p_v_d_neg'] = df['five_p_v_d_neg'].abs()
    #df['six_p_v_d_neg'] = df['six_p_v_d_neg'].abs()

    #df.insert(43, '43', '')

    #df['2-3pos'] = df['two_p_v_d_pos'] - df['three_p_v_d_pos']
    #df['5-6pos'] = df['five_p_v_d_pos'] - df['six_p_v_d_pos']

    #df['2-3pos'] = df['2-3pos'].abs()
    #df['5-6pos'] = df['5-6pos'].abs()
    #df.insert(46, '46', '')

    #df['2-3neg'] = df['two_p_v_d_neg'] - df['three_p_v_d_neg']
    #df['5-6neg'] = df['five_p_v_d_neg'] - df['six_p_v_d_neg']

    #df['2-3neg'] = df['2-3neg'].abs()
    #df['5-6neg'] = df['5-6neg'].abs()

    #CHOTA WALA RAKHNA HAI BADA WALA HATANA HAI
    #df.insert(49, '49', '')

    #df['2-3pos_a'] = df.apply(lambda x : x['2-3pos'] if x['2-3pos'] < x['5-6pos'] else np.NaN, axis = 1)
    #df['5-6pos_a'] = df.apply(lambda x : x['5-6pos'] if x['5-6pos'] < x['2-3pos'] else np.NaN, axis = 1)

    #df['2-3neg_a'] = df.apply(lambda x : x['2-3neg'] if x['2-3neg'] < x['5-6neg'] else np.NaN, axis = 1)
    #df['5-6neg_a'] = df.apply(lambda x : x['5-6neg'] if x['5-6neg'] < x['2-3neg'] else np.NaN, axis = 1)

    #av_23_pos_a = df['2-3pos_a'].mean()
    #av_56_pos_a = df['5-6pos_a'].mean()

    #av_23_neg_a = df['2-3neg_a'].mean()
    #av_56_neg_a = df['5-6neg_a'].mean()

    #av_23_pos_a_percent = av_23_pos_a*100/(av_23_pos_a + av_23_neg_a)
    #av_23_neg_a_percent = av_23_neg_a*100/(av_23_pos_a + av_23_neg_a)

    #av_56_pos_a_percent = av_56_pos_a*100/(av_56_pos_a+av_56_neg_a)
    #av_56_neg_a_percent = av_56_neg_a*100/(av_56_pos_a+av_56_neg_a)

    #a = df['2-3pos_a'].values.tolist()
    #a = [x for x in a if math.isnan(x) == False]
    #a.append(av_23_pos_a)
    #a.append(av_23_pos_a_percent)

    #b = df['5-6pos_a'].values.tolist()
    #b = [x for x in b if math.isnan(x) == False]
    #b.append(av_56_pos_a)
    #b.append(av_56_pos_a_percent)

    #c = df['2-3neg_a'].values.tolist()
    #c = [x for x in c if math.isnan(x) == False]
    #c.append(av_23_neg_a)
    #c.append(av_23_neg_a_percent)

    #d = df['5-6neg_a'].values.tolist()
    #d = [x for x in d if math.isnan(x) == False]
    #d.append(av_56_neg_a)
    #d.append(av_56_neg_a_percent)
    #    total_rows_another = df.shape[0]

    #   print(f'length of a {len(a)}')
    #   print(f'total rows {total_rows_another}')
    #   print(f'remainder {total_rows_another - len(a)}')
    #   df['2-3pos_a'] = a + another_blank_list*(total_rows_another - len(a))
    #   df['5-6pos_a'] = b + another_blank_list*(total_rows_another - len(b))
    #   df['2-3neg_a'] = c + another_blank_list*(total_rows_another - len(c))
    #   df['5-6neg_a'] = d + another_blank_list*(total_rows_another - len(d))

    writer = pandas.ExcelWriter(path + "merge_converter.xlsx",
                                engine='xlsxwriter')
    df.to_excel(writer, sheet_name='Sheet1', index=False)
    writer.save()
    print(df)
Ejemplo n.º 6
0
    def recolectarrep(self):
        localtime = str(date.today())
        urllib3.disable_warnings()
        if self.capturaCarrera.get() == "Computacion":
            self.carrera.set('10')
        elif self.capturaCarrera.get() == "Electrica":
            self.carrera.set('9')
        elif self.capturaCarrera.get() == "Telecom":
            self.carrera.set('11')
        rutadeldirectorio = askdirectory()
        http = urllib3.PoolManager()
        url = 'https://www.siass.unam.mx/consulta?numero_cuenta=' + str(
            self.numCta.get(
            )) + '&sistema_pertenece=dgae&facultad_id=11&carrera_id=' + str(
                self.carrera.get())
        r = http.request('GET', url)
        r.status
        soup = bs.BeautifulSoup(r.data, 'html.parser')
        link = soup.find_all(
            'a')  #obtenemos todos las etiquetas <a> para obtener los links
        arrlinks = [
        ]  # creamos un arreglo para guardar todos lon links en las etiquetas
        for i in link:
            arrlinks.append(
                i['href']
            )  # guardamos el link de las etiquetas <a> en arraylinks
        linkstemp = [
        ]  #guardamos los links que NO dirigen al contenido de descripción de los servicios sociales
        linkstemporales = [
        ]  #guardamos los links de las pestañas que enumeran el contenido de las páginas 1 2 3 4....
        numerosConsulta = []  #guardamos sólo el numero de los links
        for i in arrlinks:
            if ("&page=" in i):
                linkstemp.append(i)

        for x in linkstemp:
            linkstemporales.append(
                x.replace(
                    "http://www.siass.unam.mx/consulta?numero_cuenta=" +
                    self.numCta.get() +
                    "&sistema_pertenece=dgae&facultad_id=11&carrera_id=" +
                    self.carrera.get() + "&page=", "")
            )  #reemplazamos todo el url para obtener solo el número de la página a la que va el link
        #en las siguientes líneas de código determinamos el número de páginas obteniento el mayor en la lista "linkstemporales"

        z = int(linkstemporales[1])
        linkstemporales[0] = 0
        max = 0
        for j in linkstemporales:
            if (int(j) > z):
                max = int(j)
            if int(j) > max:
                max = int(j)
        #aquí ya tenemos el numero máximo
        numerosConsulta = [
        ]  #aquí guardaremos todos lor número de consulta (es decir los últimos numeros de los links que contienen la descripción de los servicios)
        arregloDic = [
        ]  #aquí guardaremos los diccionarios que se generarán en el webscraping
        contenedorxl = pd.ExcelWriter('pruebaexcelxlsx', engine='xlsxwriter')
        for e in range(
                2, max):  #recorremos todas la pestañas de la pagina del siass
            #sobreescribiremos nuestras variables, ya que obtuvimos lo necesario para recorrer la página
            url = 'https://www.siass.unam.mx/consulta?numero_cuenta=' + self.numCta.get(
            ) + '&sistema_pertenece=dgae&facultad_id=11&carrera_id=' + self.carrera.get(
            ) + '&page=' + str(e)
            r = http.request('GET', url)
            r.status
            soup = bs.BeautifulSoup(r.data, 'html.parser')
            link = soup.find_all(
                'a')  #obtenemos todas las etiquetas <a> de html
            arrlinks = []
            for i in link:
                arrlinks.append(
                    i['href'])  #obtenemos todos los links en las etiquetas <a>

            for i in arrlinks:
                if ("https://www.siass.unam.mx/consulta/" in i):
                    numerosConsulta.append(
                        i.replace("https://www.siass.unam.mx/consulta/",
                                  ""))  #obtenemos solo los números
        for r in numerosConsulta:  #recorremos la descripción de todos los servicios sociales que nuestro usuario puede ver
            url2 = 'https://www.siass.unam.mx/consulta/' + str(r)
            r = http.request('GET', url2)
            r.status
            diccionario = {}
            soup = bs.BeautifulSoup(r.data, 'html.parser')
            tabla = soup.find_all('tr')
            f = open(
                "diccionario.txt", "a", encoding="utf8"
            )  #abrimos un archivo para guardar los diccionarios que crearemos para pasar despues a la base de datos
            links = soup.find_all('li')  #encuentra todas las etiquetas li
            divs = soup.find_all(
                "div",
                {"id": re.compile('carrera_*')
                 })  #todas la etiquetas con un id que inicie con carrera
            dias = soup.find_all(
                'label', {"class": "btn btn-success disabled"
                          })  #saca los dias y los turnos del servicio social
            tablaActividades = soup.find_all(
                'table', {"class": "table table-striped table-bordered"
                          })  #actividades de servicio social
            for a in dias:
                diccionario[a.text.replace(" ", "").replace(
                    "\n", ""
                )] = "x"  # obtenemos los dias que se laboran en el servicio social y quitamos los espacios y saltos de linea
            for i in links:
                for a in i.find_all(
                        'a', href=re.compile('#carrera_*')
                ):  #obtenemos todas las etiquetas d ela carrera
                    for j in divs:
                        for b in j.find_all(
                                'p', {"class": "alert alert-info"}
                        ):  #otenemos el texto de las etiquetas que tienen la carrera
                            diccionario[(
                                a.text.replace("  ", "").replace("\n", "")
                            )] = (
                                b.text.replace("  ", "").replace("\n", "")
                            )  #crea un elemento del diccionario con el contenido de la etiqueta de la carrera y la etiqueta que contiene los prestadores
                        for b in tablaActividades:  #iniciamos a leer las actividades
                            columna = b.find_all(
                                'td', {"style": "padding-left: 20px;"}
                            )  #extraemos solo la tabla de actividades de cada carrera
                            texto = ""
                            for c in columna:
                                texto = texto + c.text.replace(
                                    "  ", "").replace("\n\n", "")
                                diccionario["Actividad " + (a.text.replace(
                                    "  ", "").replace("\n", ""))] = texto
            for i in tabla:  #creamos y llenamos un diccionario con el contenido de las tablas
                for a in i.find_all('td'):
                    for z in i.find_all('th'):
                        llave = " ".join(z.text.split())
                        valor = " ".join(a.text.split())
                        if llave in diccionario:
                            diccionario[llave + " jefe directo"] = valor
                        else:
                            diccionario[llave] = valor
            arregloDic.append(diccionario)
            f.write(str(diccionario) + "\n")
            f.close()
        toJson = json.dumps(arregloDic)
        dfPrueba = pd.read_json(toJson)
        #Esta parte permite generar archivos de cada una de las carreras
        if self.carrera.get() == "10":
            dfPrueba.to_excel(rutadeldirectorio + '/programascompu' +
                              localtime + '.xls',
                              index=False)
        elif self.carrera.get() == '11':
            dfPrueba.to_excel(rutadeldirectorio + '/programastelecom' +
                              localtime + '.xls',
                              index=False)
        elif self.carrera.get() == '9':
            dfPrueba.to_excel(rutadeldirectorio + '/programaselectr' +
                              localtime + '.xls',
                              index=False)
        contenedorxl.save()
Ejemplo n.º 7
0
def feature_importance_Einstein(base):

    df1 = base['inputs']
    df_out = base['outputs']

    try:
        df_out = df_out.drop(columns=['Timestamp'])
        df1 = df1.drop(columns=['Timestamp'])
    except:
        pass

    # Encontra as variáveis mais relevante para a incidência de COVID-19
    model = ExtraTreesClassifier(verbose=False)
    a = model.fit(df1, df_out)

    lista_importances = pd.DataFrame([model.feature_importances_])
    lista_importances.columns = list(df1.columns)
    lista_importances = lista_importances * 100

    lista_importances = lista_importances.sort_values(by=0,
                                                      axis=1,
                                                      ascending=False)

    top15 = list(lista_importances.columns[0:15])
    top15_values = []
    # print("Variáveis mais impactantes:")
    for l in lista_importances.columns[0:15]:
        # print("Nome: " + str(l) + " - " + str(lista_importances[l][0]) + " %")
        top15_values.append(lista_importances[l][0])
    # print(top15)

    # cria dataset para predição
    df_in = df1[top15]
    df_out = df_out

    # pega a lista das variáveis mais relevantes e cria outra planilha para a rede neural
    lista_neural_in = df_in
    lista_neural_out = df_out

    ### como, neste caso, o timestamp não importa, pode-se preencher com qualquer valor sequencial ###
    # pega a quantidade de linhas
    qtde_linhas = len(lista_neural_in.index)
    # cria uma coluna de Timestamps sequenciais na primeira posição
    lista_neural_in.insert(
        0, "Timestamp",
        pd.date_range(start='1/1/2020', periods=qtde_linhas, freq='H'))
    lista_neural_out.insert(
        0, "Timestamp",
        pd.date_range(start='1/1/2020', periods=qtde_linhas, freq='H'))

    df2_in = lista_neural_in.copy()
    df2_out = lista_neural_out.copy()
    writer = pd.ExcelWriter('base_simulate.xlsx', engine='openpyxl')
    lista_neural_in.to_excel(writer, sheet_name="INPUTS")
    lista_neural_out.to_excel(writer, sheet_name="OUTPUTS")
    writer.save()

    top15_aws = zip(top15, top15_values)

    response = {
        'top15': top15_aws,
        'top15_names': top15,
        'df_in': df2_in,
        'df_out': df2_out,
        'model': model.get_params(),
    }

    return response
Ejemplo n.º 8
0
    endsi   = []
    
    
    for index, tt in enumerate(starts):
        
        startsi.append( find_nearest(data[1],starts[index]) )
        endsi.append( find_nearest(data[1],ends[index]) ) 
    
    

    mm = [dirr]
    for index, tt in enumerate(startsi):
        mm.append(  max(  data[0][ startsi[index]:endsi[index]  ])  )

    




  

    current_frame = pd.DataFrame( [ mm] , columns =  columns)
    llist.append(current_frame)
    
    
new_data_frame = pd.concat(llist, ignore_index=True)    

writer = pd.ExcelWriter('{}/results/{}'.format( cwd, "results.xlsx"), engine='xlsxwriter')
new_data_frame.to_excel(writer,'Sheet1', index=False)
writer.save()
Ejemplo n.º 9
0
    today.day) + "_" + month + "_" + str(today.year) + ".xlsx"

#reorder the columns, if a comlumn is not listed, it will not be displayed.
# myColumns = ['RHSA', 'released_packages', 'severity', 'released_on', 'resource_url', 'package']
# myColumns = ['version']
myColumns = [
    'RHSA', 'ADVISORY_TITLE_TRIMMED', 'SEVERITY', 'VULNERABILITY_IMPACT',
    'RESTART', 'AFFECTED'
]

#creates a panda DataFrame from the data pulled from the API.
advisoryDF = pandas.DataFrame(
    daddy_list, columns=myColumns)  #used to be "data", not daddy_list

#created excel speadsheet
writer = pandas.ExcelWriter(workbook_name, engine='xlsxwriter')

#adds data frame to excel workbook on sheet named sheet_test
advisoryDF.to_excel(writer, index=False, sheet_name=month + " RHEL_Analysis")

workbook = writer.book
rhel_analysis_worksheet = writer.sheets[month + " RHEL_Analysis"]

bold_format = workbook.add_format({
    'bold': True,
    'bg_color': '#A6A6A6',
    'font_name': 'Verdana',
    'font_size': '10',
    'text_wrap': True,
    'align': 'center',
    'valign': 'vcenter',
Ejemplo n.º 10
0
def append_df_to_excel(filename,
                       df,
                       sheet_name='Sheet1',
                       startrow=None,
                       truncate_sheet=False,
                       **to_excel_kwargs):
    """
    Append a DataFrame [df] to existing Excel file [filename]
    into [sheet_name] Sheet.
    If [filename] doesn't exist, then this function will create it.

    Parameters:
      filename : File path or existing ExcelWriter
                 (Example: '/path/to/file.xlsx')
      df : dataframe to save to workbook
      sheet_name : Name of sheet which will contain DataFrame.
                   (default: 'Sheet1')
      startrow : upper left cell row to dump data frame.
                 Per default (startrow=None) calculate the last row
                 in the existing DF and write to the next row...
      truncate_sheet : truncate (remove and recreate) [sheet_name]
                       before writing DataFrame to Excel file
      to_excel_kwargs : arguments which will be passed to `DataFrame.to_excel()`
                        [can be dictionary]

    Returns: None

    (c) [MaxU](https://stackoverflow.com/users/5741205/maxu?tab=profile)
    """
    from openpyxl import load_workbook

    # ignore [engine] parameter if it was passed
    if 'engine' in to_excel_kwargs:
        to_excel_kwargs.pop('engine')

    writer = pd.ExcelWriter(filename, engine='openpyxl')

    # Python 2.x: define [FileNotFoundError] exception if it doesn't exist
    try:
        FileNotFoundError
    except NameError:
        FileNotFoundError = IOError

    try:
        # try to open an existing workbook
        writer.book = load_workbook(filename)

        # get the last row in the existing Excel sheet
        # if it was not specified explicitly
        if startrow is None and sheet_name in writer.book.sheetnames:
            startrow = writer.book[sheet_name].max_row

        # truncate sheet
        if truncate_sheet and sheet_name in writer.book.sheetnames:
            # index of [sheet_name] sheet
            idx = writer.book.sheetnames.index(sheet_name)
            # remove [sheet_name]
            writer.book.remove(writer.book.worksheets[idx])
            # create an empty sheet [sheet_name] using old index
            writer.book.create_sheet(sheet_name, idx)

        # copy existing sheets
        writer.sheets = {ws.title: ws for ws in writer.book.worksheets}
    except FileNotFoundError:
        # file does not exist yet, we will create it
        pass

    if startrow is None:
        startrow = 0

    # write out the new sheet
    df.to_excel(writer,
                sheet_name,
                startrow=startrow,
                header=False,
                **to_excel_kwargs)

    # save the workbook
    writer.save()
    writer.close()
Ejemplo n.º 11
0
    def calc_imp_vols(self):

        vcs_pairs = self.vcs_pairs
        vcs_pairs['validQ'] = False
        print('YYYYUUUUUPPPP!')

        for i in range(len(vcs_pairs.index)):
            for j in [1, 2]:
                if np.isnan(vcs_pairs.loc[i, 'current_strike' + str(j)]):
                    continue
                call_option_ticker_string = vcs_pairs.loc[
                    i, 'ticker' + str(j)] + '_C_' + str(
                        vcs_pairs.loc[i, 'current_strike' + str(j)])
                put_option_ticker_string = vcs_pairs.loc[
                    i, 'ticker' + str(j)] + '_P_' + str(
                        vcs_pairs.loc[i, 'current_strike' + str(j)])

                ib_underlying_multiplier = ib_contract.ib_underlying_multiplier_dictionary.get(
                    vcs_pairs.loc[i, 'tickerHead'], 1)

                if (self.bid_price_dictionary[call_option_ticker_string] > 0
                    ) and (self.ask_price_dictionary[call_option_ticker_string]
                           > 0):
                    vcs_pairs.loc[i, 'call_mid_price' + str(j)] = (
                        self.bid_price_dictionary[call_option_ticker_string] +
                        self.ask_price_dictionary[call_option_ticker_string]
                    ) / (2 * ib_underlying_multiplier)

                    option_greeks = qom.get_option_greeks(
                        underlying=vcs_pairs.loc[i, 'underlying_mid_price' +
                                                 str(j)],
                        option_price=vcs_pairs.loc[i,
                                                   'call_mid_price' + str(j)],
                        strike=vcs_pairs.loc[i, 'current_strike' + str(j)],
                        risk_free_rate=vcs_pairs.loc[i,
                                                     'interest_date' + str(j)],
                        expiration_date=vcs_pairs.loc[i, 'expiration_date' +
                                                      str(j)],
                        calculation_date=self.todays_date,
                        option_type='C',
                        exercise_type=vcs_pairs.loc[i, 'exercise_type'])

                    vcs_pairs.loc[i, 'call_iv' +
                                  str(j)] = 100 * option_greeks['implied_vol']

                if (self.bid_price_dictionary[put_option_ticker_string] > 0
                    ) and (self.ask_price_dictionary[put_option_ticker_string]
                           > 0):
                    vcs_pairs.loc[i, 'put_mid_price' + str(j)] = (
                        self.bid_price_dictionary[put_option_ticker_string] +
                        self.ask_price_dictionary[put_option_ticker_string]
                    ) / (2 * ib_underlying_multiplier)

                    option_greeks = qom.get_option_greeks(
                        underlying=vcs_pairs.loc[i, 'underlying_mid_price' +
                                                 str(j)],
                        option_price=vcs_pairs.loc[i,
                                                   'put_mid_price' + str(j)],
                        strike=vcs_pairs.loc[i, 'current_strike' + str(j)],
                        risk_free_rate=vcs_pairs.loc[i,
                                                     'interest_date' + str(j)],
                        expiration_date=vcs_pairs.loc[i, 'expiration_date' +
                                                      str(j)],
                        calculation_date=self.todays_date,
                        option_type='P',
                        exercise_type=vcs_pairs.loc[i, 'exercise_type'])

                    vcs_pairs.loc[i, 'put_iv' +
                                  str(j)] = 100 * option_greeks['implied_vol']

        for j in [1, 2]:
            vcs_pairs['straddle_iv' +
                      str(j)] = (vcs_pairs['put_iv' + str(j)] +
                                 vcs_pairs['call_iv' + str(j)]) / 2
            vcs_pairs['straddle_price' +
                      str(j)] = (vcs_pairs['call_mid_price' + str(j)] +
                                 vcs_pairs['put_mid_price' + str(j)])

        vcs_pairs['current_atm_vol_ratio'] = vcs_pairs[
            'straddle_iv1'] / vcs_pairs['straddle_iv2']

        for i in range(len(vcs_pairs.index)):
            if np.isnan(vcs_pairs.loc[i, 'current_atm_vol_ratio']):
                continue

            intraday_vcs_output = ic.get_intraday_vcs(
                report_date=self.report_date,
                ticker1=vcs_pairs.loc[i, 'ticker1'],
                ticker2=vcs_pairs.loc[i, 'ticker2'],
                atm_vol_ratio=vcs_pairs.loc[i, 'current_atm_vol_ratio'])
            vcs_pairs.loc[i, 'QC'] = intraday_vcs_output['Q']
            vcs_pairs.loc[i, 'Q1C'] = intraday_vcs_output['Q1']
            vcs_pairs.loc[i, 'validQ'] = intraday_vcs_output['validQ']

        writer = pd.ExcelWriter('C:\Research\daily\kuzu.xlsx')
        vcs_pairs.to_excel(writer, 'Sheet1')
        writer.save()
        self.vcs_pairs = vcs_pairs
        self.prepare_orders()
Ejemplo n.º 12
0
def run_bindetect(args):
    """ Main function to run bindetect algorithm with input files and parameters given in args """

    #Checking input and setting cond_names
    check_required(args, ["signals", "motifs", "genome", "peaks"])
    args.cond_names = [
        os.path.basename(os.path.splitext(bw)[0]) for bw in args.signals
    ] if args.cond_names is None else args.cond_names
    args.outdir = os.path.abspath(args.outdir)

    #Set output files
    states = ["bound", "unbound"]
    outfiles = [
        os.path.abspath(
            os.path.join(args.outdir, "*", "beds",
                         "*_{0}_{1}.bed".format(condition, state)))
        for (condition, state) in itertools.product(args.cond_names, states)
    ]
    outfiles.append(
        os.path.abspath(os.path.join(args.outdir, "*", "beds", "*_all.bed")))
    outfiles.append(
        os.path.abspath(
            os.path.join(args.outdir, "*", "plots", "*_log2fcs.pdf")))
    outfiles.append(
        os.path.abspath(os.path.join(args.outdir, "*", "*_overview.txt")))
    outfiles.append(
        os.path.abspath(os.path.join(args.outdir, "*", "*_overview.xlsx")))

    outfiles.append(
        os.path.abspath(
            os.path.join(args.outdir, args.prefix + "_distances.txt")))
    outfiles.append(
        os.path.abspath(os.path.join(args.outdir,
                                     args.prefix + "_results.txt")))
    outfiles.append(
        os.path.abspath(
            os.path.join(args.outdir, args.prefix + "_results.xlsx")))
    outfiles.append(
        os.path.abspath(os.path.join(args.outdir,
                                     args.prefix + "_figures.pdf")))

    #-------------------------------------------------------------------------------------------------------------#
    #-------------------------------------------- Setup logger and pool ------------------------------------------#
    #-------------------------------------------------------------------------------------------------------------#

    logger = TobiasLogger("BINDetect", args.verbosity)
    logger.begin()

    parser = add_bindetect_arguments(argparse.ArgumentParser())
    logger.arguments_overview(parser, args)
    logger.output_files(outfiles)

    # Setup pool
    args.cores = check_cores(args.cores, logger)
    writer_cores = max(1, int(args.cores * 0.1))
    worker_cores = max(1, args.cores - writer_cores)
    logger.debug("Worker cores: {0}".format(worker_cores))
    logger.debug("Writer cores: {0}".format(writer_cores))

    pool = mp.Pool(processes=worker_cores)
    writer_pool = mp.Pool(processes=writer_cores)

    #-------------------------------------------------------------------------------------------------------------#
    #-------------------------- Pre-processing data: Reading motifs, sequences, peaks ----------------------------#
    #-------------------------------------------------------------------------------------------------------------#

    logger.info("----- Processing input data -----")

    #Check that cond_names are the right length and are unique:
    if len(args.cond_names) != len(args.signals):
        logger.error(
            "The given number of given '--cond-names' ({0}) differ from the given input '--signals' ({1}). Please enter one condition name per signal."
            .format(len(args.cond_names), len(args.signals)))
        sys.exit(1)

    if len(args.cond_names) != len(set(args.cond_names)):
        logger.error(
            "The condition names are not unique ({0}). Please use --cond-names to set a unique set of condition names."
            .format(args.cond_names))
        sys.exit(1)

    #Check opening/writing of files
    logger.info("Checking reading/writing of files")
    check_files([args.signals, args.motifs, args.genome, args.peaks],
                action="r")
    check_files(outfiles[-3:], action="w")
    make_directory(args.outdir)

    #Comparisons between conditions
    no_conditions = len(args.signals)
    if args.time_series:
        comparisons = list(zip(args.cond_names[:-1], args.cond_names[1:]))
        args.comparisons = comparisons
    else:
        comparisons = list(itertools.combinations(args.cond_names,
                                                  2))  #all-against-all
        args.comparisons = comparisons

    #Pdf for debug output
    if args.debug:
        debug_out = os.path.abspath(
            os.path.join(args.outdir, args.prefix + "_debug.pdf"))
        debug_pdf = PdfPages(debug_out, keep_empty=True)

    #Open figure pdf and write overview
    fig_out = os.path.abspath(
        os.path.join(args.outdir, args.prefix + "_figures.pdf"))
    figure_pdf = PdfPages(fig_out, keep_empty=True)

    plt.figure()
    plt.axis('off')
    plt.text(0.5,
             0.8,
             "BINDETECT FIGURES",
             ha="center",
             va="center",
             fontsize=20)

    #output and order
    titles = []
    titles.append("Raw score distributions")
    if no_conditions > 1 and args.norm_off == False:
        titles.append("Normalized score distributions")
    if args.debug:
        for (cond1, cond2) in comparisons:
            titles.append("Background log2FCs ({0} / {1})".format(
                cond1, cond2))

    for (cond1, cond2) in comparisons:
        titles.append("BINDetect plot ({0} / {1})".format(cond1, cond2))

    plt.text(0.1,
             0.6,
             "\n".join([
                 "Page {0}) {1}".format(i + 2, titles[i])
                 for i in range(len(titles))
             ]) + "\n\n",
             va="top")
    figure_pdf.savefig(bbox_inches='tight')
    plt.close()

    ################# Read peaks ################
    #Read peak and peak_header
    logger.info("Reading peaks")
    peaks = RegionList().from_bed(args.peaks)
    logger.info("- Found {0} regions in input peaks".format(len(peaks)))

    #Check number of columns in peaks
    n_cols = len(peaks[0])
    for i, peak in enumerate(peaks):
        if len(peak) != n_cols:
            logger.error(
                "The lines in --peaks have a varying number of columns. Line 1 has {0} columns, but line {1} has {2} columns! Please adjust the format of this file to run TOBIAS BINDetect."
                .format(n_cols, i + 1, len(peak)))
            sys.exit(1)

    #Merge overlapping peaks
    peaks = peaks.merge()
    logger.info("- Merged to {0} regions".format(len(peaks)))

    if len(peaks) == 0:
        logger.error("Input --peaks file is empty!")
        sys.exit(1)

    #Read header and check match with number of peak columns
    peak_columns = len(peaks[0])  #number of columns
    logger.debug("--peaks have {0} columns".format(peak_columns))
    if args.peak_header != None:
        content = open(args.peak_header, "r").read()
        args.peak_header_list = content.split()
        logger.debug("Peak header: {0}".format(args.peak_header_list))

        #Check whether peak header fits with number of peak columns
        if len(args.peak_header_list) != peak_columns:
            logger.error(
                "Length of --peak_header ({0}) does not fit number of columns in --peaks ({1})."
                .format(len(args.peak_header_list), peak_columns))
            sys.exit(1)
    else:
        args.peak_header_list = ["peak_chr", "peak_start", "peak_end"] + [
            "additional_" + str(num + 1) for num in range(peak_columns - 3)
        ]
    logger.debug("Peak header list: {0}".format(args.peak_header_list))

    ################# Check for match between peaks and fasta/bigwig #################
    logger.info(
        "Checking for match between --peaks and --fasta/--signals boundaries")
    logger.info("- Comparing peaks to {0}".format(args.genome))
    fasta_obj = pysam.FastaFile(args.genome)
    fasta_boundaries = dict(zip(fasta_obj.references, fasta_obj.lengths))
    fasta_obj.close()
    logger.debug("Fasta boundaries: {0}".format(fasta_boundaries))
    peaks = peaks.apply_method(OneRegion.check_boundary, fasta_boundaries,
                               "exit")  #will exit if peaks are outside borders

    #Check boundaries of each bigwig signal individually
    for signal in args.signals:
        logger.info("- Comparing peaks to {0}".format(signal))
        pybw_obj = pybw.open(signal)
        pybw_header = pybw_obj.chroms()
        pybw_obj.close()
        logger.debug("Signal boundaries: {0}".format(pybw_header))
        peaks = peaks.apply_method(OneRegion.check_boundary, pybw_header,
                                   "exit")

    ##### GC content for motif scanning ######
    #Make chunks of regions for multiprocessing
    logger.info("Estimating GC content from peak sequences")
    peak_chunks = peaks.chunks(args.split)
    gc_content_pool = pool.starmap(
        get_gc_content, itertools.product(peak_chunks, [args.genome]))
    gc_content = np.mean(gc_content_pool)  #fraction
    args.gc = gc_content
    bg = np.array([(1 - args.gc) / 2.0, args.gc / 2.0, args.gc / 2.0,
                   (1 - args.gc) / 2.0])
    logger.info("- GC content estimated at {0:.2f}%".format(gc_content * 100))

    ################ Get motifs ################
    logger.info("Reading motifs from file")
    motif_list = MotifList()
    args.motifs = expand_dirs(args.motifs)
    for f in args.motifs:
        motif_list += MotifList().from_file(f)  #List of OneMotif objects
    no_pfms = len(motif_list)
    logger.info("- Read {0} motifs".format(no_pfms))

    logger.debug("Getting motifs ready")
    motif_list.bg = bg

    #Set prefixes
    for motif in motif_list:
        motif.set_prefix(args.naming)
        motif.bg = bg

        logger.spam("Getting pssm for motif {0}".format(motif.name))
        motif.get_pssm()

    #Check that prefixes are unique regardless of upper/lower case name
    motif_prefixes = [motif.prefix.upper() for motif in motif_list]
    name_count = Counter(motif_prefixes)
    if max(name_count.values()) > 1:

        duplicated = [key for key, value in name_count.items() if value > 1]
        logger.warning(
            "The motif output names (as given by --naming) are not unique.")
        logger.warning(
            "The following names occur more than once: {0}".format(duplicated))
        logger.warning(
            "These motifs will be renamed with '_1', '_2' etc. To prevent this renaming, please make the names of the input --motifs unique"
        )

        motif_count = {dup_motif: 1 for dup_motif in duplicated}
        for i, motif in enumerate(motif_list):
            if motif.prefix.upper() in duplicated:

                original_name = motif.prefix
                motif.prefix = motif.prefix + "_{0}".format(
                    motif_count[motif.prefix.upper()]
                )  #Add number to make prefix unique
                logger.debug("Renamed motif {0}: {1} -> {2}".format(
                    i + 1, original_name, motif.prefix))
                motif_count[original_name.upper()] += 1

    motif_names = [motif.prefix for motif in motif_list]

    #Get threshold for motifs
    logger.debug("Getting match threshold per motif")
    outlist = pool.starmap(OneMotif.get_threshold,
                           itertools.product(motif_list, [args.motif_pvalue]))

    logger.spam(motif_list)

    motif_list = MotifList(outlist)
    for motif in motif_list:
        logger.debug("Motif {0}: threshold {1}".format(motif.name,
                                                       motif.threshold))

    logger.info("Creating folder structure for each TF")
    for TF in motif_names:
        logger.spam("Creating directories for {0}".format(TF))
        make_directory(os.path.join(args.outdir, TF))
        make_directory(os.path.join(args.outdir, TF, "beds"))
        make_directory(os.path.join(args.outdir, TF, "plots"))

    #-------------------------------------------------------------------------------------------------------------#
    #----------------------------------------- Plot logos for all motifs -----------------------------------------#
    #-------------------------------------------------------------------------------------------------------------#

    logo_filenames = {
        motif.prefix: os.path.join(args.outdir, motif.prefix,
                                   motif.prefix + ".png")
        for motif in motif_list
    }

    logger.info("Plotting sequence logos for each motif")
    task_list = [
        pool.apply_async(OneMotif.logo_to_file, (
            motif,
            logo_filenames[motif.prefix],
        )) for motif in motif_list
    ]
    monitor_progress(task_list, logger)
    results = [task.get() for task in task_list]
    logger.comment("")

    logger.debug("Getting base64 strings per motif")
    for motif in motif_list:
        #motif.get_base()
        with open(logo_filenames[motif.prefix], "rb") as png:
            motif.base = base64.b64encode(png.read()).decode("utf-8")

    #-------------------------------------------------------------------------------------------------------------#
    #--------------------- Motif scanning: Find binding sites and match to footprint scores ----------------------#
    #-------------------------------------------------------------------------------------------------------------#

    logger.comment("")
    logger.start_logger_queue(
    )  #start process for listening and handling through the main logger queue
    args.log_q = logger.queue  #queue for multiprocessing logging
    manager = mp.Manager()
    logger.info("Scanning for motifs and matching to signals...")

    #Create writer queues for bed-file output
    logger.debug("Setting up writer queues")
    qs_list = []
    writer_qs = {}

    #writer_queue = create_writer_queue(key2file, writer_cores)
    #writer_queue.stop()	#wait until all are done

    manager = mp.Manager()
    TF_names_chunks = [
        motif_names[i::writer_cores] for i in range(writer_cores)
    ]
    writer_tasks = []
    for TF_names_sub in TF_names_chunks:
        logger.debug("Creating writer queue for {0}".format(TF_names_sub))
        files = [
            os.path.join(args.outdir, TF, "beds", TF + ".tmp")
            for TF in TF_names_sub
        ]

        q = manager.Queue()
        qs_list.append(q)

        writer_tasks.append(
            writer_pool.apply_async(file_writer,
                                    args=(q, dict(zip(TF_names_sub,
                                                      files)), args))
        )  #, callback = lambda x: finished.append(x) print("Writing time: {0}".format(x)))
        for TF in TF_names_sub:
            writer_qs[TF] = q
    writer_pool.close()  #no more jobs applied to writer_pool

    #todo: use run_parallel
    #Start working on data
    if worker_cores == 1:
        logger.debug("Running with cores = 1")
        results = []
        for chunk in peak_chunks:
            results.append(
                scan_and_score(chunk, motif_list, args, args.log_q, writer_qs))

    else:
        logger.debug("Sending jobs to worker pool")

        task_list = [
            pool.apply_async(scan_and_score, (
                chunk,
                motif_list,
                args,
                args.log_q,
                writer_qs,
            )) for chunk in peak_chunks
        ]
        monitor_progress(task_list, logger)
        results = [task.get() for task in task_list]

    logger.info("Done scanning for TFBS across regions!")
    #logger.stop_logger_queue()	#stop the listening process (wait until all was written)

    #--------------------------------------#
    logger.info("Waiting for bedfiles to write")

    #Stop all queues for writing
    logger.debug("Stop all queues by inserting None")
    for q in qs_list:
        q.put((None, None))

    #Wait for all writer tasks to finish
    finished = 0
    while finished == 0:
        logger.debug("Writer task return status: {0}".format(
            [task.get() if task.ready() else "NA" for task in writer_tasks]))
        if sum([task.ready() for task in writer_tasks]) == len(writer_tasks):
            finished = 1
            return_codes = [task.get() for task in writer_tasks]
            if sum(return_codes) != 0:
                logger.error(
                    "Bedfile writer finished with an error ({0})".format())
            else:
                logger.debug("Bedfile writer(s) finished!")
        time.sleep(0.5)

    logger.debug("Joining bed_writer queues")
    for i, q in enumerate(qs_list):
        logger.debug("- Queue {0} (size {1})".format(i, q.qsize()))

    #Waits until all queues are closed
    writer_pool.join()

    #-------------------------------------------------------------------------------------------------------------#
    #---------------------------------- Process information on background scores  --------------------------------#
    #-------------------------------------------------------------------------------------------------------------#

    logger.info("Merging results from subsets")
    background = merge_dicts([result[0] for result in results])
    TF_overlaps = merge_dicts([result[1] for result in results])
    results = None

    #Add missing TF overlaps (if some TFs had zero sites)
    for TF1 in motif_list:
        if TF1.prefix not in TF_overlaps:
            TF_overlaps[TF1.prefix] = 0

        for TF2 in motif_list:
            tup = (TF1.prefix, TF2.prefix)
            if tup not in TF_overlaps:
                TF_overlaps[tup] = 0

    #Collect sampled background values
    for bigwig in args.cond_names:
        background["signal"][bigwig] = np.array(background["signal"][bigwig])

    #Check how many values were fetched from background
    n_bg_values = len(background["signal"][args.cond_names[0]])
    logger.debug("Collected {0} values from background".format(n_bg_values))
    if n_bg_values < 1000:
        err_str = "Number of background values collected from peaks is low (={0}) ".format(
            n_bg_values)
        err_str += "- this affects estimation of the bound/unbound threshold and the normalization between conditions. "
        err_str += "To improve this estimation, please run BINDetect with --peaks = the full peak set across all conditions."
        logger.warning(err_str)

    #Plot score distribution
    fig = plot_score_distribution(
        [background["signal"][bigwig] for bigwig in args.cond_names],
        labels=args.cond_names,
        title="Raw scores per condition")
    figure_pdf.savefig(fig, bbox_inches='tight')
    plt.close()

    #Normalize arrays
    args.norm_objects = {}
    if args.norm_off == True or len(
            args.cond_names
    ) == 1:  #if norm_off or length of cond is 1 - create constant normalization
        for bigwig in args.cond_names:
            args.norm_objects[bigwig] = ArrayNorm(
                "constant", popt=1.0, value_min=0, value_max=1
            )  #no normalization; min/max don't matter for constant norm

    else:
        logger.comment("")
        logger.info("Normalizing scores across conditions")

        list_of_vals = [
            background["signal"][bigwig] for bigwig in args.cond_names
        ]
        if args.debug:
            args.norm_objects = quantile_normalization(list_of_vals,
                                                       args.cond_names,
                                                       pdfpages=debug_pdf,
                                                       logger=logger)
        else:
            args.norm_objects = quantile_normalization(list_of_vals,
                                                       args.cond_names,
                                                       logger=logger)

        #Normalize background and visualize score distribution
        for bigwig in args.cond_names:

            original = background["signal"][bigwig]

            #Check for nan
            logger.debug("Background nans ({0}): {1}".format(
                bigwig, sum(np.isnan(original))))
            normalized = args.norm_objects[bigwig].normalize(original)

            #Replace negative values with 0
            negatives = normalized < 0
            normalized[negatives] = 0

            background["signal"][bigwig] = normalized
            logger.debug(
                "Background nans after normalization ({0}): {1}".format(
                    bigwig, sum(np.isnan(background["signal"][bigwig]))))

        fig = plot_score_distribution(
            [background["signal"][bigwig] for bigwig in args.cond_names],
            labels=args.cond_names,
            title="Normalized scores per condition")
        figure_pdf.savefig(fig, bbox_inches='tight')
        plt.close()

    #-------------------------------------------------------------------------------------------------------------#
    #-------------------------------------- Estimate bound/unbound threshold -------------------------------------#
    #-------------------------------------------------------------------------------------------------------------#

    logger.info("Estimating bound/unbound threshold")

    #Prepare scores (remove 0's etc.)
    bg_values = np.array([
        background["signal"][bigwig] for bigwig in args.cond_names
    ]).flatten()  #scores from all conditions
    logger.debug("Size of background array collected: {0}".format(
        bg_values.size))
    bg_values = bg_values[np.logical_not(np.isclose(
        bg_values, 0.0))]  #only non-zero counts
    logger.debug("Size of background array after filtering > 0: {0}".format(
        bg_values.size))
    if len(bg_values) == 0:
        logger.error(
            "Error processing bigwig scores from background. It could be that there are no scores in the bigwig (= all scores are 0) assigned for the peaks. Please check your input files."
        )
        sys.exit(1)

    x_max = np.percentile(bg_values, [99])
    bg_values = bg_values[bg_values < x_max]
    logger.debug(
        "Size of background array after filtering < x_max ({0}): {1}".format(
            x_max, bg_values.size))

    #Fit mixture of normals
    log_vals = np.log(bg_values).reshape(-1, 1)
    lowest_bic = np.inf
    for n_components in [2
                         ]:  #2 components; one for 0's and one for true signal
        gmm = sklearn.mixture.GaussianMixture(n_components=n_components,
                                              random_state=1)
        gmm.fit(log_vals)

        bic = gmm.bic(log_vals)
        logger.debug("n_compontents: {0} | bic: {1}".format(n_components, bic))
        if bic < lowest_bic:
            lowest_bic = bic
            best_gmm = gmm
    gmm = best_gmm

    #Obtain parameters for each component
    means = gmm.means_.flatten()
    stds = np.sqrt(gmm.covariances_).flatten()

    #Plot components for debugging
    if args.debug:

        fig, ax = plt.subplots(nrows=2, ncols=1, constrained_layout=True)

        #Plot background distribution
        ax[0].hist(log_vals, bins='auto', density=True,
                   color="grey")  #log space
        ax[1].hist(bg_values, bins='auto', density=True,
                   color="grey")  #normal space

        #Plot components
        x_log = np.linspace(np.min(log_vals), np.max(log_vals), 1000)
        x_norm = np.exp(x_log)
        for i in range(len(means)):
            pdf = scipy.stats.norm.pdf(x_log, loc=means[i], scale=stds[i])
            ax[0].plot(x_log, pdf, label="Component {0}".format(i + 1))

            #Plot component in normal space
            log_params = scipy.stats.lognorm.fit(bg_values,
                                                 f0=stds[i],
                                                 fscale=np.exp(means[i]))
            pdf = scipy.stats.lognorm.pdf(x_norm, *log_params)
            ax[1].plot(x_norm, pdf, label="Component {0}".format(i + 1))

        ax[0].set_title("Background score distribution")
        ax[0].set_xlabel("log(background score)")
        ax[0].set_ylabel("Density")
        ax[0].legend()

        ax[1].set_xlabel("Background score")
        ax[1].set_ylabel("Density")
        ax[1].legend()

        debug_pdf.savefig(fig)
        plt.close()

    #Extract most-right gaussian
    chosen_i = np.argmax(means)  #Mixture with largest mean
    log_params = scipy.stats.lognorm.fit(bg_values,
                                         f0=stds[chosen_i],
                                         fscale=np.exp(means[chosen_i]))

    #Mode of distribution
    mode = scipy.optimize.fmin(
        lambda x: -scipy.stats.lognorm.pdf(x, *log_params), 0, disp=False)[0]
    logger.debug("- Mode estimated at: {0}".format(mode))
    pseudo = mode / 2.0  #pseudo is half the mode
    args.pseudo = pseudo
    logger.debug("Pseudocount estimated at: {0}".format(round(args.pseudo, 5)))

    # Estimate theoretical normal for threshold
    leftside_x = np.linspace(
        scipy.stats.lognorm(*log_params).ppf([0.01]), mode, 100)
    leftside_pdf = scipy.stats.lognorm.pdf(leftside_x, *log_params)

    #Flip over
    leftside_x_scale = leftside_x - np.min(leftside_x)  #scale to min 0
    mirrored_x = np.concatenate(
        [leftside_x, np.max(leftside_x) + leftside_x_scale]).flatten()
    mirrored_pdf = np.concatenate([leftside_pdf, leftside_pdf[::-1]]).flatten()
    popt, cov = scipy.optimize.curve_fit(
        lambda x, std, sc: sc * scipy.stats.norm.pdf(x, mode, std), mirrored_x,
        mirrored_pdf)
    norm_params = (mode, popt[0])
    logger.debug("Theoretical normal parameters: {0}".format(norm_params))

    #Set threshold for bound/unbound
    threshold = round(
        scipy.stats.norm.ppf(1 - args.bound_pvalue, *norm_params), 5)

    args.thresholds = {bigwig: threshold for bigwig in args.cond_names}
    logger.stats("- Threshold estimated at: {0}".format(threshold))

    #Only plot if args.debug is True
    if args.debug:

        #Plot mirrored data
        fig, ax = plt.subplots(1, 1)
        ax.hist(bg_values[bg_values < x_max],
                bins='auto',
                density=True,
                label="Observed score distribution")
        ax.plot(mirrored_x, mirrored_pdf, color="black")
        plt.xlabel("Bigwig score")
        plt.title("Theoretical normal")
        debug_pdf.savefig(fig)
        plt.close(fig)

        #Plot fit and threshold
        fig, ax = plt.subplots(1, 1)
        ax.hist(bg_values[bg_values < x_max],
                bins='auto',
                density=True,
                label="Observed score distribution")

        xvals = np.linspace(0, x_max, 1000)
        log_probas = scipy.stats.lognorm.pdf(xvals, *log_params)
        ax.plot(xvals, log_probas, label="Log-normal fit", color="orange")

        #Theoretical normal
        norm_probas = scipy.stats.norm.pdf(xvals, *norm_params)
        ax.plot(xvals,
                norm_probas * (np.max(log_probas) / np.max(norm_probas)),
                color="grey",
                linestyle="--",
                label="Theoretical normal")

        ax.axvline(threshold, color="black", label="Bound/unbound threshold")
        ymax = plt.ylim()[1]
        ax.text(threshold, ymax, "\n {0:.3f}".format(threshold), va="top")

        #Decorate plot
        plt.title("Score distribution")
        plt.xlabel("Bigwig score")
        plt.ylabel("Density")
        plt.legend(fontsize=8)
        plt.xlim((0, x_max))

        debug_pdf.savefig(fig)
        plt.close(fig)

    #-------------------------------------------------------------------------------------------------------------#
    #--------------------------------------- Foldchanges between conditions --------------------------------------#
    #-------------------------------------------------------------------------------------------------------------#

    logger.comment("")
    log2fc_params = {}
    if len(args.signals) > 1:
        logger.info(
            "Calculating background log2 fold-changes between conditions")

        for (bigwig1, bigwig2) in comparisons:  #cond1, cond2
            logger.info("- {0} / {1}".format(bigwig1, bigwig2))

            #Estimate background log2fc
            scores1 = np.copy(background["signal"][bigwig1])
            scores2 = np.copy(background["signal"][bigwig2])

            included = np.logical_or(scores1 > 0, scores2 > 0)
            scores1 = scores1[included]
            scores2 = scores2[included]

            #Calculate background log2fc normal disitribution
            log2fcs = np.log2(
                np.true_divide(scores1 + args.pseudo, scores2 + args.pseudo))

            lower, upper = np.percentile(log2fcs, [1, 99])
            log2fcs_fit = log2fcs[np.logical_and(log2fcs >= lower,
                                                 log2fcs <= upper)]

            #Decide on diff_dist
            diff_dist = scipy.stats.norm
            norm_params = diff_dist.fit(log2fcs_fit)

            logger.debug(
                "({0} / {1}) Background log2fc distribution: {2}".format(
                    bigwig1, bigwig2, norm_params))
            log2fc_params[(bigwig1, bigwig2)] = norm_params

            #If debug: plot background log2fc to figures
            if args.debug:
                fig, ax = plt.subplots(1, 1)
                plt.hist(log2fcs,
                         density=True,
                         bins='auto',
                         label="Background log2fc ({0} / {1})".format(
                             bigwig1, bigwig2))

                xvals = np.linspace(plt.xlim()[0], plt.xlim()[1], 100)
                pdf = diff_dist.pdf(xvals, *log2fc_params[(bigwig1, bigwig2)])
                plt.plot(xvals, pdf, label="Distribution fit")
                plt.title("Background log2FCs ({0} / {1})".format(
                    bigwig1, bigwig2))
                plt.xlabel("Log2 fold change")
                plt.ylabel("Density")

                debug_pdf.savefig(fig, bbox_inches='tight')
                plt.close()

                #f = open(os.path.join(args.outdir, "{0}_{1}_log2fcs.txt".format(bigwig1, bigwig2)), "w")
                #f.write("\n".join([str(val) for val in log2fcs]))
                #f.close()

    background = None  #free up space

    #-------------------------------------------------------------------------------------------------------------#
    #----------------------------- Read total sites per TF to estimate bound/unbound -----------------------------#
    #-------------------------------------------------------------------------------------------------------------#

    logger.comment("")
    logger.info("Processing scanned TFBS individually")

    #Getting bindetect table ready
    info_columns = ["total_tfbs"]
    info_columns.extend([
        "{0}_{1}".format(cond, metric)
        for (cond, metric
             ) in itertools.product(args.cond_names, ["threshold", "bound"])
    ])
    info_columns.extend([
        "{0}_{1}_{2}".format(comparison[0], comparison[1], metric)
        for (comparison,
             metric) in itertools.product(comparisons, ["change", "pvalue"])
    ])

    cols = len(info_columns)
    rows = len(motif_names)
    info_table = pd.DataFrame(np.zeros((rows, cols)),
                              columns=info_columns,
                              index=motif_names)

    #Starting calculations
    results = []
    if args.cores == 1:
        for name in motif_names:
            logger.info("- {0}".format(name))
            results.append(process_tfbs(name, args, log2fc_params))
    else:
        logger.debug("Sending jobs to worker pool")

        task_list = [
            pool.apply_async(process_tfbs, (
                name,
                args,
                log2fc_params,
            )) for name in motif_names
        ]
        monitor_progress(task_list,
                         logger)  #will not exit before all jobs are done
        results = [task.get() for task in task_list]

    logger.info("Concatenating results from subsets")
    info_table = pd.concat(results)  #pandas tables

    pool.terminate()
    pool.join()

    logger.stop_logger_queue()

    #-------------------------------------------------------------------------------------------------------------#
    #------------------------------------------------ Cluster TFBS -----------------------------------------------#
    #-------------------------------------------------------------------------------------------------------------#

    clustering = RegionCluster(TF_overlaps)
    clustering.cluster()

    #Convert full ids to alt ids
    convert = {motif.prefix: motif.name for motif in motif_list}
    for cluster in clustering.clusters:
        for name in convert:
            clustering.clusters[cluster]["cluster_name"] = clustering.clusters[
                cluster]["cluster_name"].replace(name, convert[name])

    #Write out distance matrix
    matrix_out = os.path.join(args.outdir, args.prefix + "_distances.txt")
    clustering.write_distance_mat(matrix_out)

    #-------------------------------------------------------------------------------------------------------------#
    #----------------------------------------- Write all_bindetect file ------------------------------------------#
    #-------------------------------------------------------------------------------------------------------------#

    logger.comment("")
    logger.info("Writing all_bindetect files")

    #Add columns of name / motif_id / prefix
    names = []
    ids = []
    for prefix in info_table.index:
        motif = [motif for motif in motif_list if motif.prefix == prefix]
        names.append(motif[0].name)
        ids.append(motif[0].id)

    info_table.insert(0, "output_prefix", info_table.index)
    info_table.insert(1, "name", names)
    info_table.insert(2, "motif_id", ids)

    #info_table.insert(3, "motif_logo", [os.path.join("motif_logos", os.path.basename(logo_filenames[prefix])) for prefix in info_table["output_prefix"]])	#add relative path to logo

    #Add cluster to info_table
    cluster_names = []
    for name in info_table.index:
        for cluster in clustering.clusters:
            if name in clustering.clusters[cluster]["member_names"]:
                cluster_names.append(
                    clustering.clusters[cluster]["cluster_name"])

    info_table.insert(3, "cluster", cluster_names)

    #Cluster table on motif clusters
    info_table_clustered = info_table.groupby(
        "cluster").mean()  #mean of each column
    info_table_clustered.reset_index(inplace=True)

    #Map correct type
    info_table["total_tfbs"] = info_table["total_tfbs"].map(int)
    for condition in args.cond_names:
        info_table[condition + "_bound"] = info_table[condition +
                                                      "_bound"].map(int)

    #### Write excel ###
    bindetect_excel = os.path.join(args.outdir, args.prefix + "_results.xlsx")
    writer = pd.ExcelWriter(bindetect_excel, engine='xlsxwriter')

    #Tables
    info_table.to_excel(writer, index=False, sheet_name="Individual motifs")
    info_table_clustered.to_excel(writer,
                                  index=False,
                                  sheet_name="Motif clusters")

    for sheet in writer.sheets:
        worksheet = writer.sheets[sheet]
        n_rows = worksheet.dim_rowmax
        n_cols = worksheet.dim_colmax
        worksheet.autofilter(0, 0, n_rows, n_cols)
    writer.save()

    #Format comparisons
    for (cond1, cond2) in comparisons:
        base = cond1 + "_" + cond2
        info_table[base + "_change"] = info_table[base + "_change"].round(5)
        info_table[base + "_pvalue"] = info_table[base + "_pvalue"].map(
            "{:.5E}".format, na_action="ignore")

    #Write bindetect results tables
    #info_table.insert(0, "TF_name", info_table.index)	 #Set index as first column
    bindetect_out = os.path.join(args.outdir, args.prefix + "_results.txt")
    info_table.to_csv(bindetect_out,
                      sep="\t",
                      index=False,
                      header=True,
                      na_rep="NA")

    #-------------------------------------------------------------------------------------------------------------#
    #------------------------------------------- Make BINDetect plot ---------------------------------------------#
    #-------------------------------------------------------------------------------------------------------------#

    if no_conditions > 1:
        logger.info("Creating BINDetect plot(s)")

        #Fill NAs from info_table to enable plotting of log2fcs (NA -> 0 change)
        change_cols = [col for col in info_table.columns if "_change" in col]
        pvalue_cols = [col for col in info_table.columns if "_pvalue" in col]
        info_table[change_cols] = info_table[change_cols].fillna(0)
        info_table[pvalue_cols] = info_table[pvalue_cols].fillna(1)

        #Plotting bindetect per comparison
        for (cond1, cond2) in comparisons:

            logger.info("- {0} / {1} (static plot)".format(cond1, cond2))
            base = cond1 + "_" + cond2

            #Define which motifs to show
            xvalues = info_table[base + "_change"].astype(float)
            yvalues = info_table[base + "_pvalue"].astype(float)
            y_min = np.percentile(yvalues[yvalues > 0],
                                  5)  #5% smallest pvalues
            x_min, x_max = np.percentile(
                xvalues, [5, 95])  #5% smallest and largest changes

            #Fill motifs with metadata (.change, .pvalue, .logpvalue etc.)
            for motif in motif_list:
                name = motif.prefix
                motif.change = float(
                    info_table.at[name, base +
                                  "_change"])  #change for this comparison
                motif.pvalue = float(
                    info_table.at[name, base +
                                  "_pvalue"])  #pvalue for this comparison
                motif.logpvalue = -np.log10(
                    motif.pvalue) if motif.pvalue > 0 else -np.log10(1e-308)

                #Assign each motif to group
                if motif.change < x_min or motif.change > x_max or motif.pvalue < y_min:
                    if motif.change < 0:
                        motif.group = cond2 + "_up"
                    if motif.change > 0:
                        motif.group = cond1 + "_up"
                else:
                    motif.group = "n.s."

            #Bindetect plot
            fig = plot_bindetect(motif_list, clustering, [cond1, cond2], args)
            figure_pdf.savefig(fig, bbox_inches='tight')
            plt.close(fig)

            #Interactive BINDetect plot
            logger.info("- {0} / {1} (interactive plot)".format(cond1, cond2))
            html_out = os.path.join(args.outdir, "bindetect_" + base + ".html")
            plot_interactive_bindetect(motif_list, [cond1, cond2], html_out)

    #-------------------------------------------------------------------------------------------------------------#
    #----------------------------- Make heatmap across conditions (for debugging)---------------------------------#
    #-------------------------------------------------------------------------------------------------------------#

    if args.debug and len(args.signals) > 1:
        logger.info("Plotting heatmap across conditions for debugging")
        mean_columns = [cond + "_mean_score" for cond in args.cond_names]
        heatmap_table = info_table[mean_columns]
        heatmap_table.index = info_table["output_prefix"]

        #Decide fig size
        rows, cols = heatmap_table.shape
        figsize = (7 + cols, max(10, rows / 8.0))
        cm = sns.clustermap(
            heatmap_table,
            figsize=figsize,
            z_score=0,  #zscore for rows
            col_cluster=False,  #do not cluster condition columns
            yticklabels=True,  #show all row annotations
            xticklabels=True,
            cbar_pos=(0, 0, .4, .005),
            dendrogram_ratio=(0.3, 0.01),
            cbar_kws={
                "orientation": "horizontal",
                'label': 'Row z-score'
            },
            method="single")

        #Adjust width of columns
        #hm = cm.ax_heatmap.get_position()
        #cm.ax_heatmap.set_position([hm.x0, hm.y0, cols * 3 * hm.height / rows, hm.height]) 	#aspect should be equal

        plt.setp(cm.ax_heatmap.get_xticklabels(),
                 fontsize=8,
                 rotation=45,
                 ha="right")
        plt.setp(cm.ax_heatmap.get_yticklabels(), fontsize=5)

        cm.ax_col_dendrogram.set_title('Mean scores across conditions',
                                       fontsize=20)
        cm.ax_heatmap.set_ylabel("Transcription factor motifs",
                                 fontsize=15,
                                 rotation=270)
        #cm.ax_heatmap.set_title('Conditions')
        #cm.fig.suptitle('Mean scores across conditions')
        #cm.cax.set_visible(False)

        #Save to output pdf
        plt.tight_layout()
        debug_pdf.savefig(cm.fig, bbox_inches='tight')
        plt.close(cm.fig)

    #-------------------------------------------------------------------------------------------------------------#
    #-------------------------------------------------- Wrap up---------------------------------------------------#
    #-------------------------------------------------------------------------------------------------------------#

    if args.debug:
        debug_pdf.close()

    figure_pdf.close()
    logger.end()
Ejemplo n.º 13
0
#程序文件Pex18_2.py
import numpy as np
import pandas as pd
y = np.array([4.81, 4.8, 4.73, 4.7, 4.7, 4.73, 4.75, 4.75, 5.43, 5.78, 5.85])


def ExpMove(y, a):
    n = len(y)
    M = np.zeros(n)
    M[0] = (y[0] + y[1]) / 2
    for i in range(1, len(y)):
        M[i] = a * y[i - 1] + (1 - a) * M[i - 1]
    return M


yt1 = ExpMove(y, 0.2)
yt2 = ExpMove(y, 0.5)
yt3 = ExpMove(y, 0.8)
s1 = np.sqrt(((y - yt1)**2).mean())
s2 = np.sqrt(((y - yt2)**2).mean())
s3 = np.sqrt(((y - yt3)**2).mean())
d = pd.DataFrame(np.c_[yt1, yt2, yt3])
f = pd.ExcelWriter("Pdata18_2.xlsx")
d.to_excel(f)
f.close()  #数据写入Excel文件,便于做表
print("预测的标准误差分别为:", s1, s2, s3)  #输出预测的标准误差
yh = 0.8 * y[-1] + 0.2 * yt3[-1]
print("下一期的预测值为:", yh)
Ejemplo n.º 14
0
        print("No se encuentra el fichero" + str(i))

# Crear una tabla única final
f_table = list_of_dic[0]

for k in range(1, len(list_of_dic)):
    f_table = pd.merge(f_table, list_of_dic[k], on="otu_id", how="outer")

# Dividir tabla final en intervalos de tiempo y guardar cada subtabla en una hoja excel
df2 = f_table.iloc[:, 1:40]
df2.insert(0, "otu_id", value=f_table.iloc[:, 0])
df3 = f_table.iloc[:, 40:75]
df3.insert(0, "otu_id", value=f_table.iloc[:, 0])
df4 = f_table.iloc[:, 75:203]
df4.insert(0, "otu_id", value=f_table.iloc[:, 0])
df5 = f_table.iloc[:, 203:]
df5.insert(0, "otu_id", value=f_table.iloc[:, 0])

# Formatear salida
pd.set_option("expand_frame_repr", False)

# Escribir cada dataframe en una hoja Excel diferente
writer = pd.ExcelWriter('HostLifeStyle_SalivaA_absolute.xlsx')
f_table.to_excel(writer, sheet_name="SalivaA", index=True, na_rep=0)
df2.to_excel(writer, sheet_name="h_SalivaA_Day26to69", index=False, na_rep=0)
df3.to_excel(writer, sheet_name="SalivaA_Day72to122", index=False, na_rep=0)
df4.to_excel(writer, sheet_name="h_SalivaA_Day123to257", index=False, na_rep=0)
df5.to_excel(writer, sheet_name="h_SalivaA_Day258to364", index=False, na_rep=0)

# Cerrar el escritor Pandas Excel y guardar el fichero
writer.save()
Ejemplo n.º 15
0
p = Path(r'C:\Data\json\backregisterperson.json')
with p.open('r', encoding='utf-8') as f:
    data = json.loads(f.read())
df = pd.json_normalize(data['Persons'])

#df = df.sort_values(["Person.Index"], axis=0, ascending=[True])
#https://thispointer.com/pandas-sort-rows-or-columns-in-dataframe-based-on-values-using-dataframe-sort_values/

df = df.sort_values(by =['Person.Index', 'Timestamp'])

#először a Personal index majd a Timestamp alapján rendezi sorba !

#print (df)

writer = pd.ExcelWriter('backregisterpersonchart.xlsx', engine='xlsxwriter')

df.to_excel(writer, sheet_name='Sheet1')

workbook = writer.book

worksheet = writer.sheets['Sheet1']

format1 = workbook.add_format({'num_format': '#,##0'}) #tizedesjegy beállítás

worksheet.set_column('V:V', 18, format1)

worksheet.set_column('X:X', 18, format1)

worksheet.write_formula('H2', '=COUNTIF(B2:B2180,"<10000")') #Fontos az excelben szereplő pontosvessző helyett ; sima kell ,
Ejemplo n.º 16
0
for i in range(row_size):
    row_num = ''
    for j in range(index[i]):
        row_num += str(predint[ptr])
        ptr = ptr + 1
    pre_num.append(row_num)

print(pre_num)
fp = open('./imagetxt/9.txt','r')
sourceInLine=fp.readlines()
dataset = []
for line in sourceInLine:
    temp = line.strip('\n')
    dataset.append(temp)

data_excel = pd.read_excel('image9.xlsx')
size = data_excel.shape[0]
for i in range(0, len(pre_num), 3):
    data_excel.loc[size, '图片名'] = dataset[0]
    data_excel.loc[size, '角点1'] = dataset[1]
    data_excel.loc[size, '角点2'] = dataset[2]
    data_excel.loc[size, '角点3'] = dataset[3]
    data_excel.loc[size, '角点4'] = dataset[4]
    data_excel.loc[size, '学号'] = pre_num[i]
    data_excel.loc[size, '手机号'] = pre_num[i+1]
    data_excel.loc[size, '身份证号'] = pre_num[i+2]
    size = size + 1
writer = pd.ExcelWriter('image9.xlsx')

data_excel.to_excel(writer)
writer.save()
Ejemplo n.º 17
0
                    break

                ax_curr = axs[row, col]
                plot_acf(residuals[bb_tkr], lags=np.arange(100)[1:], ax=ax_curr)

                ax_curr.set_xlabel('')
                ax_curr.set_ylabel('')
                ax_curr.set_title(f"Autocorr: {oot[oot.bb_tkr == bb_tkr].name.values[0]}")

        fig.delaxes(axs[7, 2])
        plt.savefig(f"Autocorr-{model_type_id}.png", dpi=100)  # './reports/figures/'+
        plt.show()


if __name__ == '__main__':
    cftcVariableName = 'cftc'  # * OR cftc_adj
    fcastVariableName = 'forecast'  # *OR 'forecast_adj'

    writer = pd.ExcelWriter(f'Autocorr.xlsx', engine='xlsxwriter')
    autoCorrelationPlots(model_type_ids=[153],
                         cftcVariableName='cftc',
                         fcastVariableName='forecast'
                         )
    cftcVar = 'cftc'  # * OR cftc_adj
    fcastVar = 'forecast'  # *OR 'forecast_adj'

    exclWriter = pd.ExcelWriter(f'Autocorr_.xlsx', engine='xlsxwriter')
    stats = autoCorrelationStatistics(model_type_ids=[153], cftcVariableName=cftcVariableName,
                                      fcastVariableName=fcastVariableName)
    exclWriter.save()
		price_df['Compare_'+str(col_idx+1)] = preval_price_df.iloc[:, col_loc + 2]

	except KeyError:

		print('{} is not existing on the dataframe'.format(column_nm))

	except Exception as e:

		print('Unknown error: ' + str(e))

#Write Header to Excel

wb_name = 'OTC_{}_Pricing_Pre_Validation_Report_{}.xlsx'.format(usage_type, keyword)

# Create a Pandas Excel writer using XlsxWriter as the engine.
writer = pd.ExcelWriter(wb_name, engine='xlsxwriter', options={'strings_to_numbers': False})

# Get the xlsxwriter workbook and worksheet objects.
workbook  = writer.book

# Add custom formats on the workbook.
key_format = workbook.add_format({
		'bold': True,
		'align': 'center',
		'valign': 'vcenter',
		#'text_wrap': True,
		'fg_color': '#f09886',
		'border': 1})

base_format = workbook.add_format({
		'bold': True,
Ejemplo n.º 19
0
def trata_base_Einstein(df):

    ##################################################################################3
    # upload da planilha
    excel_file = 'dataset.xlsx'

    # dataframe completo
    df = pd.read_excel(excel_file, sheet_name="All")
    ##################################################################################3

    df1 = df

    # checa se existem strings e numeros misturados
    for column in df1:
        a = list(df1[column].map(type) != str)
        if (len(set(a)) != 1):
            # converte colunas mistas para tipo string
            df1[column] = df1[column].apply(str)

    # checa se deu tudo certo
    for column in df1:
        a = list(df1[column].map(type) != str)
        # if (len(set(a)) != 1):
        #     print("valores mistos remanescentes em: " + column)

    ### como, neste caso, o timestamp não importa, pode-se preencher com qualquer valor sequencial ###

    # pega a quantidade de linhas
    qtde_linhas = len(df1.index)
    # cria uma coluna de Timestamps sequenciais na primeira posição
    df1.insert(0, "Timestamp",
               pd.date_range(start='1/1/2020', periods=qtde_linhas, freq='H'))

    # pega o nome das colunas
    colunas = list(df1)

    # remove colunas que não possuem nenhum dado
    df1 = df1.dropna(axis=1, how='all')

    # remove linhas sem exames sanguíneos (vi que quem não fez a de Hematocrit, não fez mais nenhum)
    df1 = df1[df1['Hematocrit'].notna()]

    # preenche espaços vazios restantes (NaN) com zeros
    df1 = df1.fillna(0)

    # transformando dados categóricos em números - exemplo: [normal,ausente,presente] viram [0,1,2]
    le = MultiColumnLabelEncoder()
    df1 = le.fit_transform2(df1)

    # definindo colunas de saida e transformando em números (neg=0 e pos=1)
    lista_out = [
        'Timestamp', 'SARS-Cov-2 exam result',
        'Patient addmited to regular ward (1=yes, 0=no)',
        'Patient addmited to semi-intensive unit (1=yes, 0=no)',
        'Patient addmited to intensive care unit (1=yes, 0=no)'
    ]
    df_out = df1[lista_out]
    # df_out = df_out.replace(['negative','positive'],[0,1])
    lista_out.remove('Timestamp')

    # removendo colunas que não serão úteis para a análise e também a coluna de saída
    lista_drop = lista_out
    lista_drop.append('Patient ID')
    df1 = df1.drop(lista_drop, axis=1)
    colunas = list(df1)

    # cria o dataframe que virará o xls para subir no B-Zek
    # salva em uma nova planilha de resultados
    writer = pd.ExcelWriter('base_relevance.xlsx', engine='openpyxl')
    df1.to_excel(writer, sheet_name="INPUTS")
    df_out.to_excel(writer, sheet_name="OUTPUTS")
    writer.save()

    response = {'inputs': df1, 'outputs': df_out}

    return response
Ejemplo n.º 20
0
    tip.append(item.tipo)
    sta.append(item.status)
    lon.append(item.longitud)
    pre.append(item.precipitaciones)
    vien.append(item.viento)
    if item.dirViento == True:
        dirvien.append("A favor")
    else:
        dirvien.append("En contra")

df = pd.DataFrame({
    "Complejidad": comple,
    "Tiempo": tiem,
    "Tipo": tip,
    "status": sta,
    "Longitud": lon,
    "Precipitaciones": pre,
    "Viento": vien,
    "Dirección": dirvien,
    "velocidad": vel,
    "LLantas": stLLan,
    "Gas": gas,
    "Aceite": ace,
    "Eventos": eventos
})

writer = pd.ExcelWriter("demo.xlsx", engine="xlsxwriter")

df.to_excel(writer, sheet_name="Prueba", index=False)

writer.save()
Ejemplo n.º 21
0
attr = [
    'H2-2 length', 'Inlinks', 'Status Code', 'External Outlinks',
    'Crawl Depth', 'Outlinks', 'Unique Inlinks', 'Canonical Link Element 1',
    'Title 1 Length', 'Content', 'H2-1 length', 'Indexability', 'Hash',
    'HTTP rel="prev" 1', 'Meta Description 1', 'H2-1', 'H2-2',
    'URL Encoded Address', 'Last Modified', '% of Total', 'Meta Keyword 1',
    'H1-1', 'X-Robots-Tag 1', 'Unique External Outlinks',
    'Title 1 Pixel Width', 'Meta Robots 1', 'Meta Description 1 Pixel Width',
    'Size (bytes)', 'Text Ratio', 'Unique Outlinks',
    'Meta Description 1 Length', 'Word Count', 'H1-1 length', 'Meta Refresh 1',
    'Link Score', 'HTTP rel="next" 1', 'Response Time', 'rel="prev" 1',
    'Status', 'Redirect URL', 'Title 1', 'Indexability Status',
    'Redirect Type', 'rel="next" 1', 'Meta Keywords 1 Length'
]
print(df)
df = df.fillna(0)
for attribute in attr:
    print(attribute)
    dfcomp = df.loc[df[attribute + '_mobile'].eq(df[attribute +
                                                    '_desktop']) == False]
    #print(dfcomp)
    if bool(dfcomp.values.tolist()):
        dfcomp = dfcomp[[attribute + '_mobile', attribute + '_desktop']]
        print(dfcomp)
        writer = pd.ExcelWriter(
            'C:\\Users\\lukasz.girzycki\\Desktop\\mobile_desktop_inspiracje\\różnice\\comparison_mobile_desktop'
            + attribute + '.xlsx')
        dfcomp.to_excel(writer)
        writer.save()
    else:
        print('brak różnic')
Ejemplo n.º 22
0
        print("welcome")
        occupationList1 = [
            m.group(0) for l in text1 for m in [regex1.search(l)] if m
        ]
        print("Words")
        print(occupationList1)
        if not occupationList1:
            occupationList1 = ["no_occupation"]

        occupationList.append(occupationList1[0])

#Output all fetched occupation in Output

dd = df.ix[0:, 0:3]
dd['Occupation'] = pd.Series(occupationList, index=df.index)
writer = pd.ExcelWriter('output.xlsx')
dd.to_excel(writer)
writer.save()

print("Final List")
print(occupationList)

#Print count of occupation
counts = collections.Counter(occupationList)
print(counts)

from pytagcloud import create_tag_image, make_tags
from pytagcloud.lang.counter import get_tag_counts
j = ""
for i in occupationList:
    if (i != 'no_occupation'):
Ejemplo n.º 23
0
                if "nofollow" in a['rel']:
                    track_links["Do/No-Follow?"].append("NoFollow")
                else:
                    track_links["Do/No-Follow?"].append("DoFollow")
                track_links["Live?"].append("Yes")
                track_links["Last Check"].append(now.strftime("%d-%m-%Y (%H:%M)"))

        for k, v in track_links.items():
            if k == "Anchor" and len(v) < (i + 1):
                v.append("-")
            if k == "Do/No-Follow?" and len(v) < (i + 1):
                v.append("-")
            if k == "Live?" and len(v) < (i + 1):
                v.append("No")
            if k == "Last Check" and len(v) < (i + 1):
                v.append(now.strftime("%d-%m-%Y (%H:%M)"))



df_final = pd.DataFrame.from_dict(track_links, orient='columns', dtype=None)

writer = pd.ExcelWriter(xl, engine="xlsxwriter")
df_final.to_excel(writer, sheet_name=sys.argv[1])

try:
    writer.save()
except PermissionError as e:
    print("\n",e)
    print("Please close the document before running this script!")
    sys.exit()
Ejemplo n.º 24
0
def start_Pandas_Auto():
    # Текст, который используется в процессе работы
    progress_bar = 'Progress: '
    packing = 'Packing into file...'
    sheet_name = 'Данные рынка на '
    prom_now = "Текущая стоимость акций "
    prom_day_max = 'Дневной максимум '
    prom_day_min = 'Дневной минимум '

    print(progress_bar + '1/5')     # Счетчик прогесса
    full_page = requests.get(Promotions_Tesla, headers=headers)
    soup = BeautifulSoup(full_page.content, 'html.parser')
    convert_tesla = soup.findAll("span", {"class": "arial_26 inlineblock pid-13994-last"})
    convert_min_tesla = soup.findAll("span", {"class": "inlineblock pid-13994-low"})
    convert_max_tesla = soup.findAll("span", {"class": "inlineblock pid-13994-high"})
    time.sleep(1)

    print(progress_bar + '2/5')
    full_page = requests.get(Promotions_Nissan, headers=headers)
    soup = BeautifulSoup(full_page.content, 'html.parser')
    convert_nissan = soup.findAll("span", {"class": "arial_26 inlineblock pid-44127-last"})
    convert_min_nissan = soup.findAll("span", {"class": "inlineblock pid-44127-low"})
    convert_max_nissan = soup.findAll("span", {"class": "inlineblock pid-44127-high"})
    time.sleep(1)

    print(progress_bar + '3/5')
    full_page = requests.get(Promotions_GM, headers=headers)
    soup = BeautifulSoup(full_page.content, 'html.parser')
    convert_gm = soup.findAll("span", {"class": "arial_26 inlineblock pid-239-last"})
    convert_min_gm = soup.findAll("span", {"class": "inlineblock pid-239-low"})
    convert_max_gm = soup.findAll("span", {"class": "inlineblock pid-239-high"})
    time.sleep(1)

    print(progress_bar + '4/5')
    full_page = requests.get(Promotions_Ford, headers=headers)
    soup = BeautifulSoup(full_page.content, 'html.parser')
    convert_ford = soup.findAll("span", {"class": "arial_26 inlineblock pid-255-last"})
    convert_min_ford = soup.findAll("span", {"class": "inlineblock pid-255-low"})
    convert_max_ford = soup.findAll("span", {"class": "inlineblock pid-255-high"})
    time.sleep(1)

    print(progress_bar + '5/5')
    full_page = requests.get(Promotions_Daimler, headers=headers)
    soup = BeautifulSoup(full_page.content, 'html.parser')
    convert_daimler = soup.findAll("span", {"class": "arial_26 inlineblock pid-355-last"})
    convert_min_daimler = soup.findAll("span", {"class": "inlineblock pid-355-low"})
    convert_max_daimler = soup.findAll("span", {"class": "inlineblock pid-355-high"})
    time.sleep(1)

    hms = datetime.datetime.today()     # Дата и время
    print(hms.hour, hms.minute, hms.second)
    time_flow = hms.hour, hms.minute, hms.second    # Форматирование в формат времени
    print(packing)
    # Шаблон данных
    data = [
        [prom_now + str("Tesla"), convert_tesla[0].text],   # Текущая стоимость акций + актуальное значение
        [prom_day_min, convert_min_tesla[0].text],          # Дневной минимум
        [prom_day_max, convert_max_tesla[0].text],          # Дневной максимум
        [" ", " "],                                         # Отступ
        [prom_now + str("Nissan"), convert_nissan[0].text],
        [prom_day_min, convert_min_nissan[0].text],
        [prom_day_max, convert_max_nissan[0].text],
        [" ", " "],
        [prom_now + str("General Motors"), convert_gm[0].text],
        [prom_day_min, convert_min_gm[0].text],
        [prom_day_max, convert_max_gm[0].text],
        [" ", " "],
        [prom_now + str("Ford"), convert_ford[0].text],
        [prom_day_min, convert_min_ford[0].text],
        [prom_day_max, convert_max_ford[0].text],
        [" ", " "],
        [prom_now + str("Daimler"), convert_daimler[0].text],
        [prom_day_min, convert_min_daimler[0].text],
        [prom_day_max, convert_max_daimler[0].text],
    ]

    today = date.today()    # Текущая дата
    direction = 'Сarmakers_'    # Название файла
    time_now = datetime.datetime.time(datetime.datetime.now())  # Текущее время
    new_data = pd.DataFrame(data).rename_axis(None, axis=1)     # Создание датафрейма
    file_name = str(direction) + str(today) + '-' + str(time_flow)  # Имя файла
    file_directory = file_name + '.xlsx'    # Добавление Excel-расширения к файлу
    new_data.style.hide_index()     # Индексы не будут показываться
    # Дальнейшая кастомизация через xlsxwriter
    writer = pd.ExcelWriter(file_directory, engine='xlsxwriter')
    new_data.to_excel(writer, sheet_name=str(sheet_name) + str(today), index=False)

    workbook = writer.book
    worksheet = writer.sheets[str(sheet_name) + str(today)]
    # Параметры данных в файле
    format_list = workbook.add_format({'border': 0, 'num_format': 'hh:mm:ss', 'size': 14, 'align': 'center'})   # Параметры формата чисел, размер шрифта, позиционирование по центру
    date_format = workbook.add_format({'num_format': 'mm.dd.yyyy'})     # Формат даты
    worksheet.write('A1', time_now, format_list)    # Добавление в столбец А1 текущего времени
    worksheet.write('B1', today, date_format)       # Добавление в столбец B1 текущей даты
    format = workbook.add_format({'align': 'left'})

    worksheet.set_landscape()   # Ориентация по умолчанию
    # Параметры стобцов
    worksheet.set_column('A:A', 40, format)
    worksheet.set_column('B:B', 20, format)

    writer.save()   # Сохранение получившегося файла
    print('\n''Файл с названием ' + str(direction) + str(today) + '-' + str(time_flow) + ' сохранен')   # Сообщение о сохранении
Ejemplo n.º 25
0
def main():
    """ Main entry point of the app """
    logger.info("CMPC Wide Area Distrution Main Loop")
    Change_Working_Path('../Data')

    Station_filename = 'Station Location a375a0647.xlsx'
    Transformer_filename = 'Power Transformer Asset a7c07a1cb.xlsx'
    Breaker_filename = 'Breaker Asset a475fe18.xlsx'
    Relay_filename = 'Dist Locations w Relays 110620.xls'
    Circuit_Switcher_filename = 'Circuit Switcher Asset a93a3aebd.xlsx'
    Metalclad_Switchgear_filename = 'Metalclad Switchgear Asset aa554c63f.xlsx'
    Transformer_Risk_filename = 'Oncor Transformer Asset Health Export - Risk Matrix - System.csv'
    Summer_Load_Filename = '2021 Load Projections(4-10)Summer - Clean.xlsx'
    Winter_Load_Filename = '2021 Load Projections(4-10)Winter - Clean.xlsx'
    Fault_Reporting_Proiritization_filename = 'Fault Reporting Prioritization_EDOC.XLSX'
    Fault_Reporting_Proiritization_filename1 = 'WDOC Fault Recording Relay Feeder List with Priority v1.1.xlsx'
    Associated_Breaker_Details_filename = 'Transformer Health - Analysis.xlsx'

    Excel_Files = [Station_filename, Transformer_filename, Breaker_filename, Relay_filename,
                   Metalclad_Switchgear_filename, Summer_Load_Filename, Winter_Load_Filename,
                   Fault_Reporting_Proiritization_filename, Fault_Reporting_Proiritization_filename1]

    pool = Pool(processes=15)
    Associated_Breaker_DetailsDF = Excel_to_Pandas(Associated_Breaker_Details_filename, check_update=False,
                                                   SheetName='Associated Breaker Details')
    Associated_Breaker_DetailsDF = Associated_Breaker_DetailsDF[1]
    # Import Excel files
    df_list = pool.map(Excel_to_Pandas, Excel_Files)

    Transformer_RiskDF = Cleanup_Dataframe(pd.read_csv(Transformer_Risk_filename))

    # Data Cleanup

    AIStationDF = station_df_cleanup(df_list[next(i for i, t in enumerate(df_list) if t[0] == Station_filename)][1],
                                     df_list[next(
                                         i for i, t in enumerate(df_list) if t[0] == Metalclad_Switchgear_filename)][1])

    PowerTransformerDF = transformer_df_cleanup(
        df_list[next(i for i, t in enumerate(df_list) if t[0] == Transformer_filename)][1])
    Outdoor_BreakerDF = breaker_df_cleanup(
        df_list[next(i for i, t in enumerate(df_list) if t[0] == Breaker_filename)][1])
    RelayDataDF = relay_df_cleanup(df_list[next(i for i, t in enumerate(df_list) if t[0] == Relay_filename)][1])
    Summer_LoadDF = summer_load_df_cleanup(
        df_list[next(i for i, t in enumerate(df_list) if t[0] == Summer_Load_Filename)][1])

    Winter_LoadDF = summer_load_df_cleanup(
        df_list[next(i for i, t in enumerate(df_list) if t[0] == Winter_Load_Filename)][1])

    Fault_Reporting_ProiritizationDF = FRP.Fault_Reporting_Proiritization_df_cleanup(
        df_list[next(i for i, t in enumerate(df_list) if t[0] == Fault_Reporting_Proiritization_filename)][1])

    # Create new date in the dataframes
    Fault_Reporting_ProiritizationDF = FRP.Fault_Reporting_Proiritization_df_create_data(
        Fault_Reporting_ProiritizationDF)
    Summer_LoadDF = summer_load_df_create_data(Summer_LoadDF, AIStationDF)
    Winter_LoadDF = summer_load_df_create_data(Winter_LoadDF, AIStationDF)
    AIStationDF = station_df_create_data(AIStationDF, PowerTransformerDF, Outdoor_BreakerDF)
    PowerTransformerDF = transformer_df_create_data(PowerTransformerDF, Transformer_RiskDF, Summer_LoadDF,
                                                    Winter_LoadDF, AIStationDF)
    Outdoor_BreakerDF = breaker_df_create_data(Outdoor_BreakerDF, PowerTransformerDF, Fault_Reporting_ProiritizationDF)
    Outdoor_BreakerDF = Add_Associated_XMR_Details(Outdoor_BreakerDF, Associated_Breaker_DetailsDF)
    RelayDataDF = relay_df_create_data(RelayDataDF)
    AIStationDF = add_Risk_to_Stationdf(AIStationDF, PowerTransformerDF)
    AIStationDF = add_MVA_Exceeded_Stationdf(AIStationDF, PowerTransformerDF)
    Outdoor_BreakerDF = add_Relay_Stationdf(AIStationDF, RelayDataDF, Outdoor_BreakerDF)

    # Select columns to keep
    AIStationDF = AIStationDF[
        ['Region', 'Work_Center', 'Maximo_Code', 'Station_Name', 'STATION_STR_TYPE', 'Age', 'Single_Phase_Station',
         'XFMER_Count', 'Max_Risk_Index_at_Station', 'Max_MVA_Exceeded', 'Mean_Feeder_Age'
         ]]

    PowerTransformerDF = PowerTransformerDF[['Region', 'Work_Center', 'Station_Name', 'Maximo_Code',
                                             'Age', 'MAXIMUM_MVA', 'LV_NOM_KV', 'Risk_Index_(Normalized)',
                                             'Max_Projected_Summer_Load', 'Max_Projected_Winter_Load',
                                             'Max_MVA_Exceeded']]

    Outdoor_BreakerDF = Outdoor_BreakerDF[['Region', 'Work_Center', 'Station_Name', 'Maximo_Code', 'Age',
                                           'BKR_SERVICE', 'SELF_CONTAINED', 'Manufacturer', 'BKR_MECH_MOD',
                                           'BKR_INTERR', 'Associated_XFMR', 'DOC_Fault_Reporting_Prioritization',
                                           'SUB_4_Protection']]

    # Create a Pandas Excel writer using XlsxWriter as the engine.
    writer = pd.ExcelWriter('../CMPC_WideArea_AIS.xlsx', engine='xlsxwriter')

    # Convert the dataframe to an XlsxWriter Excel object.
    AIStationDF.to_excel(writer, sheet_name='Stations', index=False)
    PowerTransformerDF.to_excel(writer, sheet_name='Transformers', index=False)
    Outdoor_BreakerDF.to_excel(writer, sheet_name='Outdoor Breakers', index=False)
    RelayDataDF.to_excel(writer, sheet_name='Relay', index=False)
    Summer_LoadDF.to_excel(writer, sheet_name='Summer Load', index=False)
    Winter_LoadDF.to_excel(writer, sheet_name='Winter Load', index=False)
    # Close the Pandas Excel writer and output the Excel file.
    writer.save()
Ejemplo n.º 26
0
def start_Pandas_IT():

    progress_bar = 'Progress: '
    packing = 'Packing into file...'
    sheet_name = 'Данные рынка на '
    prom_now = "Текущая стоимость акций "
    prom_day_max = 'Дневной максимум '
    prom_day_min = 'Дневной минимум '

    check_AMD(convert_amd)
    print(progress_bar + '1/8')
    time.sleep(1)

    full_page = requests.get(Promotions_Intel, headers=headers)
    soup = BeautifulSoup(full_page.content, 'html.parser')
    convert_intel = soup.findAll("span", {"class": "arial_26 inlineblock pid-251-last"})
    convert_min_intel = soup.findAll("span", {"class": "inlineblock pid-251-low"})
    convert_max_intel = soup.findAll("span", {"class": "inlineblock pid-251-high"})
    print(progress_bar + '2/8')
    time.sleep(1)

    full_page = requests.get(Promotions_Apple, headers=headers)
    soup = BeautifulSoup(full_page.content, 'html.parser')
    convert_apple = soup.findAll("span", {"class": "arial_26 inlineblock pid-6408-last"})
    convert_min_apple = soup.findAll("span", {"class": "inlineblock pid-6408-low"})
    convert_max_apple = soup.findAll("span", {"class": "inlineblock pid-6408-high"})
    print(progress_bar + '3/8')
    time.sleep(1)

    full_page = requests.get(Promotions_IBM, headers=headers)
    soup = BeautifulSoup(full_page.content, 'html.parser')
    convert_ibm = soup.findAll("span", {"class": "arial_26 inlineblock pid-8082-last"})
    convert_min_ibm = soup.findAll("span", {"class": "inlineblock pid-8082-low"})
    convert_max_ibm = soup.findAll("span", {"class": "inlineblock pid-8082-high"})
    print(progress_bar + '4/8')
    time.sleep(1)

    full_page = requests.get(Promotions_Microsoft, headers=headers)
    soup = BeautifulSoup(full_page.content, 'html.parser')
    convert_microsoft = soup.findAll("span", {"class": "arial_26 inlineblock pid-252-last"})
    convert_min_microsoft = soup.findAll("span", {"class": "inlineblock pid-252-low"})
    convert_max_microsoft = soup.findAll("span", {"class": "inlineblock pid-252-high"})
    print(progress_bar + '5/8')
    time.sleep(1)

    full_page = requests.get(Promotions_Google, headers=headers)
    soup = BeautifulSoup(full_page.content, 'html.parser')
    convert_google = soup.findAll("span", {"class": "arial_26 inlineblock pid-6369-last"})
    convert_min_google = soup.findAll("span", {"class": "inlineblock pid-6369-low"})
    convert_max_google = soup.findAll("span", {"class": "inlineblock pid-6369-high"})
    print(progress_bar + '6/8')
    time.sleep(1)

    full_page = requests.get(Promotions_Facebook, headers=headers)
    soup = BeautifulSoup(full_page.content, 'html.parser')
    convert_facebook = soup.findAll("span", {"class": "arial_26 inlineblock pid-26490-last"})
    convert_min_facebook = soup.findAll("span", {"class": "inlineblock pid-26490-low"})
    convert_max_facebook = soup.findAll("span", {"class": "inlineblock pid-26490-high"})
    print(progress_bar + '7/8')
    time.sleep(1)

    full_page = requests.get(Promotions_Yandex, headers=headers)
    soup = BeautifulSoup(full_page.content, 'html.parser')
    convert_yandex = soup.findAll("span", {"class": "arial_26 inlineblock pid-13999-last"})
    convert_min_yandex = soup.findAll("span", {"class": "inlineblock pid-13999-low"})
    convert_max_yandex = soup.findAll("span", {"class": "inlineblock pid-13999-high"})
    print(progress_bar + '8/8')
    time.sleep(.5)

    hms = datetime.datetime.today()
    print(hms.hour, hms.minute, hms.second)
    time_flow = hms.hour, hms.minute, hms.second
    print(packing)

    data = [
        [prom_now + str("AMD"), convert_amd[0].text],
        [prom_day_min, convert_min_amd[0].text],
        [prom_day_max, convert_max_amd[0].text],
        [" ", " "],
        [prom_now + str("Intel"), convert_intel[0].text],
        [prom_day_min, convert_min_intel[0].text],
        [prom_day_max, convert_max_intel[0].text],
        [" ", " "],
        [prom_now + str("Apple"), convert_apple[0].text],
        [prom_day_min, convert_min_apple[0].text],
        [prom_day_max, convert_max_apple[0].text],
        [" ", " "],
        [prom_now + str("IBM"), convert_ibm[0].text],
        [prom_day_min, convert_min_ibm[0].text],
        [prom_day_max, convert_max_ibm[0].text],
        [" ", " "],
        [prom_now + str("Microsoft"), convert_microsoft[0].text],
        [prom_day_min, convert_min_microsoft[0].text],
        [prom_day_max, convert_max_microsoft[0].text],
        [" ", " "],
        [prom_now + str("Google"), convert_google[0].text],
        [prom_day_min, convert_min_google[0].text],
        [prom_day_max, convert_max_google[0].text],
        [" ", " "],
        [prom_now + str("Facebook"), convert_facebook[0].text],
        [prom_day_min, convert_min_facebook[0].text],
        [prom_day_max, convert_max_facebook[0].text],
        [" ", " "],
        [prom_now + str("Yandex"), convert_yandex[0].text],
        [prom_day_min, convert_min_yandex[0].text],
        [prom_day_max, convert_max_yandex[0].text],
    ]

    today = date.today()
    time_now = datetime.datetime.time(datetime.datetime.now())
    direction = 'IT_'
    new_data = pd.DataFrame(data).rename_axis(None, axis=1)
    file_name = str(direction) + str(today) + '-' + str(time_flow)
    file_directory = file_name + '.xlsx'
    new_data.style.hide_index()
    writer = pd.ExcelWriter(file_directory, engine='xlsxwriter')
    new_data.to_excel(writer, sheet_name=str(sheet_name) + str(today), index=False)

    workbook = writer.book
    worksheet = writer.sheets[str(sheet_name) + str(today)]

    format_list = workbook.add_format({'border': 0, 'num_format': 'hh:mm:ss', 'size': 14, 'align': 'center'})
    date_format = workbook.add_format({'num_format': 'mm.dd.yyyy'})
    worksheet.write('A1', time_now, format_list)
    worksheet.write('B1', today, date_format)
    format = workbook.add_format({'align': 'left'})

    worksheet.set_landscape()
    worksheet.set_column('A:A', 40, format)
    worksheet.set_column('B:B', 20, format)

    writer.save()
    print('\n''Файл с названием ' + str(direction) + str(today) + '-' + str(time_flow) + ' сохранен')
Ejemplo n.º 27
0
# P32
# Fisher stats
# Save to Excel
#########################
myC13, myfisherDF, myCDFAngles, myCDFRanges= calcPlotC13(data2, 'kmeans')

dataBeni = groupByDensity(dataBeni)
beniC13, bfisherDF, aaa, bbbb= calcPlotC13(dataBeni, 'Beni')

data2 = calcP32(data2, myC13)
dataBeni = calcP32(dataBeni, beniC13)

data2 = data2.drop_duplicates()
dataBeni = dataBeni.drop_duplicates()

writer = pd.ExcelWriter(outfp + '\\' +'C13.xlsx')
myC13.to_excel(writer, 'Kmeans_C13')
beniC13.to_excel(writer, 'HardSectoring_C13')
data2.to_excel(writer, 'HardSectoring_origdata')
dataBeni.to_excel(writer, 'HardSectoring_groupByDensity')
myfisherDF.to_excel(writer, 'Kmeans_Fisherstats')
bfisherDF.to_excel(writer, 'HardSectoring_Fisherstats')
myCDFAngles.to_excel(writer,'Kmeans_CDF_angles')
myCDFRanges.to_excel(writer,'Kmeans_CDF_range')
aaa.to_excel(writer,'HardSectoring_CDF_angles')
bbbb.to_excel(writer,'HardSectoring_CDF_range')
writer.save()

grouped = data2.groupby('population')
for idx, group in grouped:
    fname = outfp + '\\' + 'Cluster' + str(idx) + '.csv'
Ejemplo n.º 28
0
            #             bd_subnet_type = data["polUni"]["children"][x]["fvTenant"]["children"][y]["fvBD"]["children"][0]["fvSubnet"]["attributes"]["scope"]
            #         bd.append(BD(bd_tenant,bd_vrf,bd_name,bd_des,bd_mac,bd_gateway_ip,bd_subnet_type))

for x in range(len(bd)):
    list_bd_name.append(bd[x].name)
    list_bd_vrf.append(bd[x].vrf)
    list_bd_des.append(bd[x].des)
    list_bd_tenant.append(bd[x].tn)
    list_bd_gateway_ip.append(bd[x].ip)
    list_bd_mac.append(bd[x].mac)
    list_bd_subnet_type.append(bd[x].subnet_type)

now = datetime.now()
# current date and time
date_time = now.strftime("%d%m%Y-%H%M")
# Create some Pandas dataframes from some data.
sheet1 = pd.DataFrame({
    'Tenant': list_bd_tenant,
    'VRF': list_bd_vrf,
    'Description': list_bd_des,
    'bd_mac': list_bd_mac,
    'bd_gateway_ip': list_bd_gateway_ip,
    'subnet_type': list_bd_subnet_type
})
# Create a Pandas Excel writer using XlsxWriter as the engine.
writer = pd.ExcelWriter(f"BD v1.5 {date_time}.xlsx", engine='xlsxwriter')
# Write each dataframe to a different worksheet.
sheet1.to_excel(writer, sheet_name='BD')
# Close the Pandas Excel writer and output the Excel file.
writer.save()
Ejemplo n.º 29
0
#READ THE INPUT FILE TO CREATE DATAFRAMES USING PANDAS
dg = pd.read_csv('FULL PATH TO  alpha.csv')

#CREATE DATAFRAME TO CHANGE DATE TIME FORMATS FOR SUBMISSION DUE DATE, SUBMITTED AND GRADED DATE USING STRING FORMATTING
dg['SUBMISSION DUE DATE'] = pd.to_datetime(
    dg['SUBMISSION DUE DATE']).dt.strftime('%d/%m/%Y')
dg['SUBMISSION SUBMITTED AT'] = pd.to_datetime(
    dg['SUBMISSION SUBMITTED AT']).dt.strftime('%d/%m/%Y')
dg['SUBMISSION GRADED DATE'] = pd.to_datetime(
    dg['SUBMISSION GRADED DATE']).dt.strftime('%d/%m/%Y')

##Prints a confirmation for checking
print(dg)

#CREATE WRITER OBJECT AND DEFINE OUTPUT FILE USING XLSXWRITER
writer = pd.ExcelWriter('FULL PATH TO first assignment worksheet 123456.xlsx',
                        engine='xlsxwriter')
dg.to_excel(writer, sheet_name='123456')

workbook = writer.book

#worksheet = workbook.add_worksheet('Data')
#worksheet = writer.sheets['123456']
#worksheet.write_formula()

writer.save()

print("123456 complete *****************************")

#READ THE INPUT FILE
df = pd.read_csv('FULL PATH TO  beta.csv')
Ejemplo n.º 30
0
def construct_year_chart(node_names_list: List[str], write_to_excel: bool = False) -> Dict[str, pd.DataFrame]:
    ''' Returns an excel file with the excel file '''

    if os.getcwd().endswith('2016_realtime_hourly_dataset'):
        pass
    else:
        os.chdir(os.getcwd() + '/2016_realtime_hourly_dataset')

    curr_working_dir = os.getcwd()

    output_df = None
    infer_headers_flag = False
    headers = []
    df_name = None
    desired_key = 'Location Name'

    node_pd_dict = {}

    # assume infer_headers = ['H', 'Date', 'Hour Ending', 'Location ID', 'Location Name', 'Location Type', 'Locational Marginal Price', 'Energy Component', 'Congestion Component', 'Marginal Loss Component']

    files = os.listdir(curr_working_dir)
    number_files = len(files) - 1

    for node_name in node_names_list:
        frames = []

        if f'{node_name}_2016.xlsx' in os.listdir('../individual_nodes'):
            log.debug(f'{node_name} excel already exists, so skipping write to excel')
            node_pd_dict[node_name] = load_pd('../individual_nodes/' + f'{node_name}_2016.xlsx')

            continue

        for index, filename in enumerate(files):
            if (index + 1) % 20 == 0:
                log.info((f'On file {index+1} out of {number_files}'))

            if filename == 'node_pd_dict_pickle.p': continue    # Avoid passing over the pickle file

            df = load_pd(filename)

            if not infer_headers_flag:
                headers = list(df)
                infer_headers_flag = True

            try:
                df_name = df.loc[df[desired_key] == node_name]
                frames.append(df_name)

            except ValueError:
                raise ValueError(f"{node_name} doesn't exist!")

            except KeyError:
                raise KeyError(f"Can't find {desired_key} in {headers}")

        concatenated_df = pd.concat(frames)
        log.debug(concatenated_df)

        if write_to_excel:

            if f'{node_name}_2016.xlsx' in os.listdir('../individual_nodes'):
                log.debug(f'{node_name} excel already exists, so skipping write to excel')
                continue

            final_file_name = f'/{node_name}_2016.xlsx'

            file_path = '../individual_nodes' + final_file_name

            writer = pd.ExcelWriter(file_path)
            concatenated_df.to_excel(writer)
            writer.save()

            log.debug(f'Wrote {final_file_name} to excel in {file_path}')

        node_pd_dict[node_name] = concatenated_df

    log.debug('Done!, returning pd_dict')
    return node_pd_dict