conn_oracle.close() #Wrangle data into an Access acceptable form using numpy and pandas res = [list(elem) for elem in res] res = np.array(res) column_names = [] for i in range(len(cursor.description)): column_names.append(cursor.description[i][0]) res = pd.DataFrame(res, columns=column_names) '''The shortcut to the entire data wrangling process is : res = pd.read_sql_query(sql,conn_oracle)''' #write the dataframe to MS Excel writer = pd.ExcelWriter("H:\oracle_data.xlsx", engine="xlsxwriter") res.to_excel(writer, sheet_name="Oracle", index=False) writer.save() #write the dataframe to sqlite database conn_sqlit = sqlite3.connect(r"H:\testdata.db") cursor_sq = conn_sqlit.cursor() res.to_sql("First_table", conn_sqlit, if_exists='replace', index=False) data = pd.read_sql_query("Select * from First_table", conn_sqlit) print data conn_sqlit.close()
beg = datetime.now() order_list = [] workpieces_in_time(8 * 60 * 60) data = np.array(data) print('工件完成数', len(data)) print(data[:, 0]) if err_rate > 0: # ATTENTION:***如果报错了 就是工作运行期间没有故障** is_err_str = 'err' data_err_df = pd.DataFrame(err_data) data_err_df.columns = ['故障cnc编号', '故障开始时间', '故障结束时间'] data_err_df.index += 1 # create and writer pd.DataFrame to excel writer = pd.ExcelWriter('Save_Excel_case1_' + is_err_str + '_how.xlsx') data_err_df.to_excel(writer, 'page_1', float_format='%.5f') # float_format 控制精度 writer.save() else: is_err_str = 'no_err' en = datetime.now() # 生成excel表格 # 注意组号 data_df = pd.DataFrame(data) # change the index and column name data_df.columns = ['加工编号', '上料时间', '下料时间'] data_df.index += 1 # create and writer pd.DataFrame to excel writer = pd.ExcelWriter('Save_Excel_case1_' + is_err_str + '.xlsx')
def main(): # import xls # case logic ''' 1 ID 2 GeoMarket 3 Country 4 Region 5 Product Line 6 IncidentType 7 FormStatus 8 Description 9 IncidentDate 10 EmploymentType 11 InjuryNature 12 RiskRanking 13 RiskRating 14 Root Cause(5 Why's) 15 Created By 16 FormSubmittedBy 17 QHSE Report Workflow 18 InjuryLocation 19 InjuryNatureMechanism 20 Primary Root Cause 21 NonProductiveTime 22 Test XML 23 PINType 24 Cost of Poor Quality (USD) 25 Job Number 26 Item Type 27 Path ''' # import xls file data = pd.read_excel( r'C:\Users\stacy\My WrWx\00_projects\reservoirGroup\Adam\Oil and Gas PIN System Summary Dashboard.xlsx', sheet_name='PIN Data') print(data) # print a summary table of the xlsx contents print('Col Headers:\n', data.columns) # print a list of the headers print(data['Region']) # print all rows within a column as a list # iterate over the region list from above using a loop for i in data.index: print(data['Region'][i]) # take entire columns from the sheet and put into lists region = data['Region'] company = data['Company'] raisedBy = data['RaisedBy'] nu_region = [] nu_company = [] nu_raisedBy = [] i = 0 for item in region: if i == 10: nu_region.append('Bananas') nu_company.append('Apples') nu_raisedBy.append('ORanges') else: nu_region.append(region[i]) nu_company.append(company[i]) nu_raisedBy.append(raisedBy[i]) i += 1 # show a subtable of the imported excel file df = pd.DataFrame(data, columns=['PinID', 'Risk', 'Region', 'Company']) print(df) # pandas output pandas_file = 'C:/Users/stacy/My WrWx/00_projects/reservoirGroup/Adam/pandas_test.xlsx' pandas_file_2 = 'C:/Users/stacy/My WrWx/00_projects/reservoirGroup/Adam/pandas_test_apples.xlsx' pandas_file_3 = 'C:/Users/stacy/My WrWx/00_projects/reservoirGroup/Adam/historical_pin_hse.xlsx' writer = pd.ExcelWriter(pandas_file) df.to_excel(writer, 'PIN_Data', index=False) writer.save() df2 = pd.DataFrame({ 'Region': nu_region, 'Company': nu_company, 'RaisedBy': nu_raisedBy }) writer2 = pd.ExcelWriter(pandas_file_2) df2.to_excel(writer2, 'Historical PIN and HSE', index=False) writer2.save() print('Done.')
value = df.loc[i][3] if value[-1] != ',': value = value + ',' value = value.replace(',,', ',') value = value.replace(',', ',') value = value.replace('\n', '') text = str(value).split(',') # print(text) valuelist = text[2: -4] # print(valuelist) pointlist = [] x = 0 while x < len(valuelist): pointlist.append(valuelist[x+1] + '#' + valuelist[x+2]) x += 3 # print(pointlist) df.loc[i, '坐标1'] = str(pointlist).replace('[', '').replace('\'', '').replace(']', '') del df['坐标'] df2 = df.drop('坐标1', axis=1).join(df['坐标1'].str.split(',', expand=True).stack().reset_index(level=1, drop=True).rename('坐标2')) # print(df2) df2['经度'], df2['纬度'] = df2['坐标2'].str.split('#', 1).str del df2['坐标2'] print(df2) outputroute = 'F:\\测试数据\\酉阳\\cxy探矿权处理\\test.xls' writer = pd.ExcelWriter(outputroute) df2.to_excel(writer, float_format='%.5f') writer.save()
def merge_converter(filename): df = pandas.read_excel(filename) #df.drop_duplicates(subset=['one'], keep=False, inplace=True) #df.loc['Total'] = pandas.Series(df.sum()) df.insert(7, 'seven', '') # PERCENTAGE CALCULATIONS df['two_percentage'] = df['two'].apply(lambda a: (a / df['two'].sum()) * 100) df['five_percentage'] = df['five'].apply(lambda a: (a / df['five'].sum()) * 100) df['three_percentage'] = df['three'].apply(lambda a: (a / df['three'].sum()) * 100) df['six_percentage'] = df['six'].apply(lambda a: (a / df['six'].sum()) * 100) df.insert(12, 'twelve', '') # DIFFERENCE df['two_p_diff'] = df.two_percentage.diff() df['five_p_diff'] = df.five_percentage.diff() df.insert(15, 'fifteen', '') # DIFFERENCE ENDS # SAME SAME df['two_p_same'] = df.apply( lambda x: x['two_p_diff'] if x['two_p_diff'] * x['five_p_diff'] > 0 else np.NaN, axis=1) df['five_p_same'] = df.apply( lambda x: x['five_p_diff'] if x['five_p_diff'] * x['two_p_diff'] > 0 else np.NaN, axis=1) #Changing p_diff to p_same and deleting same df['two_p_diff'] = df['two_p_same'] df['five_p_diff'] = df['five_p_same'] del df['two_p_same'] del df['five_p_same'] ## CHECK IF ALL FOUR COLUMNS HAVE DATA IN IT OTHERWISE ENTER NULL DATA #GET VALUE OF ABOVE ROW FOR TWO PERCENTAGE index_of_not_null_two_p_diff = df[~df.two_p_diff.isnull()].index.tolist() print(index_of_not_null_two_p_diff) total_rows = df.shape[0] - 3 blank_list = [np.NaN] * (total_rows) #SHOW ROWS TWO THREE FIVE SIX ABOVE AND SAME two_p_values = [] three_p_values = [] five_p_values = [] six_p_values = [] two_p_v_d = [] three_p_v_d = [] five_p_v_d = [] six_p_v_d = [] for j in index_of_not_null_two_p_diff: two_p_values.append('') two_p_values.append(df._get_value(j - 1, 'two_percentage')) two_p_values.append(df._get_value(j, 'two_percentage')) two_p_v_d.append( df._get_value(j, 'two_percentage') - df._get_value(j - 1, 'two_percentage')) three_p_values.append('') three_p_values.append(df._get_value(j - 1, 'three_percentage')) three_p_values.append(df._get_value(j, 'three_percentage')) # three_p_v_d.append(df._get_value(j, 'three_percentage')) five_p_values.append('') five_p_values.append(df._get_value(j - 1, 'five_percentage')) five_p_values.append(df._get_value(j, 'five_percentage')) # five_p_v_d.append( df._get_value(j, 'five_percentage') - df._get_value(j - 1, 'five_percentage')) six_p_values.append('') six_p_values.append(df._get_value(j - 1, 'six_percentage')) six_p_values.append(df._get_value(j, 'six_percentage')) six_p_v_d.append(df._get_value(j, 'six_percentage')) #df['two_p_values'] = two_p_values + another_blank_list*(total_rows_another - len(two_p_values)) #df['three_p_values'] = three_p_values + another_blank_list*(total_rows_another - len(three_p_values)) #df['five_p_values'] = five_p_values + another_blank_list*(total_rows_another - len(five_p_values)) #df['six_p_values'] = six_p_values + another_blank_list*(total_rows_another - len(six_p_values)) additional1 = pandas.DataFrame({'two_p_values': two_p_values}) additional2 = pandas.DataFrame({'three_p_values': three_p_values}) additional3 = pandas.DataFrame({'five_p_values': five_p_values}) additional4 = pandas.DataFrame({'six_p_values': six_p_values}) df = pandas.concat([df, additional1], axis=1) df = pandas.concat([df, additional2], axis=1) df = pandas.concat([df, additional3], axis=1) df = pandas.concat([df, additional4], axis=1) df.insert(20, 'twenty', '') total_rows_another = df.shape[0] another_blank_list = [np.NaN] #two_p_v_d = sorted(two_p_v_d) #three_p_v_d = sorted(three_p_v_d) #five_p_v_d = sorted(five_p_v_d) #six_p_v_d = sorted(six_p_v_d) two_p_v_d_neg = [abs(x) for x in two_p_v_d if x < 0] three_p_v_d_neg = [] for ind in range(0, len(two_p_v_d_neg)): three_p_v_d_neg.append(three_p_v_d[ind]) #del three_p_v_d[0] five_p_v_d_neg = [abs(x) for x in five_p_v_d if x < 0] six_p_v_d_neg = [] for ind in range(0, len(five_p_v_d_neg)): six_p_v_d_neg.append(six_p_v_d[ind]) #del six_p_v_d[0] two_p_v_d_pos = [x for x in two_p_v_d if x > 0] three_p_v_d_pos = [] for indi in range(len(two_p_v_d_neg), len(two_p_v_d)): three_p_v_d_pos.append(three_p_v_d[ind]) five_p_v_d_pos = [x for x in five_p_v_d if x > 0] six_p_v_d_pos = [] for indi in range(len(five_p_v_d_neg), len(five_p_v_d)): six_p_v_d_pos.append(six_p_v_d[ind]) print("####################") print(f"length of two_p_v_d {len(two_p_v_d)}") print(f"length of two_p_v_d_pos {len(two_p_v_d_pos)}") print(f"length of two_p_v_d_neg {len(two_p_v_d_neg)}") print("####################") print("####################") print(f"length of three_p_v_d {len(three_p_v_d)}") print(f"length of three_p_v_d_pos {len(three_p_v_d_pos)}") print(f"length of three_p_v_d_neg {len(three_p_v_d_neg)}") print("####################") print("####################") print(f"length of five_p_v_d {len(five_p_v_d)}") print(f"length of five_p_v_d_pos {len(five_p_v_d_pos)}") print(f"length of five_p_v_d_neg {len(five_p_v_d_neg)}") print("####################") print("####################") print(f"length of six_p_v_d {len(six_p_v_d)}") print(f"length of six_p_v_d_pos {len(six_p_v_d_pos)}") print(f"length of six _p_v_d_neg {len(six_p_v_d_neg)}") print("####################") minus_two_three_neg = list( map(operator.sub, two_p_v_d_neg, three_p_v_d_neg)) minus_two_three_pos = list( map(operator.sub, two_p_v_d_pos, three_p_v_d_pos)) minus_five_six_neg = list(map(operator.sub, five_p_v_d_neg, six_p_v_d_neg)) minus_five_six_pos = list(map(operator.sub, five_p_v_d_pos, six_p_v_d_pos)) df_sort = pandas.DataFrame() df_sort['two_p_v_d'] = two_p_v_d + another_blank_list * ( total_rows_another - len(two_p_v_d)) df_sort['three_p_v_d'] = three_p_v_d + another_blank_list * ( total_rows_another - len(three_p_v_d)) df_sort['five_p_v_d'] = five_p_v_d + another_blank_list * ( total_rows_another - len(five_p_v_d)) df_sort['six_p_v_d'] = six_p_v_d + another_blank_list * ( total_rows_another - len(six_p_v_d)) df['two_p_v_d'] = df_sort['two_p_v_d'] df['three_p_v_d'] = df_sort['three_p_v_d'] df['five_p_v_d'] = df_sort['five_p_v_d'] df['six_p_v_d'] = df_sort['six_p_v_d'] df['two_p_v_d_neg'] = df.apply(lambda x: x['two_p_v_d'] if x['two_p_v_d'] < 0 else np.NaN, axis=1) df['three_p_v_d_neg'] = df.apply(lambda x: x['three_p_v_d'] if x['two_p_v_d'] < 0 else np.NaN, axis=1) df['five_p_v_d_neg'] = df.apply(lambda x: x['five_p_v_d'] if x['two_p_v_d'] < 0 else np.NaN, axis=1) df['six_p_v_d_neg'] = df.apply(lambda x: x['six_p_v_d'] if x['two_p_v_d'] < 0 else np.NaN, axis=1) df['two_p_v_d_neg'] = df['two_p_v_d_neg'].abs() df['three_p_v_d_neg'] = df['three_p_v_d_neg'].abs() df['five_p_v_d_neg'] = df['five_p_v_d_neg'].abs() df['six_p_v_d_neg'] = df['six_p_v_d_neg'].abs() df_sort['two_p_v_d_pos'] = df.apply(lambda x: x['two_p_v_d'] if x['two_p_v_d'] > 0 else np.NaN, axis=1) df_sort['three_p_v_d_pos'] = df.apply(lambda x: x['three_p_v_d'] if x['two_p_v_d'] > 0 else np.NaN, axis=1) df_sort['five_p_v_d_pos'] = df.apply(lambda x: x['five_p_v_d'] if x['two_p_v_d'] > 0 else np.NaN, axis=1) df_sort['six_p_v_d_pos'] = df.apply(lambda x: x['six_p_v_d'] if x['two_p_v_d'] > 0 else np.NaN, axis=1) df_sort = df_sort[df_sort['two_p_v_d_pos'].notna()].reset_index() df['two_p_v_d_pos'] = df_sort['two_p_v_d_pos'] df['three_p_v_d_pos'] = df_sort['three_p_v_d_pos'] df['five_p_v_d_pos'] = df_sort['five_p_v_d_pos'] df['six_p_v_d_pos'] = df_sort['six_p_v_d_pos'] df['2_p_v_d_pos_percentage'] = df['two_p_v_d_pos'].apply( lambda a: (a / df['two_p_v_d_pos'].sum()) * 100) df['3_p_v_d_pos_percentage'] = df['three_p_v_d_pos'].apply( lambda a: (a / df['three_p_v_d_pos'].sum()) * 100) df['5_p_v_d_pos_percentage'] = df['five_p_v_d_pos'].apply( lambda a: (a / df['five_p_v_d_pos'].sum()) * 100) df['6_p_v_d_pos_percentage'] = df['six_p_v_d_pos'].apply( lambda a: (a / df['six_p_v_d_pos'].sum()) * 100) ## DF2 FOR DROPPING ROWS WITH NAN VALUES IN NEGRATIVE PERCENTAGE OF TWO THREE FIVE SIX VAUES df2 = pandas.DataFrame() df2['2_p_v_d_neg_percentage'] = df['two_p_v_d_neg'].apply( lambda a: (a / df['two_p_v_d_neg'].sum()) * 100) df2['3_p_v_d_neg_percentage'] = df['three_p_v_d_neg'].apply( lambda a: (a / df['three_p_v_d_neg'].sum()) * 100) df2['5_p_v_d_neg_percentage'] = df['five_p_v_d_neg'].apply( lambda a: (a / df['five_p_v_d_neg'].sum()) * 100) df2['6_p_v_d_neg_percentage'] = df['six_p_v_d_neg'].apply( lambda a: (a / df['six_p_v_d_neg'].sum()) * 100) print("THIS IS DF2 WITHOUT DROP NA") print(df2) df2 = df2.dropna().reset_index() print("THIS IS DF2 WITH DROP NA") print(df2) df['2_p_v_d_neg_percentage'] = df2['2_p_v_d_neg_percentage'] df['3_p_v_d_neg_percentage'] = df2['3_p_v_d_neg_percentage'] df['5_p_v_d_neg_percentage'] = df2['5_p_v_d_neg_percentage'] df['6_p_v_d_neg_percentage'] = df2['6_p_v_d_neg_percentage'] df.insert(25, 'twenty_fifth', '') df.insert(30, '29', '') df.insert(35, '35', '') #AB 3 or 5 ko dekhna hai agar dono chota chota hai toh poori row rakhni hai warna nahi rakhni hai df_sort['2_p_v_d_pos_p_c'] = df.apply( lambda x: x['2_p_v_d_pos_percentage'] if x['3_p_v_d_pos_percentage'] < x['2_p_v_d_pos_percentage'] and x['6_p_v_d_pos_percentage'] < x['5_p_v_d_pos_percentage'] else np.NaN, axis=1) df_sort['3_p_v_d_pos_p_c'] = df.apply( lambda x: x['3_p_v_d_pos_percentage'] if x['3_p_v_d_pos_percentage'] < x['2_p_v_d_pos_percentage'] and x['6_p_v_d_pos_percentage'] < x['5_p_v_d_pos_percentage'] else np.NaN, axis=1) df_sort['5_p_v_d_pos_p_c'] = df.apply( lambda x: x['5_p_v_d_pos_percentage'] if x['3_p_v_d_pos_percentage'] < x['2_p_v_d_pos_percentage'] and x['6_p_v_d_pos_percentage'] < x['5_p_v_d_pos_percentage'] else np.NaN, axis=1) df_sort['6_p_v_d_pos_p_c'] = df.apply( lambda x: x['6_p_v_d_pos_percentage'] if x['3_p_v_d_pos_percentage'] < x['2_p_v_d_pos_percentage'] and x['6_p_v_d_pos_percentage'] < x['5_p_v_d_pos_percentage'] else np.NaN, axis=1) df_sort['2_p_v_d_neg_p_c'] = df.apply( lambda x: x['2_p_v_d_neg_percentage'] if x['3_p_v_d_neg_percentage'] < x['2_p_v_d_neg_percentage'] and x['6_p_v_d_neg_percentage'] < x['5_p_v_d_neg_percentage'] else np.NaN, axis=1) df_sort['3_p_v_d_neg_p_c'] = df.apply( lambda x: x['3_p_v_d_neg_percentage'] if x['3_p_v_d_neg_percentage'] < x['2_p_v_d_neg_percentage'] and x['6_p_v_d_neg_percentage'] < x['5_p_v_d_neg_percentage'] else np.NaN, axis=1) df_sort['5_p_v_d_neg_p_c'] = df.apply( lambda x: x['5_p_v_d_neg_percentage'] if x['3_p_v_d_neg_percentage'] < x['2_p_v_d_neg_percentage'] and x['6_p_v_d_neg_percentage'] < x['5_p_v_d_neg_percentage'] else np.NaN, axis=1) df_sort['6_p_v_d_neg_p_c'] = df.apply( lambda x: x['6_p_v_d_neg_percentage'] if x['3_p_v_d_neg_percentage'] < x['2_p_v_d_neg_percentage'] and x['6_p_v_d_neg_percentage'] < x['5_p_v_d_neg_percentage'] else np.NaN, axis=1) list_2_p_v_d_pos_p_c = df_sort['2_p_v_d_pos_p_c'].values.tolist() list_3_p_v_d_pos_p_c = df_sort['3_p_v_d_pos_p_c'].values.tolist() list_5_p_v_d_pos_p_c = df_sort['5_p_v_d_pos_p_c'].values.tolist() list_6_p_v_d_pos_p_c = df_sort['6_p_v_d_pos_p_c'].values.tolist() list_2_p_v_d_neg_p_c = df_sort['2_p_v_d_neg_p_c'].values.tolist() list_3_p_v_d_neg_p_c = df_sort['3_p_v_d_neg_p_c'].values.tolist() list_5_p_v_d_neg_p_c = df_sort['5_p_v_d_neg_p_c'].values.tolist() list_6_p_v_d_neg_p_c = df_sort['6_p_v_d_neg_p_c'].values.tolist() list_2_p_v_d_pos_p_c.append(df_sort['2_p_v_d_pos_p_c'].mean()) list_3_p_v_d_pos_p_c.append(df_sort['3_p_v_d_pos_p_c'].mean()) list_5_p_v_d_pos_p_c.append(df_sort['5_p_v_d_pos_p_c'].mean()) list_6_p_v_d_pos_p_c.append(df_sort['6_p_v_d_pos_p_c'].mean()) list_2_p_v_d_neg_p_c.append(df_sort['2_p_v_d_neg_p_c'].mean()) list_3_p_v_d_neg_p_c.append(df_sort['3_p_v_d_neg_p_c'].mean()) list_5_p_v_d_neg_p_c.append(df_sort['5_p_v_d_neg_p_c'].mean()) list_6_p_v_d_neg_p_c.append(df_sort['6_p_v_d_neg_p_c'].mean()) list_2_p_v_d_pos_p_c.append(df_sort['2_p_v_d_pos_p_c'].count()) list_3_p_v_d_pos_p_c.append(df_sort['3_p_v_d_pos_p_c'].count()) list_5_p_v_d_pos_p_c.append(df_sort['5_p_v_d_pos_p_c'].count()) list_6_p_v_d_pos_p_c.append(df_sort['6_p_v_d_pos_p_c'].count()) list_2_p_v_d_neg_p_c.append(df_sort['2_p_v_d_neg_p_c'].count()) list_3_p_v_d_neg_p_c.append(df_sort['3_p_v_d_neg_p_c'].count()) list_5_p_v_d_neg_p_c.append(df_sort['5_p_v_d_neg_p_c'].count()) list_6_p_v_d_neg_p_c.append(df_sort['6_p_v_d_neg_p_c'].count()) df.insert(40, '40', '') print("THIS IS DF SORT NEWWWW") del df_sort['index'] del df_sort['two_p_v_d'] del df_sort['three_p_v_d'] del df_sort['five_p_v_d'] del df_sort['six_p_v_d'] del df_sort['two_p_v_d_pos'] del df_sort['three_p_v_d_pos'] del df_sort['five_p_v_d_pos'] del df_sort['six_p_v_d_pos'] #df_sort.loc[df_sort.shape[0]] = df_sort.mean() #df_sort.loc[df_sort.shape[0]] = df_sort.count() #### END WALA PART HAI JISME COUNT KRNA THA PHIR PERCETAGE NIKALNI THI PHIR MINUS KRNA THA twopospercentage = (df_sort['2_p_v_d_pos_p_c'].mean() * 100) / ( df_sort['2_p_v_d_pos_p_c'].mean() + df_sort['2_p_v_d_neg_p_c'].mean()) threepospercentage = (df_sort['3_p_v_d_pos_p_c'].mean() * 100) / ( df_sort['3_p_v_d_pos_p_c'].mean() + df_sort['3_p_v_d_neg_p_c'].mean()) fivepospercentage = (df_sort['5_p_v_d_pos_p_c'].mean() * 100) / ( df_sort['5_p_v_d_pos_p_c'].mean() + df_sort['5_p_v_d_neg_p_c'].mean()) sixpospercentage = (df_sort['6_p_v_d_pos_p_c'].mean() * 100) / ( df_sort['6_p_v_d_pos_p_c'].mean() + df_sort['6_p_v_d_neg_p_c'].mean()) twonegpercentage = (df_sort['2_p_v_d_neg_p_c'].mean() * 100) / ( df_sort['2_p_v_d_pos_p_c'].mean() + df_sort['2_p_v_d_neg_p_c'].mean()) threenegpercentage = (df_sort['3_p_v_d_neg_p_c'].mean() * 100) / ( df_sort['3_p_v_d_pos_p_c'].mean() + df_sort['3_p_v_d_neg_p_c'].mean()) fivenegpercentage = (df_sort['5_p_v_d_neg_p_c'].mean() * 100) / ( df_sort['5_p_v_d_pos_p_c'].mean() + df_sort['5_p_v_d_neg_p_c'].mean()) sixnegpercentage = (df_sort['6_p_v_d_neg_p_c'].mean() * 100) / ( df_sort['6_p_v_d_pos_p_c'].mean() + df_sort['6_p_v_d_neg_p_c'].mean()) list_2_p_v_d_pos_p_c.append(twopospercentage) list_3_p_v_d_pos_p_c.append(threepospercentage) list_3_p_v_d_pos_p_c.append(threepospercentage / twopospercentage) list_5_p_v_d_pos_p_c.append(fivepospercentage) list_6_p_v_d_pos_p_c.append(sixpospercentage) list_6_p_v_d_pos_p_c.append(sixpospercentage / fivepospercentage) list_2_p_v_d_neg_p_c.append(twonegpercentage) list_3_p_v_d_neg_p_c.append(threenegpercentage) list_3_p_v_d_neg_p_c.append(threenegpercentage / twonegpercentage) list_5_p_v_d_neg_p_c.append(fivenegpercentage) list_6_p_v_d_neg_p_c.append(sixnegpercentage) list_6_p_v_d_neg_p_c.append(sixnegpercentage / fivenegpercentage) total_rows = df.shape[0] blank_list = [np.NaN] df['2_p_v_d_pos_p_c'] = list_2_p_v_d_pos_p_c + blank_list * ( total_rows - len(list_2_p_v_d_pos_p_c)) df['3_p_v_d_pos_p_c'] = list_3_p_v_d_pos_p_c + blank_list * ( total_rows - len(list_3_p_v_d_pos_p_c)) df['5_p_v_d_pos_p_c'] = list_5_p_v_d_pos_p_c + blank_list * ( total_rows - len(list_5_p_v_d_pos_p_c)) df['6_p_v_d_pos_p_c'] = list_6_p_v_d_pos_p_c + blank_list * ( total_rows - len(list_6_p_v_d_pos_p_c)) df['2_p_v_d_neg_p_c'] = list_2_p_v_d_neg_p_c + blank_list * ( total_rows - len(list_2_p_v_d_neg_p_c)) df['3_p_v_d_neg_p_c'] = list_3_p_v_d_neg_p_c + blank_list * ( total_rows - len(list_3_p_v_d_neg_p_c)) df['5_p_v_d_neg_p_c'] = list_5_p_v_d_neg_p_c + blank_list * ( total_rows - len(list_5_p_v_d_neg_p_c)) df['6_p_v_d_neg_p_c'] = list_6_p_v_d_neg_p_c + blank_list * ( total_rows - len(list_6_p_v_d_neg_p_c)) #LAST COLUMN ADDED df2 = df2.iloc[0:0] df2['AV DIVIDE BY AU'] = df_sort.apply( lambda x: x['3_p_v_d_pos_p_c'] / x['2_p_v_d_pos_p_c'], axis=1) df2['AX DIVIDE BY AW'] = df_sort.apply( lambda x: x['6_p_v_d_pos_p_c'] / x['5_p_v_d_pos_p_c'], axis=1) df2['BA DIVIDE BY AZ'] = df_sort.apply( lambda x: x['3_p_v_d_neg_p_c'] / x['2_p_v_d_neg_p_c'], axis=1) df2['BC DIVIDE BY BB'] = df_sort.apply( lambda x: x['6_p_v_d_neg_p_c'] / x['5_p_v_d_neg_p_c'], axis=1) #df2.loc[df2.shape[0]] = df2.mean() AV_DIVIDE_BY_AU = df2['AV DIVIDE BY AU'].values.tolist() AX_DIVIDE_BY_AW = df2['AX DIVIDE BY AW'].values.tolist() BA_DIVIDE_BY_AZ = df2['BA DIVIDE BY AZ'].values.tolist() BC_DIVIDE_BY_BB = df2['BC DIVIDE BY BB'].values.tolist() print(df_sort) df.insert(45, '45', '') df.insert(50, '50', '') df.insert(55, '55', '') AV_DIVIDE_BY_AU.append(df2['AV DIVIDE BY AU'].mean()) AX_DIVIDE_BY_AW.append(df2['AX DIVIDE BY AW'].mean()) BA_DIVIDE_BY_AZ.append(df2['BA DIVIDE BY AZ'].mean()) BC_DIVIDE_BY_BB.append(df2['BC DIVIDE BY BB'].mean()) AV_DIVIDE_BY_AU.append( df2['AV DIVIDE BY AU'].mean() * 100 / (df2['BA DIVIDE BY AZ'].mean() + df2['AV DIVIDE BY AU'].mean())) BA_DIVIDE_BY_AZ.append( df2['BA DIVIDE BY AZ'].mean() * 100 / (df2['BA DIVIDE BY AZ'].mean() + df2['AV DIVIDE BY AU'].mean())) AX_DIVIDE_BY_AW.append( df2['AX DIVIDE BY AW'].mean() * 100 / (df2['AX DIVIDE BY AW'].mean() + df2['BC DIVIDE BY BB'].mean())) BC_DIVIDE_BY_BB.append( df2['BC DIVIDE BY BB'].mean() * 100 / (df2['AX DIVIDE BY AW'].mean() + df2['BC DIVIDE BY BB'].mean())) AV_DIVIDE_BY_AU.append( abs(df2['AV DIVIDE BY AU'].mean() - df2['BA DIVIDE BY AZ'].mean())) BA_DIVIDE_BY_AZ.append( abs(df2['AX DIVIDE BY AW'].mean() - df2['BC DIVIDE BY BB'].mean())) total_rows = df.shape[0] blank_list = [np.NaN] df['AV DIVIDE BY AU'] = AV_DIVIDE_BY_AU + blank_list * ( total_rows - len(AV_DIVIDE_BY_AU)) df['AX DIVIDE BY AW'] = AX_DIVIDE_BY_AW + blank_list * ( total_rows - len(AX_DIVIDE_BY_AW)) df['BA DIVIDE BY AZ'] = BA_DIVIDE_BY_AZ + blank_list * ( total_rows - len(BA_DIVIDE_BY_AZ)) df['BC DIVIDE BY BB'] = BC_DIVIDE_BY_BB + blank_list * ( total_rows - len(BC_DIVIDE_BY_BB)) #df['two_p_v_d_abs'] = df['two_p_v_d'].abs() #df['three_p_v_d_abs'] = df['three_p_v_d'].abs() #df['five_p_v_d_abs'] = df['five_p_v_d'].abs() #df['six_p_v_d_abs'] = df['six_p_v_d'].abs() #df.insert(30, 'thirty', '') #df['2-3abs'] = df['two_p_v_d_abs'] - df['three_p_v_d_abs'] #df['5-6abs'] = df['five_p_v_d_abs'] - df['six_p_v_d_abs'] #df['2-3abs'] = df['2-3abs'].abs() #df['5-6abs'] = df['5-6abs'].abs() #df.insert(33, '33', '') #two_p_v_d_pos_list = df.apply(lambda x :x['two_p_v_d'] if x['two_p_v_d'] > 0 else np.NaN ,axis = 1).values.tolist() #two_p_v_d_pos_list = [x for x in two_p_v_d_pos_list if math.isnan(x) == False] #df['two_p_v_d_pos'] = two_p_v_d_pos_list + another_blank_list*(total_rows_another - len(two_p_v_d_pos_list)) #three_p_v_d_pos_list = df.apply(lambda x :x['three_p_v_d'] if x['two_p_v_d'] > 0 else np.NaN ,axis = 1).values.tolist() #three_p_v_d_pos_list = [x for x in three_p_v_d_pos_list if math.isnan(x) == False] #df['three_p_v_d_pos'] = three_p_v_d_pos_list + another_blank_list*(total_rows_another - len(three_p_v_d_pos_list)) #five_p_v_d_pos_list = df.apply(lambda x :x['five_p_v_d'] if x['five_p_v_d'] > 0 else np.NaN ,axis = 1).values.tolist() #five_p_v_d_pos_list = [x for x in five_p_v_d_pos_list if math.isnan(x) == False] #df['five_p_v_d_pos'] = five_p_v_d_pos_list + another_blank_list*(total_rows_another - len(five_p_v_d_pos_list)) #six_p_v_d_pos_list = df.apply(lambda x :x['six_p_v_d'] if x['five_p_v_d'] > 0 else np.NaN ,axis = 1).values.tolist() #six_p_v_d_pos_list = [x for x in six_p_v_d_pos_list if math.isnan(x) == False] #df['six_p_v_d_pos'] = six_p_v_d_pos_list + another_blank_list*(total_rows_another - len(six_p_v_d_pos_list)) #df.insert(38, '38', '') #two_p_v_d_neg_list = df.apply(lambda x :x['two_p_v_d'] if x['two_p_v_d'] < 0 else np.NaN ,axis = 1).values.tolist() #two_p_v_d_neg_list = [x for x in two_p_v_d_neg_list if math.isnan(x) == False] #df['two_p_v_d_neg'] = two_p_v_d_neg_list + another_blank_list*(total_rows_another - len(two_p_v_d_neg_list)) #three_p_v_d_neg_list = df.apply(lambda x :x['three_p_v_d'] if x['two_p_v_d'] < 0 else np.NaN ,axis = 1).values.tolist() #three_p_v_d_neg_list = [x for x in three_p_v_d_neg_list if math.isnan(x) == False] #df['three_p_v_d_neg'] = three_p_v_d_neg_list + another_blank_list*(total_rows_another - len(three_p_v_d_neg_list)) #five_p_v_d_neg_list = df.apply(lambda x :x['five_p_v_d'] if x['five_p_v_d'] < 0 else np.NaN ,axis = 1).values.tolist() #five_p_v_d_neg_list = [x for x in five_p_v_d_neg_list if math.isnan(x) == False] #df['five_p_v_d_neg'] = five_p_v_d_neg_list + another_blank_list*(total_rows_another - len(five_p_v_d_neg_list)) #six_p_v_d_neg_list = df.apply(lambda x :x['six_p_v_d'] if x['five_p_v_d'] < 0 else np.NaN ,axis = 1).values.tolist() #six_p_v_d_neg_list = [x for x in six_p_v_d_neg_list if math.isnan(x) == False] #df['six_p_v_d_neg'] = six_p_v_d_neg_list + another_blank_list*(total_rows_another - len(six_p_v_d_neg_list)) #df['two_p_v_d_neg'] = df['two_p_v_d_neg'].abs() #df['three_p_v_d_neg'] = df['three_p_v_d_neg'].abs() #df['five_p_v_d_neg'] = df['five_p_v_d_neg'].abs() #df['six_p_v_d_neg'] = df['six_p_v_d_neg'].abs() #df.insert(43, '43', '') #df['2-3pos'] = df['two_p_v_d_pos'] - df['three_p_v_d_pos'] #df['5-6pos'] = df['five_p_v_d_pos'] - df['six_p_v_d_pos'] #df['2-3pos'] = df['2-3pos'].abs() #df['5-6pos'] = df['5-6pos'].abs() #df.insert(46, '46', '') #df['2-3neg'] = df['two_p_v_d_neg'] - df['three_p_v_d_neg'] #df['5-6neg'] = df['five_p_v_d_neg'] - df['six_p_v_d_neg'] #df['2-3neg'] = df['2-3neg'].abs() #df['5-6neg'] = df['5-6neg'].abs() #CHOTA WALA RAKHNA HAI BADA WALA HATANA HAI #df.insert(49, '49', '') #df['2-3pos_a'] = df.apply(lambda x : x['2-3pos'] if x['2-3pos'] < x['5-6pos'] else np.NaN, axis = 1) #df['5-6pos_a'] = df.apply(lambda x : x['5-6pos'] if x['5-6pos'] < x['2-3pos'] else np.NaN, axis = 1) #df['2-3neg_a'] = df.apply(lambda x : x['2-3neg'] if x['2-3neg'] < x['5-6neg'] else np.NaN, axis = 1) #df['5-6neg_a'] = df.apply(lambda x : x['5-6neg'] if x['5-6neg'] < x['2-3neg'] else np.NaN, axis = 1) #av_23_pos_a = df['2-3pos_a'].mean() #av_56_pos_a = df['5-6pos_a'].mean() #av_23_neg_a = df['2-3neg_a'].mean() #av_56_neg_a = df['5-6neg_a'].mean() #av_23_pos_a_percent = av_23_pos_a*100/(av_23_pos_a + av_23_neg_a) #av_23_neg_a_percent = av_23_neg_a*100/(av_23_pos_a + av_23_neg_a) #av_56_pos_a_percent = av_56_pos_a*100/(av_56_pos_a+av_56_neg_a) #av_56_neg_a_percent = av_56_neg_a*100/(av_56_pos_a+av_56_neg_a) #a = df['2-3pos_a'].values.tolist() #a = [x for x in a if math.isnan(x) == False] #a.append(av_23_pos_a) #a.append(av_23_pos_a_percent) #b = df['5-6pos_a'].values.tolist() #b = [x for x in b if math.isnan(x) == False] #b.append(av_56_pos_a) #b.append(av_56_pos_a_percent) #c = df['2-3neg_a'].values.tolist() #c = [x for x in c if math.isnan(x) == False] #c.append(av_23_neg_a) #c.append(av_23_neg_a_percent) #d = df['5-6neg_a'].values.tolist() #d = [x for x in d if math.isnan(x) == False] #d.append(av_56_neg_a) #d.append(av_56_neg_a_percent) # total_rows_another = df.shape[0] # print(f'length of a {len(a)}') # print(f'total rows {total_rows_another}') # print(f'remainder {total_rows_another - len(a)}') # df['2-3pos_a'] = a + another_blank_list*(total_rows_another - len(a)) # df['5-6pos_a'] = b + another_blank_list*(total_rows_another - len(b)) # df['2-3neg_a'] = c + another_blank_list*(total_rows_another - len(c)) # df['5-6neg_a'] = d + another_blank_list*(total_rows_another - len(d)) writer = pandas.ExcelWriter(path + "merge_converter.xlsx", engine='xlsxwriter') df.to_excel(writer, sheet_name='Sheet1', index=False) writer.save() print(df)
def recolectarrep(self): localtime = str(date.today()) urllib3.disable_warnings() if self.capturaCarrera.get() == "Computacion": self.carrera.set('10') elif self.capturaCarrera.get() == "Electrica": self.carrera.set('9') elif self.capturaCarrera.get() == "Telecom": self.carrera.set('11') rutadeldirectorio = askdirectory() http = urllib3.PoolManager() url = 'https://www.siass.unam.mx/consulta?numero_cuenta=' + str( self.numCta.get( )) + '&sistema_pertenece=dgae&facultad_id=11&carrera_id=' + str( self.carrera.get()) r = http.request('GET', url) r.status soup = bs.BeautifulSoup(r.data, 'html.parser') link = soup.find_all( 'a') #obtenemos todos las etiquetas <a> para obtener los links arrlinks = [ ] # creamos un arreglo para guardar todos lon links en las etiquetas for i in link: arrlinks.append( i['href'] ) # guardamos el link de las etiquetas <a> en arraylinks linkstemp = [ ] #guardamos los links que NO dirigen al contenido de descripción de los servicios sociales linkstemporales = [ ] #guardamos los links de las pestañas que enumeran el contenido de las páginas 1 2 3 4.... numerosConsulta = [] #guardamos sólo el numero de los links for i in arrlinks: if ("&page=" in i): linkstemp.append(i) for x in linkstemp: linkstemporales.append( x.replace( "http://www.siass.unam.mx/consulta?numero_cuenta=" + self.numCta.get() + "&sistema_pertenece=dgae&facultad_id=11&carrera_id=" + self.carrera.get() + "&page=", "") ) #reemplazamos todo el url para obtener solo el número de la página a la que va el link #en las siguientes líneas de código determinamos el número de páginas obteniento el mayor en la lista "linkstemporales" z = int(linkstemporales[1]) linkstemporales[0] = 0 max = 0 for j in linkstemporales: if (int(j) > z): max = int(j) if int(j) > max: max = int(j) #aquí ya tenemos el numero máximo numerosConsulta = [ ] #aquí guardaremos todos lor número de consulta (es decir los últimos numeros de los links que contienen la descripción de los servicios) arregloDic = [ ] #aquí guardaremos los diccionarios que se generarán en el webscraping contenedorxl = pd.ExcelWriter('pruebaexcelxlsx', engine='xlsxwriter') for e in range( 2, max): #recorremos todas la pestañas de la pagina del siass #sobreescribiremos nuestras variables, ya que obtuvimos lo necesario para recorrer la página url = 'https://www.siass.unam.mx/consulta?numero_cuenta=' + self.numCta.get( ) + '&sistema_pertenece=dgae&facultad_id=11&carrera_id=' + self.carrera.get( ) + '&page=' + str(e) r = http.request('GET', url) r.status soup = bs.BeautifulSoup(r.data, 'html.parser') link = soup.find_all( 'a') #obtenemos todas las etiquetas <a> de html arrlinks = [] for i in link: arrlinks.append( i['href']) #obtenemos todos los links en las etiquetas <a> for i in arrlinks: if ("https://www.siass.unam.mx/consulta/" in i): numerosConsulta.append( i.replace("https://www.siass.unam.mx/consulta/", "")) #obtenemos solo los números for r in numerosConsulta: #recorremos la descripción de todos los servicios sociales que nuestro usuario puede ver url2 = 'https://www.siass.unam.mx/consulta/' + str(r) r = http.request('GET', url2) r.status diccionario = {} soup = bs.BeautifulSoup(r.data, 'html.parser') tabla = soup.find_all('tr') f = open( "diccionario.txt", "a", encoding="utf8" ) #abrimos un archivo para guardar los diccionarios que crearemos para pasar despues a la base de datos links = soup.find_all('li') #encuentra todas las etiquetas li divs = soup.find_all( "div", {"id": re.compile('carrera_*') }) #todas la etiquetas con un id que inicie con carrera dias = soup.find_all( 'label', {"class": "btn btn-success disabled" }) #saca los dias y los turnos del servicio social tablaActividades = soup.find_all( 'table', {"class": "table table-striped table-bordered" }) #actividades de servicio social for a in dias: diccionario[a.text.replace(" ", "").replace( "\n", "" )] = "x" # obtenemos los dias que se laboran en el servicio social y quitamos los espacios y saltos de linea for i in links: for a in i.find_all( 'a', href=re.compile('#carrera_*') ): #obtenemos todas las etiquetas d ela carrera for j in divs: for b in j.find_all( 'p', {"class": "alert alert-info"} ): #otenemos el texto de las etiquetas que tienen la carrera diccionario[( a.text.replace(" ", "").replace("\n", "") )] = ( b.text.replace(" ", "").replace("\n", "") ) #crea un elemento del diccionario con el contenido de la etiqueta de la carrera y la etiqueta que contiene los prestadores for b in tablaActividades: #iniciamos a leer las actividades columna = b.find_all( 'td', {"style": "padding-left: 20px;"} ) #extraemos solo la tabla de actividades de cada carrera texto = "" for c in columna: texto = texto + c.text.replace( " ", "").replace("\n\n", "") diccionario["Actividad " + (a.text.replace( " ", "").replace("\n", ""))] = texto for i in tabla: #creamos y llenamos un diccionario con el contenido de las tablas for a in i.find_all('td'): for z in i.find_all('th'): llave = " ".join(z.text.split()) valor = " ".join(a.text.split()) if llave in diccionario: diccionario[llave + " jefe directo"] = valor else: diccionario[llave] = valor arregloDic.append(diccionario) f.write(str(diccionario) + "\n") f.close() toJson = json.dumps(arregloDic) dfPrueba = pd.read_json(toJson) #Esta parte permite generar archivos de cada una de las carreras if self.carrera.get() == "10": dfPrueba.to_excel(rutadeldirectorio + '/programascompu' + localtime + '.xls', index=False) elif self.carrera.get() == '11': dfPrueba.to_excel(rutadeldirectorio + '/programastelecom' + localtime + '.xls', index=False) elif self.carrera.get() == '9': dfPrueba.to_excel(rutadeldirectorio + '/programaselectr' + localtime + '.xls', index=False) contenedorxl.save()
def feature_importance_Einstein(base): df1 = base['inputs'] df_out = base['outputs'] try: df_out = df_out.drop(columns=['Timestamp']) df1 = df1.drop(columns=['Timestamp']) except: pass # Encontra as variáveis mais relevante para a incidência de COVID-19 model = ExtraTreesClassifier(verbose=False) a = model.fit(df1, df_out) lista_importances = pd.DataFrame([model.feature_importances_]) lista_importances.columns = list(df1.columns) lista_importances = lista_importances * 100 lista_importances = lista_importances.sort_values(by=0, axis=1, ascending=False) top15 = list(lista_importances.columns[0:15]) top15_values = [] # print("Variáveis mais impactantes:") for l in lista_importances.columns[0:15]: # print("Nome: " + str(l) + " - " + str(lista_importances[l][0]) + " %") top15_values.append(lista_importances[l][0]) # print(top15) # cria dataset para predição df_in = df1[top15] df_out = df_out # pega a lista das variáveis mais relevantes e cria outra planilha para a rede neural lista_neural_in = df_in lista_neural_out = df_out ### como, neste caso, o timestamp não importa, pode-se preencher com qualquer valor sequencial ### # pega a quantidade de linhas qtde_linhas = len(lista_neural_in.index) # cria uma coluna de Timestamps sequenciais na primeira posição lista_neural_in.insert( 0, "Timestamp", pd.date_range(start='1/1/2020', periods=qtde_linhas, freq='H')) lista_neural_out.insert( 0, "Timestamp", pd.date_range(start='1/1/2020', periods=qtde_linhas, freq='H')) df2_in = lista_neural_in.copy() df2_out = lista_neural_out.copy() writer = pd.ExcelWriter('base_simulate.xlsx', engine='openpyxl') lista_neural_in.to_excel(writer, sheet_name="INPUTS") lista_neural_out.to_excel(writer, sheet_name="OUTPUTS") writer.save() top15_aws = zip(top15, top15_values) response = { 'top15': top15_aws, 'top15_names': top15, 'df_in': df2_in, 'df_out': df2_out, 'model': model.get_params(), } return response
endsi = [] for index, tt in enumerate(starts): startsi.append( find_nearest(data[1],starts[index]) ) endsi.append( find_nearest(data[1],ends[index]) ) mm = [dirr] for index, tt in enumerate(startsi): mm.append( max( data[0][ startsi[index]:endsi[index] ]) ) current_frame = pd.DataFrame( [ mm] , columns = columns) llist.append(current_frame) new_data_frame = pd.concat(llist, ignore_index=True) writer = pd.ExcelWriter('{}/results/{}'.format( cwd, "results.xlsx"), engine='xlsxwriter') new_data_frame.to_excel(writer,'Sheet1', index=False) writer.save()
today.day) + "_" + month + "_" + str(today.year) + ".xlsx" #reorder the columns, if a comlumn is not listed, it will not be displayed. # myColumns = ['RHSA', 'released_packages', 'severity', 'released_on', 'resource_url', 'package'] # myColumns = ['version'] myColumns = [ 'RHSA', 'ADVISORY_TITLE_TRIMMED', 'SEVERITY', 'VULNERABILITY_IMPACT', 'RESTART', 'AFFECTED' ] #creates a panda DataFrame from the data pulled from the API. advisoryDF = pandas.DataFrame( daddy_list, columns=myColumns) #used to be "data", not daddy_list #created excel speadsheet writer = pandas.ExcelWriter(workbook_name, engine='xlsxwriter') #adds data frame to excel workbook on sheet named sheet_test advisoryDF.to_excel(writer, index=False, sheet_name=month + " RHEL_Analysis") workbook = writer.book rhel_analysis_worksheet = writer.sheets[month + " RHEL_Analysis"] bold_format = workbook.add_format({ 'bold': True, 'bg_color': '#A6A6A6', 'font_name': 'Verdana', 'font_size': '10', 'text_wrap': True, 'align': 'center', 'valign': 'vcenter',
def append_df_to_excel(filename, df, sheet_name='Sheet1', startrow=None, truncate_sheet=False, **to_excel_kwargs): """ Append a DataFrame [df] to existing Excel file [filename] into [sheet_name] Sheet. If [filename] doesn't exist, then this function will create it. Parameters: filename : File path or existing ExcelWriter (Example: '/path/to/file.xlsx') df : dataframe to save to workbook sheet_name : Name of sheet which will contain DataFrame. (default: 'Sheet1') startrow : upper left cell row to dump data frame. Per default (startrow=None) calculate the last row in the existing DF and write to the next row... truncate_sheet : truncate (remove and recreate) [sheet_name] before writing DataFrame to Excel file to_excel_kwargs : arguments which will be passed to `DataFrame.to_excel()` [can be dictionary] Returns: None (c) [MaxU](https://stackoverflow.com/users/5741205/maxu?tab=profile) """ from openpyxl import load_workbook # ignore [engine] parameter if it was passed if 'engine' in to_excel_kwargs: to_excel_kwargs.pop('engine') writer = pd.ExcelWriter(filename, engine='openpyxl') # Python 2.x: define [FileNotFoundError] exception if it doesn't exist try: FileNotFoundError except NameError: FileNotFoundError = IOError try: # try to open an existing workbook writer.book = load_workbook(filename) # get the last row in the existing Excel sheet # if it was not specified explicitly if startrow is None and sheet_name in writer.book.sheetnames: startrow = writer.book[sheet_name].max_row # truncate sheet if truncate_sheet and sheet_name in writer.book.sheetnames: # index of [sheet_name] sheet idx = writer.book.sheetnames.index(sheet_name) # remove [sheet_name] writer.book.remove(writer.book.worksheets[idx]) # create an empty sheet [sheet_name] using old index writer.book.create_sheet(sheet_name, idx) # copy existing sheets writer.sheets = {ws.title: ws for ws in writer.book.worksheets} except FileNotFoundError: # file does not exist yet, we will create it pass if startrow is None: startrow = 0 # write out the new sheet df.to_excel(writer, sheet_name, startrow=startrow, header=False, **to_excel_kwargs) # save the workbook writer.save() writer.close()
def calc_imp_vols(self): vcs_pairs = self.vcs_pairs vcs_pairs['validQ'] = False print('YYYYUUUUUPPPP!') for i in range(len(vcs_pairs.index)): for j in [1, 2]: if np.isnan(vcs_pairs.loc[i, 'current_strike' + str(j)]): continue call_option_ticker_string = vcs_pairs.loc[ i, 'ticker' + str(j)] + '_C_' + str( vcs_pairs.loc[i, 'current_strike' + str(j)]) put_option_ticker_string = vcs_pairs.loc[ i, 'ticker' + str(j)] + '_P_' + str( vcs_pairs.loc[i, 'current_strike' + str(j)]) ib_underlying_multiplier = ib_contract.ib_underlying_multiplier_dictionary.get( vcs_pairs.loc[i, 'tickerHead'], 1) if (self.bid_price_dictionary[call_option_ticker_string] > 0 ) and (self.ask_price_dictionary[call_option_ticker_string] > 0): vcs_pairs.loc[i, 'call_mid_price' + str(j)] = ( self.bid_price_dictionary[call_option_ticker_string] + self.ask_price_dictionary[call_option_ticker_string] ) / (2 * ib_underlying_multiplier) option_greeks = qom.get_option_greeks( underlying=vcs_pairs.loc[i, 'underlying_mid_price' + str(j)], option_price=vcs_pairs.loc[i, 'call_mid_price' + str(j)], strike=vcs_pairs.loc[i, 'current_strike' + str(j)], risk_free_rate=vcs_pairs.loc[i, 'interest_date' + str(j)], expiration_date=vcs_pairs.loc[i, 'expiration_date' + str(j)], calculation_date=self.todays_date, option_type='C', exercise_type=vcs_pairs.loc[i, 'exercise_type']) vcs_pairs.loc[i, 'call_iv' + str(j)] = 100 * option_greeks['implied_vol'] if (self.bid_price_dictionary[put_option_ticker_string] > 0 ) and (self.ask_price_dictionary[put_option_ticker_string] > 0): vcs_pairs.loc[i, 'put_mid_price' + str(j)] = ( self.bid_price_dictionary[put_option_ticker_string] + self.ask_price_dictionary[put_option_ticker_string] ) / (2 * ib_underlying_multiplier) option_greeks = qom.get_option_greeks( underlying=vcs_pairs.loc[i, 'underlying_mid_price' + str(j)], option_price=vcs_pairs.loc[i, 'put_mid_price' + str(j)], strike=vcs_pairs.loc[i, 'current_strike' + str(j)], risk_free_rate=vcs_pairs.loc[i, 'interest_date' + str(j)], expiration_date=vcs_pairs.loc[i, 'expiration_date' + str(j)], calculation_date=self.todays_date, option_type='P', exercise_type=vcs_pairs.loc[i, 'exercise_type']) vcs_pairs.loc[i, 'put_iv' + str(j)] = 100 * option_greeks['implied_vol'] for j in [1, 2]: vcs_pairs['straddle_iv' + str(j)] = (vcs_pairs['put_iv' + str(j)] + vcs_pairs['call_iv' + str(j)]) / 2 vcs_pairs['straddle_price' + str(j)] = (vcs_pairs['call_mid_price' + str(j)] + vcs_pairs['put_mid_price' + str(j)]) vcs_pairs['current_atm_vol_ratio'] = vcs_pairs[ 'straddle_iv1'] / vcs_pairs['straddle_iv2'] for i in range(len(vcs_pairs.index)): if np.isnan(vcs_pairs.loc[i, 'current_atm_vol_ratio']): continue intraday_vcs_output = ic.get_intraday_vcs( report_date=self.report_date, ticker1=vcs_pairs.loc[i, 'ticker1'], ticker2=vcs_pairs.loc[i, 'ticker2'], atm_vol_ratio=vcs_pairs.loc[i, 'current_atm_vol_ratio']) vcs_pairs.loc[i, 'QC'] = intraday_vcs_output['Q'] vcs_pairs.loc[i, 'Q1C'] = intraday_vcs_output['Q1'] vcs_pairs.loc[i, 'validQ'] = intraday_vcs_output['validQ'] writer = pd.ExcelWriter('C:\Research\daily\kuzu.xlsx') vcs_pairs.to_excel(writer, 'Sheet1') writer.save() self.vcs_pairs = vcs_pairs self.prepare_orders()
def run_bindetect(args): """ Main function to run bindetect algorithm with input files and parameters given in args """ #Checking input and setting cond_names check_required(args, ["signals", "motifs", "genome", "peaks"]) args.cond_names = [ os.path.basename(os.path.splitext(bw)[0]) for bw in args.signals ] if args.cond_names is None else args.cond_names args.outdir = os.path.abspath(args.outdir) #Set output files states = ["bound", "unbound"] outfiles = [ os.path.abspath( os.path.join(args.outdir, "*", "beds", "*_{0}_{1}.bed".format(condition, state))) for (condition, state) in itertools.product(args.cond_names, states) ] outfiles.append( os.path.abspath(os.path.join(args.outdir, "*", "beds", "*_all.bed"))) outfiles.append( os.path.abspath( os.path.join(args.outdir, "*", "plots", "*_log2fcs.pdf"))) outfiles.append( os.path.abspath(os.path.join(args.outdir, "*", "*_overview.txt"))) outfiles.append( os.path.abspath(os.path.join(args.outdir, "*", "*_overview.xlsx"))) outfiles.append( os.path.abspath( os.path.join(args.outdir, args.prefix + "_distances.txt"))) outfiles.append( os.path.abspath(os.path.join(args.outdir, args.prefix + "_results.txt"))) outfiles.append( os.path.abspath( os.path.join(args.outdir, args.prefix + "_results.xlsx"))) outfiles.append( os.path.abspath(os.path.join(args.outdir, args.prefix + "_figures.pdf"))) #-------------------------------------------------------------------------------------------------------------# #-------------------------------------------- Setup logger and pool ------------------------------------------# #-------------------------------------------------------------------------------------------------------------# logger = TobiasLogger("BINDetect", args.verbosity) logger.begin() parser = add_bindetect_arguments(argparse.ArgumentParser()) logger.arguments_overview(parser, args) logger.output_files(outfiles) # Setup pool args.cores = check_cores(args.cores, logger) writer_cores = max(1, int(args.cores * 0.1)) worker_cores = max(1, args.cores - writer_cores) logger.debug("Worker cores: {0}".format(worker_cores)) logger.debug("Writer cores: {0}".format(writer_cores)) pool = mp.Pool(processes=worker_cores) writer_pool = mp.Pool(processes=writer_cores) #-------------------------------------------------------------------------------------------------------------# #-------------------------- Pre-processing data: Reading motifs, sequences, peaks ----------------------------# #-------------------------------------------------------------------------------------------------------------# logger.info("----- Processing input data -----") #Check that cond_names are the right length and are unique: if len(args.cond_names) != len(args.signals): logger.error( "The given number of given '--cond-names' ({0}) differ from the given input '--signals' ({1}). Please enter one condition name per signal." .format(len(args.cond_names), len(args.signals))) sys.exit(1) if len(args.cond_names) != len(set(args.cond_names)): logger.error( "The condition names are not unique ({0}). Please use --cond-names to set a unique set of condition names." .format(args.cond_names)) sys.exit(1) #Check opening/writing of files logger.info("Checking reading/writing of files") check_files([args.signals, args.motifs, args.genome, args.peaks], action="r") check_files(outfiles[-3:], action="w") make_directory(args.outdir) #Comparisons between conditions no_conditions = len(args.signals) if args.time_series: comparisons = list(zip(args.cond_names[:-1], args.cond_names[1:])) args.comparisons = comparisons else: comparisons = list(itertools.combinations(args.cond_names, 2)) #all-against-all args.comparisons = comparisons #Pdf for debug output if args.debug: debug_out = os.path.abspath( os.path.join(args.outdir, args.prefix + "_debug.pdf")) debug_pdf = PdfPages(debug_out, keep_empty=True) #Open figure pdf and write overview fig_out = os.path.abspath( os.path.join(args.outdir, args.prefix + "_figures.pdf")) figure_pdf = PdfPages(fig_out, keep_empty=True) plt.figure() plt.axis('off') plt.text(0.5, 0.8, "BINDETECT FIGURES", ha="center", va="center", fontsize=20) #output and order titles = [] titles.append("Raw score distributions") if no_conditions > 1 and args.norm_off == False: titles.append("Normalized score distributions") if args.debug: for (cond1, cond2) in comparisons: titles.append("Background log2FCs ({0} / {1})".format( cond1, cond2)) for (cond1, cond2) in comparisons: titles.append("BINDetect plot ({0} / {1})".format(cond1, cond2)) plt.text(0.1, 0.6, "\n".join([ "Page {0}) {1}".format(i + 2, titles[i]) for i in range(len(titles)) ]) + "\n\n", va="top") figure_pdf.savefig(bbox_inches='tight') plt.close() ################# Read peaks ################ #Read peak and peak_header logger.info("Reading peaks") peaks = RegionList().from_bed(args.peaks) logger.info("- Found {0} regions in input peaks".format(len(peaks))) #Check number of columns in peaks n_cols = len(peaks[0]) for i, peak in enumerate(peaks): if len(peak) != n_cols: logger.error( "The lines in --peaks have a varying number of columns. Line 1 has {0} columns, but line {1} has {2} columns! Please adjust the format of this file to run TOBIAS BINDetect." .format(n_cols, i + 1, len(peak))) sys.exit(1) #Merge overlapping peaks peaks = peaks.merge() logger.info("- Merged to {0} regions".format(len(peaks))) if len(peaks) == 0: logger.error("Input --peaks file is empty!") sys.exit(1) #Read header and check match with number of peak columns peak_columns = len(peaks[0]) #number of columns logger.debug("--peaks have {0} columns".format(peak_columns)) if args.peak_header != None: content = open(args.peak_header, "r").read() args.peak_header_list = content.split() logger.debug("Peak header: {0}".format(args.peak_header_list)) #Check whether peak header fits with number of peak columns if len(args.peak_header_list) != peak_columns: logger.error( "Length of --peak_header ({0}) does not fit number of columns in --peaks ({1})." .format(len(args.peak_header_list), peak_columns)) sys.exit(1) else: args.peak_header_list = ["peak_chr", "peak_start", "peak_end"] + [ "additional_" + str(num + 1) for num in range(peak_columns - 3) ] logger.debug("Peak header list: {0}".format(args.peak_header_list)) ################# Check for match between peaks and fasta/bigwig ################# logger.info( "Checking for match between --peaks and --fasta/--signals boundaries") logger.info("- Comparing peaks to {0}".format(args.genome)) fasta_obj = pysam.FastaFile(args.genome) fasta_boundaries = dict(zip(fasta_obj.references, fasta_obj.lengths)) fasta_obj.close() logger.debug("Fasta boundaries: {0}".format(fasta_boundaries)) peaks = peaks.apply_method(OneRegion.check_boundary, fasta_boundaries, "exit") #will exit if peaks are outside borders #Check boundaries of each bigwig signal individually for signal in args.signals: logger.info("- Comparing peaks to {0}".format(signal)) pybw_obj = pybw.open(signal) pybw_header = pybw_obj.chroms() pybw_obj.close() logger.debug("Signal boundaries: {0}".format(pybw_header)) peaks = peaks.apply_method(OneRegion.check_boundary, pybw_header, "exit") ##### GC content for motif scanning ###### #Make chunks of regions for multiprocessing logger.info("Estimating GC content from peak sequences") peak_chunks = peaks.chunks(args.split) gc_content_pool = pool.starmap( get_gc_content, itertools.product(peak_chunks, [args.genome])) gc_content = np.mean(gc_content_pool) #fraction args.gc = gc_content bg = np.array([(1 - args.gc) / 2.0, args.gc / 2.0, args.gc / 2.0, (1 - args.gc) / 2.0]) logger.info("- GC content estimated at {0:.2f}%".format(gc_content * 100)) ################ Get motifs ################ logger.info("Reading motifs from file") motif_list = MotifList() args.motifs = expand_dirs(args.motifs) for f in args.motifs: motif_list += MotifList().from_file(f) #List of OneMotif objects no_pfms = len(motif_list) logger.info("- Read {0} motifs".format(no_pfms)) logger.debug("Getting motifs ready") motif_list.bg = bg #Set prefixes for motif in motif_list: motif.set_prefix(args.naming) motif.bg = bg logger.spam("Getting pssm for motif {0}".format(motif.name)) motif.get_pssm() #Check that prefixes are unique regardless of upper/lower case name motif_prefixes = [motif.prefix.upper() for motif in motif_list] name_count = Counter(motif_prefixes) if max(name_count.values()) > 1: duplicated = [key for key, value in name_count.items() if value > 1] logger.warning( "The motif output names (as given by --naming) are not unique.") logger.warning( "The following names occur more than once: {0}".format(duplicated)) logger.warning( "These motifs will be renamed with '_1', '_2' etc. To prevent this renaming, please make the names of the input --motifs unique" ) motif_count = {dup_motif: 1 for dup_motif in duplicated} for i, motif in enumerate(motif_list): if motif.prefix.upper() in duplicated: original_name = motif.prefix motif.prefix = motif.prefix + "_{0}".format( motif_count[motif.prefix.upper()] ) #Add number to make prefix unique logger.debug("Renamed motif {0}: {1} -> {2}".format( i + 1, original_name, motif.prefix)) motif_count[original_name.upper()] += 1 motif_names = [motif.prefix for motif in motif_list] #Get threshold for motifs logger.debug("Getting match threshold per motif") outlist = pool.starmap(OneMotif.get_threshold, itertools.product(motif_list, [args.motif_pvalue])) logger.spam(motif_list) motif_list = MotifList(outlist) for motif in motif_list: logger.debug("Motif {0}: threshold {1}".format(motif.name, motif.threshold)) logger.info("Creating folder structure for each TF") for TF in motif_names: logger.spam("Creating directories for {0}".format(TF)) make_directory(os.path.join(args.outdir, TF)) make_directory(os.path.join(args.outdir, TF, "beds")) make_directory(os.path.join(args.outdir, TF, "plots")) #-------------------------------------------------------------------------------------------------------------# #----------------------------------------- Plot logos for all motifs -----------------------------------------# #-------------------------------------------------------------------------------------------------------------# logo_filenames = { motif.prefix: os.path.join(args.outdir, motif.prefix, motif.prefix + ".png") for motif in motif_list } logger.info("Plotting sequence logos for each motif") task_list = [ pool.apply_async(OneMotif.logo_to_file, ( motif, logo_filenames[motif.prefix], )) for motif in motif_list ] monitor_progress(task_list, logger) results = [task.get() for task in task_list] logger.comment("") logger.debug("Getting base64 strings per motif") for motif in motif_list: #motif.get_base() with open(logo_filenames[motif.prefix], "rb") as png: motif.base = base64.b64encode(png.read()).decode("utf-8") #-------------------------------------------------------------------------------------------------------------# #--------------------- Motif scanning: Find binding sites and match to footprint scores ----------------------# #-------------------------------------------------------------------------------------------------------------# logger.comment("") logger.start_logger_queue( ) #start process for listening and handling through the main logger queue args.log_q = logger.queue #queue for multiprocessing logging manager = mp.Manager() logger.info("Scanning for motifs and matching to signals...") #Create writer queues for bed-file output logger.debug("Setting up writer queues") qs_list = [] writer_qs = {} #writer_queue = create_writer_queue(key2file, writer_cores) #writer_queue.stop() #wait until all are done manager = mp.Manager() TF_names_chunks = [ motif_names[i::writer_cores] for i in range(writer_cores) ] writer_tasks = [] for TF_names_sub in TF_names_chunks: logger.debug("Creating writer queue for {0}".format(TF_names_sub)) files = [ os.path.join(args.outdir, TF, "beds", TF + ".tmp") for TF in TF_names_sub ] q = manager.Queue() qs_list.append(q) writer_tasks.append( writer_pool.apply_async(file_writer, args=(q, dict(zip(TF_names_sub, files)), args)) ) #, callback = lambda x: finished.append(x) print("Writing time: {0}".format(x))) for TF in TF_names_sub: writer_qs[TF] = q writer_pool.close() #no more jobs applied to writer_pool #todo: use run_parallel #Start working on data if worker_cores == 1: logger.debug("Running with cores = 1") results = [] for chunk in peak_chunks: results.append( scan_and_score(chunk, motif_list, args, args.log_q, writer_qs)) else: logger.debug("Sending jobs to worker pool") task_list = [ pool.apply_async(scan_and_score, ( chunk, motif_list, args, args.log_q, writer_qs, )) for chunk in peak_chunks ] monitor_progress(task_list, logger) results = [task.get() for task in task_list] logger.info("Done scanning for TFBS across regions!") #logger.stop_logger_queue() #stop the listening process (wait until all was written) #--------------------------------------# logger.info("Waiting for bedfiles to write") #Stop all queues for writing logger.debug("Stop all queues by inserting None") for q in qs_list: q.put((None, None)) #Wait for all writer tasks to finish finished = 0 while finished == 0: logger.debug("Writer task return status: {0}".format( [task.get() if task.ready() else "NA" for task in writer_tasks])) if sum([task.ready() for task in writer_tasks]) == len(writer_tasks): finished = 1 return_codes = [task.get() for task in writer_tasks] if sum(return_codes) != 0: logger.error( "Bedfile writer finished with an error ({0})".format()) else: logger.debug("Bedfile writer(s) finished!") time.sleep(0.5) logger.debug("Joining bed_writer queues") for i, q in enumerate(qs_list): logger.debug("- Queue {0} (size {1})".format(i, q.qsize())) #Waits until all queues are closed writer_pool.join() #-------------------------------------------------------------------------------------------------------------# #---------------------------------- Process information on background scores --------------------------------# #-------------------------------------------------------------------------------------------------------------# logger.info("Merging results from subsets") background = merge_dicts([result[0] for result in results]) TF_overlaps = merge_dicts([result[1] for result in results]) results = None #Add missing TF overlaps (if some TFs had zero sites) for TF1 in motif_list: if TF1.prefix not in TF_overlaps: TF_overlaps[TF1.prefix] = 0 for TF2 in motif_list: tup = (TF1.prefix, TF2.prefix) if tup not in TF_overlaps: TF_overlaps[tup] = 0 #Collect sampled background values for bigwig in args.cond_names: background["signal"][bigwig] = np.array(background["signal"][bigwig]) #Check how many values were fetched from background n_bg_values = len(background["signal"][args.cond_names[0]]) logger.debug("Collected {0} values from background".format(n_bg_values)) if n_bg_values < 1000: err_str = "Number of background values collected from peaks is low (={0}) ".format( n_bg_values) err_str += "- this affects estimation of the bound/unbound threshold and the normalization between conditions. " err_str += "To improve this estimation, please run BINDetect with --peaks = the full peak set across all conditions." logger.warning(err_str) #Plot score distribution fig = plot_score_distribution( [background["signal"][bigwig] for bigwig in args.cond_names], labels=args.cond_names, title="Raw scores per condition") figure_pdf.savefig(fig, bbox_inches='tight') plt.close() #Normalize arrays args.norm_objects = {} if args.norm_off == True or len( args.cond_names ) == 1: #if norm_off or length of cond is 1 - create constant normalization for bigwig in args.cond_names: args.norm_objects[bigwig] = ArrayNorm( "constant", popt=1.0, value_min=0, value_max=1 ) #no normalization; min/max don't matter for constant norm else: logger.comment("") logger.info("Normalizing scores across conditions") list_of_vals = [ background["signal"][bigwig] for bigwig in args.cond_names ] if args.debug: args.norm_objects = quantile_normalization(list_of_vals, args.cond_names, pdfpages=debug_pdf, logger=logger) else: args.norm_objects = quantile_normalization(list_of_vals, args.cond_names, logger=logger) #Normalize background and visualize score distribution for bigwig in args.cond_names: original = background["signal"][bigwig] #Check for nan logger.debug("Background nans ({0}): {1}".format( bigwig, sum(np.isnan(original)))) normalized = args.norm_objects[bigwig].normalize(original) #Replace negative values with 0 negatives = normalized < 0 normalized[negatives] = 0 background["signal"][bigwig] = normalized logger.debug( "Background nans after normalization ({0}): {1}".format( bigwig, sum(np.isnan(background["signal"][bigwig])))) fig = plot_score_distribution( [background["signal"][bigwig] for bigwig in args.cond_names], labels=args.cond_names, title="Normalized scores per condition") figure_pdf.savefig(fig, bbox_inches='tight') plt.close() #-------------------------------------------------------------------------------------------------------------# #-------------------------------------- Estimate bound/unbound threshold -------------------------------------# #-------------------------------------------------------------------------------------------------------------# logger.info("Estimating bound/unbound threshold") #Prepare scores (remove 0's etc.) bg_values = np.array([ background["signal"][bigwig] for bigwig in args.cond_names ]).flatten() #scores from all conditions logger.debug("Size of background array collected: {0}".format( bg_values.size)) bg_values = bg_values[np.logical_not(np.isclose( bg_values, 0.0))] #only non-zero counts logger.debug("Size of background array after filtering > 0: {0}".format( bg_values.size)) if len(bg_values) == 0: logger.error( "Error processing bigwig scores from background. It could be that there are no scores in the bigwig (= all scores are 0) assigned for the peaks. Please check your input files." ) sys.exit(1) x_max = np.percentile(bg_values, [99]) bg_values = bg_values[bg_values < x_max] logger.debug( "Size of background array after filtering < x_max ({0}): {1}".format( x_max, bg_values.size)) #Fit mixture of normals log_vals = np.log(bg_values).reshape(-1, 1) lowest_bic = np.inf for n_components in [2 ]: #2 components; one for 0's and one for true signal gmm = sklearn.mixture.GaussianMixture(n_components=n_components, random_state=1) gmm.fit(log_vals) bic = gmm.bic(log_vals) logger.debug("n_compontents: {0} | bic: {1}".format(n_components, bic)) if bic < lowest_bic: lowest_bic = bic best_gmm = gmm gmm = best_gmm #Obtain parameters for each component means = gmm.means_.flatten() stds = np.sqrt(gmm.covariances_).flatten() #Plot components for debugging if args.debug: fig, ax = plt.subplots(nrows=2, ncols=1, constrained_layout=True) #Plot background distribution ax[0].hist(log_vals, bins='auto', density=True, color="grey") #log space ax[1].hist(bg_values, bins='auto', density=True, color="grey") #normal space #Plot components x_log = np.linspace(np.min(log_vals), np.max(log_vals), 1000) x_norm = np.exp(x_log) for i in range(len(means)): pdf = scipy.stats.norm.pdf(x_log, loc=means[i], scale=stds[i]) ax[0].plot(x_log, pdf, label="Component {0}".format(i + 1)) #Plot component in normal space log_params = scipy.stats.lognorm.fit(bg_values, f0=stds[i], fscale=np.exp(means[i])) pdf = scipy.stats.lognorm.pdf(x_norm, *log_params) ax[1].plot(x_norm, pdf, label="Component {0}".format(i + 1)) ax[0].set_title("Background score distribution") ax[0].set_xlabel("log(background score)") ax[0].set_ylabel("Density") ax[0].legend() ax[1].set_xlabel("Background score") ax[1].set_ylabel("Density") ax[1].legend() debug_pdf.savefig(fig) plt.close() #Extract most-right gaussian chosen_i = np.argmax(means) #Mixture with largest mean log_params = scipy.stats.lognorm.fit(bg_values, f0=stds[chosen_i], fscale=np.exp(means[chosen_i])) #Mode of distribution mode = scipy.optimize.fmin( lambda x: -scipy.stats.lognorm.pdf(x, *log_params), 0, disp=False)[0] logger.debug("- Mode estimated at: {0}".format(mode)) pseudo = mode / 2.0 #pseudo is half the mode args.pseudo = pseudo logger.debug("Pseudocount estimated at: {0}".format(round(args.pseudo, 5))) # Estimate theoretical normal for threshold leftside_x = np.linspace( scipy.stats.lognorm(*log_params).ppf([0.01]), mode, 100) leftside_pdf = scipy.stats.lognorm.pdf(leftside_x, *log_params) #Flip over leftside_x_scale = leftside_x - np.min(leftside_x) #scale to min 0 mirrored_x = np.concatenate( [leftside_x, np.max(leftside_x) + leftside_x_scale]).flatten() mirrored_pdf = np.concatenate([leftside_pdf, leftside_pdf[::-1]]).flatten() popt, cov = scipy.optimize.curve_fit( lambda x, std, sc: sc * scipy.stats.norm.pdf(x, mode, std), mirrored_x, mirrored_pdf) norm_params = (mode, popt[0]) logger.debug("Theoretical normal parameters: {0}".format(norm_params)) #Set threshold for bound/unbound threshold = round( scipy.stats.norm.ppf(1 - args.bound_pvalue, *norm_params), 5) args.thresholds = {bigwig: threshold for bigwig in args.cond_names} logger.stats("- Threshold estimated at: {0}".format(threshold)) #Only plot if args.debug is True if args.debug: #Plot mirrored data fig, ax = plt.subplots(1, 1) ax.hist(bg_values[bg_values < x_max], bins='auto', density=True, label="Observed score distribution") ax.plot(mirrored_x, mirrored_pdf, color="black") plt.xlabel("Bigwig score") plt.title("Theoretical normal") debug_pdf.savefig(fig) plt.close(fig) #Plot fit and threshold fig, ax = plt.subplots(1, 1) ax.hist(bg_values[bg_values < x_max], bins='auto', density=True, label="Observed score distribution") xvals = np.linspace(0, x_max, 1000) log_probas = scipy.stats.lognorm.pdf(xvals, *log_params) ax.plot(xvals, log_probas, label="Log-normal fit", color="orange") #Theoretical normal norm_probas = scipy.stats.norm.pdf(xvals, *norm_params) ax.plot(xvals, norm_probas * (np.max(log_probas) / np.max(norm_probas)), color="grey", linestyle="--", label="Theoretical normal") ax.axvline(threshold, color="black", label="Bound/unbound threshold") ymax = plt.ylim()[1] ax.text(threshold, ymax, "\n {0:.3f}".format(threshold), va="top") #Decorate plot plt.title("Score distribution") plt.xlabel("Bigwig score") plt.ylabel("Density") plt.legend(fontsize=8) plt.xlim((0, x_max)) debug_pdf.savefig(fig) plt.close(fig) #-------------------------------------------------------------------------------------------------------------# #--------------------------------------- Foldchanges between conditions --------------------------------------# #-------------------------------------------------------------------------------------------------------------# logger.comment("") log2fc_params = {} if len(args.signals) > 1: logger.info( "Calculating background log2 fold-changes between conditions") for (bigwig1, bigwig2) in comparisons: #cond1, cond2 logger.info("- {0} / {1}".format(bigwig1, bigwig2)) #Estimate background log2fc scores1 = np.copy(background["signal"][bigwig1]) scores2 = np.copy(background["signal"][bigwig2]) included = np.logical_or(scores1 > 0, scores2 > 0) scores1 = scores1[included] scores2 = scores2[included] #Calculate background log2fc normal disitribution log2fcs = np.log2( np.true_divide(scores1 + args.pseudo, scores2 + args.pseudo)) lower, upper = np.percentile(log2fcs, [1, 99]) log2fcs_fit = log2fcs[np.logical_and(log2fcs >= lower, log2fcs <= upper)] #Decide on diff_dist diff_dist = scipy.stats.norm norm_params = diff_dist.fit(log2fcs_fit) logger.debug( "({0} / {1}) Background log2fc distribution: {2}".format( bigwig1, bigwig2, norm_params)) log2fc_params[(bigwig1, bigwig2)] = norm_params #If debug: plot background log2fc to figures if args.debug: fig, ax = plt.subplots(1, 1) plt.hist(log2fcs, density=True, bins='auto', label="Background log2fc ({0} / {1})".format( bigwig1, bigwig2)) xvals = np.linspace(plt.xlim()[0], plt.xlim()[1], 100) pdf = diff_dist.pdf(xvals, *log2fc_params[(bigwig1, bigwig2)]) plt.plot(xvals, pdf, label="Distribution fit") plt.title("Background log2FCs ({0} / {1})".format( bigwig1, bigwig2)) plt.xlabel("Log2 fold change") plt.ylabel("Density") debug_pdf.savefig(fig, bbox_inches='tight') plt.close() #f = open(os.path.join(args.outdir, "{0}_{1}_log2fcs.txt".format(bigwig1, bigwig2)), "w") #f.write("\n".join([str(val) for val in log2fcs])) #f.close() background = None #free up space #-------------------------------------------------------------------------------------------------------------# #----------------------------- Read total sites per TF to estimate bound/unbound -----------------------------# #-------------------------------------------------------------------------------------------------------------# logger.comment("") logger.info("Processing scanned TFBS individually") #Getting bindetect table ready info_columns = ["total_tfbs"] info_columns.extend([ "{0}_{1}".format(cond, metric) for (cond, metric ) in itertools.product(args.cond_names, ["threshold", "bound"]) ]) info_columns.extend([ "{0}_{1}_{2}".format(comparison[0], comparison[1], metric) for (comparison, metric) in itertools.product(comparisons, ["change", "pvalue"]) ]) cols = len(info_columns) rows = len(motif_names) info_table = pd.DataFrame(np.zeros((rows, cols)), columns=info_columns, index=motif_names) #Starting calculations results = [] if args.cores == 1: for name in motif_names: logger.info("- {0}".format(name)) results.append(process_tfbs(name, args, log2fc_params)) else: logger.debug("Sending jobs to worker pool") task_list = [ pool.apply_async(process_tfbs, ( name, args, log2fc_params, )) for name in motif_names ] monitor_progress(task_list, logger) #will not exit before all jobs are done results = [task.get() for task in task_list] logger.info("Concatenating results from subsets") info_table = pd.concat(results) #pandas tables pool.terminate() pool.join() logger.stop_logger_queue() #-------------------------------------------------------------------------------------------------------------# #------------------------------------------------ Cluster TFBS -----------------------------------------------# #-------------------------------------------------------------------------------------------------------------# clustering = RegionCluster(TF_overlaps) clustering.cluster() #Convert full ids to alt ids convert = {motif.prefix: motif.name for motif in motif_list} for cluster in clustering.clusters: for name in convert: clustering.clusters[cluster]["cluster_name"] = clustering.clusters[ cluster]["cluster_name"].replace(name, convert[name]) #Write out distance matrix matrix_out = os.path.join(args.outdir, args.prefix + "_distances.txt") clustering.write_distance_mat(matrix_out) #-------------------------------------------------------------------------------------------------------------# #----------------------------------------- Write all_bindetect file ------------------------------------------# #-------------------------------------------------------------------------------------------------------------# logger.comment("") logger.info("Writing all_bindetect files") #Add columns of name / motif_id / prefix names = [] ids = [] for prefix in info_table.index: motif = [motif for motif in motif_list if motif.prefix == prefix] names.append(motif[0].name) ids.append(motif[0].id) info_table.insert(0, "output_prefix", info_table.index) info_table.insert(1, "name", names) info_table.insert(2, "motif_id", ids) #info_table.insert(3, "motif_logo", [os.path.join("motif_logos", os.path.basename(logo_filenames[prefix])) for prefix in info_table["output_prefix"]]) #add relative path to logo #Add cluster to info_table cluster_names = [] for name in info_table.index: for cluster in clustering.clusters: if name in clustering.clusters[cluster]["member_names"]: cluster_names.append( clustering.clusters[cluster]["cluster_name"]) info_table.insert(3, "cluster", cluster_names) #Cluster table on motif clusters info_table_clustered = info_table.groupby( "cluster").mean() #mean of each column info_table_clustered.reset_index(inplace=True) #Map correct type info_table["total_tfbs"] = info_table["total_tfbs"].map(int) for condition in args.cond_names: info_table[condition + "_bound"] = info_table[condition + "_bound"].map(int) #### Write excel ### bindetect_excel = os.path.join(args.outdir, args.prefix + "_results.xlsx") writer = pd.ExcelWriter(bindetect_excel, engine='xlsxwriter') #Tables info_table.to_excel(writer, index=False, sheet_name="Individual motifs") info_table_clustered.to_excel(writer, index=False, sheet_name="Motif clusters") for sheet in writer.sheets: worksheet = writer.sheets[sheet] n_rows = worksheet.dim_rowmax n_cols = worksheet.dim_colmax worksheet.autofilter(0, 0, n_rows, n_cols) writer.save() #Format comparisons for (cond1, cond2) in comparisons: base = cond1 + "_" + cond2 info_table[base + "_change"] = info_table[base + "_change"].round(5) info_table[base + "_pvalue"] = info_table[base + "_pvalue"].map( "{:.5E}".format, na_action="ignore") #Write bindetect results tables #info_table.insert(0, "TF_name", info_table.index) #Set index as first column bindetect_out = os.path.join(args.outdir, args.prefix + "_results.txt") info_table.to_csv(bindetect_out, sep="\t", index=False, header=True, na_rep="NA") #-------------------------------------------------------------------------------------------------------------# #------------------------------------------- Make BINDetect plot ---------------------------------------------# #-------------------------------------------------------------------------------------------------------------# if no_conditions > 1: logger.info("Creating BINDetect plot(s)") #Fill NAs from info_table to enable plotting of log2fcs (NA -> 0 change) change_cols = [col for col in info_table.columns if "_change" in col] pvalue_cols = [col for col in info_table.columns if "_pvalue" in col] info_table[change_cols] = info_table[change_cols].fillna(0) info_table[pvalue_cols] = info_table[pvalue_cols].fillna(1) #Plotting bindetect per comparison for (cond1, cond2) in comparisons: logger.info("- {0} / {1} (static plot)".format(cond1, cond2)) base = cond1 + "_" + cond2 #Define which motifs to show xvalues = info_table[base + "_change"].astype(float) yvalues = info_table[base + "_pvalue"].astype(float) y_min = np.percentile(yvalues[yvalues > 0], 5) #5% smallest pvalues x_min, x_max = np.percentile( xvalues, [5, 95]) #5% smallest and largest changes #Fill motifs with metadata (.change, .pvalue, .logpvalue etc.) for motif in motif_list: name = motif.prefix motif.change = float( info_table.at[name, base + "_change"]) #change for this comparison motif.pvalue = float( info_table.at[name, base + "_pvalue"]) #pvalue for this comparison motif.logpvalue = -np.log10( motif.pvalue) if motif.pvalue > 0 else -np.log10(1e-308) #Assign each motif to group if motif.change < x_min or motif.change > x_max or motif.pvalue < y_min: if motif.change < 0: motif.group = cond2 + "_up" if motif.change > 0: motif.group = cond1 + "_up" else: motif.group = "n.s." #Bindetect plot fig = plot_bindetect(motif_list, clustering, [cond1, cond2], args) figure_pdf.savefig(fig, bbox_inches='tight') plt.close(fig) #Interactive BINDetect plot logger.info("- {0} / {1} (interactive plot)".format(cond1, cond2)) html_out = os.path.join(args.outdir, "bindetect_" + base + ".html") plot_interactive_bindetect(motif_list, [cond1, cond2], html_out) #-------------------------------------------------------------------------------------------------------------# #----------------------------- Make heatmap across conditions (for debugging)---------------------------------# #-------------------------------------------------------------------------------------------------------------# if args.debug and len(args.signals) > 1: logger.info("Plotting heatmap across conditions for debugging") mean_columns = [cond + "_mean_score" for cond in args.cond_names] heatmap_table = info_table[mean_columns] heatmap_table.index = info_table["output_prefix"] #Decide fig size rows, cols = heatmap_table.shape figsize = (7 + cols, max(10, rows / 8.0)) cm = sns.clustermap( heatmap_table, figsize=figsize, z_score=0, #zscore for rows col_cluster=False, #do not cluster condition columns yticklabels=True, #show all row annotations xticklabels=True, cbar_pos=(0, 0, .4, .005), dendrogram_ratio=(0.3, 0.01), cbar_kws={ "orientation": "horizontal", 'label': 'Row z-score' }, method="single") #Adjust width of columns #hm = cm.ax_heatmap.get_position() #cm.ax_heatmap.set_position([hm.x0, hm.y0, cols * 3 * hm.height / rows, hm.height]) #aspect should be equal plt.setp(cm.ax_heatmap.get_xticklabels(), fontsize=8, rotation=45, ha="right") plt.setp(cm.ax_heatmap.get_yticklabels(), fontsize=5) cm.ax_col_dendrogram.set_title('Mean scores across conditions', fontsize=20) cm.ax_heatmap.set_ylabel("Transcription factor motifs", fontsize=15, rotation=270) #cm.ax_heatmap.set_title('Conditions') #cm.fig.suptitle('Mean scores across conditions') #cm.cax.set_visible(False) #Save to output pdf plt.tight_layout() debug_pdf.savefig(cm.fig, bbox_inches='tight') plt.close(cm.fig) #-------------------------------------------------------------------------------------------------------------# #-------------------------------------------------- Wrap up---------------------------------------------------# #-------------------------------------------------------------------------------------------------------------# if args.debug: debug_pdf.close() figure_pdf.close() logger.end()
#程序文件Pex18_2.py import numpy as np import pandas as pd y = np.array([4.81, 4.8, 4.73, 4.7, 4.7, 4.73, 4.75, 4.75, 5.43, 5.78, 5.85]) def ExpMove(y, a): n = len(y) M = np.zeros(n) M[0] = (y[0] + y[1]) / 2 for i in range(1, len(y)): M[i] = a * y[i - 1] + (1 - a) * M[i - 1] return M yt1 = ExpMove(y, 0.2) yt2 = ExpMove(y, 0.5) yt3 = ExpMove(y, 0.8) s1 = np.sqrt(((y - yt1)**2).mean()) s2 = np.sqrt(((y - yt2)**2).mean()) s3 = np.sqrt(((y - yt3)**2).mean()) d = pd.DataFrame(np.c_[yt1, yt2, yt3]) f = pd.ExcelWriter("Pdata18_2.xlsx") d.to_excel(f) f.close() #数据写入Excel文件,便于做表 print("预测的标准误差分别为:", s1, s2, s3) #输出预测的标准误差 yh = 0.8 * y[-1] + 0.2 * yt3[-1] print("下一期的预测值为:", yh)
print("No se encuentra el fichero" + str(i)) # Crear una tabla única final f_table = list_of_dic[0] for k in range(1, len(list_of_dic)): f_table = pd.merge(f_table, list_of_dic[k], on="otu_id", how="outer") # Dividir tabla final en intervalos de tiempo y guardar cada subtabla en una hoja excel df2 = f_table.iloc[:, 1:40] df2.insert(0, "otu_id", value=f_table.iloc[:, 0]) df3 = f_table.iloc[:, 40:75] df3.insert(0, "otu_id", value=f_table.iloc[:, 0]) df4 = f_table.iloc[:, 75:203] df4.insert(0, "otu_id", value=f_table.iloc[:, 0]) df5 = f_table.iloc[:, 203:] df5.insert(0, "otu_id", value=f_table.iloc[:, 0]) # Formatear salida pd.set_option("expand_frame_repr", False) # Escribir cada dataframe en una hoja Excel diferente writer = pd.ExcelWriter('HostLifeStyle_SalivaA_absolute.xlsx') f_table.to_excel(writer, sheet_name="SalivaA", index=True, na_rep=0) df2.to_excel(writer, sheet_name="h_SalivaA_Day26to69", index=False, na_rep=0) df3.to_excel(writer, sheet_name="SalivaA_Day72to122", index=False, na_rep=0) df4.to_excel(writer, sheet_name="h_SalivaA_Day123to257", index=False, na_rep=0) df5.to_excel(writer, sheet_name="h_SalivaA_Day258to364", index=False, na_rep=0) # Cerrar el escritor Pandas Excel y guardar el fichero writer.save()
p = Path(r'C:\Data\json\backregisterperson.json') with p.open('r', encoding='utf-8') as f: data = json.loads(f.read()) df = pd.json_normalize(data['Persons']) #df = df.sort_values(["Person.Index"], axis=0, ascending=[True]) #https://thispointer.com/pandas-sort-rows-or-columns-in-dataframe-based-on-values-using-dataframe-sort_values/ df = df.sort_values(by =['Person.Index', 'Timestamp']) #először a Personal index majd a Timestamp alapján rendezi sorba ! #print (df) writer = pd.ExcelWriter('backregisterpersonchart.xlsx', engine='xlsxwriter') df.to_excel(writer, sheet_name='Sheet1') workbook = writer.book worksheet = writer.sheets['Sheet1'] format1 = workbook.add_format({'num_format': '#,##0'}) #tizedesjegy beállítás worksheet.set_column('V:V', 18, format1) worksheet.set_column('X:X', 18, format1) worksheet.write_formula('H2', '=COUNTIF(B2:B2180,"<10000")') #Fontos az excelben szereplő pontosvessző helyett ; sima kell ,
for i in range(row_size): row_num = '' for j in range(index[i]): row_num += str(predint[ptr]) ptr = ptr + 1 pre_num.append(row_num) print(pre_num) fp = open('./imagetxt/9.txt','r') sourceInLine=fp.readlines() dataset = [] for line in sourceInLine: temp = line.strip('\n') dataset.append(temp) data_excel = pd.read_excel('image9.xlsx') size = data_excel.shape[0] for i in range(0, len(pre_num), 3): data_excel.loc[size, '图片名'] = dataset[0] data_excel.loc[size, '角点1'] = dataset[1] data_excel.loc[size, '角点2'] = dataset[2] data_excel.loc[size, '角点3'] = dataset[3] data_excel.loc[size, '角点4'] = dataset[4] data_excel.loc[size, '学号'] = pre_num[i] data_excel.loc[size, '手机号'] = pre_num[i+1] data_excel.loc[size, '身份证号'] = pre_num[i+2] size = size + 1 writer = pd.ExcelWriter('image9.xlsx') data_excel.to_excel(writer) writer.save()
break ax_curr = axs[row, col] plot_acf(residuals[bb_tkr], lags=np.arange(100)[1:], ax=ax_curr) ax_curr.set_xlabel('') ax_curr.set_ylabel('') ax_curr.set_title(f"Autocorr: {oot[oot.bb_tkr == bb_tkr].name.values[0]}") fig.delaxes(axs[7, 2]) plt.savefig(f"Autocorr-{model_type_id}.png", dpi=100) # './reports/figures/'+ plt.show() if __name__ == '__main__': cftcVariableName = 'cftc' # * OR cftc_adj fcastVariableName = 'forecast' # *OR 'forecast_adj' writer = pd.ExcelWriter(f'Autocorr.xlsx', engine='xlsxwriter') autoCorrelationPlots(model_type_ids=[153], cftcVariableName='cftc', fcastVariableName='forecast' ) cftcVar = 'cftc' # * OR cftc_adj fcastVar = 'forecast' # *OR 'forecast_adj' exclWriter = pd.ExcelWriter(f'Autocorr_.xlsx', engine='xlsxwriter') stats = autoCorrelationStatistics(model_type_ids=[153], cftcVariableName=cftcVariableName, fcastVariableName=fcastVariableName) exclWriter.save()
price_df['Compare_'+str(col_idx+1)] = preval_price_df.iloc[:, col_loc + 2] except KeyError: print('{} is not existing on the dataframe'.format(column_nm)) except Exception as e: print('Unknown error: ' + str(e)) #Write Header to Excel wb_name = 'OTC_{}_Pricing_Pre_Validation_Report_{}.xlsx'.format(usage_type, keyword) # Create a Pandas Excel writer using XlsxWriter as the engine. writer = pd.ExcelWriter(wb_name, engine='xlsxwriter', options={'strings_to_numbers': False}) # Get the xlsxwriter workbook and worksheet objects. workbook = writer.book # Add custom formats on the workbook. key_format = workbook.add_format({ 'bold': True, 'align': 'center', 'valign': 'vcenter', #'text_wrap': True, 'fg_color': '#f09886', 'border': 1}) base_format = workbook.add_format({ 'bold': True,
def trata_base_Einstein(df): ##################################################################################3 # upload da planilha excel_file = 'dataset.xlsx' # dataframe completo df = pd.read_excel(excel_file, sheet_name="All") ##################################################################################3 df1 = df # checa se existem strings e numeros misturados for column in df1: a = list(df1[column].map(type) != str) if (len(set(a)) != 1): # converte colunas mistas para tipo string df1[column] = df1[column].apply(str) # checa se deu tudo certo for column in df1: a = list(df1[column].map(type) != str) # if (len(set(a)) != 1): # print("valores mistos remanescentes em: " + column) ### como, neste caso, o timestamp não importa, pode-se preencher com qualquer valor sequencial ### # pega a quantidade de linhas qtde_linhas = len(df1.index) # cria uma coluna de Timestamps sequenciais na primeira posição df1.insert(0, "Timestamp", pd.date_range(start='1/1/2020', periods=qtde_linhas, freq='H')) # pega o nome das colunas colunas = list(df1) # remove colunas que não possuem nenhum dado df1 = df1.dropna(axis=1, how='all') # remove linhas sem exames sanguíneos (vi que quem não fez a de Hematocrit, não fez mais nenhum) df1 = df1[df1['Hematocrit'].notna()] # preenche espaços vazios restantes (NaN) com zeros df1 = df1.fillna(0) # transformando dados categóricos em números - exemplo: [normal,ausente,presente] viram [0,1,2] le = MultiColumnLabelEncoder() df1 = le.fit_transform2(df1) # definindo colunas de saida e transformando em números (neg=0 e pos=1) lista_out = [ 'Timestamp', 'SARS-Cov-2 exam result', 'Patient addmited to regular ward (1=yes, 0=no)', 'Patient addmited to semi-intensive unit (1=yes, 0=no)', 'Patient addmited to intensive care unit (1=yes, 0=no)' ] df_out = df1[lista_out] # df_out = df_out.replace(['negative','positive'],[0,1]) lista_out.remove('Timestamp') # removendo colunas que não serão úteis para a análise e também a coluna de saída lista_drop = lista_out lista_drop.append('Patient ID') df1 = df1.drop(lista_drop, axis=1) colunas = list(df1) # cria o dataframe que virará o xls para subir no B-Zek # salva em uma nova planilha de resultados writer = pd.ExcelWriter('base_relevance.xlsx', engine='openpyxl') df1.to_excel(writer, sheet_name="INPUTS") df_out.to_excel(writer, sheet_name="OUTPUTS") writer.save() response = {'inputs': df1, 'outputs': df_out} return response
tip.append(item.tipo) sta.append(item.status) lon.append(item.longitud) pre.append(item.precipitaciones) vien.append(item.viento) if item.dirViento == True: dirvien.append("A favor") else: dirvien.append("En contra") df = pd.DataFrame({ "Complejidad": comple, "Tiempo": tiem, "Tipo": tip, "status": sta, "Longitud": lon, "Precipitaciones": pre, "Viento": vien, "Dirección": dirvien, "velocidad": vel, "LLantas": stLLan, "Gas": gas, "Aceite": ace, "Eventos": eventos }) writer = pd.ExcelWriter("demo.xlsx", engine="xlsxwriter") df.to_excel(writer, sheet_name="Prueba", index=False) writer.save()
attr = [ 'H2-2 length', 'Inlinks', 'Status Code', 'External Outlinks', 'Crawl Depth', 'Outlinks', 'Unique Inlinks', 'Canonical Link Element 1', 'Title 1 Length', 'Content', 'H2-1 length', 'Indexability', 'Hash', 'HTTP rel="prev" 1', 'Meta Description 1', 'H2-1', 'H2-2', 'URL Encoded Address', 'Last Modified', '% of Total', 'Meta Keyword 1', 'H1-1', 'X-Robots-Tag 1', 'Unique External Outlinks', 'Title 1 Pixel Width', 'Meta Robots 1', 'Meta Description 1 Pixel Width', 'Size (bytes)', 'Text Ratio', 'Unique Outlinks', 'Meta Description 1 Length', 'Word Count', 'H1-1 length', 'Meta Refresh 1', 'Link Score', 'HTTP rel="next" 1', 'Response Time', 'rel="prev" 1', 'Status', 'Redirect URL', 'Title 1', 'Indexability Status', 'Redirect Type', 'rel="next" 1', 'Meta Keywords 1 Length' ] print(df) df = df.fillna(0) for attribute in attr: print(attribute) dfcomp = df.loc[df[attribute + '_mobile'].eq(df[attribute + '_desktop']) == False] #print(dfcomp) if bool(dfcomp.values.tolist()): dfcomp = dfcomp[[attribute + '_mobile', attribute + '_desktop']] print(dfcomp) writer = pd.ExcelWriter( 'C:\\Users\\lukasz.girzycki\\Desktop\\mobile_desktop_inspiracje\\różnice\\comparison_mobile_desktop' + attribute + '.xlsx') dfcomp.to_excel(writer) writer.save() else: print('brak różnic')
print("welcome") occupationList1 = [ m.group(0) for l in text1 for m in [regex1.search(l)] if m ] print("Words") print(occupationList1) if not occupationList1: occupationList1 = ["no_occupation"] occupationList.append(occupationList1[0]) #Output all fetched occupation in Output dd = df.ix[0:, 0:3] dd['Occupation'] = pd.Series(occupationList, index=df.index) writer = pd.ExcelWriter('output.xlsx') dd.to_excel(writer) writer.save() print("Final List") print(occupationList) #Print count of occupation counts = collections.Counter(occupationList) print(counts) from pytagcloud import create_tag_image, make_tags from pytagcloud.lang.counter import get_tag_counts j = "" for i in occupationList: if (i != 'no_occupation'):
if "nofollow" in a['rel']: track_links["Do/No-Follow?"].append("NoFollow") else: track_links["Do/No-Follow?"].append("DoFollow") track_links["Live?"].append("Yes") track_links["Last Check"].append(now.strftime("%d-%m-%Y (%H:%M)")) for k, v in track_links.items(): if k == "Anchor" and len(v) < (i + 1): v.append("-") if k == "Do/No-Follow?" and len(v) < (i + 1): v.append("-") if k == "Live?" and len(v) < (i + 1): v.append("No") if k == "Last Check" and len(v) < (i + 1): v.append(now.strftime("%d-%m-%Y (%H:%M)")) df_final = pd.DataFrame.from_dict(track_links, orient='columns', dtype=None) writer = pd.ExcelWriter(xl, engine="xlsxwriter") df_final.to_excel(writer, sheet_name=sys.argv[1]) try: writer.save() except PermissionError as e: print("\n",e) print("Please close the document before running this script!") sys.exit()
def start_Pandas_Auto(): # Текст, который используется в процессе работы progress_bar = 'Progress: ' packing = 'Packing into file...' sheet_name = 'Данные рынка на ' prom_now = "Текущая стоимость акций " prom_day_max = 'Дневной максимум ' prom_day_min = 'Дневной минимум ' print(progress_bar + '1/5') # Счетчик прогесса full_page = requests.get(Promotions_Tesla, headers=headers) soup = BeautifulSoup(full_page.content, 'html.parser') convert_tesla = soup.findAll("span", {"class": "arial_26 inlineblock pid-13994-last"}) convert_min_tesla = soup.findAll("span", {"class": "inlineblock pid-13994-low"}) convert_max_tesla = soup.findAll("span", {"class": "inlineblock pid-13994-high"}) time.sleep(1) print(progress_bar + '2/5') full_page = requests.get(Promotions_Nissan, headers=headers) soup = BeautifulSoup(full_page.content, 'html.parser') convert_nissan = soup.findAll("span", {"class": "arial_26 inlineblock pid-44127-last"}) convert_min_nissan = soup.findAll("span", {"class": "inlineblock pid-44127-low"}) convert_max_nissan = soup.findAll("span", {"class": "inlineblock pid-44127-high"}) time.sleep(1) print(progress_bar + '3/5') full_page = requests.get(Promotions_GM, headers=headers) soup = BeautifulSoup(full_page.content, 'html.parser') convert_gm = soup.findAll("span", {"class": "arial_26 inlineblock pid-239-last"}) convert_min_gm = soup.findAll("span", {"class": "inlineblock pid-239-low"}) convert_max_gm = soup.findAll("span", {"class": "inlineblock pid-239-high"}) time.sleep(1) print(progress_bar + '4/5') full_page = requests.get(Promotions_Ford, headers=headers) soup = BeautifulSoup(full_page.content, 'html.parser') convert_ford = soup.findAll("span", {"class": "arial_26 inlineblock pid-255-last"}) convert_min_ford = soup.findAll("span", {"class": "inlineblock pid-255-low"}) convert_max_ford = soup.findAll("span", {"class": "inlineblock pid-255-high"}) time.sleep(1) print(progress_bar + '5/5') full_page = requests.get(Promotions_Daimler, headers=headers) soup = BeautifulSoup(full_page.content, 'html.parser') convert_daimler = soup.findAll("span", {"class": "arial_26 inlineblock pid-355-last"}) convert_min_daimler = soup.findAll("span", {"class": "inlineblock pid-355-low"}) convert_max_daimler = soup.findAll("span", {"class": "inlineblock pid-355-high"}) time.sleep(1) hms = datetime.datetime.today() # Дата и время print(hms.hour, hms.minute, hms.second) time_flow = hms.hour, hms.minute, hms.second # Форматирование в формат времени print(packing) # Шаблон данных data = [ [prom_now + str("Tesla"), convert_tesla[0].text], # Текущая стоимость акций + актуальное значение [prom_day_min, convert_min_tesla[0].text], # Дневной минимум [prom_day_max, convert_max_tesla[0].text], # Дневной максимум [" ", " "], # Отступ [prom_now + str("Nissan"), convert_nissan[0].text], [prom_day_min, convert_min_nissan[0].text], [prom_day_max, convert_max_nissan[0].text], [" ", " "], [prom_now + str("General Motors"), convert_gm[0].text], [prom_day_min, convert_min_gm[0].text], [prom_day_max, convert_max_gm[0].text], [" ", " "], [prom_now + str("Ford"), convert_ford[0].text], [prom_day_min, convert_min_ford[0].text], [prom_day_max, convert_max_ford[0].text], [" ", " "], [prom_now + str("Daimler"), convert_daimler[0].text], [prom_day_min, convert_min_daimler[0].text], [prom_day_max, convert_max_daimler[0].text], ] today = date.today() # Текущая дата direction = 'Сarmakers_' # Название файла time_now = datetime.datetime.time(datetime.datetime.now()) # Текущее время new_data = pd.DataFrame(data).rename_axis(None, axis=1) # Создание датафрейма file_name = str(direction) + str(today) + '-' + str(time_flow) # Имя файла file_directory = file_name + '.xlsx' # Добавление Excel-расширения к файлу new_data.style.hide_index() # Индексы не будут показываться # Дальнейшая кастомизация через xlsxwriter writer = pd.ExcelWriter(file_directory, engine='xlsxwriter') new_data.to_excel(writer, sheet_name=str(sheet_name) + str(today), index=False) workbook = writer.book worksheet = writer.sheets[str(sheet_name) + str(today)] # Параметры данных в файле format_list = workbook.add_format({'border': 0, 'num_format': 'hh:mm:ss', 'size': 14, 'align': 'center'}) # Параметры формата чисел, размер шрифта, позиционирование по центру date_format = workbook.add_format({'num_format': 'mm.dd.yyyy'}) # Формат даты worksheet.write('A1', time_now, format_list) # Добавление в столбец А1 текущего времени worksheet.write('B1', today, date_format) # Добавление в столбец B1 текущей даты format = workbook.add_format({'align': 'left'}) worksheet.set_landscape() # Ориентация по умолчанию # Параметры стобцов worksheet.set_column('A:A', 40, format) worksheet.set_column('B:B', 20, format) writer.save() # Сохранение получившегося файла print('\n''Файл с названием ' + str(direction) + str(today) + '-' + str(time_flow) + ' сохранен') # Сообщение о сохранении
def main(): """ Main entry point of the app """ logger.info("CMPC Wide Area Distrution Main Loop") Change_Working_Path('../Data') Station_filename = 'Station Location a375a0647.xlsx' Transformer_filename = 'Power Transformer Asset a7c07a1cb.xlsx' Breaker_filename = 'Breaker Asset a475fe18.xlsx' Relay_filename = 'Dist Locations w Relays 110620.xls' Circuit_Switcher_filename = 'Circuit Switcher Asset a93a3aebd.xlsx' Metalclad_Switchgear_filename = 'Metalclad Switchgear Asset aa554c63f.xlsx' Transformer_Risk_filename = 'Oncor Transformer Asset Health Export - Risk Matrix - System.csv' Summer_Load_Filename = '2021 Load Projections(4-10)Summer - Clean.xlsx' Winter_Load_Filename = '2021 Load Projections(4-10)Winter - Clean.xlsx' Fault_Reporting_Proiritization_filename = 'Fault Reporting Prioritization_EDOC.XLSX' Fault_Reporting_Proiritization_filename1 = 'WDOC Fault Recording Relay Feeder List with Priority v1.1.xlsx' Associated_Breaker_Details_filename = 'Transformer Health - Analysis.xlsx' Excel_Files = [Station_filename, Transformer_filename, Breaker_filename, Relay_filename, Metalclad_Switchgear_filename, Summer_Load_Filename, Winter_Load_Filename, Fault_Reporting_Proiritization_filename, Fault_Reporting_Proiritization_filename1] pool = Pool(processes=15) Associated_Breaker_DetailsDF = Excel_to_Pandas(Associated_Breaker_Details_filename, check_update=False, SheetName='Associated Breaker Details') Associated_Breaker_DetailsDF = Associated_Breaker_DetailsDF[1] # Import Excel files df_list = pool.map(Excel_to_Pandas, Excel_Files) Transformer_RiskDF = Cleanup_Dataframe(pd.read_csv(Transformer_Risk_filename)) # Data Cleanup AIStationDF = station_df_cleanup(df_list[next(i for i, t in enumerate(df_list) if t[0] == Station_filename)][1], df_list[next( i for i, t in enumerate(df_list) if t[0] == Metalclad_Switchgear_filename)][1]) PowerTransformerDF = transformer_df_cleanup( df_list[next(i for i, t in enumerate(df_list) if t[0] == Transformer_filename)][1]) Outdoor_BreakerDF = breaker_df_cleanup( df_list[next(i for i, t in enumerate(df_list) if t[0] == Breaker_filename)][1]) RelayDataDF = relay_df_cleanup(df_list[next(i for i, t in enumerate(df_list) if t[0] == Relay_filename)][1]) Summer_LoadDF = summer_load_df_cleanup( df_list[next(i for i, t in enumerate(df_list) if t[0] == Summer_Load_Filename)][1]) Winter_LoadDF = summer_load_df_cleanup( df_list[next(i for i, t in enumerate(df_list) if t[0] == Winter_Load_Filename)][1]) Fault_Reporting_ProiritizationDF = FRP.Fault_Reporting_Proiritization_df_cleanup( df_list[next(i for i, t in enumerate(df_list) if t[0] == Fault_Reporting_Proiritization_filename)][1]) # Create new date in the dataframes Fault_Reporting_ProiritizationDF = FRP.Fault_Reporting_Proiritization_df_create_data( Fault_Reporting_ProiritizationDF) Summer_LoadDF = summer_load_df_create_data(Summer_LoadDF, AIStationDF) Winter_LoadDF = summer_load_df_create_data(Winter_LoadDF, AIStationDF) AIStationDF = station_df_create_data(AIStationDF, PowerTransformerDF, Outdoor_BreakerDF) PowerTransformerDF = transformer_df_create_data(PowerTransformerDF, Transformer_RiskDF, Summer_LoadDF, Winter_LoadDF, AIStationDF) Outdoor_BreakerDF = breaker_df_create_data(Outdoor_BreakerDF, PowerTransformerDF, Fault_Reporting_ProiritizationDF) Outdoor_BreakerDF = Add_Associated_XMR_Details(Outdoor_BreakerDF, Associated_Breaker_DetailsDF) RelayDataDF = relay_df_create_data(RelayDataDF) AIStationDF = add_Risk_to_Stationdf(AIStationDF, PowerTransformerDF) AIStationDF = add_MVA_Exceeded_Stationdf(AIStationDF, PowerTransformerDF) Outdoor_BreakerDF = add_Relay_Stationdf(AIStationDF, RelayDataDF, Outdoor_BreakerDF) # Select columns to keep AIStationDF = AIStationDF[ ['Region', 'Work_Center', 'Maximo_Code', 'Station_Name', 'STATION_STR_TYPE', 'Age', 'Single_Phase_Station', 'XFMER_Count', 'Max_Risk_Index_at_Station', 'Max_MVA_Exceeded', 'Mean_Feeder_Age' ]] PowerTransformerDF = PowerTransformerDF[['Region', 'Work_Center', 'Station_Name', 'Maximo_Code', 'Age', 'MAXIMUM_MVA', 'LV_NOM_KV', 'Risk_Index_(Normalized)', 'Max_Projected_Summer_Load', 'Max_Projected_Winter_Load', 'Max_MVA_Exceeded']] Outdoor_BreakerDF = Outdoor_BreakerDF[['Region', 'Work_Center', 'Station_Name', 'Maximo_Code', 'Age', 'BKR_SERVICE', 'SELF_CONTAINED', 'Manufacturer', 'BKR_MECH_MOD', 'BKR_INTERR', 'Associated_XFMR', 'DOC_Fault_Reporting_Prioritization', 'SUB_4_Protection']] # Create a Pandas Excel writer using XlsxWriter as the engine. writer = pd.ExcelWriter('../CMPC_WideArea_AIS.xlsx', engine='xlsxwriter') # Convert the dataframe to an XlsxWriter Excel object. AIStationDF.to_excel(writer, sheet_name='Stations', index=False) PowerTransformerDF.to_excel(writer, sheet_name='Transformers', index=False) Outdoor_BreakerDF.to_excel(writer, sheet_name='Outdoor Breakers', index=False) RelayDataDF.to_excel(writer, sheet_name='Relay', index=False) Summer_LoadDF.to_excel(writer, sheet_name='Summer Load', index=False) Winter_LoadDF.to_excel(writer, sheet_name='Winter Load', index=False) # Close the Pandas Excel writer and output the Excel file. writer.save()
def start_Pandas_IT(): progress_bar = 'Progress: ' packing = 'Packing into file...' sheet_name = 'Данные рынка на ' prom_now = "Текущая стоимость акций " prom_day_max = 'Дневной максимум ' prom_day_min = 'Дневной минимум ' check_AMD(convert_amd) print(progress_bar + '1/8') time.sleep(1) full_page = requests.get(Promotions_Intel, headers=headers) soup = BeautifulSoup(full_page.content, 'html.parser') convert_intel = soup.findAll("span", {"class": "arial_26 inlineblock pid-251-last"}) convert_min_intel = soup.findAll("span", {"class": "inlineblock pid-251-low"}) convert_max_intel = soup.findAll("span", {"class": "inlineblock pid-251-high"}) print(progress_bar + '2/8') time.sleep(1) full_page = requests.get(Promotions_Apple, headers=headers) soup = BeautifulSoup(full_page.content, 'html.parser') convert_apple = soup.findAll("span", {"class": "arial_26 inlineblock pid-6408-last"}) convert_min_apple = soup.findAll("span", {"class": "inlineblock pid-6408-low"}) convert_max_apple = soup.findAll("span", {"class": "inlineblock pid-6408-high"}) print(progress_bar + '3/8') time.sleep(1) full_page = requests.get(Promotions_IBM, headers=headers) soup = BeautifulSoup(full_page.content, 'html.parser') convert_ibm = soup.findAll("span", {"class": "arial_26 inlineblock pid-8082-last"}) convert_min_ibm = soup.findAll("span", {"class": "inlineblock pid-8082-low"}) convert_max_ibm = soup.findAll("span", {"class": "inlineblock pid-8082-high"}) print(progress_bar + '4/8') time.sleep(1) full_page = requests.get(Promotions_Microsoft, headers=headers) soup = BeautifulSoup(full_page.content, 'html.parser') convert_microsoft = soup.findAll("span", {"class": "arial_26 inlineblock pid-252-last"}) convert_min_microsoft = soup.findAll("span", {"class": "inlineblock pid-252-low"}) convert_max_microsoft = soup.findAll("span", {"class": "inlineblock pid-252-high"}) print(progress_bar + '5/8') time.sleep(1) full_page = requests.get(Promotions_Google, headers=headers) soup = BeautifulSoup(full_page.content, 'html.parser') convert_google = soup.findAll("span", {"class": "arial_26 inlineblock pid-6369-last"}) convert_min_google = soup.findAll("span", {"class": "inlineblock pid-6369-low"}) convert_max_google = soup.findAll("span", {"class": "inlineblock pid-6369-high"}) print(progress_bar + '6/8') time.sleep(1) full_page = requests.get(Promotions_Facebook, headers=headers) soup = BeautifulSoup(full_page.content, 'html.parser') convert_facebook = soup.findAll("span", {"class": "arial_26 inlineblock pid-26490-last"}) convert_min_facebook = soup.findAll("span", {"class": "inlineblock pid-26490-low"}) convert_max_facebook = soup.findAll("span", {"class": "inlineblock pid-26490-high"}) print(progress_bar + '7/8') time.sleep(1) full_page = requests.get(Promotions_Yandex, headers=headers) soup = BeautifulSoup(full_page.content, 'html.parser') convert_yandex = soup.findAll("span", {"class": "arial_26 inlineblock pid-13999-last"}) convert_min_yandex = soup.findAll("span", {"class": "inlineblock pid-13999-low"}) convert_max_yandex = soup.findAll("span", {"class": "inlineblock pid-13999-high"}) print(progress_bar + '8/8') time.sleep(.5) hms = datetime.datetime.today() print(hms.hour, hms.minute, hms.second) time_flow = hms.hour, hms.minute, hms.second print(packing) data = [ [prom_now + str("AMD"), convert_amd[0].text], [prom_day_min, convert_min_amd[0].text], [prom_day_max, convert_max_amd[0].text], [" ", " "], [prom_now + str("Intel"), convert_intel[0].text], [prom_day_min, convert_min_intel[0].text], [prom_day_max, convert_max_intel[0].text], [" ", " "], [prom_now + str("Apple"), convert_apple[0].text], [prom_day_min, convert_min_apple[0].text], [prom_day_max, convert_max_apple[0].text], [" ", " "], [prom_now + str("IBM"), convert_ibm[0].text], [prom_day_min, convert_min_ibm[0].text], [prom_day_max, convert_max_ibm[0].text], [" ", " "], [prom_now + str("Microsoft"), convert_microsoft[0].text], [prom_day_min, convert_min_microsoft[0].text], [prom_day_max, convert_max_microsoft[0].text], [" ", " "], [prom_now + str("Google"), convert_google[0].text], [prom_day_min, convert_min_google[0].text], [prom_day_max, convert_max_google[0].text], [" ", " "], [prom_now + str("Facebook"), convert_facebook[0].text], [prom_day_min, convert_min_facebook[0].text], [prom_day_max, convert_max_facebook[0].text], [" ", " "], [prom_now + str("Yandex"), convert_yandex[0].text], [prom_day_min, convert_min_yandex[0].text], [prom_day_max, convert_max_yandex[0].text], ] today = date.today() time_now = datetime.datetime.time(datetime.datetime.now()) direction = 'IT_' new_data = pd.DataFrame(data).rename_axis(None, axis=1) file_name = str(direction) + str(today) + '-' + str(time_flow) file_directory = file_name + '.xlsx' new_data.style.hide_index() writer = pd.ExcelWriter(file_directory, engine='xlsxwriter') new_data.to_excel(writer, sheet_name=str(sheet_name) + str(today), index=False) workbook = writer.book worksheet = writer.sheets[str(sheet_name) + str(today)] format_list = workbook.add_format({'border': 0, 'num_format': 'hh:mm:ss', 'size': 14, 'align': 'center'}) date_format = workbook.add_format({'num_format': 'mm.dd.yyyy'}) worksheet.write('A1', time_now, format_list) worksheet.write('B1', today, date_format) format = workbook.add_format({'align': 'left'}) worksheet.set_landscape() worksheet.set_column('A:A', 40, format) worksheet.set_column('B:B', 20, format) writer.save() print('\n''Файл с названием ' + str(direction) + str(today) + '-' + str(time_flow) + ' сохранен')
# P32 # Fisher stats # Save to Excel ######################### myC13, myfisherDF, myCDFAngles, myCDFRanges= calcPlotC13(data2, 'kmeans') dataBeni = groupByDensity(dataBeni) beniC13, bfisherDF, aaa, bbbb= calcPlotC13(dataBeni, 'Beni') data2 = calcP32(data2, myC13) dataBeni = calcP32(dataBeni, beniC13) data2 = data2.drop_duplicates() dataBeni = dataBeni.drop_duplicates() writer = pd.ExcelWriter(outfp + '\\' +'C13.xlsx') myC13.to_excel(writer, 'Kmeans_C13') beniC13.to_excel(writer, 'HardSectoring_C13') data2.to_excel(writer, 'HardSectoring_origdata') dataBeni.to_excel(writer, 'HardSectoring_groupByDensity') myfisherDF.to_excel(writer, 'Kmeans_Fisherstats') bfisherDF.to_excel(writer, 'HardSectoring_Fisherstats') myCDFAngles.to_excel(writer,'Kmeans_CDF_angles') myCDFRanges.to_excel(writer,'Kmeans_CDF_range') aaa.to_excel(writer,'HardSectoring_CDF_angles') bbbb.to_excel(writer,'HardSectoring_CDF_range') writer.save() grouped = data2.groupby('population') for idx, group in grouped: fname = outfp + '\\' + 'Cluster' + str(idx) + '.csv'
# bd_subnet_type = data["polUni"]["children"][x]["fvTenant"]["children"][y]["fvBD"]["children"][0]["fvSubnet"]["attributes"]["scope"] # bd.append(BD(bd_tenant,bd_vrf,bd_name,bd_des,bd_mac,bd_gateway_ip,bd_subnet_type)) for x in range(len(bd)): list_bd_name.append(bd[x].name) list_bd_vrf.append(bd[x].vrf) list_bd_des.append(bd[x].des) list_bd_tenant.append(bd[x].tn) list_bd_gateway_ip.append(bd[x].ip) list_bd_mac.append(bd[x].mac) list_bd_subnet_type.append(bd[x].subnet_type) now = datetime.now() # current date and time date_time = now.strftime("%d%m%Y-%H%M") # Create some Pandas dataframes from some data. sheet1 = pd.DataFrame({ 'Tenant': list_bd_tenant, 'VRF': list_bd_vrf, 'Description': list_bd_des, 'bd_mac': list_bd_mac, 'bd_gateway_ip': list_bd_gateway_ip, 'subnet_type': list_bd_subnet_type }) # Create a Pandas Excel writer using XlsxWriter as the engine. writer = pd.ExcelWriter(f"BD v1.5 {date_time}.xlsx", engine='xlsxwriter') # Write each dataframe to a different worksheet. sheet1.to_excel(writer, sheet_name='BD') # Close the Pandas Excel writer and output the Excel file. writer.save()
#READ THE INPUT FILE TO CREATE DATAFRAMES USING PANDAS dg = pd.read_csv('FULL PATH TO alpha.csv') #CREATE DATAFRAME TO CHANGE DATE TIME FORMATS FOR SUBMISSION DUE DATE, SUBMITTED AND GRADED DATE USING STRING FORMATTING dg['SUBMISSION DUE DATE'] = pd.to_datetime( dg['SUBMISSION DUE DATE']).dt.strftime('%d/%m/%Y') dg['SUBMISSION SUBMITTED AT'] = pd.to_datetime( dg['SUBMISSION SUBMITTED AT']).dt.strftime('%d/%m/%Y') dg['SUBMISSION GRADED DATE'] = pd.to_datetime( dg['SUBMISSION GRADED DATE']).dt.strftime('%d/%m/%Y') ##Prints a confirmation for checking print(dg) #CREATE WRITER OBJECT AND DEFINE OUTPUT FILE USING XLSXWRITER writer = pd.ExcelWriter('FULL PATH TO first assignment worksheet 123456.xlsx', engine='xlsxwriter') dg.to_excel(writer, sheet_name='123456') workbook = writer.book #worksheet = workbook.add_worksheet('Data') #worksheet = writer.sheets['123456'] #worksheet.write_formula() writer.save() print("123456 complete *****************************") #READ THE INPUT FILE df = pd.read_csv('FULL PATH TO beta.csv')
def construct_year_chart(node_names_list: List[str], write_to_excel: bool = False) -> Dict[str, pd.DataFrame]: ''' Returns an excel file with the excel file ''' if os.getcwd().endswith('2016_realtime_hourly_dataset'): pass else: os.chdir(os.getcwd() + '/2016_realtime_hourly_dataset') curr_working_dir = os.getcwd() output_df = None infer_headers_flag = False headers = [] df_name = None desired_key = 'Location Name' node_pd_dict = {} # assume infer_headers = ['H', 'Date', 'Hour Ending', 'Location ID', 'Location Name', 'Location Type', 'Locational Marginal Price', 'Energy Component', 'Congestion Component', 'Marginal Loss Component'] files = os.listdir(curr_working_dir) number_files = len(files) - 1 for node_name in node_names_list: frames = [] if f'{node_name}_2016.xlsx' in os.listdir('../individual_nodes'): log.debug(f'{node_name} excel already exists, so skipping write to excel') node_pd_dict[node_name] = load_pd('../individual_nodes/' + f'{node_name}_2016.xlsx') continue for index, filename in enumerate(files): if (index + 1) % 20 == 0: log.info((f'On file {index+1} out of {number_files}')) if filename == 'node_pd_dict_pickle.p': continue # Avoid passing over the pickle file df = load_pd(filename) if not infer_headers_flag: headers = list(df) infer_headers_flag = True try: df_name = df.loc[df[desired_key] == node_name] frames.append(df_name) except ValueError: raise ValueError(f"{node_name} doesn't exist!") except KeyError: raise KeyError(f"Can't find {desired_key} in {headers}") concatenated_df = pd.concat(frames) log.debug(concatenated_df) if write_to_excel: if f'{node_name}_2016.xlsx' in os.listdir('../individual_nodes'): log.debug(f'{node_name} excel already exists, so skipping write to excel') continue final_file_name = f'/{node_name}_2016.xlsx' file_path = '../individual_nodes' + final_file_name writer = pd.ExcelWriter(file_path) concatenated_df.to_excel(writer) writer.save() log.debug(f'Wrote {final_file_name} to excel in {file_path}') node_pd_dict[node_name] = concatenated_df log.debug('Done!, returning pd_dict') return node_pd_dict