def test_parse_orbital(self): self.mam1.parse_orbital() self.assertEqual(self.mam1.orbital.shape[0], 28) self.assertTrue(np.all(pd.notnull(self.mam1.orbital))) self.mam2.parse_orbital() self.assertEqual(self.mam2.orbital.shape[0], 91) self.assertTrue(np.all(pd.notnull(self.mam2.orbital)))
def logistic_test_using_cosine(score_feature=False): logger.info('using cosine features in logistic regression') if score_feature: logger.info('also use score feature') Cs = [2**t for t in range(0, 10, 1)] Cs.extend([3**t for t in range(1, 10, 1)]) snli2cosine = SNLI2Cosine('/home/junfeng/word2vec/GoogleNews-vectors-negative300.bin') logger.info('loading snli data ...') train_df = pd.read_csv('./snli/snli_1.0/snli_1.0_train.txt', delimiter='\t') train_df = train_df[pd.notnull(train_df.sentence2)] train_df = train_df[train_df.gold_label != '-'] train_df = train_df[:(len(train_df) / 3)] train_df.reset_index(inplace=True) test_df = pd.read_csv('./snli/snli_1.0/snli_1.0_test.txt', delimiter='\t') test_df = test_df[pd.notnull(test_df.sentence2)] test_df = test_df[test_df.gold_label != '-'] test_df.reset_index(inplace=True) X_train, train_labels, X_test, test_labels = snli2cosine.calculate_cosine_features(train_df, test_df) if score_feature: y_train_proba, y_test_proba = joblib.load('./snli/logistic_score_snli.pkl') # y_train_proba = y_train_proba.flatten() # y_test_proba = y_test_proba.flatten() X_train = np.concatenate([X_train, y_train_proba.reshape((-1, 1))], axis=1) X_test = np.concatenate([X_test, y_test_proba.reshape((-1, 1))], axis=1) logger.info('X_train.shape: {0}'.format(X_train.shape)) logger.info('X_test.shape: {0}'.format(X_test.shape)) logreg = LogisticRegressionCV(Cs=Cs, cv=3, n_jobs=10, random_state=919) logreg.fit(X_train, train_labels) logger.info('best C is {0}'.format(logreg.C_)) y_test_predicted = logreg.predict(X_test) acc = accuracy_score(test_labels, y_test_predicted) logger.info('test data predicted accuracy: {0}'.format(acc))
def read_tcr(filename, organism, chains, epitope_col): tcr_col = ['cdr1.alpha', 'cdr2.alpha', 'cdr2.5.alpha', 'cdr3.alpha', 'cdr1.beta', 'cdr2.beta', 'cdr2.5.beta', 'cdr3.beta', 'v.alpha', 'v.beta', epitope_col, 'species'] all_tcrs = pd.read_table(filename, usecols=tcr_col) all_tcrs = all_tcrs[all_tcrs['species'] == organism] for chain in chains: if chain == 'A': all_tcrs = all_tcrs[pd.notnull(all_tcrs['v.alpha'])] all_tcrs = all_tcrs[pd.notnull(all_tcrs['cdr3.alpha'])] all_tcrs = all_tcrs[all_tcrs['cdr3.alpha'].str.len() > 5] elif chain == 'B': all_tcrs = all_tcrs[pd.notnull(all_tcrs['v.beta'])] all_tcrs = all_tcrs[pd.notnull(all_tcrs['cdr3.beta'])] all_tcrs = all_tcrs[all_tcrs['cdr3.beta'].str.len() > 5] all_tcrs['v_alpha_rep'] = all_tcrs.loc[:, 'v.alpha'].map( ch_cdr3s_human.all_loopseq_representative[organism.lower()]) all_tcrs['v_beta_rep'] = all_tcrs.loc[:, 'v.beta'].map(ch_cdr3s_human.all_loopseq_representative[organism.lower()]) all_tcrs['tcr_info'] = list(zip(all_tcrs.v_alpha_rep.str.split(','), all_tcrs.v_beta_rep.str.split(','), all_tcrs['cdr3.alpha'], all_tcrs['cdr3.beta'])) all_tcrs = all_tcrs.drop_duplicates(subset=['v_alpha_rep', 'v_beta_rep', 'cdr3.alpha', 'cdr3.beta', epitope_col], keep='first') #remove duplicates all_tcrs = all_tcrs.drop_duplicates(subset=['v_alpha_rep', 'v_beta_rep', 'cdr3.alpha', 'cdr3.beta'], keep=False) #remove crossreactivity all_tcrs = all_tcrs.reset_index(drop=True) return all_tcrs
def test_parse_momatrix(self): self.mam1.parse_momatrix() self.assertEqual(self.mam1.momatrix.shape[0], 784) self.assertTrue(np.all(pd.notnull(self.mam1.momatrix))) self.mam2.parse_momatrix() self.assertEqual(self.mam2.momatrix.shape[0], 8281) self.assertTrue(np.all(pd.notnull(self.mam2.momatrix)))
def test_update_info_intercept_norm_value(self): calc = self.calculated_res['intercept_norm_value'] calc = calc[pd.notnull(calc)] exp = self.expected_res['intercept_norm_value'] exp = exp[pd.notnull(exp)] assert_equal(calc.to_dict(), exp.to_dict())
def get_prediction_summary(data_pred_df, pred_cols=None, do_print=True, transpose=True, percentiles=None): data_pred_df = load_if_str(data_pred_df) if pred_cols is None: pred_cols = data_pred_df.columns if percentiles is None: percentiles = [] pred_summary = data_pred_df.describe(percentiles=percentiles) data_all_out = load_data('data_all_out') data_all_out = data_all_out[pd.notnull(data_all_out[TARGET_COL])] data_pred_df_actual = pd.merge(left=data_all_out, right=data_pred_df, left_index=True, right_index=True) if len(data_pred_df_actual) > 0: score_ix = len(pred_summary) for pred_col in pred_cols: try: pred_sel = pd.notnull(data_pred_df_actual[pred_col]) score = auc(actual=data_pred_df_actual.ix[pred_sel, TARGET_COL], pred=data_pred_df_actual.ix[pred_sel, pred_col].round(decimals=ROUND_PRED)) except ValueError: score = np.nan pred_summary.loc[score_ix, pred_col] = score pred_summary.index = list(pred_summary.index[:-1]) + ['auc'] if transpose: pred_summary = pred_summary.transpose() if do_print: get_log().info('\nPrediction summary:\n%s' % pred_summary.to_string()) else: return pred_summary
def put_rainfall_to_dataframe(df_10min, lrf_var_all, infd_lst): for dfidx in df_10min.index: # Put first site in site list for slidx in xrange(len(infd_lst)): if pd.Timestamp(dfidx) in lrf_var_all[slidx]: df_10min[infd_lst[slidx]][dfidx] = \ lrf_var_all[slidx][pd.Timestamp(dfidx)] # Fill AS1 if there is missing data from AS2. # With more sites, AD, ALG, the missing data number are similar. if pd.isnull(df_10min[infd_lst[0]][dfidx]): if pd.notnull(df_10min[infd_lst[1]][dfidx]): df_10min[infd_lst[0]][dfidx] = float(df_10min[infd_lst[1]][dfidx]) elif pd.notnull(df_10min[infd_lst[2]][dfidx]): df_10min[infd_lst[0]][dfidx] = float(df_10min[infd_lst[2]][dfidx]) elif pd.notnull(df_10min[infd_lst[3]][dfidx]): df_10min[infd_lst[0]][dfidx] = float(df_10min[infd_lst[3]][dfidx]) elif pd.notnull(df_10min[infd_lst[4]][dfidx]): df_10min[infd_lst[0]][dfidx] = float(df_10min[infd_lst[4]][dfidx]) else: df_10min[infd_lst[0]][dfidx] = df_10min[infd_lst[0]][dfidx] return df_10min
def delete_empty_rows(df, column_list): """ The Data might contains some rows with Empty value NaN. This function delete those rows. Input: df: pandas DataFrame column_list: list of String, each item is column name in the DF Output: df: pandas DataFrame """ empty_rows=[] len_1 = len(df) if isinstance(column_list, str): df = df[pd.notnull(df[column_list])] elif isinstance(column_list, list): for column in column_list: df = df[pd.notnull(df[column])] empty_rows.append(df[pd.isnull(df[column])]) else: raise ValueError("Unsupported input!") if len_1 - len(df) > 0: note = "{0} rows deleted because containing empty value in column {1}." print note.format(len_1 - len(df), str(column_list)) print empty_rows return df
def test_basic(self): # array or list or dates N = 50 rng = date_range('1/1/1990', periods=N, freq='53s') ts = Series(np.random.randn(N), index=rng) ts[15:30] = np.nan dates = date_range('1/1/1990', periods=N * 3, freq='25s') result = ts.asof(dates) assert notnull(result).all() lb = ts.index[14] ub = ts.index[30] result = ts.asof(list(dates)) assert notnull(result).all() lb = ts.index[14] ub = ts.index[30] mask = (result.index >= lb) & (result.index < ub) rs = result[mask] assert (rs == ts[lb]).all() val = result[result.index[result.index >= ub][0]] assert ts[ub] == val
def snap_largest_volume(self, product, sdate, nn=10, midcurves=True): """Snap options grid and get the 10 most traded by volume for each month. """ self.reset(product, sdate) if self.livedata: ffields = ['BID', 'ASK', 'VOLUME'] _columns = ['BID', 'BID_VOL', 'ASK', 'ASK_VOL', 'VOLUME'] else: ffields = ['PX_SETTLE', 'VOLUME'] _columns = ['SETTLE', 'SETTLE_VOL', 'VOLUME'] fdata = pd.DataFrame.from_records(self.futures_chain(), columns=['mon', 'last_trade'], index='mon') fdata.insert(0, 'ticker', [self.product+mm+' '+self.suffix for mm in fdata.index]) fdata = pd.merge(fdata, self.pb.fetch(fdata.ticker, ffields, self.date), left_on='ticker', \ right_index=True, how='outer') if not 'VOLUME' in fdata: raise Exception("No futures volume on " + sdate.strftime("%Y-%m-%d")) optr = self.options_chain() # discard months with no volume optr = {k:v for k,v in optr.iteritems() if pd.notnull(fdata.VOLUME[v.undl])} if self.livedata: self.calc_atm_vols(optr, 0.5*(fdata.BID+fdata.ASK)) else: self.calc_atm_vols(optr, fdata.PX_SETTLE) self.get_options_by_volume(optr, nn) if pd.notnull(self.xs.midcurves) and midcurves: mcoptr = self.midcurves_chain() mcre = re.compile('^(?P<mm>\d' + self.xs.midcurves[1] + \ '[FGHJKMNQUVXZ]\d{1})(?P<type>[PC])\s(?P<k>\d+(\.\d+)?) ' + self.suffix + '$') if self.livedata: mcoptr = {k:v for k,v in mcoptr.iteritems() if pd.notnull(fdata.BID[v.undl]) and pd.notnull(fdata.ASK[v.undl])} self.calc_atm_vols(mcoptr, 0.5*(fdata.BID+fdata.ASK), '', mcre) else: mcoptr = {k:v for k,v in mcoptr.iteritems() if pd.notnull(fdata.VOLUME[v.undl])} self.calc_atm_vols(mcoptr, fdata.PX_SETTLE, '', mcre) self.get_options_by_volume(mcoptr, nn, 0.05, '', mcre) else: mcoptr = [] idx = pd.MultiIndex.from_tuples( \ [(mm,kk[0]) for mm in sorted(optr) for kk in optr[mm].data] \ + [(mc,kk[0]) for mc in sorted(mcoptr) for kk in mcoptr[mc].data], \ names=['month', 'strike']) odata = pd.DataFrame( \ [list(x[2:]) for mm in sorted(optr) for x in optr[mm].data] + [list(x[2:]) for mc in sorted(mcoptr) for x in mcoptr[mc].data], \ columns = _columns, \ index = idx) return fdata, odata
def cleanupforanalysis(self, df_orig, col_sample_an, col_background_an, col_background_int): ''' remove NaNs, remove duplicates, split protein groups, remove splice variant appendix create 2 DataFrames self.df_all: columns = [sample_ans, background_ans] --> contains all AccessionNumbers regardless if intensity values present or not self.df_int: columns = [sample_ans, background_ans, intensity] --> only if intensity value given :return: None ''' self.sample_ser = df_orig[col_sample_an] self.background_df = df_orig[[col_background_an, col_background_int]] # remove duplicate AccessionNumbers and NaNs from samplefrequency and backgroundfrequency AN-cols cond = pd.notnull(self.sample_ser) self.sample_ser = self.sample_ser.loc[cond, ].drop_duplicates() cond = pd.notnull(self.background_df[col_background_an]) self.background_df = self.background_df.loc[cond, [col_background_an, col_background_int]].drop_duplicates(subset=col_background_an) # split AccessionNumber column into mulitple rows P63261;I3L4N8;I3L1U9;I3L3I0 --> 4 rows of values # remove splice variant appendix from AccessionNumbers (if present) P04406-2 --> P04406 self.sample_ser = self.removeSpliceVariants_takeFirstEntryProteinGroups_Series(self.sample_ser) self.background_df = self.removeSpliceVariants_takeFirstEntryProteinGrous_DataFrame(self.background_df, col_background_an, col_background_int) # remove duplicate AccessionNumbers and NaNs from samplefrequency and backgroundfrequency AN-cols cond = pd.notnull(self.sample_ser) self.sample_ser = self.sample_ser.loc[cond, ].drop_duplicates() cond = pd.notnull(self.background_df[col_background_an]) self.background_df = self.background_df.loc[cond, [col_background_an, col_background_int]].drop_duplicates(subset=col_background_an) # concatenate data self.df_all = self.concat_and_align_sample_and_background(self.sample_ser, self.background_df) # remove AccessionNumbers from sample and background-frequency without intensity values self.df_int = self.df_all.loc[pd.notnull(self.df_all[col_background_int]), ]
def line(self, x, y, label='', alpha=1.0, add_legend=True, color_from=None, color=None, dashed=False): """ Add a line to the chart object. Input: x: pandas.Series or list, containing datetime.date objects or strings of the form: 'YYYY-mm_dd'. y: pandas.Series or list, containing numerical values. label: string, to be used in the legend and tooltip. alpha: float, opacity: [0.0, 1.0]. add_legend: boolean, either adds or removes this line from the legend. color_from: string, using the label from another line you can copy its color onto this line. """ # pandas.Series to list if isinstance(x, s.Series): x = x.where((notnull(x)), None) x = x.tolist() # datetime.date to str if isinstance(x[0], date): x = [str(dt) for dt in x] # pandas.Series to list if isinstance(y, s.Series): y = y.where((notnull(y)), None) y = y.tolist() if not label: add_legend = False kwargs = {'alpha':alpha, 'add_legend':add_legend, 'color':color, 'dashed':dashed} if color_from: kwargs['color_from'] = color_from curr_line = """ch.line({x}, {y}, '{label}', {kwargs});""".format(x=json.dumps(x), y=json.dumps(y), label=label, kwargs=json.dumps(kwargs)) self.lines.append(curr_line) return self.render_js()
def process_missing_data(self, missing='drop'): """ Process rows in item array that contain missing values. Args: missing (string): Method for dealing with missing values. Options: 'drop': Drop any subjects with at least one missing items 'impute': Impute the mean for that item across all subjects """ if missing == 'drop': inds = pd.notnull(self.X).all(1).nonzero()[0] if self.y is not None: inds = np.intersect1d(inds, pd.notnull(self.y).all(1).nonzero()[0]) n_missing = len(self.X) - len(inds) if n_missing: # Slice and reindex X and y self.X = self.X.ix[inds] if self.y is not None: self.y = self.y.ix[inds] logger.info('Found and deleted %d subjects with missing data.' % n_missing) # Imputation. Note that we don't impute the y values, because these should really be # inspected and validated by the user before abbreviating. elif missing == 'impute': self.X = self.X.apply(lambda x: x.fillna(x.mean()), axis=0) # self.y = self.y.apply(lambda x: x.fillna(x.mean()), axis=0) self.n_subjects = len(self.X)
def load_markers(): df_cities = pd.read_csv("static/data/cities.csv", encoding="cp1255") df_acc = pd.concat(pd.read_csv(filename, encoding="cp1255") for filename in glob("static/data/lms/Accidents Type */*/*AccData.csv")) df_acc = df_acc[df_acc.SEMEL_YISHUV > 0] groups = df_acc.groupby(["SEMEL_YISHUV", "HUMRAT_TEUNA"], as_index=False) df_size = groups.size() df_size_total = df_acc.groupby("SEMEL_YISHUV", as_index=False).size() max_size = df_size_total.max() df = groups.mean() df = pd.merge(df, df_cities, left_on="SEMEL_YISHUV", right_on="SEMEL") df = df[pd.notnull(df.X) & pd.notnull(df.Y) & (df_size_total > 1)] app.markers = [] for index, row in df.iterrows(): lng, lat = coordinates_converter.convert(row.X, row.Y) size = 30 * np.log(1.25 + df_size_total[row.SEMEL_YISHUV] / float(max_size)) size_per_severity = df_size[row.SEMEL_YISHUV] color = max(0, 200 - 200 * (size_per_severity.get(1, 0) + size_per_severity.get(2, 0)) / size_per_severity.get(3, 1)) print size app.markers.append({ "lat": lat, "lng": lng, "title": row.NAME, "size": size, "color": color })
def get_Gallup_country_lookups(verbose=True): """ Kosovo is the only GWP country not matched to a 3-letter ISO code. Let's ignore it. """ dfr = pd.read_table(__local_input_path__+'GallupWorldPoll-region-country.tsv').rename(columns={'country':'rcountry'}) dfr['lccountry'] = dfr.rcountry.str.lower() dfr = dfr.set_index('lccountry') dfw = pd.read_table(__local_input_path__+'GallupWorldPoll-WP5-defs-2016.tsv').rename(columns={'country':'wcountry'}) dfw['lccountry'] = dfw.wcountry.str.lower() dfw = dfw.set_index('lccountry') wp5s = pd.read_table(__local_input_path__ +'countrycode_main.tsv', skiprows=3).set_index('country_GWP3_wp5') wp5s = wp5s[['countryCode_GWP3_wp5', 'countryCode_ISO3','country_bestShortName','country_bestName','twoletter_AlexShultz_svg']] df= wp5s.join(dfr).join(dfw).rename(columns = {'countryCode_ISO3':'ISO',}) df.index.name = 'country' assert 'South Africa'.lower() in dfr.rcountry assert 'South Africa'.lower() in df.index # Now several checks: # Did regions get their ISO? problems = { ' Published WHR country lacks an ISO: ': df[pd.notnull(df.rcountry) & pd.isnull(df.ISO)][['ISO','countryCode_GWP3_wp5','WP5','rcountry']], ' Published WHR country lacks a WP5: ': df[pd.notnull(df.rcountry) & pd.isnull(df.WP5)], ' Published WHR country lacks a map code: ': df[pd.notnull(df.rcountry) & pd.isnull(df.twoletter_AlexShultz_svg)], ' Old Gallup micro country lacks an ISO in my master lookup: ': df[pd.notnull(df.countryCode_GWP3_wp5) & pd.isnull(df.ISO)][['ISO','countryCode_GWP3_wp5','WP5','wcountry']], ' 2016 Gallup micro country lacks an ISO in my master lookup: ': df[pd.notnull(df.WP5) & pd.isnull(df.ISO)][['ISO','countryCode_GWP3_wp5','WP5','wcountry']], } if verbose: for tt,dd in problems.items(): if not dd.empty: print('\n\n -- country_tools WARNING: '+tt) print dd return df.reset_index()
def process_df(df, file, ax): duration_val = [] for index, row in enumerate(df['days_to_death']): if row == '[Not Applicable]': duration_val.append(df['days_to_last_followup'][index]) else: duration_val.append(row) df['duration'] = duration_val df['duration'] = df['duration'].convert_objects(convert_numeric = True).dropna() vital_status = [] for row in df['vital_status']: if row not in ['Alive', 'Dead']: vital_status.append(None) else: vital_status.append(row) df['vital_status'] = vital_status df['SARS'] = df['SARS'].dropna() df = df[pd.notnull(df['duration'])] df = df[pd.notnull(df['SARS'])] df = df[pd.notnull(df['vital_status'])] lst = df['SARS'].tolist() q1 = np.percentile(lst, 33.33) q2 = np.percentile(lst,66.66) df1 = df[df['SARS']<=q1] df2 = df[(df['SARS']>q1) & (df['SARS'] <= q2)] df3 = df[df['SARS']>q2] plot_km(df, ax, '', file, "q1") ax.get_figure().savefig(result_dir+file+'_kmplot(samples='+str(len(df.index))+').png')
def prepare_data(subdata): subdata.ix[notnull(subdata['share']),'cost']=subdata.ix[notnull(subdata['share']),'Pop'] subdata.ix[notnull(subdata['share']),'costlog']=np.log(subdata.ix[notnull(subdata['share']),'cost']) ### predicts missing water level data points formula=varin1+" ~ return_period + wetland_loss + climate_change + runoff + bndconditions" olsmodel=sm.ols(formula,data=subdata).fit() predictions=olsmodel.predict(subdata) subdata.loc[subdata[varin1].isnull().values,varin1]=predictions[subdata[varin1].isnull().values] formula=varin2+" ~ return_period + wetland_loss + climate_change + runoff + bndconditions" olsmodel2=sm.ols(formula,data=subdata).fit() res2=olsmodel2.params predictions2=olsmodel2.predict(subdata) subdata.loc[subdata[varin2].isnull().values,varin2]=predictions2[subdata[varin2].isnull().values] ### predicts damages based on a few points using water level subdata['log{}'.format(varin1)]=np.log(subdata[varin1]) subdata['log{}'.format(varin2)]=np.log(subdata[varin2]) formula="costlog ~ log{}".format(varin1) damagemodel=sm.ols(formula,data=subdata).fit() predicted_damages=damagemodel.predict(subdata) subdata.loc[subdata['costlog'].isnull().values,'costlog']=predicted_damages[subdata['costlog'].isnull().values] subdata['popestimated']=np.exp(subdata['costlog']) return subdata
def combine_basic_detailed(basic, detailed): """ Combine the basic and detailed player information from BBR Input: basic - dict of basic player info (identified by bbr_id) detailed - dict of detailed player info (also identified by bbr_id) Output: A dict that has parsed certain keys and removed redundant keys """ combined = pd.merge(pd.DataFrame(basic), pd.DataFrame(detailed), how='outer', on='bbr_id') # Parse birth date and location tmp_born = pd.DataFrame([str(x).split(' in ') for x in combined['born_dets']]) # TO DO: Make this less hard-coded combined['birth_date'], combined['birth_loc'] = tmp_born.ix[:,0].str.strip(), tmp_born.ix[:,1].str.strip() # Convert height to inches combined['height_in'] = combined['height_dets'].str.split('-').apply(lambda x: int(x[0])*12 + int(x[1])) # Parse draft details tmp_draft = combined.draft_dets[pd.notnull(combined.draft_dets)].apply(parse_nba_draft_dets).apply(pd.Series) tmp_draft.columns = ['draft_team','draft_round','draft_pick_round','draft_pick_overall', 'draft_year'] combined = combined.join(tmp_draft) # Parse Hall of Fame details tmp_hof = combined.hall_of_fame_dets[pd.notnull(combined.hall_of_fame_dets)].apply(parse_nba_hof_dets).apply(pd.Series) tmp_hof.rename(columns={'Coach':'hof_coach','Contributor':'hof_contributor','Player':'hof_player'}, inplace=True) combined = combined.join(tmp_hof) # Return parsed/non-redundant columns combined.rename(columns={'pos':'position','wt':'weight_lbs','high_school_dets':'high_school','nba_debut_dets':'nba_debut','shoots_dets':'shoots'}, inplace=True) combined = combined[BBR_NBA_PLAYER_COLS] # Scrub missing values to None combined = combined.where(pd.notnull(combined), None) return [list(row) for idx, row in combined.iterrows()]
def ImportCleanData(self): """ Import and clean the data by removing ratings observations for restaurants that aren't yet rated or have a z or p score. """ # Import the restaurant grades data. try: self.grades = pd.read_csv('grades.csv', sep=',', error_bad_lines=False, index_col=False, dtype='unicode') except: raise InvalidInputError("Problem in reading in the restaurant data.") # Just Select the variables we need self.grades = self.grades[['GRADE','CAMIS','INSPECTION DATE', 'BORO']] # Convert date to date time variable. self.grades = self.grades.loc[~self.grades['INSPECTION DATE'].isin(['01/01/1900'])] self.grades['INSPECTION DATE'] = pd.to_datetime(self.grades['INSPECTION DATE']) # Drop rows that have a missing values. self.grades = self.grades[pd.notnull(self.grades['GRADE'])] self.grades = self.grades[pd.notnull(self.grades['BORO'])] self.grades = self.grades[pd.notnull(self.grades['INSPECTION DATE'])] # Drop row where the grade has not been given yet. self.grades = self.grades.loc[~self.grades['GRADE'].isin(['Not Yet Graded', 'P', 'Z'])] # Drop row where the borough info is missing has not been given yet. self.grades = self.grades.loc[~self.grades['BORO'].isin(['Missing'])] # Drop duplicated (same restaurant and same date) inspection records. self.grades.drop_duplicates(['CAMIS','INSPECTION DATE','GRADE'], take_last=True, inplace=True) # Sort the data self.grades = self.grades.sort(['BORO','CAMIS','INSPECTION DATE'], ascending=[1,1,1]) return self.grades
def createUniqueId( longData ): import numpy as np if pd.notnull(longData['user_id']) and pd.notnull(['course_id']): uid = str(longData['user_id']) + str('__') + str(longData['course_id']) else: uid = 'NULL' return uid
def get_taps_mazs(self, maz, attribute=None, filter=None): # we return multiple tap rows for each maz, so we add an 'idx' row to tell caller # which maz-taz rows belong to which row in the original maz list # i.e. idx contains the index of the original maz series so we know which # rows belong together # if maz is a series, then idx has the original maz series index values # otherwise it has the 0-based integer offset of the original maz if filter: maz2tap_df = self.maz2tap_df[pd.notnull(self.maz2tap_df[filter])] else: maz2tap_df = self.maz2tap_df if attribute: # FIXME - not sure anyone needs this feature maz2tap_df = maz2tap_df[['MAZ', 'TAP', attribute]] # filter out null attribute rows maz2tap_df = maz2tap_df[pd.notnull(self.maz2tap_df[attribute])] else: maz2tap_df = maz2tap_df[['MAZ', 'TAP']] if isinstance(maz, pd.Series): # idx based on index of original maz series maz_df = pd.DataFrame({'MAZ': maz, 'idx': maz.index}) else: # 0-based index of original maz maz_df = pd.DataFrame({'MAZ': maz, 'idx': range(len(maz))}) df = pd.merge(maz_df, maz2tap_df, how="inner", sort=False) return df
def phonetic(s, method): """ Phonetically encode the values in the Series. :param method: The algorithm that is used to phonetically encode the values. The possible options are 'soundex' (`wikipedia <https://en.wikipedia.org/wiki/Soundex>`_) and 'nysiis' (`wikipedia <https://en.wikipedia.org/wiki/New_York_State_Identification_and_Intelligence_System>`_). :type method: str :return: A Series with phonetic encoded values. :rtype: pandas.Series .. note:: The 'soundex' and 'nysiis' algorithms use the package 'jellyfish'. It can be installed with pip (``pip install jellyfish``). """ try: import jellyfish except ImportError: print ("Install jellyfish to use string encoding.") s = clean(s, replace_by_none='[^\-\_A-Za-z0-9]+') if method == 'soundex': return s.str.upper().apply(lambda x: jellyfish.soundex(x) if pandas.notnull(x) else np.nan) elif method == 'nysiis': return s.str.upper().apply(lambda x: jellyfish.nysiis(x) if pandas.notnull(x) else np.nan) else: raise Exception("Phonetic encoding method not found")
def GetJoinedDf(self): run_id = 12332 #dstore.Get("current_run") # DB df #[u'state', u'id', u'run_id', u'user_id', u'cell', u'time_raw', u'time1', u'lap1', u'time2', u'lap2', u'time3', u'lap3', u'un1', u'un2', u'un3', u'us1'] self.joinedDf = psql.read_sql(\ "SELECT * FROM times" +\ " WHERE (times.run_id = "+ str(run_id ) +")"\ , self.db) #set index = id self.joinedDf.set_index('id', drop=False, inplace = True) #replace nan with None self.joinedDf = self.joinedDf.where(pd.notnull(self.joinedDf), None) if(self.dstore.GetItem("racesettings-app", ['rfid']) == 2): tDf = psql.read_sql("SELECT * FROM tags", self.db, index_col = "id") tDf = tDf[["user_nr", "tag_id"]] self.joinedDf = pd.merge(self.joinedDf, tDf, left_on='user_id', right_on='tag_id', how="left") self.joinedDf = pd.merge(self.joinedDf, self.ucDf, left_on='user_nr', right_on='nr', how="left") self.joinedDf.set_index('id', drop=False, inplace = True) else: self.joinedDf = pd.merge(self.joinedDf, self.ucDf, left_on='user_id', right_index=True, how="left") self.joinedDf.sort("time_raw", inplace=True) #replace nan with None self.joinedDf = self.joinedDf.where(pd.notnull(self.joinedDf), None) return self.joinedDf
def _get_project_data(project_df, project_attribute_keys_data): """ Yields a project record and project attribute records grouped by project """ for i, row in project_df.iterrows(): project_data = { "project_id": row.project_id, "zipcode": str(row.zipcode) if pd.notnull(row.zipcode) else None, "weather_station": str(row.weather_station) if pd.notnull(row.weather_station) else None, "latitude": row.latitude if pd.notnull(row.latitude) else None, "longitude": row.longitude if pd.notnull(row.longitude) else None, "baseline_period_start": None, "reporting_period_end": None, } assert pd.notnull(project_data["project_id"]) baseline_period_end_localized = pytz.UTC.localize(row.baseline_period_end) if pd.isnull(baseline_period_end_localized): project_data["baseline_period_end"] = None else: project_data["baseline_period_end"] = baseline_period_end_localized.strftime("%Y-%m-%dT%H:%M:%S%z") reporting_period_start_localized = pytz.UTC.localize(row.reporting_period_start) if pd.isnull(reporting_period_start_localized): project_data["reporting_period_start"] = None else: project_data["reporting_period_start"] = reporting_period_start_localized.strftime("%Y-%m-%dT%H:%M:%S%z") project_attributes_data = [] for project_attribute_key_data in project_attribute_keys_data: project_attribute_data = _get_project_attribute_data(row, project_attribute_key_data) project_attributes_data.append(project_attribute_data) yield project_data, project_attributes_data
def load_data(input_file): unii = pd.read_csv(input_file, sep='\t', low_memory=False, dtype=str) unii.rename(columns={'MF': 'molecular_formula', 'PT': 'preferred_term', 'RN': 'registry_number'}, inplace=True) unii.columns = unii.columns.str.lower() # half of them don't have inchikeys # set the primary key to inchikey and fill in missing ones with unii unii['_id'] = unii.inchikey unii['_id'].fillna(unii.unii, inplace=True) dupes = set(unii._id) - set(unii._id.drop_duplicates(False)) records = [{k:v for k,v in record.items() if pd.notnull(v)} for record in unii.to_dict("records") if record['_id'] not in dupes] records = [{'_id': record['_id'], 'unii': record} for record in records] # take care of a couple cases with identical inchikeys for dupe in dupes: dr = unii.query("_id == @dupe").to_dict("records") dr = [{k:v for k,v in record.items() if pd.notnull(v)} for record in dr] records.append({'_id': dupe, 'unii': dr}) for record in records: if isinstance(record['unii'], dict): del record['unii']['_id'] else: for subr in record['unii']: del subr['_id'] yield record
def test_parse_frame(self): self.mam1.parse_frame() self.assertEqual(self.mam1.frame.shape[0], 1) self.assertTrue(np.all(pd.notnull(self.mam1.frame))) self.mam2.parse_frame() self.assertEqual(self.mam2.frame.shape[0], 1) self.assertTrue(np.all(pd.notnull(self.mam2.frame)))
def parse_concepts_from_sheet(graph, vocabulary_name, sheet_data): """Parse vocabulary concepts from spreadsheet and into a graph.""" base_uri = uri_prefix + vocabulary_name for index, row in sheet_data.iterrows(): concept = URIRef(base_uri + uri_common_part + str(index)) graph.add((concept, RDF.type, SKOS.Concept)) graph.add((concept, SKOS.inScheme, URIRef(base_uri))) graph.add((concept, SKOS.topConceptOf, URIRef(base_uri))) graph.add((URIRef(base_uri), SKOS.hasTopConcept, concept)) graph.add((concept, SKOS.prefLabel, Literal(row['Suomeksi'].rstrip(), lang='fi'))) graph.add((concept, SKOS.prefLabel, Literal(row['Englanniksi'].rstrip(), lang='en'))) graph.add((concept, SKOS.prefLabel, Literal(row['Ruotsiksi'].rstrip(), lang='sv'))) if pandas.notnull(row[u'Synonyymi (YSO)']): graph.add((concept, SKOS.exactMatch, URIRef(str(row['Synonyymi (YSO)'])))) if pandas.notnull(row[u'Läheinen käsite']): graph.add((concept, SKOS.closeMatch, URIRef(str(row[u'Läheinen käsite'])))) return
def format_json_tweets(unprocessed_tweets): """ This function accepts a list of json-formatted tweets. It stores them as a Pandas' DataFrame preserving only the content of the tweet, i.e. the text, and it's time of creation. It will apply a cleanup to the text in order to remove potentially harmful characters, such as unicode characters or escape characters. Parameters ---------- unprocessed_tweets : list It is a list of json-formatted tweets with many data such as text, created_at, geo, is_translator Each line contains one tweet. Returns ------- df : pandas.DataFrame df is a DataFrame that contains the content and timestamp of each tweet. """ df = pd.DataFrame(json.loads(line) for line in unprocessed_tweets) # Remove unwanted data df = df[['text', 'created_at']] df = df[ pd.notnull(df['text']) ] df = df[ pd.notnull(df['created_at']) ] # Remove unicode characters df['text'] = df['text'].apply(removeUnicode) # Replace escape characters df['text'] = df['text'].apply(replaceEscape) return df
def test_parse_basis_set_order(self): self.mam1.parse_basis_set_order() self.assertEqual(self.mam1.basis_set_order.shape[0], 28) self.assertTrue(np.all(pd.notnull(self.mam1.basis_set_order))) self.mam2.parse_basis_set_order() self.assertEqual(self.mam2.basis_set_order.shape[0], 91) self.assertTrue(np.all(pd.notnull(self.mam2.basis_set_order)))
def merge(listenings, artists): dataset = pd.merge(listenings, artists, left_on='artist', right_on='artist') dataset = dataset[pd.notnull(dataset['artist'])] dataset = dataset[pd.notnull(dataset['genre'])] print dataset.describe() save_csv(dataset, "../OUTPUT/listenings_genre_merged.csv")
def get_relevant_and_reformatted_prescs(prescriptions, druglists, pt_features, window): ''' Filter prescriptions to only include ones which are for relevant drugs and within the exposure window, and create 'amount' and 'unit' columns (necessary for calculating PDD) ''' prescs = pd.merge(prescriptions, pt_features[['patid', 'index_date']], how='left', on='patid') prescs = prescs.loc[pd.notnull(prescs['qty'])].copy( ) #remove the relatively small number of prescriptions where the quantity is NaN pegprod = pd.read_csv('dicts/proc_pegasus_prod.csv') prescs = pd.merge( prescs, pegprod[['prodcode', 'strength', 'route', 'drugsubstance']], how='left') #Only use prescriptions belonging to the main exposure window (not the ones used in sensitivity analysis) start_year = timedelta(days=(365 * abs(sd.exposure_windows[1]['start_year']))) end_year = timedelta(days=( 365 * abs(sd.exposure_windows[1]['start_year'] + sd.window_length_in_years))) timely_presc_mask = (prescs['eventdate'] >= (prescs['index_date'] - start_year)) & ( prescs['eventdate'] <= (prescs['index_date'] - end_year)) timely_prescs = prescs.loc[timely_presc_mask].copy() all_drugs = [drug for druglist in druglists for drug in druglist['drugs']] prodcodes = get_prodcodes_from_drug_name(all_drugs) relev_prescs = timely_prescs.loc[timely_prescs['prodcode'].isin( prodcodes)].copy() # Create new columns ('amount' and 'unit', extracted from the 'substrance strength' string) amount_and_unit = relev_prescs['strength'].str.extract( '([\d\.]+)([\d\.\+ \w\/]*)', expand=True) amount_and_unit.columns = ['amount', 'unit'] amount_and_unit.amount = amount_and_unit.amount.astype('float') reformatted_prescs = pd.concat( [relev_prescs, amount_and_unit], axis=1).drop(['numpacks', 'numdays', 'packtype', 'issueseq'], axis=1) # Convert micrograms to mg micro_mask = reformatted_prescs['unit'].str.contains('microgram', na=False, case=False) reformatted_prescs.loc[micro_mask, 'amount'] /= 1000 reformatted_prescs.loc[micro_mask, 'unit'] = 'mg' #Convert mg/Xml to mg for simplicity micro_mask = reformatted_prescs['unit'].str.contains('mg/', na=False, case=False) reformatted_prescs.loc[micro_mask, 'unit'] = 'mg' #Remove the small number of prescriptions where there is no amount reformatted_prescs = reformatted_prescs[pd.notnull( reformatted_prescs['amount'])].copy() # Create a 'total_amount' column - used to calculate each pt's PDDs for a given drug. reformatted_prescs['total_amount'] = reformatted_prescs[ 'qty'] * reformatted_prescs['amount'] #Change all 'numeric daily doses' (NDD) from 0 (this appears to be the default in the CPRD data) to 1. #Note that an NDD of 2 means 'twice daily' reformatted_prescs.loc[reformatted_prescs['ndd'] == 0, 'ndd'] = 1 return reformatted_prescs
def test(): df = pd.read_csv("./result/kko_regex.csv") df.Date = pd.to_datetime(df.Date) df["year"] = df['Date'].dt.strftime('%Y') df["month"] = df['Date'].dt.strftime('%m') df["day"] = df['Date'].dt.strftime('%d') df["weekday"] = df['Date'].dt.strftime('%A') df["24time"] = df["timetype"] + " " + df["time"] df.time = pd.to_datetime(df.time) temp = [] transform_time = [] for i in range(len(df)): time = df["24time"][i] #print(time) temp.append(dt.datetime.strptime(time, "%p %I:%M:%S")) transform_time.append(temp[i].time()) df["24time"] = transform_time df["hh"] = df["24time"].apply(lambda x: x.strftime("%H") if pd.notnull(x) else '') df["mm"] = df["24time"].apply(lambda x: x.strftime("%M") if pd.notnull(x) else '') df.head() print(username) plt.rc('font', family='NanumGothic') plt.figure(2) sns.countplot(x="weekday", data=df) plt.title("요일 별 대화 수") plt.legend() # plt.savefig('./static/days.png') plt.savefig('./static/' + name2 + '.png') plt.figure(3) sns.countplot(x="Speaker", data=df) plt.title("사용자 별 대화 수") plt.legend() # plt.savefig('./static/users.png') plt.savefig('./static/' + name3 + '.png') plt.figure(4) sns.countplot(x="emotions", data=df) plt.title("감정의 분포") plt.legend() # plt.savefig('./static/emotion.png') plt.savefig('./static/' + name4 + '.png') # 추가하고 싶은 부분 # plt.figure(4) # for i in username: # g = sns.kdeplot(df["hh"][(df['Speaker'] == i) & (df["hh"].notnull())], bw=1.5) # g.set_xlabel("viewCount") # g.set_ylabel("Frequency") # plt.title("Chat Rate by Hour") # plt.legend() # plt.show() # plt.savefig('./static/'+name5+'.png') file_path = './static' file_list = os.listdir(file_path) for file_name in file_list: old_name = file_path + '/' + file_name new_name = file_path + '/' + random.choice( string.ascii_letters) + random.choice( string.ascii_letters) + random.choice( string.ascii_letters) + random.choice( string.ascii_letters) + random.choice( string.ascii_letters) + random.choice( string.ascii_letters) + '.png' os.rename(old_name, new_name)
candidate_tickets # Nice, we got some candidates! Let's verify with one of them. # In[17]: df[df.Ticket == '113781'] # Great! We can complete some Cabins! # In[18]: shared_tickets = candidate_tickets.index.tolist() find_cabin_given_ticket = lambda ticket: df[ (df.Ticket == ticket) & (pd.notnull(df.Cabin))].Cabin.values[0] def assign_cabin(row): if pd.isnull(row.Cabin) and row.Ticket in shared_tickets: return find_cabin_given_ticket(row.Ticket) return row.Cabin df['Cabin'] = df[['Cabin', 'Ticket']].apply(assign_cabin, axis=1) df['cabin_letter'] = df['Cabin'].apply(lambda c: c[0] if not pd.isnull(c) else 'N') # N=none df[df.Ticket == '113781'] # In[19]:
def calculate_bbh(blast_results_1, blast_results_2, r_name=None, g_name=None, outdir=''): """Calculate the best bidirectional BLAST hits (BBH) and save a dataframe of results. Args: blast_results_1 (str): BLAST results for reference vs. other genome blast_results_2 (str): BLAST results for other vs. reference genome r_name: Name of reference genome g_name: Name of other genome outdir: Directory where BLAST results are stored. Returns: Path to Pandas DataFrame of the BBH results. """ # TODO: add force_rerun option cols = [ 'gene', 'subject', 'PID', 'alnLength', 'mismatchCount', 'gapOpenCount', 'queryStart', 'queryEnd', 'subjectStart', 'subjectEnd', 'eVal', 'bitScore' ] if not r_name and not g_name: r_name = op.basename(blast_results_1).split('_vs_')[0] g_name = op.basename(blast_results_1).split('_vs_')[1].replace( '_blast.out', '') r_name2 = op.basename(blast_results_2).split('_vs_')[1].replace( '_blast.out', '') if r_name != r_name2: log.warning('{} != {}'.format(r_name, r_name2)) outfile = op.join(outdir, '{}_vs_{}_bbh.csv'.format(r_name, g_name)) if op.exists(outfile) and os.stat(outfile).st_size != 0: log.debug('{} vs {} BLAST BBHs already found at {}'.format( r_name, g_name, outfile)) return outfile bbh1 = pd.read_csv(blast_results_1, sep='\t', names=cols) bbh2 = pd.read_csv(blast_results_2, sep='\t', names=cols) out = pd.DataFrame() log.debug('Finding BBHs for {} vs. {}'.format(r_name, g_name)) for g in bbh1[pd.notnull(bbh1.gene)].gene.unique(): res = bbh1[bbh1.gene == g] if len(res) == 0: continue best_hit = res.ix[res.PID.idxmax()].copy() best_gene = best_hit.subject res2 = bbh2[bbh2.gene == best_gene] if len(res2) == 0: continue best_hit2 = res2.ix[res2.PID.idxmax()] best_gene2 = best_hit2.subject if g == best_gene2: best_hit['BBH'] = '<=>' else: best_hit['BBH'] = '->' out = pd.concat([out, pd.DataFrame(best_hit).transpose()]) out.to_csv(outfile) log.debug('{} vs {} BLAST BBHs saved at {}'.format(r_name, g_name, outfile)) return outfile
#-*- coding:utf-8 -*- # Peishichao import pandas as pd from apriori import find_rule inputfile = '../data/menu_orders.xls' outputfile = '../data/apriori_rules.xls' data = pd.read_excel(inputfile, index_col=None) print(u'\n转化原始数据至0-1矩阵...') ct = lambda x: pd.Series(1, index=x[pd.notnull(x)]) #转化0-1矩阵的过度函数 b = map(ct, data.as_matrix()) data = pd.DataFrame(list(b)).fillna(0) print(u'\n转化完毕') del b support = 0.2 confidence = 0.5 ms = '---' find_rule(data, support, confidence, ms).to_excel(outputfile)
def get_series_for_label(mgra_dataframe, label, multiplier): # grab the column series = mgra_dataframe[label].copy() # set all non-null values equal to the multiplier series[pandas.notnull(series)] = multiplier return series
def remove_random_nan(pd_obj): return pd_obj.where((pd.notnull(pd_obj)), None)
def get_condition_status(pt_features, entries, prescriptions, window, condition): ''' Searches a patient's history (i.e. the list of medcoded entries) for any one of a list of related Read codes (e.g. 'clinically significant alcohol use', or 'insomnia') during a given exposure window (e.g. 5-10 years prior to index date). According to the 'count_or_boolean' parameter, will return either a count of the Read codes (i.e. insomnia) or a simple boolean (all other conditions). ''' new_colname = condition['name'] if new_colname in pt_features.columns: #delete column if it already exists (otherwise this causes problems with the 'fillna' command below) pt_features.drop(new_colname, axis=1, inplace=True) # If we're using all the patient's history from the exposure window back to birth #(e.g. for intellectual disability), overwrite the predefined exposure windows with a single window if condition['record_exposure_in_window_period_only'] == True: start_year = timedelta(days=(365 * abs(window['start_year']))) else: #for all other conditions, record exposure from end of window period back to start of their records start_year = timedelta(days=(365 * 100)) medcount_colname = new_colname + '_Read_code_count' medcodes = get_medcodes_from_readcodes(condition['codes']) medcode_events = entries[entries['medcode'].isin(medcodes)] medcode_events = medcode_events[pd.notnull( medcode_events['eventdate'] )] #drops a small number of rows with NaN eventdates # display(medcode_events.head(10)) # print('\tTotal {0} events in all medcoded_events dataframe: {1}'.format(condition['name'],len(medcode_events))) medcode_events = pd.merge(medcode_events[['patid', 'eventdate']], pt_features[['patid', 'index_date']], how='inner', on='patid') # Restrict event counts to those that occur during pt's exposure window relevant_event_mask = (medcode_events['eventdate'] >= (medcode_events['index_date'] - start_year)) & ( medcode_events['eventdate'] <= (medcode_events['index_date'] - timedelta( days=(365 * sd.window_length_in_years)))) window_medcode_events = medcode_events.loc[relevant_event_mask] window_medcode_events = window_medcode_events.groupby( 'patid')['eventdate'].count().reset_index() window_medcode_events.columns = ['patid', medcount_colname] # print('\t{0} events in this window for our patients: {1}'.format(new_colname,len(window_medcode_events))) #delete zero counts window_medcode_events = window_medcode_events.loc[ window_medcode_events[medcount_colname] > 0] pt_features = pd.merge(pt_features, window_medcode_events, how='left') pt_features[medcount_colname].fillna(0, inplace=True) pt_features.loc[pt_features[medcount_colname] > 0, new_colname] = 1 pt_features.loc[pt_features[medcount_colname] == 0, new_colname] = 0 if len(condition['medications']) > 0: presc_count_colname = new_colname + '_prescription_count' prodcodes = get_prodcodes_from_drug_name(condition['medications']) prescriptions = prescriptions.loc[prescriptions['prodcode'].isin( prodcodes)].copy() prescriptions = prescriptions.loc[pd.notnull( prescriptions['qty'] )].copy( ) #remove the relatively small number of prescriptions where the quantity is NaN # Some conditions (e.g. insomnia) are also defined by whether or not certain medications are prescribed prescriptions = pd.merge(prescriptions[['patid', 'eventdate']], pt_features[['patid', 'index_date']], how='inner', on='patid') start_year = timedelta(days=(365 * abs(window['start_year']))) end_year = timedelta( days=(365 * abs(window['start_year'] + sd.window_length_in_years))) timely_presc_mask = (prescriptions['eventdate'] >= (prescriptions['index_date'] - start_year)) & ( prescriptions['eventdate'] <= (prescriptions['index_date'] - end_year)) prescriptions = prescriptions.loc[timely_presc_mask].copy() prescriptions = prescriptions.groupby( 'patid')['eventdate'].count().reset_index() prescriptions.columns = ['patid', presc_count_colname] prescriptions = prescriptions.loc[ prescriptions[presc_count_colname] > 0] pt_features = pd.merge(pt_features, prescriptions, how='left') pt_features[presc_count_colname].fillna(0, inplace=True) # convert condition from a count to a boolean pt_features.loc[(pt_features[medcount_colname] > 0) | (pt_features[presc_count_colname] > 0), new_colname] = 1 pt_features.drop(presc_count_colname, axis=1, inplace=True) pt_features.drop(medcount_colname, axis=1, inplace=True) pt_features[new_colname] = pt_features[new_colname].astype(int) return pt_features
left_on=['PROPERTYADDRESS', 'PROPERTYHOUSENUM'], right_on=['street', 'number']) # making the fire column with all type 100s as fires pcafire['fire'] = pcafire['full.code'].astype(str).str[0] pcafire.loc[pcafire.fire == '1', 'fire'] = 'fire' pcafire.loc[pcafire.fire != 'fire', 'fire'] = 'No fire' pcafire['full.code'][pcafire['fire'] == 'fire'] = None #Removing vacant commerical land pcafire = pcafire[pcafire.USEDESC != 'VACANT COMMERCIAL LAND'] #Fire occured after inspection pcafire1 = pcafire[(pcafire.CALL_CREATED_DATE >= pcafire.INSPECTION_DATE)] pcafire1 = pcafire[(pcafire.CALL_CREATED_DATE >= pcafire.INSPECTION_DATE)] pcafire1 = pcafire1[pd.notnull(pcafire1.INSPECTION_DATE)] #checking if violation is in the same year as the fire and keeping only those pcafire2 = pcafire1[(pcafire1.violation_year == pcafire1.fire_year)] #joining all rows with no pli violations fire_nopli = pd.concat([ fire_new, pcafire2[[ 'number', 'street', 'CALL_CREATED_DATE', 'full.code', 'response_time', 'fire_year' ]], pcafire2[[ 'number', 'street', 'CALL_CREATED_DATE', 'full.code', 'response_time', 'fire_year' ]] ]).drop_duplicates(keep=False) pcafire_nopli = pd.merge(pcafinal,
def get_index_date_and_caseness_and_add_final_dementia_subtype( all_entries, pt_features): ''' Calculates index date and establishes caseness by looking for first dementia diagnoses. Also looks for final dementia diagnosis (e.g. 'vascular dementia'), as this is likely to be our best guess as to the dementia subtype ''' pegmed = pd.read_csv('dicts/proc_pegasus_medical.csv', delimiter=',') pegprod = pd.read_csv('dicts/proc_pegasus_prod.csv', delimiter=',') medcodes = get_medcodes_from_readcodes( codelists.alzheimer_vascular_and_non_specific_dementias['codes']) prodcodes = get_prodcodes_from_drug_name( codelists.alzheimer_vascular_and_non_specific_dementias['medications']) entries_with_antidementia_presc_mask = all_entries['prodcode'].isin( prodcodes) entries_with_dementia_dx_mask = all_entries['medcode'].isin(medcodes) #For the purpose of my paper's flow chart of patient selection, #get number of cases where there is an antidementia prescription but not a dementia diagnosis patids_prescribed_antidementia_drugs = set( all_entries.loc[entries_with_antidementia_presc_mask, 'patid']) patids_with_dementia_dx = set( all_entries.loc[entries_with_dementia_dx_mask, 'patid']) total_pts_prescribed_antidementia_drugs_but_no_dementia_dx = len( pt_features[ (pt_features['patid'].isin(patids_prescribed_antidementia_drugs)) & ~(pt_features['patid'].isin(patids_with_dementia_dx))]) print( 'Number of patients prescribed antidementia drugs but not diagnosed with dementia:', total_pts_prescribed_antidementia_drugs_but_no_dementia_dx) # from the all_entries df, get just those which contain a dementia dx or an antidementia drug prescription all_dementia_entries = all_entries[entries_with_antidementia_presc_mask | entries_with_dementia_dx_mask] # for clarity, look up the Read terms all_dem_labelled = pd.merge(all_dementia_entries, pegmed, how='left')[[ 'patid', 'prodcode', 'medcode', 'sysdate', 'eventdate', 'type' ]] # for clarity, look up the drug names all_dem_labelled = pd.merge(all_dem_labelled, pegprod, how='left')[[ 'patid', 'medcode', 'prodcode', 'sysdate', 'eventdate', 'type', 'drugsubstance' ]] all_dem_labelled.loc[:, 'eventdate'] = pd.to_datetime( all_dem_labelled.loc[:, 'eventdate']) #Get the date of earliest dementia diagnosis / antidementia drug prescription - this will be the revised index date, and will also determine revised caseness earliest_dementia_dates = all_dem_labelled.groupby( 'patid')['eventdate'].min().reset_index() earliest_dementia_dates.rename(columns={'eventdate': 'index_date'}, inplace=True) pt_features = pd.merge(pt_features, earliest_dementia_dates, how='left') pt_features['isCase'] = np.where(pd.notnull(pt_features['index_date']), True, False) # Get the final dementia diagnosis just_dementia_diagnoses = all_dem_labelled[pd.isnull( all_dem_labelled['prodcode'])] final_dementia_dx = just_dementia_diagnoses.loc[ just_dementia_diagnoses.groupby('patid')['eventdate'].idxmax()][[ 'patid', 'medcode' ]] final_dementia_dx.rename(columns={'medcode': 'final dementia medcode'}, inplace=True) pt_features = pd.merge(pt_features, final_dementia_dx, how='left') return pt_features
import pandas as pd df = pd.read_csv('train.csv') dfield = df[df['Survived'] == 1] print(dfield) array_sobreviventes = df['PassengerId'].unique() all_number = array_sobreviventes[-1] survived_number = dfield.sum()['Survived'] percent = (survived_number / all_number) * 100 print('Porcetagem de sobreviventes: ' + str(percent)) df = df[pd.notnull(df['Embarked'])] print(df) pd.get_dummies(data=df, columns=df['Embarked'])
plt.legend(bbox_to_anchor=(1.1, 1.05)) plt.savefig("../reports/figures/club_01_bar_{0}.PNG".format(txt), bbox_inches='tight') plt.show() # In[12]: atts = ["Country", "Tier"] for col in [ "Manager", "ManagerOpp", "Stadium", "Referee", "Latitude", "Longitude" ]: # for att in ["Country", "Tier"]: print("\n#######\n") print("Sample size and means with {0} by {1}".format(col, ", ".join(atts))) print(df.loc[pd.notnull(df[col]), ].groupby(atts).TotalGoals.agg( ["size", "mean"]).sort_index()) # ## Mapping Goals # In[13]: # mapdata=df.dropna(subset=['Latitude', 'Longitude']) mapdata = df.dropna(subset=['Latitude', 'Longitude']).groupby( ['Latitude', 'Longitude', 'Country']).TotalGoals.mean().reset_index() # fg = sns.FacetGrid(data=mapdata, hue='Country', height=6, aspect=.9) # fg.map(plt.scatter, 'Longitude', 'Latitude').add_legend() # sns.lmplot(x='Longitude', y='Latitude', s='TotalGoals', hue='Country', data=mapdata, fit_reg=False, # x_jitter=0.1, y_jitter=0.1, markers="o", palette="viridis", height=7)
def make_df(self, exclude_stmts=None, complex_members=3): """Create a dataframe containing information extracted from assembler's list of statements necessary to build an IndraNet. Parameters ---------- exclude_stmts : list[str] A list of statement type names to not include in the dataframe. complex_members : int Maximum allowed size of a complex to be included in the data frame. All complexes larger than complex_members will be rejected. For accepted complexes, all permutations of their members will be added as dataframe records. Default is `3`. Returns ------- df : pd.DataFrame Pandas DataFrame object containing information extracted from statements. It contains the following columns: *agA_name* The first Agent's name. *agA_ns* The first Agent's identifier namespace as per `db_refs`. *agA_id* The first Agent's identifier as per `db_refs` *ags_ns, agB_name, agB_id* As above for the second agent. Note that the Agent may be None (and these fields left empty) if the Statement consists only of a single Agent (e.g., SelfModification, ActiveForm, or Translocation statement). *stmt_type* Statement type, given by the name of the class in indra.statements. *evidence_count* Number of evidences for the statement. *stmt_hash* An unique long integer hash identifying the content of the statement. *belief* The belief score associated with the statement. *source_counts* The number of evidences per input source for the statement. *initial_sign* The default sign (polarity) associated with the given statement if the statement type has implied polarity. To facilitate weighted path finding, the sign is represented as 0 for positive polarity and 1 for negative polarity. """ rows = [] if exclude_stmts: exclude_types = tuple( get_statement_by_name(st_type) for st_type in exclude_stmts) else: exclude_types = () for stmt in self.statements: # Exclude statements from given exclude list if isinstance(stmt, exclude_types): logger.debug('Skipping a statement of a type %s.' % type(stmt).__name__) continue agents = stmt.agent_list() not_none_agents = [a for a in agents if a is not None] # Exclude statements with less than 2 agents if len(not_none_agents) < 2: continue # Special handling for Influences and Associations if isinstance(stmt, (Influence, Association)): stmt_pol = stmt.overall_polarity() if stmt_pol == 1: sign = 0 elif stmt_pol == -1: sign = 1 else: sign = None if isinstance(stmt, Influence): edges = [(stmt.subj.concept, stmt.obj.concept, sign)] else: edges = [(a, b, sign) for a, b in permutations(not_none_agents, 2)] # Handle complexes by creating pairs of their # not-none-agents. elif isinstance(stmt, Complex): # Do not add complexes with more members than complex_members if len(not_none_agents) > complex_members: logger.debug('Skipping a complex with %d members.' % len(not_none_agents)) continue else: # add every permutation with a neutral polarity edges = [(a, b, None) for a, b in permutations(not_none_agents, 2)] elif isinstance(stmt, Conversion): edges = [] if stmt.subj: for obj in stmt.obj_from: edges.append((stmt.subj, obj, 1)) for obj in stmt.obj_to: edges.append((stmt.subj, obj, 0)) # This is for any remaining statement type that may not be # handled above explicitly but somehow has more than two # not-none-agents at this point elif len(not_none_agents) > 2: continue else: edges = [(not_none_agents[0], not_none_agents[1], None)] for (agA, agB, sign) in edges: agA_ns, agA_id = get_ag_ns_id(agA) agB_ns, agB_id = get_ag_ns_id(agB) stmt_type = type(stmt).__name__ row = OrderedDict([ ('agA_name', agA.name), ('agB_name', agB.name), ('agA_ns', agA_ns), ('agA_id', agA_id), ('agB_ns', agB_ns), ('agB_id', agB_id), ('stmt_type', stmt_type), ('evidence_count', len(stmt.evidence)), ('stmt_hash', stmt.get_hash(refresh=True)), ('belief', stmt.belief), ('source_counts', _get_source_counts(stmt)), ('initial_sign', sign)]) rows.append(row) df = pd.DataFrame.from_dict(rows) df = df.where((pd.notnull(df)), None) return df
def load_template_to_dataframe(fn, index='sample_name'): """Load a sample/prep template or a QIIME mapping file into a data frame Parameters ---------- fn : str or file-like object filename of the template to load, or an already open template file index : str, optional Defaults to 'sample_name'. The index to use in the loaded information Returns ------- DataFrame Pandas dataframe with the loaded information Raises ------ ValueError Empty file passed QiitaDBColumnError If the sample_name column is not present in the template. QiitaDBWarning When columns are dropped because they have no content for any sample. QiitaDBError When non UTF-8 characters are found in the file. QiitaDBDuplicateHeaderError If duplicate columns are present in the template Notes ----- The index attribute of the DataFrame will be forced to be 'sample_name' and will be cast to a string. Additionally rows that start with a '\t' character will be ignored and columns that are empty will be removed. Empty sample names will be removed from the DataFrame. Column names are case-insensitive but will be lowercased on addition to the database Everything in the DataFrame will be read and managed as string """ # Load in file lines holdfile = None with qdb.util.open_file(fn, mode='U') as f: errors = defaultdict(list) holdfile = f.readlines() # here we are checking for non UTF-8 chars for row, line in enumerate(holdfile): for col, block in enumerate(line.split('\t')): try: tblock = block.encode('utf-8') except UnicodeDecodeError: tblock = unicode(block, errors='replace') tblock = tblock.replace(u'\ufffd', '🐾') errors[tblock].append('(%d, %d)' % (row, col)) if bool(errors): raise ValueError( "There are invalid (non UTF-8) characters in your information " "file. The offending fields and their location (row, column) " "are listed below, invalid characters are represented using " "🐾: %s" % '; '.join([ '"%s" = %s' % (k, ', '.join(v)) for k, v in viewitems(errors) ])) if not holdfile: raise ValueError('Empty file passed!') if index == "#SampleID": # We're going to parse a QIIME mapping file. We are going to first # parse it with the QIIME function so we can remove the comments # easily and make sure that QIIME will accept this as a mapping file data, headers, comments = _parse_mapping_file(holdfile) holdfile = ["%s\n" % '\t'.join(d) for d in data] holdfile.insert(0, "%s\n" % '\t'.join(headers)) # The QIIME parser fixes the index and removes the # index = 'SampleID' # Strip all values in the cells in the input file for pos, line in enumerate(holdfile): cols = line.split('\t') if pos == 0 and index != 'SampleID': # get and clean the controlled columns ccols = {'sample_name'} ccols.update(qdb.metadata_template.constants.CONTROLLED_COLS) newcols = [ c.lower().strip() if c.lower().strip() in ccols else c.strip() for c in cols ] # while we are here, let's check for duplicate columns headers if len(set(newcols)) != len(newcols): raise qdb.exceptions.QiitaDBDuplicateHeaderError( find_duplicates(newcols)) else: # .strip will remove odd chars, newlines, tabs and multiple # spaces but we need to read a new line at the end of the # line(+'\n') newcols = [d.strip(" \r\n") for d in cols] holdfile[pos] = '\t'.join(newcols) + '\n' # index_col: # is set as False, otherwise it is cast as a float and we want a string # keep_default: # is set as False, to avoid inferring empty/NA values with the defaults # that Pandas has. # comment: # using the tab character as "comment" we remove rows that are # constituted only by delimiters i. e. empty rows. template = pd.read_csv(StringIO(''.join(holdfile)), sep='\t', dtype=str, encoding='utf-8', infer_datetime_format=False, keep_default_na=False, index_col=False, comment='\t', converters={index: lambda x: str(x).strip()}) # remove newlines and tabs from fields template.replace(to_replace='[\t\n\r\x0b\x0c]+', value='', regex=True, inplace=True) initial_columns = set(template.columns) if index not in template.columns: raise qdb.exceptions.QiitaDBColumnError( "The '%s' column is missing from your template, this file cannot " "be parsed." % index) # remove rows that have no sample identifier but that may have other data # in the rest of the columns template.dropna(subset=[index], how='all', inplace=True) # set the sample name as the index template.set_index(index, inplace=True) # it is not uncommon to find templates that have empty columns so let's # find the columns that are all '' columns = np.where(np.all(template.applymap(lambda x: x == ''), axis=0)) template.drop(template.columns[columns], axis=1, inplace=True) initial_columns.remove(index) dropped_cols = initial_columns - set(template.columns) if dropped_cols: warnings.warn( 'The following column(s) were removed from the template because ' 'all their values are empty: %s' % ', '.join(dropped_cols), qdb.exceptions.QiitaDBWarning) # Pandas represents data with np.nan rather than Nones, change it to None # because psycopg2 knows that a None is a Null in SQL, while it doesn't # know what to do with NaN template = template.where((pd.notnull(template)), None) return template
mRests = rests[rests['BORO'] == "MANHATTAN"] ## Look at only Manhattan Data # In[4]: list(mRests.columns.values) # In[5]: mRests = mRests[ mRests['GRADE'] != "Not Yet Graded"] ## Remove stores that have not been graded yet # In[6]: mRests = mRests[pd.notnull( mRests["GRADE"])] ## Remove Stores that have no grade # In[7]: mRests = mRests[pd.notnull(mRests["SCORE"])] ## Remove stores with no score # In[8]: mRests["SCORE"].describe() # In[9]: mRests["GRADE"] = mRests["GRADE"].astype( "category", categories=["A", "B", "C", "P", "Z"], ordered=True) ## redefine score levels
def prepare_compare(df): df = df.sort_values(orderby).reset_index(drop=True) df = df.where((pd.notnull(df)), None) return df
for i in range(len(df)): u = df['User'][i].lower() r = df['RoomName'][i] mention = df['Mentions'][i] if "P:" in r: u1, u2 = r.replace("P:", "").split('|') if u1 == u: r = u2 else: r = u1 r = r.lower() dtset['nodes']['people'][r]['size'] += 1 else: dtset['nodes']['projects'][r]['size'] += 1 l = "{},{}".format(r, u) if l not in dtset['links']: dtset['links'][l] = {'weight': 0, 'mention_weight': 0} dtset['links'][l]['weight'] += 1 dtset['nodes']['people'][u]['size'] += 1 if pd.notnull(mention): users = mention.split(',') regex = re.compile('[^a-zA-Z]') users = [regex.sub('', k).lower() for k in users] users = [k for k in users if k in dtset['nodes']['people'].keys()] for user in users: l = "{},{}".format(u, user) if l not in dtset['links']: dtset['links'][l] = {'weight': 0, 'mention_weight': 0} dtset['links'][l]['mention_weight'] += 1 json.dump(dtset, open("leo.json", 'w'))
agg.sort_index(ascending=asc, inplace=True) agg.columns = [ 'Matches played', 'Points per game', '% correct result', '% correct goal diff', '% correct score', 'Goals per game (predicted)', 'Goals per game (actual)', '% games won (predicted)', '% games won (actual)' ] # print(agg.columns) return agg overall = pd.DataFrame( { "Matches played": output[pd.notnull(output.Actual_result)].shape[0], "Points per game": output[pd.notnull(output.Actual_result)].Points.mean(), "% correct result": output[pd.notnull(output.Actual_result)].Correct_result.mean(), "% correct goal diff": output[pd.notnull(output.Actual_result)].Correct_goal_diff.mean(), "% correct score": output[pd.notnull(output.Actual_result)].Correct_score.mean(), "Goals per game (predicted)": output[pd.notnull(output.Actual_result)].Predicted_goal_total.mean(), "Goals per game (actual)": output[pd.notnull(output.Actual_result)].Actual_goal_total.mean(), "% games won (predicted)": output[pd.notnull(output.Actual_result) & (output.Predicted_result != "Draw")].shape[0] /
def _clean_features(struct): """Cleans up the features collected in parse_play_details. :struct: Pandas Series of features parsed from details string. :returns: the same dict, but with cleaner features (e.g., convert bools, ints, etc.) """ struct = dict(struct) # First, clean up play type bools ptypes = [ 'isKickoff', 'isTimeout', 'isFieldGoal', 'isPunt', 'isKneel', 'isSpike', 'isXP', 'isTwoPoint', 'isPresnapPenalty', 'isPass', 'isRun' ] for pt in ptypes: struct[pt] = struct[pt] if pd.notnull(struct.get(pt)) else False # Second, clean up other existing variables on a one-off basis struct['callUpheld'] = struct.get('callUpheld') == 'upheld' struct['fgGood'] = struct.get('fgGood') == 'good' struct['isBlocked'] = struct.get('isBlocked') == 'blocked' struct['isComplete'] = struct.get('isComplete') == 'complete' struct['isFairCatch'] = struct.get('isFairCatch') == 'fair catch' struct['isMuffedCatch'] = pd.notnull(struct.get('isMuffedCatch')) struct['isNoPlay'] = ( ' (no play)' in struct['detail'] and 'penalty enforced in end zone' not in struct['detail'] if struct.get('detail') else False) struct['isOnside'] = struct.get('isOnside') == 'onside' struct['isSack'] = pd.notnull(struct.get('sackYds')) struct['isSafety'] = (struct.get('isSafety') == ', safety' or (struct.get('detail') and 'enforced in end zone, safety' in struct['detail'])) struct['isTD'] = struct.get('isTD') == ', touchdown' struct['isTouchback'] = struct.get('isTouchback') == ', touchback' struct['oob'] = pd.notnull(struct.get('oob')) struct['passLoc'] = PASS_OPTS.get(struct.get('passLoc'), np.nan) if struct['isPass']: pyds = struct['passYds'] struct['passYds'] = pyds if pd.notnull(pyds) else 0 if pd.notnull(struct['penalty']): struct['penalty'] = struct['penalty'].strip() struct['penDeclined'] = struct.get('penDeclined') == 'Declined' if struct['quarter'] == 'OT': struct['quarter'] = 5 struct['rushDir'] = RUSH_OPTS.get(struct.get('rushDir'), np.nan) if struct['isRun']: ryds = struct['rushYds'] struct['rushYds'] = ryds if pd.notnull(ryds) else 0 year = struct.get('season', np.nan) struct['timeoutTeam'] = sportsref.nfl.teams.team_ids(year).get( struct.get('timeoutTeam'), np.nan) struct['twoPointSuccess'] = struct.get('twoPointSuccess') == 'succeeds' struct['xpGood'] = struct.get('xpGood') == 'good' # Third, ensure types are correct bool_vars = [ 'fgGood', 'isBlocked', 'isChallenge', 'isComplete', 'isFairCatch', 'isFieldGoal', 'isKickoff', 'isKneel', 'isLateral', 'isNoPlay', 'isPass', 'isPresnapPenalty', 'isPunt', 'isRun', 'isSack', 'isSafety', 'isSpike', 'isTD', 'isTimeout', 'isTouchback', 'isTwoPoint', 'isXP', 'isMuffedCatch', 'oob', 'penDeclined', 'twoPointSuccess', 'xpGood' ] int_vars = [ 'down', 'fgBlockRetYds', 'fgDist', 'fumbRecYdLine', 'fumbRetYds', 'intRetYds', 'intYdLine', 'koRetYds', 'koYds', 'muffRetYds', 'pbp_score_aw', 'pbp_score_hm', 'passYds', 'penYds', 'puntBlockRetYds', 'puntRetYds', 'puntYds', 'quarter', 'rushYds', 'sackYds', 'timeoutNum', 'ydLine', 'yds_to_go' ] float_vars = ['exp_pts_after', 'exp_pts_before', 'home_wp'] string_vars = [ 'challenger', 'detail', 'fairCatcher', 'fgBlockRecoverer', 'fgBlocker', 'fgKicker', 'fieldSide', 'fumbForcer', 'fumbRecFieldSide', 'fumbRecoverer', 'fumbler', 'intFieldSide', 'interceptor', 'kneelQB', 'koKicker', 'koReturner', 'muffRecoverer', 'muffedBy', 'passLoc', 'passer', 'penOn', 'penalty', 'puntBlockRecoverer', 'puntBlocker', 'puntReturner', 'punter', 'qtr_time_remain', 'rushDir', 'rusher', 'sacker1', 'sacker2', 'spikeQB', 'tackler1', 'tackler2', 'target', 'timeoutTeam', 'xpKicker' ] for var in bool_vars: struct[var] = struct.get(var) is True for var in int_vars: try: struct[var] = int(struct.get(var)) except (ValueError, TypeError): struct[var] = np.nan for var in float_vars: try: struct[var] = float(struct.get(var)) except (ValueError, TypeError): struct[var] = np.nan for var in string_vars: if var not in struct or pd.isnull(struct[var]) or var == '': struct[var] = np.nan # Fourth, create new helper variables based on parsed variables # creating fieldSide and ydline from location if struct['isXP']: struct['fieldSide'] = struct['ydLine'] = np.nan else: fieldSide, ydline = _loc_to_features(struct.get('location')) struct['fieldSide'] = fieldSide struct['ydLine'] = ydline # creating secsElapsed (in entire game) from qtr_time_remain and quarter if pd.notnull(struct.get('qtr_time_remain')): qtr = struct['quarter'] mins, secs = map(int, struct['qtr_time_remain'].split(':')) struct['secsElapsed'] = qtr * 900 - mins * 60 - secs # creating columns for turnovers struct['isInt'] = pd.notnull(struct.get('interceptor')) struct['isFumble'] = pd.notnull(struct.get('fumbler')) # create column for isPenalty struct['isPenalty'] = pd.notnull(struct.get('penalty')) # create columns for EPA struct['team_epa'] = struct['exp_pts_after'] - struct['exp_pts_before'] struct['opp_epa'] = struct['exp_pts_before'] - struct['exp_pts_after'] return pd.Series(struct)
!wget -O weather-stations20140101-20141231.csv https://s3-api.us-geo.objectstorage.softlayer.net/cf-courses-data/CognitiveClass/ML0101ENv3/labs/weather-stations20140101-20141231.csv import csv import pandas as pd import numpy as np filename='weather-stations20140101-20141231.csv' #Read csv pdf = pd.read_csv(filename) pdf.head(5) # . DATA CLEANING pdf = pdf[pd.notnull(pdf["Tm"])] pdf = pdf.reset_index(drop=True) pdf.head(5) # . VISULIZATION from mpl_toolkits.basemap import Basemap import matplotlib.pyplot as plt from pylab import rcParams %matplotlib inline rcParams['figure.figsize'] = (14,10) llon=-140 ulon=-50 llat=40 ulat=65
def funds_as_dict(country=None, columns=None, as_json=False): """ This function retrieves all the available funds on Investing.com and returns them as a :obj:`dict` containing the country, name, symbol, tag, id, issuer, isin, asset_class, currency and underlying data. All the available funds can be found at: https://www.investing.com/funds/ Args: country (:obj:`str`, optional): name of the country to retrieve all its available funds from. columns (:obj:`list` of :obj:`str`, optional): description a :obj:`list` containing the column names from which the data is going to be retrieved. as_json (:obj:`bool`, optional): description value to determine the format of the output data (:obj:`dict` or :obj:`json`). Returns: :obj:`dict` or :obj:`json` - funds_dict: The resulting :obj:`dict` contains the retrieved data if found, if not, the corresponding fields are filled with `None` values. In case the information was successfully retrieved, the :obj:`dict` will look like:: { 'country': country, 'name': name, 'symbol': symbol, 'issuer': issuer, 'isin': isin, 'asset_class': asset_class, 'currency': currency, 'underlying': underlying } Raises: ValueError: raised whenever any of the introduced arguments is not valid or errored. FileNotFoundError: raised when the `funds.csv` file was not found. IOError: raised if the `funds.csv` file is missing or errored. """ if country is not None and not isinstance(country, str): raise ValueError("ERR#0025: specified country value not valid.") if not isinstance(as_json, bool): raise ValueError( "ERR#0002: as_json argument can just be True or False, bool type.") resource_package = 'investpy' resource_path = '/'.join(('resources', 'funds.csv')) if pkg_resources.resource_exists(resource_package, resource_path): funds = pd.read_csv( pkg_resources.resource_filename(resource_package, resource_path)) else: raise FileNotFoundError("ERR#0057: funds file not found or errored.") if funds is None: raise IOError("ERR#0005: funds not found or unable to retrieve.") funds.drop(columns=['tag', 'id'], inplace=True) funds = funds.where(pd.notnull(funds), None) if columns is None: columns = funds.columns.tolist() else: if not isinstance(columns, list): raise ValueError( "ERR#0020: specified columns argument is not a list, it can just be list type." ) if not all(column in funds.columns.tolist() for column in columns): raise ValueError( "ERR#0023: specified columns does not exist, available columns are " "<country, name, symbol, issuer, isin, asset_class, currency, underlying>" ) if country is None: if as_json: return json.dumps(funds[columns].to_dict(orient='records')) else: return funds[columns].to_dict(orient='records') else: country = unidecode(country.strip().lower()) if country not in fund_countries_as_list(): raise ValueError("ERR#0034: country " + country + " not found, check if it is correct.") if as_json: return json.dumps( funds[funds['country'] == country][columns].to_dict( orient='records')) else: return funds[funds['country'] == country][columns].to_dict( orient='records')
def remove_roman_characters_column(df): column_mandarin_keyword = df['Keyword in Chinese'] column_no_roman_characters = column_mandarin_keyword.apply( lambda x: remove_roman_characters(x) if pd.notnull(x) else x) df['Keyword in Chinese'] = column_no_roman_characters return df
def message_df(fitfile=None, msgtype='record', outfile=None, appendunits=True, missing='drop', addlasttimestamp=False, fromR=False): # serial number has numpy/pandas conversion problems baddevinfovars = ['serial_number'] if fitfile is None: print("No fitfile given") sys.exit(1) lasttimestamp = pd.to_datetime(float("NaN")) msgdf = pd.DataFrame() with fitdecode.FitReader(fitfile) as fit: for frame in fit: # The yielded frame object is of one of the following types: # * fitdecode.FitHeader # * fitdecode.FitDefinitionMessage # * fitdecode.FitDataMessage # * fitdecode.FitCRC if isinstance(frame, fitdecode.FitDataMessage): # Here, frame is a FitDataMessage object. # A FitDataMessage object contains decoded values that # are directly usable in your script logic. if frame.has_field('timestamp'): lasttimestamp = frame.get_value('timestamp') if frame.name == msgtype: msgdict = {} if addlasttimestamp and not frame.has_field('timestamp'): msgdict['timestamp'] = lasttimestamp # Go through all the data entries in this msg for fld in frame.fields: if fld.units and appendunits: keyname = fld.name + "." + fld.units.replace( "/", ".") else: keyname = fld.name if (msgtype == 'device_info') and (fld.name in baddevinfovars): msgdict[keyname] = force_to_int( frame.get_value(fld.name, fallback=float('NaN'))) else: msgdict[keyname] = frame.get_value( fld.name, fallback=float('NaN')) msgdf = msgdf.append(msgdict, ignore_index=True) msgdf = msgdf.where((pd.notnull(msgdf)), None) if missing == 'drop': msgdf.dropna(axis=1, how='all', inplace=True) if not fromR: print("variables extracted:") print("\n".join(str(x) for x in msgdf.columns)) print("dtypes: ") print(msgdf.dtypes) if outfile is None: return msgdf else: msgdf.to_json(path_or_buf=outfile, date_format='iso', date_unit='s')
def main(): # import data filename = "data.csv" raw = pd.read_csv(filename) originalFrame = raw.copy() ################## PREPROCESSING ########################### raw['remaining_time'] = raw['minutes_remaining'] * 60 + raw[ 'seconds_remaining'] raw["last_5_sec_in_period"] = raw["remaining_time"] < 5 drops = ["minutes_remaining", "seconds_remaining","team_id", "shot_zone_area", \ 'shot_zone_range', 'shot_zone_basic', "game_date", "team_name", "matchup", "lat", "lon", 'game_event_id'] raw["home_play"] = raw["matchup"].str.contains("vs").astype("int") for drop in drops: raw = raw.drop(drop, 1) raw = randomForestStrToNum(raw) nona = raw[pd.notnull(raw['shot_made_flag'])] #splitting explantory and response variables train = nona.drop('shot_made_flag', 1) train_y = nona['shot_made_flag'] # setting up KFolds seed = 24 num_folds = 3 num_rounds = 10 folds = KFold(len(train), n_folds=num_folds, random_state=seed, shuffle=True) model = RandomForestClassifier(n_estimators=200, max_depth=10, max_features=0.25, random_state=seed) # model = model.fit(train, train_y) ################################################################# #Looking at specific shots and their predicted probability layupFrame = nona.loc[nona["Layup"] == 1] fadeawayShotFrame = nona.loc[nona["Fadeaway Jump Shot"] == 1] dunkFrame = nona.loc[nona["Dunk"] == 1] ################# LAYUPS ######################### layup_train = train.loc[train["Layup"] == 1] layup_train_y = layupFrame['shot_made_flag'] print("LayupFrame shape: " + str(layupFrame.shape)) layupScore = testSubset(model, train, train_y, layup_train, layup_train_y, num_rounds, num_folds, seed) print(layupScore) ################## DUNKS ######################## dunk_train = train.loc[train["Dunk"] == 1] dunk_train_y = dunkFrame['shot_made_flag'] print("DunkFrame shape: " + str(dunkFrame.shape)) dunkScore = testSubset(model, train, train_y, dunk_train, dunk_train_y, num_rounds, num_folds, seed) print(dunkScore) ################## FADEAWAY SHOTS ######################## fade_train = train.loc[train["Fadeaway Jump Shot"] == 1] fade_train_y = fadeawayShotFrame['shot_made_flag'] print("FadeawayShotFrame shape: " + str(fadeawayShotFrame.shape)) fadeScore = testSubset(model, train, train_y, fade_train, fade_train_y, num_rounds, num_folds, seed) print(fadeScore)
def import_data(): t2 = time.process_time() data = pd.ExcelFile('C://Users//mima//Documents//price_freight_assay_data.xlsx') raw_rates = pd.ExcelFile('C://Users//mima//Documents//flat_rates.xlsx') trader_assessed = pd.ExcelFile('L://TRADING//ANALYSIS//GLOBAL//Arb Models//Pecking Order 2018.xlsm') assay = pd.read_excel(data, 'assay', index_col = 'Database_Name').to_dict('index') ws = pd.read_excel(data, 'ws') expiry_table = pd.read_excel(data, 'expiry', index_col = 'Month') ports = pd.read_excel(data, 'ports') sub_to_ws = pd.read_excel(data, 'sub_to_ws', header = None) sub_to_ws = sub_to_ws.set_index([0]).to_dict() """table containing the basrah base worldscale that they fix their freight against""" basrah_ws_base = pd.read_excel(data, 'basrah_ws_base', index_col = 'YEAR') """Take in the crude prices and codes and convert to a dataframe. We need to take the first 2 rows of the prices with no headers as this will give us the cude name and the code ascociated Then transpose from rows to columns and rename the columns. This will be for later when we determine crude prices basis desired comaprison""" #prices_reference = (pd.read_excel(data, 'paper prices', header = None).iloc[0:2,1:]).transpose().rename(columns={0:'Name', 1: 'Code'}) """Merge the WS table with the prices table, slice df so 2016 onwards (Flat rates last date is 2015). We don't drop rows now as dropping would be dependent on any nans in any column""" #total = prices.merge(ws_table, how = 'inner', left_index = True, right_index = True) #total = total.merge(paper_prices, how = 'inner', left_index = True, right_index = True) #total = total.iloc[total.index > dt(2015,12,31)] """this new total table generates all the prices in one place for us""" total = pd.read_excel(data, 'price_warehouse', header = 4).drop(['Timestamp']) total.index = pd.to_datetime(total.index) total.sort_index(inplace=True) total.fillna(method='ffill', inplace=True) total = total[total.index > dt(2015,1,1)] """We know there are some perculiarities in the data, such as the OSPs. So create this table here to handle. Found out need to shift the prices back a month but in order to identify which ones, needed the list of OSP crudes""" exceptions = { 'Arab Extra Light': {'ROTTERDAM':{'Code':'AAIQQ00','Index':'BWAVE'}, 'AUGUSTA':{'Code':'AAWQK00','Index':'BWAVE'}, 'HOUSTON':{'Code':'AAIQZ00','Index':'WTI'}, 'SINGAPORE':{'Code':'AAIQV00','Index':'OMAN/DUBAI'}}, 'Arab Light': {'ROTTERDAM':{'Code':'AAIQR00','Index':'BWAVE'}, 'AUGUSTA':{'Code':'AAWQL00','Index':'BWAVE'}, 'HOUSTON':{'Code':'AAIRA00','Index':'WTI'}, 'SINGAPORE':{'Code':'AAIQW00','Index':'OMAN/DUBAI'}}, 'Arab Medium': {'ROTTERDAM':{'Code':'AAIQS00','Index':'BWAVE'}, 'AUGUSTA':{'Code':'AAWQM00','Index':'BWAVE'}, 'HOUSTON':{'Code':'AAIRB00','Index':'WTI'}, 'SINGAPORE':{'Code':'AAIQX00','Index':'OMAN/DUBAI'}}, 'Arab Heavy': {'ROTTERDAM':{'Code':'AAIQT00','Index':'BWAVE'}, 'AUGUSTA':{'Code':'AAWQN00','Index':'BWAVE'}, 'HOUSTON':{'Code':'AAIRC00','Index':'WTI'}, 'SINGAPORE':{'Code':'AAIQY00','Index':'OMAN/DUBAI'}}, 'Basrah Light': {'ROTTERDAM':{'Code':'AAIPH00','Index':'Dated'}, 'AUGUSTA':{'Code':'AAIPH00','Index':'Dated'}, 'HOUSTON':{'Code':'AAIPG00','Index':'WTI'}, 'SINGAPORE':{'Code':'AAIPE00','Index':'OMAN/DUBAI'}}, 'Basrah Heavy': {'ROTTERDAM':{'Code':'AAXUC00','Index':'Dated'}, 'AUGUSTA':{'Code':'AAXUC00','Index':'Dated'}, 'HOUSTON':{'Code':'AAXUE00','Index':'Mars'}, 'SINGAPORE':{'Code':'AAXUA00','Index':'OMAN/DUBAI'}}, 'Iranian Heavy': {'ROTTERDAM':{'Code':'AAIPB00','Index':'BWAVE'}, 'AUGUSTA':{'Code':'AAUCH00','Index':'BWAVE'}, #'Iranian Heavy':{'HOUSTON':{'Code':abcde,'Index':'WTI'}}, 'SINGAPORE':{'Code':'AAIOY00','Index':'OMAN/DUBAI'}}, 'Iranian Light': {'ROTTERDAM':{'Code':'AAIPA00','Index':'BWAVE'}, 'AUGUSTA':{'Code':'AAUCJ00','Index':'BWAVE'}, 'SINGAPORE':{'Code':'AAIOX00','Index':'OMAN/DUBAI'}}, 'Forozan': {'ROTTERDAM':{'Code':'AAIPC00','Index':'BWAVE'}, 'AUGUSTA':{'Code':'AAUCF00','Index':'BWAVE'}, 'SINGAPORE':{'Code':'AAIOZ00','Index':'OMAN/DUBAI'}}, 'Isthmus':{'ROTTERDAM':{'Code':'AAIQC00','Index':'Dated'}, 'AUGUSTA':{'Code':'AAIQC00','Index':'Dated'}, 'HOUSTON':{'Code':'AAIPZ00','Index':'WTI'}, 'SINGAPORE':{'Code':'AAIQE00','Index':'OMAN/DUBAI'}}, 'Maya':{'ROTTERDAM':{'Code':'AAIQB00','Index':'Dated'}, 'AUGUSTA':{'Code':'AAIQB00','Index':'Dated'}, 'HOUSTON':{'Code':'AAIPY00','Index':'WTI'}, 'SINGAPORE':{'Code':'AAIQD00','Index':'OMAN/DUBAI'}} } crudes_to_shift = pd.DataFrame.from_dict({(crude,destination): exceptions[crude][destination] for crude in exceptions.keys() for destination in exceptions[crude].keys()}, orient='index') """convert the dataseries to a list, then use setr to get the unique items, then convert back to a list""" crudes_to_shift = list(set(list(crudes_to_shift['Code']))) """Fopr the crudes in the list, I want to resample the series at the month start so there is a common value for the start of each month, I then want shift these values by 1 backwards, in this case because we resampled, this automatically means shift abck one month, I then want to re-index the new dataframe to conform to where we are putting it back into, and finally I assign the total dataframe where the column headers are equal to the crude list, the new shifted and filled forward values to make sure everything lines up""" total[crudes_to_shift] = total[crudes_to_shift].resample('MS').mean().shift(-1, freq='MS').reindex(total.index).fillna(method='ffill') #total['AAXUC00'] """This will help with the date error. Turn the index into a numpy array and then assign the value""" if total.index[-1] - total.index[-2] > pd.Timedelta(days=2): total.index.values[-1] = total.index[-2] + pd.Timedelta(days=1) """Clean the column hedaers so no white spcaes - use simple list comprehension and set headers equal to cleaned""" cleaned_column_headers = [i.strip() for i in total.columns.values] total.columns = cleaned_column_headers """The below was get rid of the row in the index that hax NaT against it and then expand to daily and fill backwards""" crude_diffs = pd.read_excel(trader_assessed, 'Crude Diffs Traders', header = 0) crude_diffs = crude_diffs.loc[pd.notnull(crude_diffs.index)] crude_diffs = crude_diffs.drop([name for name in crude_diffs.columns if 'Unnamed' in name], axis=1) #crude_diffs.index = crude_diffs.index.map(lambda x : x + 1*BDay()) crude_diffs = crude_diffs.reindex(total.index).fillna(method='bfill').fillna(method='ffill') """Slice the crude diffs where the dates in the index are the same as the dates in the total dataframe""" #crude_diffs = crude_diffs[crude_diffs.index.isin(total.index)] crudes_diff_against_osp = ['Basrah Light','Basrah Heavy'] codes_list = [x for x in crude_diffs.columns if x not in crudes_diff_against_osp] """Apply the values in crude diffs to the correct codes and dates in the total dataframe""" total.update(crude_diffs[codes_list]) """We have to convert the prices that are in absolutes into a diff vs a local index, and if there are, set to zero. This is LOOP Sour""" total['AALSM01'].loc[total['AALSM01'] > 30] = total['AALSM01'].loc[total['AALSM01'] > 30] - total['CLc1'] #total.loc[total.index.isin(crude_diffs.index), codes_list] = crude_diffs[codes_list] #total[codes_list] #total.update(crude_diffs[codes_list]) """ Need this for the sulphur table""" forties_sulphur = pd.read_excel(trader_assessed, 'Forties de-esc', header = [22], parse_cols="H:I").set_index('week ending') forties_sulphur = forties_sulphur.loc[pd.notnull(forties_sulphur.index)] forties_sulphur = forties_sulphur.reindex(total.index).fillna(method='ffill') """Also need to adjust the cfds to take into account the inter month BFOE spread""" cfd_list = ['PCAKA00','PCAKC00','PCAKE00','PCAKG00','AAGLU00','AAGLV00','AALCZ00','AALDA00'] temp = total[cfd_list].sub(pd.Series(total['PCAAQ00'] - total['PCAAR00']), axis=0) temp = temp[temp.index > dt(2017,6,30)] total.loc[total.index.isin(temp.index), list(temp.columns)] = temp[list(temp.columns)] """This turns the 5 years of rate matricies into a table for use to reference - 12/04/2018""" rates = [] for x,y in enumerate([name.split()[2] for name in raw_rates.sheet_names]): f = pd.read_excel(raw_rates, sheetname = x, header = None).iloc[1:47,1:] lplen = len(f.iloc[:,1]) dplen = len(f.iloc[1,:]) for j in range(1, dplen): for i in range(1,lplen): LoadPort = f.iloc[i,0] DischargePort = f.iloc[0,j] Year = y Rate = f.iloc[i,j] rates.append({'LoadPort':LoadPort, 'DischargePort': DischargePort, 'Year':Year,'Rate':Rate}) rate_data = pd.DataFrame(rates) """Also initialise the temp df with index of total. Temp df is tol hold the dataseries needed to calculate the freight""" df = pd.DataFrame(index=total.index) df['Date'] = df.index """This function allows us to apply the expiration date for the wti futures used to determine what structure we apply to the CMA Have tried timing and slight improvment with the blow of 0.2seconds....""" t = time.process_time() for_dates = lambda x: (expiry_table.loc[(expiry_table.index.month == x.month)&(expiry_table.index.year == x.year)]['Expiry']).iat[0] df['Expiry'] = df['Date'].apply(for_dates) df.drop(['Date'], inplace=True, axis=1) print("df['Expiry'] created successfully: Time was {}".format(time.process_time() - t)) print("Temp DataFrame created successfully") print("import_data() created successfully: Time was {}".format(time.process_time() - t2)) return assay, ws, ports, total, rate_data, sub_to_ws, df, basrah_ws_base, crude_diffs, forties_sulphur, exceptions, crudes_to_shift
'b' in obj2 #用于需要字典参数的函数中,例如字典键的判断 'e' in obj2 #通过字典创建series,只传入字典,则键即为索引 sdata = {'Ohio': 35000, 'Texas': 71000, 'Oregon': 16000, 'Utah': 5000} obj3 = Series(sdata) #返回的series有序排列 obj3 #同时传入字典和指定索引,字典的键与索引是否匹配-读取匹配的数据,不匹配,其索引的值为NaN states = ['California', 'Ohio', 'Oregon', 'Texas'] obj4 = Series(sdata, index=states) obj4 #pandas的isnull和notnull可以检测缺失数据 pd.isnull(obj4) #返回布尔型 pd.notnull(obj4) #series的isnull obj4.isnull() #series会自动对齐不同索引的数据 obj3 obj4 obj3 + obj4 #series对象本身和索引都有一个“name”属性,类似于标签 obj4.name = 'population' #设置series对象本身的“name”属性 obj4.index.name = 'state' obj4 #通过赋值的方式就地修改series的索引
def prep_pbp_data(df): df = df[df['qb_dropback'].notna()] df['epa'] = pd.to_numeric(df['epa']) df['wpa'] = pd.to_numeric(df['wpa']) # Passing pass_df = df[(df['qb_dropback'] == "1") & (df['passer_player_name'].notna())] cols_pass = [ # "airEPA_Result", "airWPA_Result", "yacEPA_Result", "yacWPA_Result", "passer_id", "receiver_id", "passer_player_name", # "Passer_salary", "Passer_capHit", "PasserName", "receiver_player_name" # , "Receiver_salary", "Receiver_capHit", "Receiver_position"] ] for i in cols_pass: pass_df = pass_df[pd.notnull(pass_df[i])] pass_df = pass_df[(pass_df['receiver_id'] != "None") & (pass_df['passer_id'] != "None")] # Rushing rush_df = df[(df['qb_dropback'] == "0")] cols_rush = [ "epa", "wpa", "rusher_player_name", "rusher_id", # "Rusher_capHit", "Rusher_salary"] ] for k in cols_rush: rush_df = rush_df[pd.notnull(rush_df[k])] rush_df = rush_df[(rush_df['rusher_id'] != "None")] # Receiving rec_df = pass_df cols_rec = [ # "airEPA_Result", "airWPA_Result", "yacEPA_Result", "yacWPA_Result", "passer_id", "receiver_id", "receiver_player_name", # "Passer_salary", "Passer_capHit", "passer_player_name" # , "Receiver_salary", "Receiver_capHit", "Receiver_position"] ] for g in cols_rec: rec_df = rec_df[pd.notnull(rec_df[g])] rec_df = rec_df[(rec_df['receiver_id'] != "None") & (rec_df['passer_id'] != "None")] # Team Passing team_passing = pass_df.groupby(['Season', 'posteam']).agg({ 'epa': sum, 'wpa': sum, 'play_id': 'count' }).reset_index() team_passing.rename(columns={ 'play_id': 'Pass_Attempts', 'epa': 'Pass_EPA', 'wpa': 'Pass_WPA' }, inplace=True) team_passing['Pass_EPA_Att'] = team_passing['Pass_EPA'] / team_passing[ 'Pass_Attempts'] team_passing['Pass_WPA_Att'] = team_passing['Pass_WPA'] / team_passing[ 'Pass_Attempts'] # Team Rushing team_rushing = rush_df.groupby(['Season', 'posteam']).agg({ 'epa': sum, 'wpa': sum, 'play_id': 'count' }).reset_index() team_rushing.rename(columns={ 'play_id': 'Rush_Attempts', 'epa': 'Rush_EPA', 'wpa': 'Rush_WPA' }, inplace=True) team_rushing['Rush_EPA_Att'] = team_rushing['Rush_EPA'] / team_rushing[ 'Rush_Attempts'] team_rushing['Rush_WPA_Att'] = team_rushing['Rush_WPA'] / team_rushing[ 'Rush_Attempts'] # Ind Passing ind_passing = pass_df.groupby( ['Season', 'passer_player_name', 'posteam']).agg({ 'epa': sum, 'wpa': sum, # 'airEPA_Result': sum, 'play_id': 'count', # 'Passer_salary': 'max', # 'Passer_capHit': 'max' }).reset_index() ind_passing.rename(columns={ 'play_id': 'Pass_Attempts', 'epa': 'Pass_EPA', 'wpa': 'Pass_WPA' }, inplace=True) ind_passing['Pass_EPA_Att'] = ind_passing['Pass_EPA'] / ind_passing[ 'Pass_Attempts'] ind_passing['Pass_WPA_Att'] = ind_passing['Pass_WPA'] / ind_passing[ 'Pass_Attempts'] # ind_passing['airEPA_Att'] = ind_passing['airEPA_Result'] / ind_passing['Pass_Attempts'] ind_passing = ind_passing[(ind_passing['Pass_Attempts'] > 10)] # Ind Rushing ind_rushing = rush_df.groupby( ['Season', 'rusher_player_name', 'posteam']).agg({ 'epa': sum, 'wpa': sum, 'play_id': 'count', # 'Rusher_salary': 'max', # 'Rusher_capHit': 'max' }).reset_index() ind_rushing.rename(columns={ 'play_id': 'Rush_Attempts', 'epa': 'Rush_EPA', 'wpa': 'Rush_WPA', 'rusher_player_name': 'Player' }, inplace=True) ind_rushing['Rush_EPA_Att'] = ind_rushing['Rush_EPA'] / ind_rushing[ 'Rush_Attempts'] ind_rushing['Rush_WPA_Att'] = ind_rushing['Rush_WPA'] / ind_rushing[ 'Rush_Attempts'] # ind_rushing = ind_rushing[(ind_rushing['Rush_Attempts'] > 25)] # filter out QBs from Rush df qbs = ind_passing[(ind_passing['Pass_Attempts'] > 15)]['passer_player_name'] ind_rushing = ind_rushing[~ind_rushing['Player'].isin(qbs)] # Ind Receiving ind_receiving = rec_df.groupby( ['Season', 'receiver_player_name', 'posteam']).agg({ 'epa': sum, 'wpa': sum, 'play_id': 'count', # 'Receiver_salary': 'max', # 'Receiver_capHit': 'max' }).reset_index() ind_receiving.rename(columns={ 'play_id': 'Targets', 'epa': 'Rec_EPA', 'wpa': 'Rec_WPA', 'Receiver': 'Player' }, inplace=True) ind_receiving['Rec_EPA_Target'] = ind_receiving[ 'Rec_EPA'] / ind_receiving['Targets'] ind_receiving['Rec_WPA_Target'] = ind_receiving[ 'Rec_WPA'] / ind_receiving['Targets'] # ind_receiving = ind_receiving[(ind_receiving['Targets'] > 25)] # Combine ind_rushing and ind_receiving merged_ind = pd.merge( ind_rushing, ind_receiving, left_on=["Season", "Player", "posteam"], right_on=["Season", "receiver_player_name", "posteam"]) merged_ind['Opportunities'] = merged_ind['Rush_Attempts'] + merged_ind[ 'Targets'] merged_ind['Weighted_Rush_EPA'] = merged_ind[ 'Rush_Attempts'] * merged_ind['Rush_EPA_Att'] merged_ind['Weighted_Rush_WPA'] = merged_ind[ 'Rush_Attempts'] * merged_ind['Rush_WPA_Att'] merged_ind['Weighted_Target_EPA'] = merged_ind['Targets'] * merged_ind[ 'Rec_EPA_Target'] merged_ind['Weighted_Target_WPA'] = merged_ind['Targets'] * merged_ind[ 'Rec_WPA_Target'] merged_ind['Weighted_EPA_Opps'] = (merged_ind['Weighted_Rush_EPA'] + merged_ind['Weighted_Target_EPA']) \ / merged_ind['Opportunities'] merged_ind['Weighted_WPA_Opps'] = (merged_ind['Weighted_Rush_WPA'] + merged_ind['Weighted_Target_WPA']) \ / merged_ind['Opportunities'] merged_team = pd.merge(team_passing, team_rushing, on=["posteam", "Season"]) return merged_team, team_passing, team_rushing, ind_passing, ind_rushing, ind_receiving, merged_ind
def create_price_table(): total.head() where(pd.notnull(assay_input),None).to_dict('records') total.index = pd.to_datetime(total.index) total.sort_index(inplace=True) total.fillna(method='ffill', inplace=True) total = total[total.index > dt(2015,1,1)] def import_prices_prepare_for_sql(): total = pd.read_excel(data, 'price_warehouse', header=4).drop(['Timestamp']).iloc[:10] total_descriptions = pd.read_excel(data, 'price_warehouse', header=3).columns total.columns = [total_descriptions,total.columns] total.index = pd.to_datetime(total.index) total.columns.names = ('Series','Code') total.index.name = 'Date' total = total.unstack().reset_index().rename(columns={0:'Value'}) total['Value'] = pd.to_numeric(total['Value'], errors='coerce') total = total.dropna(subset=['Value']) prices_input = total.to_dict('records') return prices_input def basrah_data_records(): basrah_ws_base = pd.read_excel(data, 'basrah_ws_base', index_col = 'Date') basrah_ws_base_input = pd.DataFrame(basrah_ws_base).unstack().reset_index().rename(columns={'level_0':'Series', 0:'Value'}).to_dict('records') return basrah_ws_base_input def freight_rates_records(): raw_rates = pd.ExcelFile('C://Users//mima//Documents//flat_rates.xlsx') rates = [] for x,y in enumerate([name.split()[2] for name in raw_rates.sheet_names]): f = pd.read_excel(raw_rates, sheetname = x, header = None).iloc[1:47,1:] lplen = len(f.iloc[:,1]) dplen = len(f.iloc[1,:]) for j in range(1, dplen): for i in range(1,lplen): LoadPort = f.iloc[i,0] DischargePort = f.iloc[0,j] Year = y Rate = f.iloc[i,j] rates.append({'LoadPort':LoadPort, 'DischargePort': DischargePort, 'Year':Year,'Rate':Rate}) rate_data_input = pd.DataFrame(rates).dropna(axis=0).to_dict('records') return rate_data_input def assay_records(): assay_input = pd.read_excel(data, 'assay', index_col='Database_Name') assay_input.dtypes assay_input['RESIDUE_v40'] = assay_input['RESIDUE_v40'].astype(float) assay_input['GradesId'] = assay_input['GradesId'].astype(float) assay_input = assay_input.reset_index() assay_input = assay_input.where(pd.notnull(assay_input),None).to_dict('records') return assay_input def world_scale_records(): ws_input = pd.read_excel(data, 'ws').to_dict('records') return ws_input def world_scale_mappings_data(): world_scale_mappings_input = pd.read_excel(data, 'sub_to_ws').to_dict('records') return world_scale_mappings_input def exceptions_data(): exceptions = { 'Arab Extra Light': {'ROTTERDAM':{'Code':'AAIQQ00','Index':'BWAVE'}, 'AUGUSTA':{'Code':'AAWQK00','Index':'BWAVE'}, 'HOUSTON':{'Code':'AAIQZ00','Index':'WTI'}, 'SINGAPORE':{'Code':'AAIQV00','Index':'OMAN/DUBAI'}}, 'Arab Light': {'ROTTERDAM':{'Code':'AAIQR00','Index':'BWAVE'}, 'AUGUSTA':{'Code':'AAWQL00','Index':'BWAVE'}, 'HOUSTON':{'Code':'AAIRA00','Index':'WTI'}, 'SINGAPORE':{'Code':'AAIQW00','Index':'OMAN/DUBAI'}}, 'Arab Medium': {'ROTTERDAM':{'Code':'AAIQS00','Index':'BWAVE'}, 'AUGUSTA':{'Code':'AAWQM00','Index':'BWAVE'}, 'HOUSTON':{'Code':'AAIRB00','Index':'WTI'}, 'SINGAPORE':{'Code':'AAIQX00','Index':'OMAN/DUBAI'}}, 'Arab Heavy': {'ROTTERDAM':{'Code':'AAIQT00','Index':'BWAVE'}, 'AUGUSTA':{'Code':'AAWQN00','Index':'BWAVE'}, 'HOUSTON':{'Code':'AAIRC00','Index':'WTI'}, 'SINGAPORE':{'Code':'AAIQY00','Index':'OMAN/DUBAI'}}, 'Basrah Light': {'ROTTERDAM':{'Code':'AAIPH00','Index':'Dated'}, 'AUGUSTA':{'Code':'AAIPH00','Index':'Dated'}, 'HOUSTON':{'Code':'AAIPG00','Index':'WTI'}, 'SINGAPORE':{'Code':'AAIPE00','Index':'OMAN/DUBAI'}}, 'Basrah Heavy': {'ROTTERDAM':{'Code':'AAXUC00','Index':'Dated'}, 'AUGUSTA':{'Code':'AAXUC00','Index':'Dated'}, 'HOUSTON':{'Code':'AAXUE00','Index':'Mars'}, 'SINGAPORE':{'Code':'AAXUA00','Index':'OMAN/DUBAI'}}, 'Iranian Heavy': {'ROTTERDAM':{'Code':'AAIPB00','Index':'BWAVE'}, 'AUGUSTA':{'Code':'AAUCH00','Index':'BWAVE'}, #'Iranian Heavy':{'HOUSTON':{'Code':abcde,'Index':'WTI'}}, 'SINGAPORE':{'Code':'AAIOY00','Index':'OMAN/DUBAI'}}, 'Iranian Light': {'ROTTERDAM':{'Code':'AAIPA00','Index':'BWAVE'}, 'AUGUSTA':{'Code':'AAUCJ00','Index':'BWAVE'}, 'SINGAPORE':{'Code':'AAIOX00','Index':'OMAN/DUBAI'}}, 'Forozan': {'ROTTERDAM':{'Code':'AAIPC00','Index':'BWAVE'}, 'AUGUSTA':{'Code':'AAUCF00','Index':'BWAVE'}, 'SINGAPORE':{'Code':'AAIOZ00','Index':'OMAN/DUBAI'}}, 'Isthmus':{'ROTTERDAM':{'Code':'AAIQC00','Index':'Dated'}, 'AUGUSTA':{'Code':'AAIQC00','Index':'Dated'}, 'HOUSTON':{'Code':'AAIPZ00','Index':'WTI'}, 'SINGAPORE':{'Code':'AAIQE00','Index':'OMAN/DUBAI'}}, 'Maya':{'ROTTERDAM':{'Code':'AAIQB00','Index':'Dated'}, 'AUGUSTA':{'Code':'AAIQB00','Index':'Dated'}, 'HOUSTON':{'Code':'AAIPY00','Index':'WTI'}, 'SINGAPORE':{'Code':'AAIQD00','Index':'OMAN/DUBAI'}} } exceptions_input = pd.DataFrame.from_dict({(crude,destination): exceptions[crude][destination] for crude in exceptions.keys() for destination in exceptions[crude].keys()}, orient='index') exceptions_input = exceptions_input.unstack().unstack().reset_index().rename(columns={'level_0':'Series','level_1':'Destination','level_2':'Crude', 0:'Value'}).to_dict('records') return exceptions_input def forties_sulphur_records(): trader_assessed = pd.ExcelFile('L://TRADING//ANALYSIS//GLOBAL//Arb Models//Pecking Order 2018.xlsm') forties_sulphur = pd.read_excel(trader_assessed, 'Forties de-esc', header = [22], parse_cols="H:I").set_index('week ending') forties_sulphur = forties_sulphur.loc[pd.notnull(forties_sulphur.index)] forties_sulphur_input = forties_sulphur.reset_index().rename(columns={'buzzard content':'BuzzardContent','week ending':'Date'}).to_dict('records') return forties_sulphur_input params = urllib.parse.quote("DRIVER={SQL Server Native Client 11.0};SERVER=STCHGS112;DATABASE=MIMAWorkSpace;UID=mima;Trusted_Connection=Yes") eng = create_engine("mssql+pyodbc:///?odbc_connect=%s" % params, echo=True) Base = declarative_base() session = Session(bind=eng) class Basrah_WS_Base(Base): """this tells SQLAlchemy that rows of Basrah_WS_Base table must be mapped to this class""" __tablename__ = 'Basrah_WS_Base' Id = Column(Integer, primary_key=True) Date = Column(Date) Series = Column(String(32)) Value = Column(Float) class Global_Flat_Rates(Base): __tablename__ = 'Global_Flat_Rates' __table_args__ = {'extend_existing': True} Id = Column(Integer, primary_key=True) DischargePort = Column(String(32)) LoadPort = Column(String(32)) Rate = Column(Float) Year = Column(Integer) class Crude_Assay(Base): __tablename__ = 'Crude_Assay' __table_args__ = {'extend_existing': True} Id = Column(Integer, primary_key=True) Database_Name = Column(String(32)) H_Comet_Name = Column(String(32)) Crude_Manager_Name = Column(String(32)) Gravity = Column(Float) API = Column(Float) Sulphur = Column(Float) Conversion = Column(Float) LPG = Column(Float) LVN = Column(Float) HVN = Column(Float) KERO = Column(Float) LGO = Column(Float) HGO = Column(Float) VGO = Column(Float) RESIDUE = Column(Float) LGO_density = Column(Float) HGO_desnsity = Column(Float) VGO_density = Column(Float) RESIDUE_density = Column(Float) LGO_sulphur = Column(Float) HGO_sulphur = Column(Float) VGO_sulphur = Column(Float) RESIDUE_sulphur = Column(Float) RESIDUE_v50 = Column(Float) RESIDUE_v40 = Column(Float) RESIDUE_v100 = Column(Float) GradesId = Column(Float) Code = Column(String(32)) Index = Column(String(32)) Basis = Column(String(32)) LoadPort = Column(String(32)) FOBLoadPort = Column(String(32)) FOBCode = Column(String(32)) class World_Scale_Table(Base): __tablename__ = 'World_Scale_Table' __table_args__ = {'extend_existing': True} Id = Column(Integer, primary_key=True) Name = Column(String(64)) Origin = Column(String(32)) Destination = Column(String(32)) Size = Column(String(32)) Volume = Column(Integer) Terms = Column(String(32)) Code = Column(String(32)) bbls = Column(Integer) class World_Scale_Mappings(Base): __tablename__ = 'World_Scale_Mappings' Id = Column(Integer, primary_key=True) Port_SubRegion = Column(String(32)) WS_Region = Column(String(32)) local_index = Column(String(32)) Price_Set = Column(String(32)) class Exceptions(Base): __tablename__ = 'Exceptions' Id = Column(Integer, primary_key=True) Crude = Column(String(32)) Destination = Column(String(32)) Series = Column(String(32)) Value = Column(String(32)) class Forties_Sulphur(Base): __tablename__ = 'Forties_Sulphur' Id = Column(Integer, primary_key=True) BuzzardContent = Column(Float) Date = Column(Date) class Prices(Base): __tablename__ = 'Prices' Id = Column(Integer, primary_key=True) Series = Column(String) Code = Column(String(32)) Date = Column(Date) Value = Column(Float) """Create the tables""" Base.metadata.create_all(eng) """Commit the data to a database""" session.bulk_insert_mappings(Basrah_WS_Base, basrah_ws_base_input) session.bulk_insert_mappings(Global_Flat_Rates, rate_data_input) session.bulk_insert_mappings(Crude_Assay, assay_input) session.bulk_insert_mappings(World_Scale_Table, ws_input) session.bulk_insert_mappings(World_Scale_Mappings, world_scale_mappings_input) session.bulk_insert_mappings(Exceptions, exceptions_input) session.bulk_insert_mappings(Forties_Sulphur, forties_sulphur_input) session.bulk_insert_mappings(Prices, prices_input) session.commit() session.rollback() Base.metadata.bind = eng DBSession = sessionmaker(bind=eng) """EXTRACTION: Create the connection to the database to be able to upload data""" with eng.connect() as con: """Load definition of the Basrah_WS_Base table and the connection metadata""" meta = MetaData(eng) basrah_ws_base = Table('Basrah_WS_Base', meta, autoload=True) """Create the SQL select statement and execute and print - use table.c.column name for individual columns""" stm = select([basrah_ws_base]) rs = con.execute(stm) #You can also put a standard query in here data = pd.DataFrame(rs.fetchall(), columns=rs.keys()) df = data.pivot(columns='Series', index = 'Date', values='Value') print(df) """Import data from local files if needed""" raw_rates = pd.ExcelFile('C://Users//mima//Documents//flat_rates.xlsx') trader_assessed = pd.ExcelFile('L://TRADING//ANALYSIS//GLOBAL//Arb Models//Pecking Order 2018.xlsm') assay = pd.read_excel(data, 'assay', index_col = 'Database_Name').to_dict('index') ws = pd.read_excel(data, 'ws') expiry_table = pd.read_excel(data, 'expiry', index_col = 'Month') ports = pd.read_excel(data, 'ports') sub_to_ws = pd.read_excel(data, 'sub_to_ws', header = None) sub_to_ws = sub_to_ws.set_index([0]).to_dict() engine.execute("SELECT Series, Date, Value FROM Basrah_WS_Base").fetchall() pd.DataFrame(engine.execute("SELECT Series, Date, Value FROM Basrah_WS_Base").fetchall()) basrah_ws_base = basrah_ws_base_flat.pivot(columns='Series', index='Date', values='Value') test = pd.DataFrame(refinery_configurations).unstack().reset_index().rename(columns={'index':'configuration'}) pd.DataFrame(refinery_configurations).T.reset_index() t = time.process_time() test.to_sql('Refinery_Configs_2', con =engine, index_label = 'Id', if_exists = 'replace') print("Uploaded successfully: Time was {}".format(time.process_time() - t)) test.to_csv('L:/TRADING/ANALYSIS/Python/test.csv', sep='\t') def retrieve_prices_model(): df1 = database_prices.pivot(columns='Code', index = 'Date', values='Value') import pandas as pd import numpy as np from datetime import datetime as dt import time from pandas.tseries.offsets import BDay def import_data(): t2 = time.process_time() data = pd.ExcelFile('C://Users//mima//Documents//price_freight_assay_data.xlsx') raw_rates = pd.ExcelFile('C://Users//mima//Documents//flat_rates.xlsx') trader_assessed = pd.ExcelFile('L://TRADING//ANALYSIS//GLOBAL//Arb Models//Pecking Order 2018.xlsm') assay = pd.read_excel(data, 'assay', index_col = 'Database_Name').to_dict('index') ws = pd.read_excel(data, 'ws') expiry_table = pd.read_excel(data, 'expiry', index_col = 'Month') ports = pd.read_excel(data, 'ports') sub_to_ws = pd.read_excel(data, 'sub_to_ws', header = None) sub_to_ws = sub_to_ws.set_index([0]).to_dict() """table containing the basrah base worldscale that they fix their freight against""" basrah_ws_base = pd.read_excel(data, 'basrah_ws_base', index_col = 'YEAR') """Take in the crude prices and codes and convert to a dataframe. We need to take the first 2 rows of the prices with no headers as this will give us the cude name and the code ascociated Then transpose from rows to columns and rename the columns. This will be for later when we determine crude prices basis desired comaprison""" #prices_reference = (pd.read_excel(data, 'paper prices', header = None).iloc[0:2,1:]).transpose().rename(columns={0:'Name', 1: 'Code'}) """Merge the WS table with the prices table, slice df so 2016 onwards (Flat rates last date is 2015). We don't drop rows now as dropping would be dependent on any nans in any column""" #total = prices.merge(ws_table, how = 'inner', left_index = True, right_index = True) #total = total.merge(paper_prices, how = 'inner', left_index = True, right_index = True) #total = total.iloc[total.index > dt(2015,12,31)] """this new total table generates all the prices in one place for us""" total = pd.read_excel(data, 'price_warehouse', header = 4).drop(['Timestamp']) total.index = pd.to_datetime(total.index) total.sort_index(inplace=True) total.fillna(method='ffill', inplace=True) total = total[total.index > dt(2015,1,1)] """We know there are some perculiarities in the data, such as the OSPs. So create this table here to handle. Found out need to shift the prices back a month but in order to identify which ones, needed the list of OSP crudes""" exceptions = { 'Arab Extra Light': {'ROTTERDAM':{'Code':'AAIQQ00','Index':'BWAVE'}, 'AUGUSTA':{'Code':'AAWQK00','Index':'BWAVE'}, 'HOUSTON':{'Code':'AAIQZ00','Index':'WTI'}, 'SINGAPORE':{'Code':'AAIQV00','Index':'OMAN/DUBAI'}}, 'Arab Light': {'ROTTERDAM':{'Code':'AAIQR00','Index':'BWAVE'}, 'AUGUSTA':{'Code':'AAWQL00','Index':'BWAVE'}, 'HOUSTON':{'Code':'AAIRA00','Index':'WTI'}, 'SINGAPORE':{'Code':'AAIQW00','Index':'OMAN/DUBAI'}}, 'Arab Medium': {'ROTTERDAM':{'Code':'AAIQS00','Index':'BWAVE'}, 'AUGUSTA':{'Code':'AAWQM00','Index':'BWAVE'}, 'HOUSTON':{'Code':'AAIRB00','Index':'WTI'}, 'SINGAPORE':{'Code':'AAIQX00','Index':'OMAN/DUBAI'}}, 'Arab Heavy': {'ROTTERDAM':{'Code':'AAIQT00','Index':'BWAVE'}, 'AUGUSTA':{'Code':'AAWQN00','Index':'BWAVE'}, 'HOUSTON':{'Code':'AAIRC00','Index':'WTI'}, 'SINGAPORE':{'Code':'AAIQY00','Index':'OMAN/DUBAI'}}, 'Basrah Light': {'ROTTERDAM':{'Code':'AAIPH00','Index':'Dated'}, 'AUGUSTA':{'Code':'AAIPH00','Index':'Dated'}, 'HOUSTON':{'Code':'AAIPG00','Index':'WTI'}, 'SINGAPORE':{'Code':'AAIPE00','Index':'OMAN/DUBAI'}}, 'Basrah Heavy': {'ROTTERDAM':{'Code':'AAXUC00','Index':'Dated'}, 'AUGUSTA':{'Code':'AAXUC00','Index':'Dated'}, 'HOUSTON':{'Code':'AAXUE00','Index':'Mars'}, 'SINGAPORE':{'Code':'AAXUA00','Index':'OMAN/DUBAI'}}, 'Iranian Heavy': {'ROTTERDAM':{'Code':'AAIPB00','Index':'BWAVE'}, 'AUGUSTA':{'Code':'AAUCH00','Index':'BWAVE'}, #'Iranian Heavy':{'HOUSTON':{'Code':abcde,'Index':'WTI'}}, 'SINGAPORE':{'Code':'AAIOY00','Index':'OMAN/DUBAI'}}, 'Iranian Light': {'ROTTERDAM':{'Code':'AAIPA00','Index':'BWAVE'}, 'AUGUSTA':{'Code':'AAUCJ00','Index':'BWAVE'}, 'SINGAPORE':{'Code':'AAIOX00','Index':'OMAN/DUBAI'}}, 'Forozan': {'ROTTERDAM':{'Code':'AAIPC00','Index':'BWAVE'}, 'AUGUSTA':{'Code':'AAUCF00','Index':'BWAVE'}, 'SINGAPORE':{'Code':'AAIOZ00','Index':'OMAN/DUBAI'}}, 'Isthmus':{'ROTTERDAM':{'Code':'AAIQC00','Index':'Dated'}, 'AUGUSTA':{'Code':'AAIQC00','Index':'Dated'}, 'HOUSTON':{'Code':'AAIPZ00','Index':'WTI'}, 'SINGAPORE':{'Code':'AAIQE00','Index':'OMAN/DUBAI'}}, 'Maya':{'ROTTERDAM':{'Code':'AAIQB00','Index':'Dated'}, 'AUGUSTA':{'Code':'AAIQB00','Index':'Dated'}, 'HOUSTON':{'Code':'AAIPY00','Index':'WTI'}, 'SINGAPORE':{'Code':'AAIQD00','Index':'OMAN/DUBAI'}} } crudes_to_shift = pd.DataFrame.from_dict({(crude,destination): exceptions[crude][destination] for crude in exceptions.keys() for destination in exceptions[crude].keys()}, orient='index') """convert the dataseries to a list, then use setr to get the unique items, then convert back to a list""" crudes_to_shift = list(set(list(crudes_to_shift['Code']))) """Fopr the crudes in the list, I want to resample the series at the month start so there is a common value for the start of each month, I then want shift these values by 1 backwards, in this case because we resampled, this automatically means shift abck one month, I then want to re-index the new dataframe to conform to where we are putting it back into, and finally I assign the total dataframe where the column headers are equal to the crude list, the new shifted and filled forward values to make sure everything lines up""" total[crudes_to_shift] = total[crudes_to_shift].resample('MS').mean().shift(-1, freq='MS').reindex(total.index).fillna(method='ffill') #total['AAXUC00'] """This will help with the date error. Turn the index into a numpy array and then assign the value""" if total.index[-1] - total.index[-2] > pd.Timedelta(days=2): total.index.values[-1] = total.index[-2] + pd.Timedelta(days=1) """Clean the column hedaers so no white spcaes - use simple list comprehension and set headers equal to cleaned""" cleaned_column_headers = [i.strip() for i in total.columns.values] total.columns = cleaned_column_headers """The below was get rid of the row in the index that hax NaT against it and then expand to daily and fill backwards""" crude_diffs = pd.read_excel(trader_assessed, 'Crude Diffs Traders', header = 0) crude_diffs = crude_diffs.loc[pd.notnull(crude_diffs.index)] crude_diffs = crude_diffs.drop([name for name in crude_diffs.columns if 'Unnamed' in name], axis=1) #crude_diffs.index = crude_diffs.index.map(lambda x : x + 1*BDay()) crude_diffs = crude_diffs.reindex(total.index).fillna(method='bfill').fillna(method='ffill') """Slice the crude diffs where the dates in the index are the same as the dates in the total dataframe""" #crude_diffs = crude_diffs[crude_diffs.index.isin(total.index)] crudes_diff_against_osp = ['Basrah Light','Basrah Heavy'] codes_list = [x for x in crude_diffs.columns if x not in crudes_diff_against_osp] """Apply the values in crude diffs to the correct codes and dates in the total dataframe""" total.update(crude_diffs[codes_list]) """We have to convert the prices that are in absolutes into a diff vs a local index, and if there are, set to zero. This is LOOP Sour""" total['AALSM01'].loc[total['AALSM01'] > 30] = total['AALSM01'].loc[total['AALSM01'] > 30] - total['CLc1'] #total.loc[total.index.isin(crude_diffs.index), codes_list] = crude_diffs[codes_list] #total[codes_list] #total.update(crude_diffs[codes_list]) """ Need this for the sulphur table""" forties_sulphur = pd.read_excel(trader_assessed, 'Forties de-esc', header = [22], parse_cols="H:I").set_index('week ending') forties_sulphur = forties_sulphur.loc[pd.notnull(forties_sulphur.index)] forties_sulphur = forties_sulphur.reindex(total.index).fillna(method='ffill') """Also need to adjust the cfds to take into account the inter month BFOE spread""" cfd_list = ['PCAKA00','PCAKC00','PCAKE00','PCAKG00','AAGLU00','AAGLV00','AALCZ00','AALDA00'] temp = total[cfd_list].sub(pd.Series(total['PCAAQ00'] - total['PCAAR00']), axis=0) temp = temp[temp.index > dt(2017,6,30)] total.loc[total.index.isin(temp.index), list(temp.columns)] = temp[list(temp.columns)] """This turns the 5 years of rate matricies into a table for use to reference - 12/04/2018""" rates = [] for x,y in enumerate([name.split()[2] for name in raw_rates.sheet_names]): f = pd.read_excel(raw_rates, sheetname = x, header = None).iloc[1:47,1:] lplen = len(f.iloc[:,1]) dplen = len(f.iloc[1,:]) for j in range(1, dplen): for i in range(1,lplen): LoadPort = f.iloc[i,0] DischargePort = f.iloc[0,j] Year = y Rate = f.iloc[i,j] rates.append({'LoadPort':LoadPort, 'DischargePort': DischargePort, 'Year':Year,'Rate':Rate}) rate_data = pd.DataFrame(rates) """Also initialise the temp df with index of total. Temp df is tol hold the dataseries needed to calculate the freight""" df = pd.DataFrame(index=total.index) df['Date'] = df.index """This function allows us to apply the expiration date for the wti futures used to determine what structure we apply to the CMA Have tried timing and slight improvment with the blow of 0.2seconds....""" t = time.process_time() for_dates = lambda x: (expiry_table.loc[(expiry_table.index.month == x.month)&(expiry_table.index.year == x.year)]['Expiry']).iat[0] df['Expiry'] = df['Date'].apply(for_dates) df.drop(['Date'], inplace=True, axis=1) print("df['Expiry'] created successfully: Time was {}".format(time.process_time() - t)) print("Temp DataFrame created successfully") print("import_data() created successfully: Time was {}".format(time.process_time() - t2)) return assay, ws, ports, total, rate_data, sub_to_ws, df, basrah_ws_base, crude_diffs, forties_sulphur, exceptions, crudes_to_shift cxn = pyodbc.connect('Driver=SQL Server Native Client 11.0;' 'Server=STCHGS112;' 'Database=MIMAWorkSpace;' 'uid=mima;' 'Trusted_Connection=Yes;') query = '''CREATE TABLE Global_Arbs_GPWs (
def read_bpp_from_input_file( filename: str, expand_orient: bool = False, expand_strand: bool = False, expand_svtype: bool = False, integer_columns: Set[str] = INTEGER_COLUMNS, float_columns: Set[str] = FLOAT_COLUMNS, required_columns: Set[str] = set(), add_default: Dict[str, Any] = {}, summary: bool = False, apply: Dict[str, Callable] = {}, overwrite: Dict[str, Any] = {}, ) -> List[BreakpointPair]: """ reads a file using the tab module. Each row is converted to a breakpoint pair and other column data is stored in the data attribute Args: filename: path to the input file expand_ns: expand not specified orient/strand settings to all specific version (for strand this is only applied if the bam itself is stranded) explicit_strand: used to stop unstranded breakpoint pairs from losing input strand information summary: the input is post-summary so some float/int columns have been merged and delimited with semi-colons overwrite: set column values for all breakpoints, if the column exists overwrite its current value Returns: a list of pairs """ def soft_null_cast(value): try: cast_null(value) except TypeError: return value if summary: integer_columns = integer_columns - SUMMARY_LIST_COLUMNS float_columns = float_columns - SUMMARY_LIST_COLUMNS try: df = pd.read_csv( filename, dtype={ **{col: pd.Int64Dtype() for col in integer_columns}, **{col: float for col in float_columns}, **{ col: str for col in COLUMNS.keys() if col not in (float_columns | integer_columns) }, }, sep='\t', comment='#', na_values=[ 'None', 'none', 'N/A', 'n/a', 'null', 'NULL', 'Null', 'nan', '<NA>', 'NaN' ], ) df = df.where(pd.notnull(df), None) except pd.errors.EmptyDataError: return [] for col in required_columns: if col not in df and col not in add_default: raise KeyError(f'missing required column: {col}') if COLUMNS.opposing_strands in df: df[COLUMNS.opposing_strands] = df[COLUMNS.opposing_strands].apply( lambda x: None if x == '?' else soft_cast(x, cast_type=bool)) else: df[COLUMNS.opposing_strands] = None if COLUMNS.stranded in df: df[COLUMNS.stranded] = df[COLUMNS.stranded].apply(cast_boolean) else: df[COLUMNS.stranded] = None if COLUMNS.untemplated_seq in df: df[COLUMNS.untemplated_seq] = df[COLUMNS.untemplated_seq].apply( soft_null_cast) else: df[COLUMNS.untemplated_seq] = None for col in [COLUMNS.break1_chromosome, COLUMNS.break2_chromosome]: df[col] = df[col].apply(lambda v: re.sub(r'^chr', '', v)) if COLUMNS.tracking_id not in df: df[COLUMNS.tracking_id] = '' else: df[COLUMNS.tracking_id] = df[COLUMNS.tracking_id].fillna(str(uuid())) # add default values for col, default_value in add_default.items(): if col in df: df[col] = df[col].fillna(default_value) else: df[col] = default_value # run the custom functions for col, func in apply.items(): df[col] = df[col].apply(func) # set overwriting defaults for col, value in overwrite.items(): df[col] = value # enforce controlled vocabulary for vocab, cols in [ (ORIENT, [COLUMNS.break1_orientation, COLUMNS.break2_orientation]), (STRAND, [COLUMNS.break1_strand, COLUMNS.break2_strand]), (PROTOCOL, [COLUMNS.protocol]), ]: for col in cols: if col in df: df[col].apply(lambda c: vocab.enforce(c)) # type: ignore elif hasattr(vocab, 'NS'): df[col] = vocab.NS # type: ignore def validate_pipeline_id(value): if not re.match(r'^([A-Za-z0-9-]+|)(;[A-Za-z0-9-]+)*$', value): raise AssertionError( 'All mavis pipeline step ids must satisfy the regex:', '^([A-Za-z0-9-]+|)(;[A-Za-z0-9-]+)*$', value, ) for col in [ COLUMNS.cluster_id, COLUMNS.annotation_id, COLUMNS.validation_id ]: if col in df: try: df[col].apply(validate_pipeline_id) except AssertionError as err: raise AssertionError(f'error in column ({col}): {err}') rows = df.where(df.notnull(), None).to_dict('records') non_data_columns = { COLUMNS.break1_chromosome, COLUMNS.break1_position_start, COLUMNS.break1_position_end, COLUMNS.break1_strand, COLUMNS.break1_orientation, COLUMNS.break2_chromosome, COLUMNS.break2_position_start, COLUMNS.break2_position_end, COLUMNS.break2_strand, COLUMNS.break2_orientation, COLUMNS.stranded, COLUMNS.opposing_strands, COLUMNS.untemplated_seq, } pairs: List[BreakpointPair] = [] for line_index, row in enumerate(rows): row['line_no'] = line_index + 1 if '_index' in row: del row['_index'] for attr, val in row.items(): row[attr] = soft_null_cast(val) stranded = row[COLUMNS.stranded] strand1 = row[COLUMNS.break1_strand] if stranded else STRAND.NS strand2 = row[COLUMNS.break2_strand] if stranded else STRAND.NS temp = [] expand_strand = stranded and expand_strand event_type = [None] if not pd.isnull(row.get(COLUMNS.event_type)): try: event_type = row[COLUMNS.event_type].split(';') for putative_event_type in event_type: SVTYPE.enforce(putative_event_type) except KeyError: pass for orient1, orient2, strand1, strand2, putative_event_type in itertools.product( ORIENT.expand(row[COLUMNS.break1_orientation]) if expand_orient else [row[COLUMNS.break1_orientation]], ORIENT.expand(row[COLUMNS.break2_orientation]) if expand_orient else [row[COLUMNS.break2_orientation]], STRAND.expand(strand1) if expand_strand and stranded else [strand1], STRAND.expand(strand2) if expand_strand and stranded else [strand2], event_type, ): try: break1 = Breakpoint( row[COLUMNS.break1_chromosome], row[COLUMNS.break1_position_start], row[COLUMNS.break1_position_end], strand=strand1, orient=orient1, ) break2 = Breakpoint( row[COLUMNS.break2_chromosome], row[COLUMNS.break2_position_start], row[COLUMNS.break2_position_end], strand=strand2, orient=orient2, ) data = { k: v for k, v in row.items() if k not in non_data_columns } bpp = BreakpointPair( break1, break2, opposing_strands=row[COLUMNS.opposing_strands], untemplated_seq=row[COLUMNS.untemplated_seq], stranded=row[COLUMNS.stranded], ) bpp.data.update(data) if putative_event_type: bpp.data[COLUMNS.event_type] = putative_event_type if putative_event_type not in BreakpointPair.classify(bpp): raise InvalidRearrangement( 'error: expected one of', BreakpointPair.classify(bpp), 'but found', putative_event_type, str(bpp), row, ) if expand_svtype and not putative_event_type: for svtype in BreakpointPair.classify( bpp, distance=lambda x, y: Interval(y - x)): new_bpp = bpp.copy() new_bpp.data[COLUMNS.event_type] = svtype temp.append(new_bpp) else: temp.append(bpp) except InvalidRearrangement as err: if not any([expand_strand, expand_svtype, expand_orient]): raise err except AssertionError as err: if not expand_strand: raise err if not temp: raise InvalidRearrangement( 'could not produce a valid rearrangement', row) else: pairs.extend(temp) return pairs
def forties_sulphur_records(): trader_assessed = pd.ExcelFile('L://TRADING//ANALYSIS//GLOBAL//Arb Models//Pecking Order 2018.xlsm') forties_sulphur = pd.read_excel(trader_assessed, 'Forties de-esc', header = [22], parse_cols="H:I").set_index('week ending') forties_sulphur = forties_sulphur.loc[pd.notnull(forties_sulphur.index)] forties_sulphur_input = forties_sulphur.reset_index().rename(columns={'buzzard content':'BuzzardContent','week ending':'Date'}).to_dict('records') return forties_sulphur_input