def get_plots_for_thresholds(ds, thresholds, leaky_threshold, n_scripts_range, filename_suffix='dye_snippets', y_range=(0, 1), recall_color='black', n_scripts_color='firebrick', **extra_plot_opts): resultsdir = ds.config('DYESCORE_RESULTS_DIR') # Infile validation for threshold in thresholds: inpath = os.path.join( resultsdir, f'dye_score_plot_data_from_{filename_suffix}_{threshold}_leak_{leaky_threshold}.csv' ) ds.file_in_validation(inpath) plots = {} for threshold in thresholds: inpath = os.path.join( resultsdir, f'dye_score_plot_data_from_{filename_suffix}_{threshold}_leak_{leaky_threshold}.csv' ) if ds.s3: with ds.s3.open(inpath, 'r') as f: pr_df = pd_read_csv(f) else: pr_df = pd_read_csv(inpath) plots[threshold] = get_pr_plot(pr_df, f'{threshold}', n_scripts_range, y_range, recall_color, n_scripts_color, **extra_plot_opts) return plots
def loading_query( test_data_file: str, test_score_file: str ): data_test = pd_read_csv(test_data_file) scores_test = pd_read_csv(test_score_file) print("Number of pathways:", len(scores_test)) print("Total number of reactions:", len(data_test)) return data_test, scores_test
def get_threshold_summary_plot(ds): resultsdir = ds.config('DYESCORE_RESULTS_DIR') inpath = os.path.join(resultsdir, f'recall_summary_plot_data.csv') ds.file_in_validation(inpath) if ds.s3: with ds.s3.open(inpath, 'r') as f: results_df = pd_read_csv(f) else: results_df = pd_read_csv(inpath) recall_thresholds = sorted(results_df.recall_threshold.unique()) grouped_results_df = results_df.groupby('recall_threshold').agg( lambda x: list(x)) palette = inferno(len(recall_thresholds) + 1) # The yellow is often a little light source = ColumnDataSource(grouped_results_df) p = figure( title= f'Scripts captured by distance threshold for {len(recall_thresholds)} recall thresholds (colored)', width=800, toolbar_location=None, tools='', y_range=Range1d(results_df.n_over_threshold.min(), results_df.n_over_threshold.max()), ) p.xaxis.axis_label = 'distance threshold' p.yaxis.axis_label = 'minimum n_scripts' p.yaxis.formatter = NumeralTickFormatter(format="0a") p.extra_y_ranges = { 'percent': Range1d(results_df.percent.min(), results_df.percent.max()) } p.add_layout( LinearAxis(y_range_name='percent', axis_label='minimum n_scripts (percent of total)', formatter=NumeralTickFormatter(format='0%')), 'right') for i, recall_threshold in enumerate(recall_thresholds): view = CDSView(source=source, filters=[IndexFilter([i])]) opts = dict(source=source, view=view, legend=str(recall_threshold), color=palette[i], line_width=5, line_alpha=0.6) p.multi_line(xs='distance_threshold', ys='n_over_threshold', **opts) p.multi_line(xs='distance_threshold', ys='percent', y_range_name='percent', **opts) p.legend.click_policy = 'hide' return p
def updateScore(csvfile, score): """ Add or update score column and reorder """ import string head, rows = read_csv(csvfile) data = pd_read_csv(csvfile) data.index = data.index + 1 cols = data.columns.tolist() sco = pd_Series(np_zeros(len(data[cols[0]])), index=data.index) if 'Score' not in cols: data['Score'] = sco cols = ['Score'] + cols data = data[cols] colk = list(string.ascii_uppercase) for sc in score: try: coln = colk.index(sc[0]) val = sc[2] checked = sc[3] if checked: sco += val * data.iloc[:, coln] except: continue data['Score'] = sco data = data.sort_values('Score', ascending=False) updateMSA(os_path.dirname(csvfile), [[v] for v in data['Seq. ID']]) data = data.reset_index(drop=True) data.index = data.index + 1 data.rename_axis('Select', axis="columns") data.to_csv(csvfile, quoting=csv_QUOTE_ALL, index=False) return data
def _predict_score( test_data_file: str, test_score_file: str, models_path: str, features_dset_train, no_of_rxns_thres: int ) -> float: # ttdf = open(test_data_file, 'r') # print('test_data_file') # print(ttdf.read()) # ttsf = open(test_score_file, 'r') # print('test_score_file') # print(ttsf.read()) data_test, scores_test = loading_query( test_data_file, test_score_file ) with NamedTemporaryFile(delete=False) as out_f: encode_and_predict( data_test, scores_test, models_path, features_dset_train, no_of_rxns_thres, out_f.name ) out_f.close() score_df = pd_read_csv(out_f.name) remove(out_f.name) return list(score_df.to_dict()['Prob1_mean'].values())
def _build_plot_data_for_score_df(self, s3, inpath, outpath, compare_list): if s3: with s3.open(inpath, 'r') as f: score_df = pd_read_csv(f) else: score_df = pd_read_csv(inpath) pr = pd_DataFrame({ 'dye_score_threshold': np.linspace(0, score_df.dye_score.max(), 1000) }) pr['recall'] = pr.dye_score_threshold.apply(self._get_recall, score_df=score_df, compare_list=compare_list) pr['n_over_threshold'] = pr.dye_score_threshold.apply( lambda x: (score_df.dye_score > x).sum()) if s3: with s3.open(outpath, 'w') as f: pr.to_csv(f, index=False) else: pr.to_csv(outpath, index=False) return outpath
def read_from_csv(fileName): df_full = pd_read_csv(fileName) y = df_full.pop('complex') X = df_full neg_start_ind = y[y == 0].index[0] X_pos = X.iloc[0:neg_start_ind] y_pos = y[0:neg_start_ind] X_neg = X.iloc[neg_start_ind:] y_neg = y[neg_start_ind:] return y, X, X_pos, y_pos, X_neg, y_neg
def semesters(ha_df, core_courses, conval_dict, population_IDs=[], program='Computer Science'): global se_df _h_program = hash( program ) try: if se_df.empty: _se_df = pd_read_csv( './data/kuleuven/se_df_%i.csv'%( _h_program )) se_df = _se_df return se_df except: _se_df = semesters_features_calc( ha_df, core_courses, conval_dict, population_IDs ) _se_df.to_csv('./data/kuleuven/se_df_%i.csv'%( _h_program )) se_df = _se_df return se_df
def read_csv(filepath, sep=',', header='infer', names=None, usecols=None, dtype=None, converters=None, skiprows=None, nrows=None): """Read CSV into DataFrame. Eager implementation using pandas, i.e. entire file is read at this point. Only common/relevant parameters available at the moment; for full list, could use pandas directly and then convert to baloo. Parameters ---------- filepath : str sep : str, optional Separator used between values. header : 'infer' or None, optional Whether to infer the column names from the first row or not. names : list of str, optional List of column names to use. Overrides inferred header. usecols : list of (int or str), optional Which columns to parse. dtype : dict, optional Dict of column -> type to parse as. converters : dict, optional Dict of functions for converting values in certain columns. skiprows : int, optional Number of lines to skip at start of file. nrows : int, optional Number of rows to read. Returns ------- DataFrame See Also -------- pandas.read_csv : https://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_csv.html """ pd_df = pd_read_csv(filepath, sep=sep, header=header, names=names, usecols=usecols, dtype=dtype, converters=converters, skiprows=skiprows, nrows=nrows) return DataFrame.from_pandas(pd_df)
def alpha_beta_skewness(ha_df, population_IDs=[], program='Computer Science', overwrite=False): global abs_df _h_program = hash( program ) def calc_n_save(): _abs_df = courses_features_calc( ha_df, population_IDs ) _abs_df.to_csv('./data/kuleuven/abs_df_%i.csv'%( _h_program )) return _abs_df if abs_df.empty: try: _abs_df = pd_read_csv('./data/kuleuven/abs_df_%i.csv'%( _h_program ), index_col=0) abs_df = _abs_df except: abs_df = calc_n_save() elif overwrite: abs_df = calc_n_save() return abs_df
def get_GPA_by_student(ha_df=kuleuven_loader.ha_df): global gpa_df try: if gpa_df.empty: #print 'GPA load' _gpa_df = pd_read_csv('./data/kuleuven/gpa_df.csv', index_col=0, dtype={'GPA':float32, 'ap_GPA':float32, 'cod_estudiante':int32, 'performance':float32}) gpa_df = _gpa_df else: return gpa_df except: #print 'GPA load fails' _gpa_df = GPA_calc(ha_df) _gpa_df.to_csv('./data/kuleuven/gpa_df.csv') return _gpa_df
def read_coordinates_of_class(in_situ_crop_directory, directory_as_class): class_coordinates = [] with open( os.path.join(in_situ_crop_directory, directory_as_class, 'coordinates.txt')) as file: df = pd_read_csv(file, sep='\t', header=None) for i in range(len(df)): row = df.loc[i] cls = int(directory_as_class) - 1 shelf = int(row[0]) frame = int(row[1]) xleft = int(row[2]) yupper = int(row[3]) xright = int(row[2] + row[4]) ylower = int(row[3] + row[5]) class_coordinates.append( (cls, shelf, frame, xleft, yupper, xright, ylower)) return class_coordinates
from numpy import int32, float32 pd.options.mode.chained_assignment = None pd.options.mode.use_inf_as_null = True def side_strip(_str): try: return _str[ :_str.index(' ') ] except: return _str ''' Students Academic History ''' start = time() ha_df = pd_read_csv('./data/espol/ha_df.csv', index_col=0) ha_df['cod_materia_acad'] = ha_df['cod_materia_acad'].apply( side_strip ) try: ha_df['cod_estudiante'] = ha_df['cod_estudiante'].values.astype(int32) ha_df['promedio'] = ha_df['promedio'].values.astype(float32) ha_df['anio'] = ha_df['anio'].values.astype(int32) ha_df['paralelo'] = ha_df['paralelo'].values.astype(int32) ha_df['GPA'] = ha_df['GPA'].values.astype(float32) ha_df['ap_GPA'] = ha_df['ap_GPA'].values.astype(float32) ha_df['performance'] = ha_df['performance'].values.astype(float32) ha_df['promedio_GPA'] = ha_df['promedio_GPA'].values.astype(float32) except: pass end = time() print('Exe time: %.2f'%(end - start)) print('loaded dataframe from CSV as DataFrame. records: %d'%len(ha_df))
from keras.layers import Input, Conv2D, MaxPooling2D, Flatten, Dense, concatenate, Dropout from keras.models import Model from deepgs.parser import load_plink_text from deepgs.transform import transform import deepgs.model as models # if executed from within the test directory, go one level up the tree. if os.path.split(os.getcwd())[1] == "tests": os.chdir("..") file_path = os.path.join("data", "sample_100-10000") if os.path.exists(file_path + ".pkl"): G = pickle.load(open(file_path + ".pkl", "rb")) map_table = pd_read_csv(file_path + ".map", delim_whitespace=True, header=None, names=["chr", "snp", "cm", "bp"]) else: g_df, map_table = load_plink_text(file_path) G = transform(g_df) model = models.create_architecture_small(map_table) model = models.compile_model(model) GG = models.format_as_model_input(G, map_table) model.predict(GG) def create_dummy_architecture(map_table, output_dim=1, output_activation=None): """ A small dummy architecture to be tested on the small 15-by-10 dataset.
def get_recall_summary_plot_data(self, thresholds, recall_thresholds, leaky_threshold, filename_suffix='dye_snippets', override=True): resultsdir = self.config('DYESCORE_RESULTS_DIR') # Infile validation for threshold in thresholds: inpath = os.path.join( resultsdir, f'dye_score_plot_data_from_{filename_suffix}_{threshold}_leak_{leaky_threshold}.csv' ) self.file_in_validation(inpath) # Outfile validation outpath = os.path.join(resultsdir, f'recall_summary_plot_data.csv') self.file_out_validation(outpath, override) # Gather up relevant results results = [] for threshold in thresholds: inpath = os.path.join( resultsdir, f'dye_score_plot_data_from_{filename_suffix}_{threshold}_leak_{leaky_threshold}.csv' ) if self.s3: with self.s3.open(inpath, 'r') as f: pr_df = pd_read_csv(f) else: pr_df = pd_read_csv(inpath) for recall_threshold in recall_thresholds: # TODO Use idxmin result = {} n_over_threshold = pr_df[pr_df > recall_threshold].sort_values( by='recall').iloc[0]['n_over_threshold'] result['distance_threshold'] = threshold result['n_over_threshold'] = n_over_threshold result['recall_threshold'] = recall_threshold results.append(result) # Make DF and save inpath = os.path.join( resultsdir, f'dye_score_from_{filename_suffix}_{thresholds[0]}_leak_{leaky_threshold}.csv' ) if self.s3: with self.s3.open(inpath, 'r') as f: total_results = len(pd_read_csv(f)) else: total_results = len(pd_read_csv(inpath)) results_df = pd_DataFrame.from_records(results) results_df['percent'] = (results_df.n_over_threshold / total_results) if self.s3: with self.s3.open(outpath, 'w') as f: results_df.to_csv(f, index=False) else: results_df.to_csv(outpath, index=False) return outpath
from sklearn.cross_validation import train_test_split from numpy import average as np_average from numpy import array as np_array from skfuzzy import cmeans, cmeans_predict from data_loader import kuleuven_loader from itertools import combinations in_source = "kuleuven" dispatcher = WSDispatcher(source=in_source) se_df = dispatcher.academic_clusterer.semesters_features sf_df = dispatcher.academic_clusterer.students_features ss_df = pd_merge(se_df, sf_df, on="student") # cd doc/calibration_test/ abs_df = pd_read_csv("../../data/kuleuven/abs_df_1716653621.csv", index_col=0) abs_df = abs_df.fillna(-1000) ha_df = pd_read_csv("../../data/kuleuven/students_courses.csv", index_col=0) ha_df = ha_df.drop_duplicates(["year", "status", "course", "grade", "student"]) sha_df = pd_merge(ha_df, sf_df, on="student") sha_df = pd_merge(sha_df, abs_df, on="course") OP = [] OP_append = OP.append def plot_calibration_curve_from_data(X, y, est, name, fig_index): """Plot calibration curve for est w/o and with calibration. """ X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=7) # Calibrated with isotonic calibration
from detection_and_tracking.datasets.seagull.seagull_txt_handler import Seagull_TXT_Handler from detection_and_tracking.configuration.seagull import dataset_dir, SeagullPaths from pandas import read_csv as pd_read_csv seagull_paths = SeagullPaths() # TODO - Fix maritime if __name__ == '__main__': txt_handler = Seagull_TXT_Handler(dataset_dir) dataset_info_df = pd_read_csv(dataset_annotations_info, delimiter=';') txt_handler.txts_to_tracking_csv(complete_visible_txt_dir, csv_name='all-complete-visible-tracking.csv', dataset_info_df=dataset_info_df) txt_handler.txts_to_detection_csv(complete_visible_txt_dir, csv_name='all-complete-visible-detection_and_tracking.csv') txt_handler.txts_to_tracking_csv(complete_infrared_txt_dir, csv_name='all-complete-infrared-tracking.csv', dataset_info_df=dataset_info_df) txt_handler.txts_to_detection_csv(complete_infrared_txt_dir, csv_name='all-complete-infrared-detection_and_tracking.csv') txt_handler.txts_to_detection_csv(incomplete_visible_txt_dir, csv_name='all-incomplete-visible-detection_and_tracking.csv')
Students Academic History <class 'pandas.core.frame.DataFrame'> Int64Index: 120080 entries, 0 to 120079 Data columns (total 7 columns): student 120080 non-null int32 grade 120063 non-null float32 course 120080 non-null object name 120080 non-null object status 120080 non-null object performance 120080 non-null float32 year 120080 non-null int32 dtypes: float32(2), int32(2), object(3) """ start = time() ha_df = pd_read_csv("./data/kuleuven/students_courses.csv") # ha_df['cod_materia_acad'] = ha_df['cod_materia_acad'].apply( side_strip ) try: ha_df["student"] = ha_df["student"].values.astype(int32) ha_df["grade"] = ha_df["grade"].values.astype(float32) ha_df["performance"] = ha_df["performance"].values.astype(float32) ha_df["year"] = ha_df["year"].values.astype(int32) except: pass ha_df = ha_df.drop_duplicates(["year", "status", "course", "grade", "student"]) end = time() print("Exe time: %.2f" % (end - start)) print("loaded dataframe from CSV as DataFrame. records: %d" % len(ha_df)) print("\n") #'''