def summary_stats(data_list): """ Accepts a sample of numbers and returns a pretty print out of a variety of descriptive statistics. """ mean = calculate.mean(data_list) median = calculate.median(data_list) mode = calculate.mode(data_list) n = len(data_list) max_ = max(data_list) min_ = min(data_list) range_ = calculate.range(data_list) standard_deviation = calculate.standard_deviation(data_list) variation_coefficient = calculate.variation_coefficient(data_list) table = ptable.indent( [ ['Statistic', 'Value'], ['n', str(n)], ['mean', str(mean)], ['median', str(median)], ['mode', str(mode)], ['maximum', str(max_)], ['minimum', str(min_)], ['range', str(range_)], ['standard deviation', str(standard_deviation)], ['variation coefficient', str(variation_coefficient)], ], hasHeader=True, separateRows=False, prefix="| ", postfix=" |", ) print(table)
def summary_stats(data_list): """ Accepts a sample of numbers and returns a pretty print out of a variety of descriptive statistics. """ mean = calculate.mean(data_list) median = calculate.median(data_list) mode = calculate.mode(data_list) n = len(data_list) max_ = max(data_list) min_ = min(data_list) range_ = calculate.range(data_list) standard_deviation = calculate.standard_deviation(data_list) print """ Summary statistics ================== n: %s max: %s min: %s range: %s mean: %s median: %s mode: %s std: %s """ % (n, max_, min_, range_, mean, median, mode, standard_deviation)
def standard_deviation(data_list): """ Returns the standard deviation of a list of numbers. h3. Documentation http://en.wikipedia.org/wiki/Standard_deviation """ data_list = map(float, data_list) mean = calculate.mean(data_list) deviations = [i - mean for i in data_list] deviations_squared = [math.pow(i, 2) for i in deviations] mean_deviation = calculate.mean(deviations_squared) standard_deviation = math.sqrt(mean_deviation) return standard_deviation
def variation_coefficient(data_list): """ Accepts a list of values and returns the variation coefficient, which is a normalized measure of the distribution. This is the sort of thing you can use to compare the standard deviation of sets that are measured in different units. Note that it uses our "population" standard deviation as part of the calculation, not a "sample standard deviation. h3. Example usage >>> import calculate >>> calculate.variation_coefficient([1, 2, -2, 4, -3]) 6.442049363362563 h3. Documentation * "coefficient of variation":http://en.wikipedia.org/wiki/\ Coefficient_of_variation """ # Convert all the values to floats and test to make sure # there aren't any strings in there try: data_list = list(map(float, data_list)) except ValueError: raise ValueError('Input values must contain numbers') std = calculate.standard_deviation(data_list) mean = calculate.mean(data_list) return std / mean
def has_sound_spike(self): """ Find the standard deviation of the past 10 minutes. Send out a tweet if there are any signals greater than two standard deviations in the past 10 seconds. """ ten_minutes = timezone.localtime( timezone.now()) - datetime.timedelta(minutes=10) ten_seconds = timezone.localtime( timezone.now()) - datetime.timedelta(seconds=10) signals_past_ten_min = self.signal_set.filter( timestamp__lt=timezone.localtime(timezone.now()), timestamp__gte=ten_minutes) if signals_past_ten_min.count > 0: voltages = list( signals_past_ten_min.values_list( 'voltage', flat=True).order_by('voltage')) avg = calculate.mean(voltages) std_dev = calculate.standard_deviation(voltages) twice_std_dev = (std_dev * 2) + avg signals_past_10_secs = signals_past_ten_min.filter( timestamp__gte=ten_seconds, voltage__gte=twice_std_dev) # return the voltage of the highest signal if there has been a spike # Or return False if signals_past_10_secs.count() > 0: signals_past_10_secs = list( signals_past_10_secs.values_list( 'voltage', flat=True).order_by('-voltage')) return signals_past_10_secs[0] else: return False else: return False
def has_sound_spike(self): """ Find the standard deviation of the past 10 minutes. Send out a tweet if there are any signals greater than two standard deviations in the past 10 seconds. """ ten_minutes = timezone.localtime(timezone.now()) - datetime.timedelta(minutes=10) ten_seconds = timezone.localtime(timezone.now()) - datetime.timedelta(seconds=10) signals_past_ten_min = self.signal_set.filter( timestamp__lt=timezone.localtime(timezone.now()), timestamp__gte=ten_minutes ) if signals_past_ten_min.count > 0: voltages = list(signals_past_ten_min.values_list("voltage", flat=True).order_by("voltage")) avg = calculate.mean(voltages) std_dev = calculate.standard_deviation(voltages) twice_std_dev = (std_dev * 2) + avg signals_past_10_secs = signals_past_ten_min.filter(timestamp__gte=ten_seconds, voltage__gte=twice_std_dev) # return the voltage of the highest signal if there has been a spike # Or return False if signals_past_10_secs.count() > 0: signals_past_10_secs = list(signals_past_10_secs.values_list("voltage", flat=True).order_by("-voltage")) return signals_past_10_secs[0] else: return False else: return False
def standard_deviation(data_list): """ Accepts a list of values and returns the standard deviation. Standard deviation measures how widely dispersed the values are from the mean. A lower value means the data tend to be bunched close to the averge. A higher value means they tend to be further away. This is a "population" calculation that assumes that you are submitting all of the values, not a sample. h3. Example usage >> import calculate >>> calculate.standard_deviation([2,3,3,4]) 0.70710678118654757 >>> calculate.standard_deviation([-2,3,3,40]) 16.867127793432999 h3. Documentation "standard deviation":http://en.wikipedia.org/wiki/Standard_deviation """ # Convert all the values to floats and test to make sure # there aren't any strings in there try: data_list = list(map(float, data_list)) except ValueError: raise ValueError('Input values must contain numbers') # Find the mean mean = calculate.mean(data_list) # Create a new list containing the distance from mean # for each value in the sample deviations = [i - mean for i in data_list] # Square the distances deviations_squared = [math.pow(i, 2) for i in deviations] # Take the average of those squares mean_deviation = calculate.mean(deviations_squared) # And then take the square root of the mean to find the standard deviation return math.sqrt(mean_deviation)
def get_avg_unemployment(data, start_year=2013, end_year=2015): avgs = {} while start_year <= end_year: avg = calculate.mean([ Decimal(rate.get(str(start_year))) for rate in data if rate.get(str(start_year))]) avgs[str(start_year)] = avg start_year += 1 return avgs
def get_avg_unemployment(data, start_year=2013, end_year=2015): avgs = {} while start_year <= end_year: avg = calculate.mean([ Decimal(rate.get(str(start_year))) for rate in data if rate.get(str(start_year)) ]) avgs[str(start_year)] = avg start_year += 1 return avgs
def main(): print() print("------------------------------------------------------------------------------------------------------------------------") print(" # Feature Tester") print(" # Purpose : This program is used to test generated feature set.") print(" # You have to manually configure in feature_tester.py as follows") print(" # [1] list_feature : Feature to be tested") print(" # [2] row_to_read_file_input : Number of rows in the file mapping between samples and their gene expression to be read") print(" # [3] file_training_input : A file contains mapping between samples and their gene expression") print(" # [4] file_training_output : A file contains mapping between samples and their health status") print(" # [5] rows_to_read_file_pathway : Number of rows in the file mapping between pathways and their member genes to be read") print(" # [6] file_ref_name : A file mapping between gene probe id and gene entrez id") print(" # [7] file_to_convert_name : A file contains mapping between samples and their gene expression") print(" # [8] file_pathway_name : A file mapping between pathways and their member genes") print(" # These files must follow a required format shown in file_format.pdf") print(" #") print(" # You will be asked to provide related files and required information about them including ") print(" # [1] Number of folds") print(" #") print(" # You will be asked for the name of an output file.") print("------------------------------------------------------------------------------------------------------------------------") print() # list of feature to be tested # example : epoch 7 in mean_no_normalize_10_10_10 list_feature = ['BIOCARTA_INTRINSIC_PATHWAY', 'REACTOME_REGULATION_OF_MRNA_STABILITY_BY_PROTEINS_THAT_BIND_AU_RICH_ELEMENTS', 'PID_SMAD2_3NUCLEAR_PATHWAY', 'REACTOME_G1_S_SPECIFIC_TRANSCRIPTION', 'REACTOME_RESOLUTION_OF_AP_SITES_VIA_THE_SINGLE_NUCLEOTIDE_REPLACEMENT_PATHWAY', 'KEGG_BASAL_TRANSCRIPTION_FACTORS', 'REACTOME_EXTENSION_OF_TELOMERES', 'PID_A6B1_A6B4_INTEGRIN_PATHWAY', 'REACTOME_LIPID_DIGESTION_MOBILIZATION_AND_TRANSPORT', 'REACTOME_BASE_FREE_SUGAR_PHOSPHATE_REMOVAL_VIA_THE_SINGLE_NUCLEOTIDE_REPLACEMENT_PATHWAY', 'REACTOME_ABC_FAMILY_PROTEINS_MEDIATED_TRANSPORT', 'PID_MET_PATHWAY', 'KEGG_SPLICEOSOME', 'BIOCARTA_TOLL_PATHWAY', 'PID_AVB3_OPN_PATHWAY', 'REACTOME_CELL_CYCLE_MITOTIC', 'REACTOME_FORMATION_OF_THE_HIV1_EARLY_ELONGATION_COMPLEX', 'REACTOME_DNA_STRAND_ELONGATION', 'REACTOME_CYCLIN_E_ASSOCIATED_EVENTS_DURING_G1_S_TRANSITION_', 'BIOCARTA_SPPA_PATHWAY', 'REACTOME_APC_CDC20_MEDIATED_DEGRADATION_OF_NEK2A', 'REACTOME_INHIBITION_OF_THE_PROTEOLYTIC_ACTIVITY_OF_APC_C_REQUIRED_FOR_THE_ONSET_OF_ANAPHASE_BY_MITOTIC_SPINDLE_CHECKPOINT_COMPONENTS', 'PID_HIF1A_PATHWAY', 'BIOCARTA_PTEN_PATHWAY', 'REACTOME_GRB2_SOS_PROVIDES_LINKAGE_TO_MAPK_SIGNALING_FOR_INTERGRINS_', 'PID_RETINOIC_ACID_PATHWAY'] # prepare data # default row_to_read = 22283 row_to_read_file_input = 22283 file_training_input = pd.read_csv("GSE2034-22071 (edited).csv", nrows = row_to_read_file_input) file_training_output= pd.read_csv("mapping_sample_to_class_full.csv", usecols = ['GEO asscession number', 'relapse (1=True)']) # files to be used to get pathways and their gene expression # default rows_to_read_file_pathway = 1329 rows_to_read_file_pathway = 1329 file_ref_name = "accession_number_to_entrez_id.csv" file_to_convert_name = "GSE2034-22071 (edited).csv" file_pathway_name = "c2.cp.v6.2.entrez.gmt.csv" file_pathway = pd.read_csv(file_pathway_name, nrows = rows_to_read_file_pathway) # get gene order id with its name list_gene_name = [] for i in range(0, row_to_read_file_input): # add element with this format (gene_order_id, gene_name) gene_name = [] gene_name.append(i) gene_name.append(file_training_input.loc[i, "ID_REF"]) list_gene_name.append(gene_name) # get list of pathway name list_pathway_name = [] for i in range(0, rows_to_read_file_pathway): pathway_name = [] pathway_name.append(i) pathway_name.append(file_pathway.loc[i, "PATHWAY_NAME"]) list_pathway_name.append(pathway_name) # consider non-relapse and relapse (not in specific period of time) sample_relapse = file_training_output.loc[file_training_output['relapse (1=True)'].isin(['1'])] sample_no_relapse = file_training_output.loc[file_training_output['relapse (1=True)'].isin(['0'])] # add GEO asscession number to each list list_sample_relapse = [] for element in sample_relapse.loc[:, 'GEO asscession number']: list_sample_relapse.append(element) list_sample_no_relapse = [] for element in sample_no_relapse.loc[:, 'GEO asscession number']: list_sample_no_relapse.append(element) # shuffle data to make each chunk does not depend on sample order random.shuffle(list_sample_relapse) print("list_sample_relapse SIZE = " + str(len(list_sample_relapse))) random.shuffle(list_sample_no_relapse) print("list_sample_no_relapse SIZE = " + str(len(list_sample_no_relapse))) # get number of folds while True: num_of_folds = input("Number of folds: ") if (num_of_folds.isnumeric() == False): print("WARNING : Input must be numeric") elif(int(num_of_folds) > len(list_sample_relapse)): print("WARNING : Number of folds exceeds the size of the 1st dataset") elif(int(num_of_folds) > len(list_sample_no_relapse)): print("WARNING : Number of folds exceeds the size of the 2nd dataset") elif(int(num_of_folds) <= 1): print("WARNING : Number of folds cannot lower than or equal to 1") else: break num_of_folds = int(num_of_folds) # get output file's name file_name = input("Name of output file : ") # # prepare text file for results to be written in result_file = open(str(file_name) + ".txt", "w+") print("Process : Creating collection to collect samples and their genes' expression") # create dictionary used to collect pathways of each sample samples_relapse = {} samples_no_relapse = {} # get all pathways of all samples in class 'relapse' for element_index in range(0, len(list_sample_relapse)): print() print("Creating pathways for sample " + str(element_index + 1) + " relapse is in progress ...") print(str(len(list_sample_relapse) - (element_index + 1)) + " samples left") print() sample = [] sample_name = list_sample_relapse[element_index] pathways = calculate.getPathway(file_ref_name, file_to_convert_name, file_pathway_name, sample_name, rows_to_read_file_pathway, normalize = False) sample.append(sample_name) sample.append(pathways) samples_relapse[element_index] = sample for element_index in range(0, len(list_sample_no_relapse)): print() print("Creating pathways for sample " + str(element_index + 1) + " non-relapse is in progress ...") print(str(len(list_sample_no_relapse) - (element_index + 1)) + " samples left") print() sample = [] sample_name = list_sample_no_relapse[element_index] pathways = calculate.getPathway(file_ref_name, file_to_convert_name, file_pathway_name, sample_name, rows_to_read_file_pathway, normalize = False) sample.append(sample_name) sample.append(pathways) samples_no_relapse[element_index] = sample print("Process : Creating collections of samples with their pathways' activity ...") # create collections of samples with their pathways # data will be collected in this format # { GSM1234, {0: ['KEGG_GLYCOLYSIS_GLUCONEOGENESIS', [[55902, 0.0], [2645, 0.0], ...}} samples_relapse_pathway_activity = {} samples_no_relapse_pathway_activity = {} for samples_index in range(0, len(samples_relapse)): sample = [] list_pathway = [] for pathway_index in range(0, len(samples_relapse[samples_index][1])): list_gene_expression_in_pathway = [] pathway = [] for gene_index in range(0, len(samples_relapse[samples_index][1][pathway_index][1])): gene_expression = samples_relapse[samples_index][1][pathway_index][1][gene_index][1] list_gene_expression_in_pathway.append(gene_expression) # data to collect as pathway activity pathway_name = samples_relapse[samples_index][1][pathway_index][0] pathway_activity = calculate.mean(list_gene_expression_in_pathway) pathway.append(pathway_name) pathway.append(pathway_activity) list_pathway.append(pathway) sample_name = samples_relapse[samples_index][0] sample.append(sample_name) sample.append(list_pathway) samples_relapse_pathway_activity[samples_index] = sample for samples_index in range(0, len(samples_no_relapse)): sample = [] list_pathway = [] for pathway_index in range(0, len(samples_no_relapse[samples_index][1])): list_gene_expression_in_pathway = [] pathway = [] for gene_index in range(0, len(samples_no_relapse[samples_index][1][pathway_index][1])): gene_expression = samples_no_relapse[samples_index][1][pathway_index][1][gene_index][1] list_gene_expression_in_pathway.append(gene_expression) # data to collect as pathway activity pathway_name = samples_no_relapse[samples_index][1][pathway_index][0] pathway_activity = calculate.mean(list_gene_expression_in_pathway) pathway.append(pathway_name) pathway.append(pathway_activity) list_pathway.append(pathway) sample_name = samples_no_relapse[samples_index][0] sample.append(sample_name) sample.append(list_pathway) samples_no_relapse_pathway_activity[samples_index] = sample # create list of indexes used to indicate the position in the list list_index_samples_relapse = [] list_index_samples_no_relapse = [] for index in range(0, len(list_sample_relapse)): list_index_samples_relapse.append(index) for index in range(0, len(list_sample_no_relapse)): list_index_samples_no_relapse.append(index) # shuffle it to make it flexible for epoch changed random.shuffle(list_index_samples_relapse) random.shuffle(list_index_samples_no_relapse) # split data into k parts chunk_relapse_size = math.ceil(len(list_index_samples_relapse) / num_of_folds) chunk_no_relapse_size = math.ceil(len(list_index_samples_no_relapse) / num_of_folds) chunk_list_relapse = list(calculate.chunks(list_index_samples_relapse, chunk_relapse_size)) print("number of chunks in chunk_list_relapse = " + str(len(chunk_list_relapse))) chunk_list_no_relapse = list(calculate.chunks(list_index_samples_no_relapse, chunk_no_relapse_size)) print("number of in chunk_list_no_relapse = " + str(len(chunk_list_no_relapse))) check_valid, num_of_chunks = calculate.checkEqualListSize(chunk_list_relapse, chunk_list_no_relapse) if (check_valid == True): # random index the chunk to be tested chunk_test_index = random.randint(0, num_of_chunks - 1) # separating data into testing and training dataset # get testing set chunk_test_relapse = chunk_list_relapse[chunk_test_index] chunk_test_no_relapse = chunk_list_no_relapse[chunk_test_index] # get training set of this fold chunk_train_relapse = [] for chunk_train_relapse_index in range(0, num_of_chunks): if (chunk_list_relapse[chunk_train_relapse_index] is not chunk_test_relapse): chunk_train_relapse.append(chunk_list_relapse[chunk_train_relapse_index]) print("chunk train relapse size = " + str(len(chunk_train_relapse))) chunk_train_no_relapse = [] for chunk_train_no_relapse_index in range(0, num_of_chunks): if (chunk_list_no_relapse[chunk_train_no_relapse_index] is not chunk_test_no_relapse): chunk_train_no_relapse.append(chunk_list_no_relapse[chunk_train_no_relapse_index]) print("chunk train no relapse size = " + str(len(chunk_train_no_relapse))) # merge training data of each class list_train_relapse = [] for i in range(0, len(chunk_train_relapse)): list_train_relapse.extend(chunk_train_relapse[i]) print("size of list_train_relapse : " + str(len(list_train_relapse))) list_train_no_relapse = [] for i in range(0, len(chunk_train_no_relapse)): list_train_no_relapse.extend(chunk_train_no_relapse[i]) print("size of list_train_no_relapse : " + str(len(list_train_no_relapse))) # making classifier # get pathways' activity of members in the feature set # for class 'relapse' list_sample_relapse_pathway_activity_classifier = [] list_pathway_name_classifier_relapse = [] for sample_index in range(0, len(list_train_relapse)): list_pathway_activity = [] sample_index_in_list = list_train_relapse[sample_index] for feature in list_feature: for pathway_index in range(0, len(samples_relapse_pathway_activity[sample_index_in_list][1])): pathway_name = samples_relapse_pathway_activity[sample_index_in_list][1][pathway_index][0] if (pathway_name == feature): pathway_activity = samples_relapse_pathway_activity[sample_index_in_list][1][pathway_index][1] list_pathway_activity.append(pathway_activity) if(pathway_name not in list_pathway_name_classifier_relapse): list_pathway_name_classifier_relapse.append(pathway_name) list_sample_relapse_pathway_activity_classifier.append(list_pathway_activity) result_file.write("feature set (" + str(len(list_feature)) + ") : \n") result_file.write(str(list_feature)) result_file.write("\n") result_file.write("pathway name in class 'relapse' (" + str(len(list_pathway_name_classifier_relapse))+ ") : ") result_file.write(str(list_pathway_name_classifier_relapse)) result_file.write("\n") # for class 'non-relapse' list_sample_no_relapse_pathway_activity_classifier = [] list_pathway_name_classifier_no_relapse = [] for sample_index in range(0, len(list_train_no_relapse)): list_pathway_activity = [] sample_index_in_list = list_train_no_relapse[sample_index] for feature in list_feature: for pathway_index in range(0, len(samples_no_relapse_pathway_activity[sample_index_in_list][1])): pathway_name = samples_no_relapse_pathway_activity[sample_index_in_list][1][pathway_index][0] if (pathway_name == feature): pathway_activity = samples_no_relapse_pathway_activity[sample_index_in_list][1][pathway_index][1] list_pathway_activity.append(pathway_activity) if(pathway_name not in list_pathway_name_classifier_no_relapse): list_pathway_name_classifier_no_relapse.append(pathway_name) list_sample_no_relapse_pathway_activity_classifier.append(list_pathway_activity) result_file.write("pathway name in class 'non-relapse' (" + str(len(list_pathway_name_classifier_no_relapse)) + ") : ") result_file.write(str(list_pathway_name_classifier_no_relapse)) result_file.write("\n") # prepare tesing set # each sample contains only pathway in feature set # for class 'relapse' list_sample_relapse_pathway_activity_testing_set = [] for sample_index in range(0, len(chunk_test_relapse)): list_pathway_activity = [] sample_index_in_list = chunk_test_relapse[sample_index] for feature in list_feature: for pathway_index in range(0, len(samples_relapse_pathway_activity[sample_index_in_list][1])): pathway_name = samples_relapse_pathway_activity[sample_index_in_list][1][pathway_index][0] if (pathway_name == feature): pathway_activity = samples_relapse_pathway_activity[sample_index_in_list][1][pathway_index][1] list_pathway_activity.append(pathway_activity) list_sample_relapse_pathway_activity_testing_set.append(list_pathway_activity) # for class 'non-relapse' list_sample_no_relapse_pathway_activity_testing_set = [] for sample_index in range(0, len(chunk_test_no_relapse)): list_pathway_activity = [] sample_index_in_list = chunk_test_no_relapse[sample_index] for feature in list_feature: for pathway_index in range(0, len(samples_no_relapse_pathway_activity[sample_index_in_list][1])): pathway_name = samples_no_relapse_pathway_activity[sample_index_in_list][1][pathway_index][0] if (pathway_name == feature): pathway_activity = samples_no_relapse_pathway_activity[sample_index_in_list][1][pathway_index][1] list_pathway_activity.append(pathway_activity) list_sample_no_relapse_pathway_activity_testing_set.append(list_pathway_activity) # merge testing data to be used in lda for feature selection list_sample_all_pathway_activity_testing_set = [] list_sample_all_pathway_activity_testing_set.extend(list_sample_relapse_pathway_activity_testing_set) list_sample_all_pathway_activity_testing_set.extend(list_sample_no_relapse_pathway_activity_testing_set) # get sample name of samples feature selection set list_sample_relapse_name_testing_set = [] for index in range(0, len(chunk_test_relapse)): sample_index_in_list = chunk_test_relapse[index] list_sample_relapse_name_testing_set.append(samples_relapse[sample_index_in_list][0]) list_sample_no_relapse_name_testing_set = [] for index in range(0, len(chunk_test_no_relapse)): sample_index_in_list = chunk_test_no_relapse[index] list_sample_no_relapse_name_testing_set.append(samples_no_relapse[sample_index_in_list][0]) # merge samples' name of both class list_sample_name_testing_set = [] list_sample_name_testing_set.extend(list_sample_relapse_name_testing_set) list_sample_name_testing_set.extend(list_sample_no_relapse_name_testing_set) # create list of desired output file_desired_outputs_testing = file_training_output.loc[file_training_output['GEO asscession number'].isin(list_sample_name_testing_set)] file_desired_outputs_testing['sample_id'] = file_desired_outputs_testing['GEO asscession number'].apply(lambda name: list_sample_name_testing_set.index(name)) file_desired_outputs_testing = file_desired_outputs_testing.sort_values(by = ['sample_id']) file_desired_outputs_testing.drop(columns = 'sample_id', inplace = True) list_desired_outputs_testing = [] for element in file_desired_outputs_testing.loc[:, 'relapse (1=True)']: list_desired_outputs_testing.append(element) # linear discrimination analysis list_actual_outputs_testing = calculate.lda(list_sample_all_pathway_activity_testing_set, list_sample_relapse_pathway_activity_classifier, list_sample_no_relapse_pathway_activity_classifier) # calculate rAUC score auc_score = roc_auc_score(list_desired_outputs_testing, list_actual_outputs_testing) result_file.write("list_sample_name_testing_set (" + str(len(list_sample_name_testing_set)) + ") : " + str(list_sample_name_testing_set) +"\n") result_file.write("list_desired_outputs_testing (" + str(len(list_desired_outputs_testing)) + ") : \n") result_file.write(str(list_desired_outputs_testing)) result_file.write("\n") result_file.write("list_actual_outputs_testing (" + str(len(list_actual_outputs_testing)) + ") : \n") result_file.write(str(list_actual_outputs_testing)) result_file.write("\n") result_file.write("AUC score : " + str(auc_score) + "\n") result_file.close()
def standard_deviation_ellipses( geoqueryset, point_attribute_name='point', num_of_std=1, fix_points=True ): """ Accepts a GeoQuerySet and generates one or more standard deviation ellipses demonstrating the geospatial distribution of where its points occur. Returns a one-to-many list of the ellipses as Polygon objects. The standard deviation ellipse illustrates the average variation in the distance of points from the mean center, as well as their direction. By default, the function expects the Point field on your model to be called 'point'. If the point field is called something else, change the kwarg 'point_attribute_name' to whatever your field might be called. Also by default, the function will nudge slightly apart any identical points and only return the first standard deviation ellipse. If you'd like to change that behavior, change the corresponding kwargs. h3. Example usage >> import calculate >> calculate.standard_deviation_ellipses(qs) [<Polygon object at 0x77a1c34>] h3. Dependencies * "django":http://www.djangoproject.com/ * "geodjango":http://www.geodjango.org/ * "psql ellipse() function":http://postgis.refractions.net/support/\ wiki/index.php?plpgsqlfunctions h3. Documentation * "standard deviation ellipse":http://www.spatialanalysisonline.com/\ output/html/Directionalanalysisofpointdatasets.html * "This code is translated from SQL by Francis Dupont":http://\ postgis.refractions.net/pipermail/postgis-users/2008-June/020354.html """ if not isinstance(geoqueryset, GeoQuerySet): error = 'First parameter must be a GeoQuerySet. You submitted a %s' raise TypeError(error % type(geoqueryset)) n = len(geoqueryset) if n < 3: return [None] if fix_points: geoqueryset = calculate.nudge_points( geoqueryset, point_attribute_name=point_attribute_name ) avg_x = calculate.mean([ abs(getattr(p, point_attribute_name).x) for p in geoqueryset ]) avg_y = calculate.mean([ abs(getattr(p, point_attribute_name).y) for p in geoqueryset ]) center_x = calculate.mean([ getattr(p, point_attribute_name).x for p in geoqueryset ]) center_y = calculate.mean([ getattr(p, point_attribute_name).y for p in geoqueryset ]) sum_square_diff_avg_x = sum([ math.pow( (abs(getattr(p, point_attribute_name).x) - avg_x), 2 ) for p in geoqueryset ]) sum_square_diff_avg_y = sum([ math.pow( (abs(getattr(p, point_attribute_name).y) - avg_y), 2 ) for p in geoqueryset ]) sum_diff_avg_x_y = sum([ (abs(getattr(p, point_attribute_name).x) - avg_x) * (abs(getattr(p, point_attribute_name).y) - avg_y) for p in geoqueryset ]) sum_square_diff_avg_x_y = sum([ math.pow( (abs(getattr(p, point_attribute_name).x) - avg_x) * (abs(getattr(p, point_attribute_name).y) - avg_y), 2 ) for p in geoqueryset ]) constant = math.sqrt( math.pow((sum_square_diff_avg_x - sum_square_diff_avg_y), 2) + (4 * sum_square_diff_avg_x_y) ) theta = math.atan( (sum_square_diff_avg_x - sum_square_diff_avg_y + constant) / (2 * sum_diff_avg_x_y) ) stdx_sum_x_y_cos_sin_theta = sum([ math.pow( ( ( (getattr(p, point_attribute_name).x - center_x) * math.cos(theta) ) - ( (getattr(p, point_attribute_name).y - center_y) * math.sin(theta) ) ), 2 ) for p in geoqueryset ]) stdy_sum_x_y_sin_cos_theta = sum([ math.pow( ( ( (getattr(p, point_attribute_name).x - center_x) * math.sin(theta) ) - ( (getattr(p, point_attribute_name).y - center_y) * math.cos(theta) ) ), 2 ) for p in geoqueryset ]) stdx = math.sqrt((2 * stdx_sum_x_y_cos_sin_theta) / (n - 2)) stdy = math.sqrt((2 * stdy_sum_x_y_sin_cos_theta) / (n - 2)) results = [] from django.db import connection cursor = connection.cursor() while num_of_std: sql = "SELECT ellipse(%s, %s, (%s * %s), (%s * %s), %s, 40);" % ( center_x, center_y, num_of_std, stdx, num_of_std, stdy, theta ) cursor.execute(sql) results.append(fromstr(cursor.fetchall()[0][0], srid=4326)) num_of_std -= 1 return results
def main(): # record start time start_time = time.time() print() print( "------------------------------------------------------------------------------------------------------------------------" ) print(" # Method : Gene-Based Classification") print(" # Experiment : Within Dataset") print( " # You will be asked to provide related files and required information about them including " ) print( " # [1] A file contains mapping between gene probe IDs and samples") print( " # [2] Number of rows of the file containing mapping between gene probe IDs and samples to be read" ) print(" # [3] A file contains mapping between samples and their class") print( " # These files must follow a required format shown in file_format.pdf" ) print(" #") print( " # You will be asked to provide required information to conduct an experiment including" ) print(" # [1] Number of epochs") print(" # [2] Number of folds") print(" # [3] Number of top-ranked feature") print(" #") print(" # You will be asked for the name of an output file.") print( "------------------------------------------------------------------------------------------------------------------------" ) print() # prepare variables file_training_input_name = None row_to_read = None file_training_output_name = None epoch = None num_of_folds = None number_of_ranked_gene = None file_name = None print(" # Enter required information about the first dataset ") print( " 1. Enter name of a file containing mapping between probes IDs and samples " ) file_training_input_name = add_ons.getFile() print() print(" 2. Enter number of rows of this file to be read ") while True: row_to_read = input(" Number of rows : ") if (row_to_read.isnumeric() == False): print(" WARNING : Number of rows must be numeric.") elif (int(row_to_read) < 1): print("WARNING : Number of rows cannot be lower than 1.") else: break row_to_read = int(row_to_read) print() print( " 3. Enter name of a file containing mapping between samples and their class" ) file_training_output_name = add_ons.getFile() print() # prepare data # row_to_read = 22283 # file_training_input = pd.read_csv("GSE2034-22071 (edited).csv", nrows = row_to_read) file_training_input = pd.read_csv(file_training_input_name, nrows=row_to_read) # consider non-relapse and relapse (not in specific period of time) file_training_output = pd.read_csv( file_training_output_name, usecols=['GEO asscession number', 'relapse (1=True)']) # this will be used in calculating lda training_input = file_training_input training_output = file_training_output # get gene order id with its name list_gene_name = [] for i in range(0, row_to_read): # add element with this format (gene_order_id, gene_name) gene_name = [] gene_name.append(i) gene_name.append(file_training_input.loc[i, "ID_REF"]) list_gene_name.append(gene_name) # separate data into 2 classes # consider non-relapse and relapse (not in specific period of time) sample_relapse = file_training_output.loc[ file_training_output['relapse (1=True)'].isin(['1'])] sample_no_relapse = file_training_output.loc[ file_training_output['relapse (1=True)'].isin(['0'])] # print(sample_no_relapse) # add GEO asscession number to each list list_sample_relapse = [] for element in sample_relapse.loc[:, 'GEO asscession number']: list_sample_relapse.append(element) # print(list_sample_relapse) list_sample_no_relapse = [] for element in sample_no_relapse.loc[:, 'GEO asscession number']: list_sample_no_relapse.append(element) # shuffle data to make each chunk does not depend on sample order random.shuffle(list_sample_relapse) random.shuffle(list_sample_no_relapse) print(" # Enter required information to conduct an experiment") print(" 1. Enter number of epochs ") while True: epoch = input(" Epochs : ") if (epoch.isnumeric() == False): print(" WARNING : Number of epochs must be numeric.") elif (int(epoch) <= 0): print(" WARINING : Number of epochs must be greater than 0.") else: break print() print(" 2. Enter number of folds ") while True: num_of_folds = input(" Number of folds: ") if (num_of_folds.isnumeric() == False): print(" WARNING : Number of folds must be numeric") # these conditions are not available in mock-up elif (int(num_of_folds) > len(list_sample_relapse)): print( "WARNING : Number of folds exceeds the size of samples in class relapse" ) elif (int(num_of_folds) > len(list_sample_no_relapse)): print( "WARNING : Number of folds exceeds the size of samples in class non-relapse" ) elif (int(num_of_folds) <= 1): print(" WARNING : Number of folds cannot lower than or equal to 1") else: break num_of_folds = int(num_of_folds) print() print(" 3. Enter number of top-ranked features") while True: number_of_ranked_gene = input(" Number of top-ranked features: ") if (number_of_ranked_gene.isnumeric() == False): print(" WARNING : Number of top-ranked features must be numeric.") # these conditions are not available in mock-up elif (int(number_of_ranked_gene) > row_to_read): print( " WARINING : Number of top-ranked features must not exceed available genes from the first file." ) elif (int(number_of_ranked_gene) <= 0): print( " WARNING : Number of top-ranked features must not be lower than or equal to 0." ) else: break print() file_name = input(" # Enter name of an output file : ") # prepare text file for results to be written in result_file = open("./result/" + str(file_name) + ".txt", "w+") # record file name result_file.write("Dataset : " + str(file_training_input_name) + "\n") result_file.write("Number of epochs : " + str(epoch) + "\n") result_file.write("Number of folds : " + str(num_of_folds) + "\n") result_file.write("Number of top-ranked features : " + str(number_of_ranked_gene) + "\n") result_file.write("\n") print(" Number of samples in class relapse : " + str(len(list_sample_relapse))) print(" Number of samples in class non-relapse : " + str(len(list_sample_no_relapse))) # list used to collect average auc score of each epoch list_avg_auc_each_epoch = [] # list to collect feature counter list_feature_counter = [] for epoch_count in range(0, int(epoch)): start_epoch_time = time.time() result_file.write("#################################### Epoch : " + str(epoch_count + 1) + " ####################################\n") print("#################################### Epoch : " + str(epoch_count + 1) + " ####################################\n") # split data into k parts chunk_relapse_size = math.ceil(len(list_sample_relapse) / num_of_folds) chunk_no_relapse_size = math.ceil( len(list_sample_no_relapse) / num_of_folds) chunk_list_relapse = list( calculate.chunks(list_sample_relapse, chunk_relapse_size)) print(" Number of chunks in class relapse : " + str(len(chunk_list_relapse))) chunk_list_no_relapse = list( calculate.chunks(list_sample_no_relapse, chunk_no_relapse_size)) print(" Number of chunks in class non-relapse = " + str(len(chunk_list_no_relapse))) print() check_valid, num_of_chunks = calculate.checkEqualListSize( chunk_list_relapse, chunk_list_no_relapse) # list to collect maximun AUC in each fold list_max_auc = [] # list and variable to track feature set that has the best auc score auc_score_max = 0 list_feature_set_max_auc = [] list_auc_score = [] print(" # Process : Cross-validation") # do only if number of chunks of both datasets are equal if (check_valid == True): for first_layer_test_index in range(0, num_of_chunks): feature_set = [] feature_set_name = [] top_n_genes_name_for_eval = [] # keep testing data from each class first_layer_test_relapse = chunk_list_relapse[ first_layer_test_index] first_layer_test_no_relapse = chunk_list_no_relapse[ first_layer_test_index] print("\n------------------------------------------ K : " + str(first_layer_test_index + 1) + " of Epoch " + str(epoch_count + 1) + " --------------------------------") print(" Samples in class relapse used as testing set :" + str(first_layer_test_relapse) + "\n") print(" Samples in class non-relapse used as testing set : " + str(first_layer_test_no_relapse) + "\n") print() # find training data # first layer first_layer_train_relapse = [] for first_layer_train_index in range(0, num_of_chunks): if (chunk_list_relapse[first_layer_train_index] is not first_layer_test_relapse): first_layer_train_relapse.append( chunk_list_relapse[first_layer_train_index]) first_layer_train_no_relapse = [] for first_layer_train_index in range(0, num_of_chunks): if (chunk_list_no_relapse[first_layer_train_index] is not first_layer_test_no_relapse): first_layer_train_no_relapse.append( chunk_list_no_relapse[first_layer_train_index]) # merge all element in the same class second_list_sample_relapse = [] for i in range(0, len(first_layer_train_relapse)): second_list_sample_relapse.extend( first_layer_train_relapse[i]) print(" Samples in class relapse used as trainning set = " + str(second_list_sample_relapse) + "\n") second_list_sample_no_relapse = [] for i in range(0, len(first_layer_train_no_relapse)): second_list_sample_no_relapse.extend( first_layer_train_no_relapse[i]) print(" Samples in class non-relapse used as training set : " + str(second_list_sample_no_relapse) + "\n") # splitting lists to use them as marker evaluation set and feature selection set # given that we separate it into 3 parts print(" Process : Feature selection") print( "\n #### divide training set into 3 parts (2/3 for marker evaluation and 1/3 for feature selection) ####" ) second_num_of_fold = 3 second_chunk_relapse_size = math.ceil( len(second_list_sample_relapse) / second_num_of_fold) second_chunk_no_relapse_size = math.ceil( len(second_list_sample_no_relapse) / second_num_of_fold) second_chunk_list_relapse = list( calculate.chunks(second_list_sample_relapse, second_chunk_relapse_size)) second_chunk_list_no_relapse = list( calculate.chunks(second_list_sample_no_relapse, second_chunk_no_relapse_size)) second_check_valid, second_num_of_chunks = calculate.checkEqualListSize( second_chunk_list_relapse, second_chunk_list_no_relapse) # do only if number of chunks of both datasets are equal if (second_check_valid == True): second_layer_test_index = random.randint( 0, second_num_of_chunks - 1) # keep testing data from eacch class second_layer_test_relapse = second_chunk_list_relapse[ second_layer_test_index] second_layer_test_no_relapse = second_chunk_list_no_relapse[ second_layer_test_index] print( " Samples in class relapse used as feature selection set : " + str(second_layer_test_relapse) + "\n") print( " Samples in class non-relapse used as feature selection set : " + str(second_layer_test_no_relapse)) print() # separate training dataset from testing dataset to use in t-test ranking second_layer_train_relapse = [] for second_layer_train_index in range( 0, second_num_of_chunks): if (second_chunk_list_relapse[second_layer_train_index] is not second_layer_test_relapse): second_layer_train_relapse.append( second_chunk_list_relapse[ second_layer_train_index]) second_layer_train_no_relapse = [] for second_layer_train_index in range( 0, second_num_of_chunks): if (second_chunk_list_no_relapse[ second_layer_train_index] is not second_layer_test_no_relapse): second_layer_train_no_relapse.append( second_chunk_list_no_relapse[ second_layer_train_index]) # prepare dataset for t-test # merge all samples in the same class ttest_list_sample_relapse = [] for i in range(0, len(second_layer_train_relapse)): ttest_list_sample_relapse.extend( second_layer_train_relapse[i]) print( " Samples in class relapse used as marker evaluation set : " + str(ttest_list_sample_relapse) + "\n") ttest_list_sample_no_relapse = [] for i in range(0, len(second_layer_train_no_relapse)): ttest_list_sample_no_relapse.extend( second_layer_train_no_relapse[i]) print( " Samples in class non-relapse used as marker evaluation set : " + str(ttest_list_sample_no_relapse) + "\n") # get gene expression for each gene from samples with relapse list_gene_exp_relapse = [] for i in range(0, row_to_read): gene_exp_relapse = [] for column in file_training_input.loc[ i, ttest_list_sample_relapse]: gene_exp_relapse.append(column) list_gene_exp_relapse.append(gene_exp_relapse) # get gene expression for each gene from samples with no relapse list_gene_exp_no_relapse = [] for i in range(0, row_to_read): gene_exp_no_relapse = [] for column in file_training_input.loc[ i, ttest_list_sample_no_relapse]: gene_exp_no_relapse.append(column) list_gene_exp_no_relapse.append(gene_exp_no_relapse) print(" # Process : Calculating t-score") # conducting t-test ttest_result = [] for i in range(0, row_to_read): score = [] # get absolute magnitude of t-test value abs_ttest_value = math.fabs( stats.ttest_ind(list_gene_exp_relapse[i], list_gene_exp_no_relapse[i], equal_var=False)[0]) p_value = stats.ttest_ind(list_gene_exp_relapse[i], list_gene_exp_no_relapse[i], equal_var=False)[1] # add element with this format (gene_order_id, ttest_value) score.append(i) score.append(abs_ttest_value) ttest_result.append(score) # ranking elements using their t-test value in descending order ttest_result.sort(key=lambda x: x[1], reverse=True) # create list of ranked gene ranked_gene = [] for i in range(0, len(ttest_result)): gene_order_id = ttest_result[i][0] ranked_gene.append(list_gene_name[gene_order_id][1]) # show top ranked feature top_n_genes_name = [] print(" #### t-score ranking ####") for i in range(0, int(number_of_ranked_gene)): top_n_genes_name.append(ranked_gene[i]) print(" " + str(ranked_gene[i]) + " => " + " t-score : " + str(ttest_result[i][1])) print() # rank gene id of each sample in training data # for class 'relapse' # print("#### class 'Relapse' ####") col_to_read_relapse = ["ID_REF"] col_to_read_relapse.extend(ttest_list_sample_relapse) file_training_input_relapse = pd.read_csv( "GSE2034-22071 (edited).csv", nrows=row_to_read, usecols=col_to_read_relapse) top_n_genes_relapse = file_training_input_relapse.loc[ file_training_input_relapse['ID_REF'].isin( top_n_genes_name)] top_n_genes_relapse['gene_id'] = top_n_genes_relapse[ 'ID_REF'].apply( lambda name: top_n_genes_name.index(name)) top_n_genes_relapse_sorted = top_n_genes_relapse.sort_values( by=['gene_id']) top_n_genes_relapse_sorted.drop(columns='gene_id', inplace=True) top_n_genes_relapse_sorted_train = top_n_genes_relapse_sorted top_n_genes_relapse_sorted_train.drop(columns='ID_REF', inplace=True) # for class 'no relapse' # print("#### class 'no Relapse' ####") col_to_read_no_relapse = ["ID_REF"] col_to_read_no_relapse.extend(ttest_list_sample_no_relapse) file_training_input_no_relapse = pd.read_csv( "GSE2034-22071 (edited).csv", nrows=row_to_read, usecols=col_to_read_no_relapse) top_n_genes_no_relapse = file_training_input_no_relapse.loc[ file_training_input_no_relapse['ID_REF'].isin( top_n_genes_name)] top_n_genes_no_relapse['gene_id'] = top_n_genes_no_relapse[ 'ID_REF'].apply( lambda name: top_n_genes_name.index(name)) top_n_genes_no_relapse_sorted = top_n_genes_no_relapse.sort_values( by=['gene_id']) top_n_genes_no_relapse_sorted.drop(columns='gene_id', inplace=True) top_n_genes_no_relapse_sorted_train = top_n_genes_no_relapse_sorted top_n_genes_no_relapse_sorted_train.drop(columns='ID_REF', inplace=True) # Preparing testing data for feature selection second_layer_test_all = [] second_layer_test_all.extend(second_layer_test_relapse) second_layer_test_all.extend(second_layer_test_no_relapse) # output for testing data # sort gene order of testing data col_to_read_second_layer_test_gene = ["ID_REF"] col_to_read_second_layer_test_gene.extend( second_layer_test_all) second_layer_test_gene = pd.read_csv( "GSE2034-22071 (edited).csv", nrows=row_to_read, usecols=col_to_read_second_layer_test_gene) second_layer_top_n_test = second_layer_test_gene.loc[ second_layer_test_gene['ID_REF'].isin( top_n_genes_name)] second_layer_top_n_test[ 'gene_id'] = second_layer_top_n_test['ID_REF'].apply( lambda name: top_n_genes_name.index(name)) second_layer_top_n_test_sorted = second_layer_top_n_test.sort_values( by=['gene_id']) second_layer_top_n_test_sorted.drop(columns='gene_id', inplace=True) top_n_test_sorted = second_layer_top_n_test_sorted top_n_test_sorted.drop(columns='ID_REF', inplace=True) # use top-rank feature as the first feature in lda classifier # prepare list for input # list of all input data (testing data) list_second_layer_top_n_test_sorted = [] for column in range(0, len(top_n_test_sorted)): list_each_sample = [] for element in top_n_test_sorted.iloc[column]: list_each_sample.append(element) list_second_layer_top_n_test_sorted.append( list_each_sample) list_second_layer_top_n_test_sorted = list( np.transpose(list_second_layer_top_n_test_sorted)) # output for testing data second_layer_test_output = training_output.loc[ training_output['GEO asscession number'].isin( second_layer_test_all)] # sorting data according to its order in testing data list_sample_to_read = list( second_layer_top_n_test_sorted.columns.values) second_layer_test_output[ 'sample_id'] = second_layer_test_output[ 'GEO asscession number'].apply( lambda name: list_sample_to_read.index(name)) second_layer_test_output = second_layer_test_output.sort_values( by=['sample_id']) second_layer_test_output.drop(columns='sample_id', inplace=True) # create list of output list_desired_output = [] for element in second_layer_test_output.loc[:, 'relapse (1=True)']: list_desired_output.append(element) # list of gene expression and sample of class 'relapse' list_top_n_gene_relapse_sorted = [] for column in range(0, len(top_n_genes_relapse_sorted_train)): list_each_sample = [] for element in top_n_genes_relapse_sorted_train.iloc[ column]: list_each_sample.append(element) list_top_n_gene_relapse_sorted.append(list_each_sample) list_top_n_gene_relapse_sorted = list( np.transpose(list_top_n_gene_relapse_sorted)) # list of gene expression and sample of class 'no relapse' list_top_n_gene_no_relapse_sorted = [] for column in range( 0, len(top_n_genes_no_relapse_sorted_train)): list_each_sample = [] for element in top_n_genes_no_relapse_sorted_train.iloc[ column]: list_each_sample.append(element) list_top_n_gene_no_relapse_sorted.append( list_each_sample) list_top_n_gene_no_relapse_sorted = list( np.transpose(list_top_n_gene_no_relapse_sorted)) print(" # Process : Sequential Forward Selection (SFS)") # find set of genes to be used as a feature check_finish = False count_iteration = 1 gene_order = [0] list_auc = [] while (check_finish == False): if (count_iteration >= int(number_of_ranked_gene)): check_finish = True else: max_auc_score = 0 gene_index_in_list = None for i in range(0, int(number_of_ranked_gene)): gene_order_test = deepcopy(gene_order) gene_order_test.extend([i]) # select gene to be used in lda input_relapse = [] for sample_index in range( 0, len(list_top_n_gene_relapse_sorted)): list_each_sample = [] for element_id in range( 0, len(list_top_n_gene_relapse_sorted[ sample_index])): if (element_id in gene_order_test): list_each_sample.append( list_top_n_gene_relapse_sorted[ sample_index][element_id]) input_relapse.append(list_each_sample) # print(input_relapse) input_no_relapse = [] for sample_index in range( 0, len(list_top_n_gene_no_relapse_sorted) ): list_each_sample = [] for element_id in range( 0, len(list_top_n_gene_no_relapse_sorted[ sample_index])): if (element_id in gene_order_test): list_each_sample.append( list_top_n_gene_no_relapse_sorted[ sample_index][element_id]) input_no_relapse.append(list_each_sample) # print(input_no_relapse) input_testing_data = [] for sample_index in range( 0, len(list_second_layer_top_n_test_sorted )): list_each_sample = [] for element_id in range( 0, len(list_second_layer_top_n_test_sorted[ sample_index])): if (element_id in gene_order_test): list_each_sample.append( list_second_layer_top_n_test_sorted[ sample_index][element_id]) input_testing_data.append(list_each_sample) list_actual_output = calculate.lda( input_testing_data, input_relapse, input_no_relapse) # calculate AUC score auc_score = roc_auc_score( list_desired_output, list_actual_output) if (auc_score > max_auc_score): max_auc_score = auc_score gene_index_in_list = i # print(max_auc_score) if max_auc_score not in list_auc: list_auc.append(max_auc_score) # do not add gene that already exists in a feature if (gene_index_in_list not in gene_order): gene_order.extend([gene_index_in_list]) count_iteration += 1 list_max_auc.append(max(list_auc)) gene_order.sort() # get gene_name gene_order_name = [] for element in gene_order: gene_order_name.append(top_n_genes_name[element]) # copy required data to be used in evaluation top_n_genes_name_for_eval = deepcopy(top_n_genes_name) feature_set = deepcopy(gene_order) feature_set_name = deepcopy(gene_order_name) # count feature frequency if (int(epoch) > 1): for feature_index in range(0, len(feature_set_name)): # if list feature counter is empty if not list_feature_counter: feature_counter = [] feature_name = feature_set_name[feature_index] feature_frequency = 1 feature_counter.append(feature_name) feature_counter.append(feature_frequency) list_feature_counter.append(feature_counter) else: feature_name = feature_set_name[feature_index] # check if this feature exist in the feature counter list check_found = False for feature_counter_index in range( 0, len(list_feature_counter)): feature_counter_name = list_feature_counter[ feature_counter_index][0] if (feature_name == feature_counter_name): feature_frequency = list_feature_counter[ feature_counter_index][1] feature_frequency += 1 list_feature_counter[ feature_counter_index][ 1] = feature_frequency check_found = True # if this feature is not exist in a list feature counter if (check_found == False): feature_counter = [] feature_name = feature_set_name[feature_index] feature_frequency = 1 feature_counter.append(feature_name) feature_counter.append(feature_frequency) list_feature_counter.append(feature_counter) # preparing data for evaluation and creating classifier # for class 'relapse' print(" # Process : Prepare classifiers and testing data") col_to_read_relapse_for_eval = ["ID_REF"] col_to_read_relapse_for_eval.extend(second_list_sample_relapse) file_training_input_relapse_for_eval = pd.read_csv( "GSE2034-22071 (edited).csv", nrows=row_to_read, usecols=col_to_read_relapse_for_eval) top_n_genes_relapse_for_eval = file_training_input_relapse.loc[ file_training_input_relapse['ID_REF'].isin( feature_set_name)] top_n_genes_relapse_for_eval[ 'gene_id'] = top_n_genes_relapse_for_eval['ID_REF'].apply( lambda name: feature_set_name.index(name)) top_n_genes_relapse_sorted_for_eval = top_n_genes_relapse_for_eval.sort_values( by=['gene_id']) top_n_genes_relapse_sorted_for_eval.drop(columns='gene_id', inplace=True) top_n_genes_relapse_sorted_for_eval.drop(columns='ID_REF', inplace=True) # print(top_n_genes_relapse_sorted_for_eval) # for class 'no relapse' col_to_read_no_relapse_for_eval = ["ID_REF"] col_to_read_no_relapse_for_eval.extend( second_list_sample_no_relapse) file_training_input_no_relapse_for_eval = pd.read_csv( "GSE2034-22071 (edited).csv", nrows=row_to_read, usecols=col_to_read_no_relapse_for_eval) top_n_genes_no_relapse_for_eval = file_training_input_no_relapse_for_eval.loc[ file_training_input_no_relapse_for_eval['ID_REF'].isin( feature_set_name)] top_n_genes_no_relapse_for_eval[ 'gene_id'] = top_n_genes_no_relapse_for_eval[ 'ID_REF'].apply( lambda name: feature_set_name.index(name)) top_n_genes_no_relapse_sorted_for_eval = top_n_genes_no_relapse_for_eval.sort_values( by=['gene_id']) top_n_genes_no_relapse_sorted_for_eval.drop(columns='gene_id', inplace=True) top_n_genes_no_relapse_sorted_for_eval.drop(columns='ID_REF', inplace=True) # print(top_n_genes_no_relapse_sorted_for_eval) first_layer_test_all = [] first_layer_test_all.extend(first_layer_test_relapse) first_layer_test_all.extend(first_layer_test_no_relapse) # print(first_layer_test_all) col_to_read_first_layer_test_gene = ["ID_REF"] col_to_read_first_layer_test_gene.extend(first_layer_test_all) first_layer_test_gene = pd.read_csv( "GSE2034-22071 (edited).csv", nrows=row_to_read, usecols=col_to_read_first_layer_test_gene) first_layer_top_n_test = first_layer_test_gene.loc[ first_layer_test_gene['ID_REF'].isin(feature_set_name)] first_layer_top_n_test['gene_id'] = first_layer_top_n_test[ 'ID_REF'].apply(lambda name: feature_set_name.index(name)) first_layer_top_n_test_sorted = first_layer_top_n_test.sort_values( by=['gene_id']) first_layer_top_n_test_sorted.drop(columns='gene_id', inplace=True) top_n_test_sorted_for_eval = first_layer_top_n_test_sorted top_n_test_sorted_for_eval.drop(columns='ID_REF', inplace=True) # prepare list for input # list of all input data (testing data) list_first_layer_top_n_test_sorted = [] for column in range(0, len(top_n_test_sorted_for_eval)): list_each_sample = [] for element in top_n_test_sorted_for_eval.iloc[column]: list_each_sample.append(element) list_first_layer_top_n_test_sorted.append(list_each_sample) list_first_layer_top_n_test_sorted = list( np.transpose(list_first_layer_top_n_test_sorted)) # output for testing data first_layer_test_output = training_output.loc[training_output[ 'GEO asscession number'].isin(first_layer_test_all)] # sorting data according to its order in testing data list_sample_to_read_for_eval = list( first_layer_top_n_test_sorted.columns.values) first_layer_test_output['sample_id'] = first_layer_test_output[ 'GEO asscession number'].apply( lambda name: list_sample_to_read_for_eval.index(name)) first_layer_test_output = first_layer_test_output.sort_values( by=['sample_id']) first_layer_test_output.drop(columns='sample_id', inplace=True) # create list of output list_desired_output_for_eval = [] for element in first_layer_test_output.loc[:, 'relapse (1=True)']: list_desired_output_for_eval.append(element) # list of gene expression and sample of class 'relapse' for evaluation list_top_n_gene_relapse_sorted_for_eval = [] for column in range(0, len(top_n_genes_relapse_sorted_for_eval)): list_each_sample = [] for element in top_n_genes_relapse_sorted_for_eval.iloc[ column]: list_each_sample.append(element) list_top_n_gene_relapse_sorted_for_eval.append( list_each_sample) list_top_n_gene_relapse_sorted_for_eval = list( np.transpose(list_top_n_gene_relapse_sorted_for_eval)) # list of gene expression and sample of class 'no relapse' for evaluation list_top_n_gene_no_relapse_sorted_for_eval = [] for column in range( 0, len(top_n_genes_no_relapse_sorted_for_eval)): list_each_sample = [] for element in top_n_genes_no_relapse_sorted_for_eval.iloc[ column]: list_each_sample.append(element) list_top_n_gene_no_relapse_sorted_for_eval.append( list_each_sample) list_top_n_gene_no_relapse_sorted_for_eval = list( np.transpose(list_top_n_gene_no_relapse_sorted_for_eval)) # calculate lda to get actual output input_relapse_for_eval = [] for sample_index in range( 0, len(list_top_n_gene_relapse_sorted_for_eval)): list_each_sample = [] for element_id in range( 0, len(list_top_n_gene_relapse_sorted_for_eval[ sample_index])): if (element_id in feature_set): list_each_sample.append( list_top_n_gene_relapse_sorted_for_eval[ sample_index][element_id]) input_relapse_for_eval.append(list_each_sample) input_no_relapse_for_eval = [] for sample_index in range( 0, len(list_top_n_gene_no_relapse_sorted_for_eval)): list_each_sample = [] for element_id in range( 0, len(list_top_n_gene_no_relapse_sorted_for_eval[ sample_index])): if (element_id in feature_set): list_each_sample.append( list_top_n_gene_no_relapse_sorted_for_eval[ sample_index][element_id]) input_no_relapse_for_eval.append(list_each_sample) input_testing_data_for_eval = [] for sample_index in range( 0, len(list_first_layer_top_n_test_sorted)): list_each_sample = [] for element_id in range( 0, len(list_first_layer_top_n_test_sorted[ sample_index])): if (element_id in feature_set): list_each_sample.append( list_first_layer_top_n_test_sorted[ sample_index][element_id]) input_testing_data_for_eval.append(list_each_sample) list_actual_output_for_eval = calculate.lda( input_testing_data_for_eval, input_relapse_for_eval, input_no_relapse_for_eval) # calculate AUC score auc_score_for_eval = roc_auc_score( list_desired_output_for_eval, list_actual_output_for_eval) list_auc_score.append(auc_score_for_eval) print("#### Evaluation of " + str(first_layer_test_index + 1) + " - fold ####") print(" Feature Set : " + str(feature_set_name)) print(" Actual Output : " + str(list_actual_output_for_eval)) print(" Desired Output : " + str(list_desired_output_for_eval)) print(" AUC ROC score = " + str(auc_score_for_eval)) # track feature set which gives maximum auc score if (auc_score_for_eval > auc_score_max): list_feature_set_max_auc = deepcopy(feature_set_name) auc_score_max = auc_score # write output to an output file result_file.write("Fold : " + str(first_layer_test_index + 1) + "\n") result_file.write("Feature Set : " + str(feature_set_name) + "\n") result_file.write("Actual Output : " + str(list_actual_output_for_eval) + "\n") result_file.write("Desired Output : " + str(list_desired_output_for_eval) + "\n") result_file.write("AUC ROC Score from testing : " + str(auc_score_for_eval) + "\n") result_file.write("\n") list_avg_auc_each_epoch.append(calculate.mean(list_auc_score)) # record ending time of this iteration end_epoch_time = time.time() time_elapse_epoch_second = end_epoch_time - start_epoch_time time_elapse_epoch_minute = time_elapse_epoch_second / 60 time_elapse_epoch_hour = time_elapse_epoch_minute / 60 time_elapse_epoch_minute = round(time_elapse_epoch_minute, 2) time_elapse_epoch_hour = round(time_elapse_epoch_hour, 2) result_file.write("\n#### Summary ####\n") result_file.write("Average AUC score : " + str(calculate.mean(list_auc_score)) + "\n") result_file.write("AUC score from feature selection in each fold : " + str(list_max_auc) + "\n") result_file.write( "Size of feature set which gives the highest AUC score from testing : " + str(len(list_feature_set_max_auc))) result_file.write("\n") result_file.write( "Feature set which gives the highest AUC score from testing : " + "\n") result_file.write(str(list_feature_set_max_auc)) result_file.write("\n") result_file.write("Time Elapse : " + str(time_elapse_epoch_minute) + " minutes (" + str(time_elapse_epoch_hour) + " hours)\n") print(" Time Elapse : " + str(time_elapse_epoch_minute) + " minutes (" + str(time_elapse_epoch_hour) + " hours)\n") print(" AUC score from feature selection in each fold = " + str(list_max_auc)) # calculate mean over all epoch mean_over_all_epoch = calculate.mean(list_avg_auc_each_epoch) print(" Average AUC score over " + str(epoch) + " epoch : " + str(mean_over_all_epoch)) result_file.write("\n") result_file.write("Average AUC score over " + str(epoch) + " epoch : " + str(mean_over_all_epoch) + "\n") result_file.write("\n") # rank feature frequency if (len(list_feature_counter) < 10): num_of_top_frequent_pathway = len(list_feature_counter) else: # default number of features to be shown is 10 num_of_top_frequent_pathway = 10 # rank pathway frequency in descending order list_feature_counter.sort(key=lambda x: x[1], reverse=True) # add top pathways to a list to be shown list_top_pathway_frequency = [] for top_pathway_index in range(0, num_of_top_frequent_pathway): list_top_pathway_frequency.append( list_feature_counter[top_pathway_index]) print(" Feature frequency : ") result_file.write("\n") result_file.write("Feature frequency :\n") for index in range(0, len(list_top_pathway_frequency)): feature_name = list_top_pathway_frequency[index][0] feature_frequency = list_top_pathway_frequency[index][1] print(" " + str(index + 1) + ". " + str(feature_name) + " : " + str(feature_frequency)) result_file.write( str(index + 1) + ". " + str(feature_name) + " : " + str(feature_frequency) + "\n") print() result_file.write("\n") # record end time end_time = time.time() time_elapse_second = end_time - start_time time_elapse_minute = time_elapse_second / 60 time_elapse_hour = time_elapse_minute / 60 time_elapse_minute = round(time_elapse_minute, 2) time_elapse_hour = round(time_elapse_hour, 2) print(" Total Time Elapse : " + str(time_elapse_minute) + " minutes (" + str(time_elapse_hour) + " hours)") result_file.write("Total Time Elapse : " + str(time_elapse_minute) + " minutes (" + str(time_elapse_hour) + " hours)\n") result_file.close()
def test_mean(self): self.assertEqual(calculate.mean([1, 2, 3]), 2.0) self.assertEqual(calculate.mean([1, 99]), 50.0) self.assertEqual(calculate.mean([2, 3, 3]), 2.6666666666666665) self.assertRaises(ValueError, calculate.mean, ['a', 0.2, 3])
def standard_deviation_ellipses(geoqueryset, point_attribute_name='point', num_of_std=1, fix_points=True): """ Accepts a GeoQuerySet and generates one or more standard deviation ellipses demonstrating the geospatial distribution of where its points occur. Returns a one-to-many list of the ellipses as Polygon objects. The standard deviation ellipse illustrates the average variation in the distance of points from the mean center, as well as their direction. By default, the function expects the Point field on your model to be called 'point'. If the point field is called something else, change the kwarg 'point_attribute_name' to whatever your field might be called. Also by default, the function will nudge slightly apart any identical points and only return the first standard deviation ellipse. If you'd like to change that behavior, change the corresponding kwargs. h3. Example usage >> import calculate >> calculate.standard_deviation_ellipses(qs) [<Polygon object at 0x77a1c34>] h3. Dependencies * "django":http://www.djangoproject.com/ * "geodjango":http://www.geodjango.org/ * "psql ellipse() function":http://postgis.refractions.net/support/wiki/index.php?plpgsqlfunctions h3. Documentation * "standard deviation ellipse":http://www.spatialanalysisonline.com/output/html/Directionalanalysisofpointdatasets.html * "This code is translated from SQL by Francis Dupont":http://postgis.refractions.net/pipermail/postgis-users/2008-June/020354.html """ if not isinstance(geoqueryset, GeoQuerySet): raise TypeError( 'First parameter must be a Django GeoQuerySet. You submitted a %s object' % type(geoqueryset)) n = len(geoqueryset) if n < 3: return [None] if fix_points: calculate.nudge_points(geoqueryset, point_attribute_name=point_attribute_name) avg_x = calculate.mean( [abs(getattr(p, point_attribute_name).x) for p in geoqueryset]) avg_y = calculate.mean( [abs(getattr(p, point_attribute_name).y) for p in geoqueryset]) center_x = calculate.mean( [getattr(p, point_attribute_name).x for p in geoqueryset]) center_y = calculate.mean( [getattr(p, point_attribute_name).y for p in geoqueryset]) sum_square_diff_avg_x = sum([ math.pow((abs(getattr(p, point_attribute_name).x) - avg_x), 2) for p in geoqueryset ]) sum_square_diff_avg_y = sum([ math.pow((abs(getattr(p, point_attribute_name).y) - avg_y), 2) for p in geoqueryset ]) sum_diff_avg_x_y = sum([(abs(getattr(p, point_attribute_name).x) - avg_x) * (abs(getattr(p, point_attribute_name).y) - avg_y) for p in geoqueryset]) sum_square_diff_avg_x_y = sum([ math.pow((abs(getattr(p, point_attribute_name).x) - avg_x) * (abs(getattr(p, point_attribute_name).y) - avg_y), 2) for p in geoqueryset ]) constant = math.sqrt( math.pow((sum_square_diff_avg_x - sum_square_diff_avg_y), 2) + (4 * sum_square_diff_avg_x_y)) theta = math.atan( (sum_square_diff_avg_x - sum_square_diff_avg_y + constant) / (2 * sum_diff_avg_x_y)) stdx_sum_x_y_cos_sin_theta = sum([ math.pow((((getattr(p, point_attribute_name).x - center_x) * math.cos(theta)) - ((getattr(p, point_attribute_name).y - center_y) * math.sin(theta))), 2) for p in geoqueryset ]) stdy_sum_x_y_sin_cos_theta = sum([ math.pow((((getattr(p, point_attribute_name).x - center_x) * math.sin(theta)) - ((getattr(p, point_attribute_name).y - center_y) * math.cos(theta))), 2) for p in geoqueryset ]) stdx = math.sqrt((2 * stdx_sum_x_y_cos_sin_theta) / (n - 2)) stdy = math.sqrt((2 * stdy_sum_x_y_sin_cos_theta) / (n - 2)) results = [] from django.db import connection cursor = connection.cursor() while num_of_std: cursor.execute( """SELECT ellipse(%s, %s, (%s * %s), (%s * %s), %s, 40);""" % (center_x, center_y, num_of_std, stdx, num_of_std, stdy, theta)) results.append(fromstr(cursor.fetchall()[0][0], srid=4326)) num_of_std -= 1 return results
def percentile(data_list, value, kind='weak'): """ Accepts a sample of values and a single number to compare to it and determine its percentile rank. A percentile of, for example, 80 means that 80 percent of the scores in the sequence are below the given score. In the case of gaps or ties, the exact definition depends on the type of the calculation stipulated by the "kind" keyword argument. There are three kinds of percentile calculations provided here. The default is "weak". 1. "weak" Corresponds to the definition of a cumulative distribution function, with the result generated by returning the percentage of values at or equal to the the provided value. 2. "strict" Similar to "weak", except that only values that are less than the given score are counted. This can often produce a result much lower than "weak" when the provided score is occurs many times in the sample. 3. "mean" The average of the "weak" and "strict" scores. h3. Example usage >> import calculate >> calculate.percentile([1, 2, 3, 4], 3) 75.0 >> calculate.percentile([1, 2, 3, 3, 4], 3, kind='strict') 40.0 >> calculate.percentile([1, 2, 3, 3, 4], 3, kind='weak') 80.0 >> calculate.percentile([1, 2, 3, 3, 4], 3, kind='mean') 60.0 h3. Documentation * "Percentile rank":http://en.wikipedia.org/wiki/Percentile_rank h3. Credits This function is a modification of scipy.stats.percentileofscore. The only major difference is that I eliminated the numpy dependency, and omitted the rank kwarg option until I can find time to translate the numpy parts out. """ # Convert all the values to floats and test to make sure # there aren't any strings in there try: data_list = list(map(float, data_list)) except ValueError: raise ValueError('Input values should contain numbers, your first \ input contains something else') # Find the number of values in the sample n = float(len(data_list)) if kind == 'strict': # If the selected method is strict, count the number of values # below the provided one and then divide it into the n return len([i for i in data_list if i < value]) / n * 100 elif kind == 'weak': # If the selected method is weak, count the number of values # equal to or below the provided on and then divide it into n return len([i for i in data_list if i <= value]) / n * 100 elif kind == 'mean': # If the selected method is mean, take the weak and strong # methods and average them. strict = len([i for i in data_list if i < value]) / n * 100 weak = len([i for i in data_list if i <= value]) / n * 100 return calculate.mean([strict, weak]) else: raise ValueError("The kind kwarg must be 'strict', 'weak' or 'mean'. \ You can also opt to leave it out and rely on the default method.")