def take_user_input(): global attribute_selection_count global progressbar_total global preprocessing_switch selection_count = input( "Enter the number of gene attributes to select (default is " + str(attribute_selection_count) + "): ") if not selection_count: selection_count = "100" attribute_selection_count = int(selection_count) preprocessing_choice = input( "Enter your pre-processing choice. Select 1 for SD and 2 for |SNR| (defult is 1): " ) if not preprocessing_choice: preprocessing_choice = 1 preprocessing_switch = int(preprocessing_choice) progressbar.show(0, progressbar_total, prefix='Progress:', suffix='Complete', length=50)
def sort_by_standard_deviation(): global transposed_raw_data_matrix global ALL_count global MLL_count global AML_count global snr_tuples global progressbar_total sample_length = len(transposed_raw_data_matrix[0]) sample_count = len(transposed_raw_data_matrix) MLL_end_index = ALL_count + MLL_count for attribute_index in range(sample_length - 1): attribute_list = list() for sample_index in range(sample_count): attribute_list.append( transposed_raw_data_matrix[sample_index][attribute_index]) sd_value = statistics.standard_deviation( attribute_list[:ALL_count]) + statistics.standard_deviation( attribute_list[ALL_count:MLL_end_index] ) + statistics.standard_deviation(attribute_list[MLL_end_index:]) rounded_sd = math.ceil(sd_value * 10000) / 10000 snr_tuples.append((attribute_index, rounded_sd)) sort.randomized_quick_sort_for_tuples(snr_tuples, 0, len(snr_tuples) - 1) progressbar.show(9, progressbar_total, prefix="Progress:", suffix="Complete", length=50)
def write_as_csv(): global data_matrix global gene_attributes global progressbar_total filename = "child-all.csv" writefile = open(filename, 'w+') write_file_content = "" for attribute in gene_attributes: write_file_content += attribute + "," write_file_content += "class\n" for sample in data_matrix: line = "" for value in sample[:-1]: line += str(value) + "," line += sample[-1] + "\n" write_file_content += line writefile.write(write_file_content) writefile.close() progressbar.show(3, progressbar_total, prefix='Progress:', suffix='Complete', length=50)
def write_as_csv(): global transposed_raw_data_matrix global gene_attributes global progressbar_total filename = "mll-leukemia.csv" writefile = open(filename, "w+") write_file_content = "" for attribute in gene_attributes: write_file_content += attribute + "," write_file_content += "class\n" for sample in transposed_raw_data_matrix: line = "" for value in sample[:-1]: line += str(value) + "," line += sample[-1] + "\n" write_file_content += line writefile.write(write_file_content) writefile.close() progressbar.show(6, progressbar_total, prefix="Progress:", suffix="Complete", length=50)
def normalize_data(): global data_matrix global progressbar_total sample_length = len(data_matrix[0]) sample_count = len(data_matrix) for attribute_index in range(sample_length - 1): attribute_list = list() normalized_attribute_list = list() for sample_index in range(sample_count): attribute_list.append(data_matrix[sample_index][attribute_index]) for attribute in attribute_list: z_score = zscore.calculate_zscore(attribute, attribute_list) rounded_zscore = math.ceil(z_score * 100000) / 100000 normalized_attribute_list.append(rounded_zscore) for sample_index in range(sample_count): data_matrix[sample_index][ attribute_index] = normalized_attribute_list[sample_index] progressbar.show(7, progressbar_total, prefix='Progress:', suffix='Complete', length=50)
def handle_missing_values(): global data_matrix global gene_attributes global progressbar_total missing_attribute_indices = list() third_sample = data_matrix[2] shift_counter = 0 for attribute_index in range(len(third_sample)): if third_sample[attribute_index] == '?': missing_attribute_indices.append(attribute_index) for index in range(len(missing_attribute_indices)): missing_attribute_indices[ index] = missing_attribute_indices[index] - shift_counter shift_counter += 1 for index in missing_attribute_indices: gene_attributes.pop(index) for sample in data_matrix: for index in missing_attribute_indices: sample.pop(index) progressbar.show(5, progressbar_total, prefix='Progress:', suffix='Complete', length=50)
def sort_by_SNR(): global data_matrix global after_th_count global snr_tuples global progressbar_total sample_length = len(data_matrix[0]) sample_count = len(data_matrix) for attribute_index in range(sample_length - 1): attribute_list = list() for sample_index in range(sample_count): attribute_list.append(data_matrix[sample_index][attribute_index]) snr_value = snr.mod_SNR(attribute_list[:after_th_count], attribute_list[after_th_count:]) rounded_snr = math.ceil(snr_value * 10000) / 10000 snr_tuples.append((attribute_index, rounded_snr)) sort.randomized_quick_sort_for_tuples(snr_tuples, 0, len(snr_tuples) - 1) progressbar.show(8, progressbar_total, prefix='Progress:', suffix='Complete', length=50)
def write_to_file(): global selected_data_matrix global selected_gene_attributes global attribute_selection_count global progressbar_total global preprocessing_switch preprocess_type = "-sd" if preprocessing_switch == 1 else "-snr" filename = "leukemia-selected-" + str(attribute_selection_count) + preprocess_type + ".csv" writefile = open("../datasets/preprocessed/" + filename, 'w+') write_file_content = "" for attribute in selected_gene_attributes: write_file_content += attribute + "," write_file_content += "class\n" for sample in selected_data_matrix: line = "" for value in sample[:-1]: line += str(value) + "," line += sample[-1] + "\n" write_file_content += line writefile.write(write_file_content) writefile.close() progressbar.show(9, progressbar_total, prefix = 'Progress:', suffix = 'Complete', length = 50) print("The pre-processed data has been saved in the file: ", filename, "!", end = "\n")
def prepare_selected_dataset(): global data_matrix global gene_attributes global selected_data_matrix global selected_gene_attributes global snr_tuples global attribute_selection_count global progressbar_total flag = 0 for sample in data_matrix: selected_attribute_sample = list() for index in range(attribute_selection_count): snr_tuple = snr_tuples[index] selected_index = snr_tuple[0] if flag == 0: selected_gene_attributes.append(gene_attributes[selected_index - 1]) selected_attribute_sample.append(sample[selected_index]) selected_attribute_sample.append(sample[0]) selected_data_matrix.append(copy.deepcopy(selected_attribute_sample)) flag = 1 progressbar.show(8, progressbar_total, prefix = 'Progress:', suffix = 'Complete', length = 50)
def sort_by_standard_deviation(): global data_matrix global ALL_count global snr_tuples global progressbar_total sample_length = len(data_matrix[0]) sample_count = len(data_matrix) for attribute_index in range(1, sample_length): attribute_list = list() for sample_index in range(sample_count): attribute_list.append(data_matrix[sample_index][attribute_index]) sd_value = statistics.standard_deviation( attribute_list[:ALL_count]) + statistics.standard_deviation(attribute_list[(ALL_count + 1):]) rounded_sd = math.ceil(sd_value * 10000) / 10000 snr_tuples.append((attribute_index, rounded_sd)) sort.randomized_quick_sort_for_tuples(snr_tuples, 0, len(snr_tuples) - 1) progressbar.show(7, progressbar_total, prefix='Progress:', suffix='Complete', length=50)
def aggregate_same_class_samples(): global data_matrix global ALL_count global AML_count global progressbar_total ALL_list = list() AML_list = list() for sample in data_matrix: if sample[0] == "ALL": ALL_list.append(copy.deepcopy(sample)) elif sample[0] == "AML": AML_list.append(copy.deepcopy(sample)) ALL_count = len(ALL_list) AML_count = len(AML_list) data_matrix = list() transfer_list(ALL_list, data_matrix) transfer_list(AML_list, data_matrix) progressbar.show(3, progressbar_total, prefix='Progress:', suffix='Complete', length=50)
def convert_datapoints_to_number(): global data_matrix global progressbar_total for sample in data_matrix: for index in range(1, len(sample)): sample[index] = float(sample[index]) progressbar.show(5, progressbar_total, prefix = 'Progress:', suffix = 'Complete', length = 50)
def tidy_raw_dataset(): global raw_data_matrix global data_matrix global gene_attributes gene_attributes = raw_data_matrix[0][:-1] data_matrix = raw_data_matrix[3:] progressbar.show(2, 9, prefix='Progress:', suffix='Complete', length=50)
def arrange_dataset(): global raw_data_matrix global gene_attributes global ALL_samples global MLL_samples global AML_samples global ALL_count global AML_count global MLL_count global progressbar_total linecount = 0 for line in raw_data_matrix[1:]: splitted_line_list = line.split("\t") gene_attributes.append(splitted_line_list[0]) ALL_start_index = 2 ALL_end_index = ALL_count + 2 MLL_start_index = ALL_end_index MLL_end_index = ALL_count + 2 + MLL_count AML_start_index = MLL_end_index AML_end_index = ALL_count + MLL_count + AML_count + 2 for index in range( len(splitted_line_list[ALL_start_index:ALL_end_index])): if linecount == 0: ALL_samples.append(list()) ALL_samples[index].append(splitted_line_list[index + ALL_start_index]) for index in range( len(splitted_line_list[MLL_start_index:MLL_end_index])): if linecount == 0: MLL_samples.append(list()) MLL_samples[index].append(splitted_line_list[index + MLL_start_index]) for index in range( len(splitted_line_list[AML_start_index:AML_end_index])): if linecount == 0: AML_samples.append(list()) AML_samples[index].append(splitted_line_list[index + AML_start_index]) linecount = linecount + 1 progressbar.show(3, progressbar_total, prefix="Progress:", suffix="Complete", length=50)
def read_leukemia_raw_dataset(): global raw_data_matrix global progressbar_total with open("../datasets/leukemia.txt", 'r') as datafile: for line in datafile: line = line.rstrip() splitted_line_list = line.split("\t") raw_data_matrix.append(splitted_line_list) progressbar.show(1, progressbar_total, prefix = 'Progress:', suffix = 'Complete', length = 50)
def convert_datapoints_to_number(): global transposed_raw_data_matrix global progressbar_total for sample in transposed_raw_data_matrix: for index in range(len(sample) - 1): sample[index] = float(sample[index]) progressbar.show(7, progressbar_total, prefix="Progress:", suffix="Complete", length=50)
def read_mll_leukemia_raw_dataset(): global raw_data_matrix global progressbar_total with open("mll_leukemia.txt", "r") as dataset: for line in dataset: raw_data_matrix.append(line.rstrip()) progressbar.show(1, progressbar_total, prefix="Progress:", suffix="Complete", length=50)
def count_class_strength(): global data_matrix global before_th_count global after_th_count global progressbar_total for sample in data_matrix: if sample[-1] == "before Th": before_th_count += 1 elif sample[-1] == "after Th": after_th_count += 1 progressbar.show(4, progressbar_total, prefix='Progress:', suffix='Complete', length=50)
def count_class_strength(): global data_matrix global DLBCL_count global FL_count global progressbar_total for sample in data_matrix: if sample[-1] == "DLBCL": DLBCL_count += 1 elif sample[-1] == "FL": FL_count += 1 progressbar.show(4, progressbar_total, prefix='Progress:', suffix='Complete', length=50)
def take_user_input(): global attribute_selection_count global progressbar_total selection_count = input( "Enter the number of gene attributes to select (default is " + str(attribute_selection_count) + "): ") if not selection_count: selection_count = "100" attribute_selection_count = int(selection_count) progressbar.show(0, progressbar_total, prefix='Progress:', suffix='Complete', length=50)
def add_class_labels(): global ALL_samples global MLL_samples global AML_samples global progressbar_total for sample in ALL_samples: sample.append("ALL") for sample in MLL_samples: sample.append("MLL") for sample in AML_samples: sample.append("AML") progressbar.show(4, progressbar_total, prefix="Progress:", suffix="Complete", length=50)
def write_to_file(): global selected_data_matrix global selected_gene_attributes global attribute_selection_count global progressbar_total filename = "child-all-selected-" + str(attribute_selection_count) + ".csv" writefile = open(filename, 'w+') write_file_content = "" for attribute in selected_gene_attributes: write_file_content += attribute + "," write_file_content += "class\n" for sample in selected_data_matrix: line = "" for value in sample[:-1]: line += str(value) + "," line += sample[-1] + "\n" write_file_content += line writefile.write(write_file_content) writefile.close() progressbar.show(10, progressbar_total, prefix='Progress:', suffix='Complete', length=50) print("The pre-processed data has been saved in the file: ", filename, "!", end="\n")
def count_class_strength(): global raw_data_matrix global ALL_count global AML_count global MLL_count global progressbar_total splitted_attributes = raw_data_matrix[0].split("\t")[2:] for word in splitted_attributes: if "ALL" in word: ALL_count = ALL_count + 1 elif "AML" in word: AML_count = AML_count + 1 elif "MLL" in word: MLL_count = MLL_count + 1 progressbar.show(2, progressbar_total, prefix="Progress:", suffix="Complete", length=50)
def transpose_dataset(): global raw_data_matrix global gene_attributes global transposed_raw_data_matrix global ALL_samples global MLL_samples global AML_samples global progressbar_total for sample in ALL_samples: transposed_raw_data_matrix.append(sample) for sample in MLL_samples: transposed_raw_data_matrix.append(sample) for sample in AML_samples: transposed_raw_data_matrix.append(sample) progressbar.show(5, progressbar_total, prefix="Progress:", suffix="Complete", length=50)
def progressbar(self): progressbar = Dec_Progressbar(self) progressbar.show()