def main(): try: args = get_args() #sys.stdout.write(str(args)+"\n") nanList= ["NAN", "NA", "N/A", "-","?","nan", "na", "n/a"] matrix, og_cols,og_rows = reader(args.input_file_txt) #old_reader matrix, og_rows, og_cols = reader(args.input_file_txt) # if float(args.thresh) < 0.000001: # print('Invalid negative threshold chosen = '+str(args.thresh)+" choose positive value") # sys.exit(-4) if args.choice == "Histogram": Histo(matrix) elif args.choice == "CreateFiles": CreateFiles(args.output_file_info) elif args.choice == "Variance": if args.axes == "Row": matrix, filter_rows, filter_cols,delCnt,minVal,maxVal = Variance_Percent_Filter_row(matrix,1,og_rows,og_cols,True) labeler(matrix,filter_rows,filter_cols,args.output_file_txt) # if delCnt < 1: # print('\nNO Filtering occurred for rows using variance < '+str(args.thresh)+ ' by row. Matrix row minimum variance= %.2f' % minVal+' and maximum variance= %.2f' % maxVal) # sys.stderr.write('\nFiltering out rows using variance < '+str(args.thresh)+ ' removed '+str(delCnt)+' rows') # sys.exit(-1) # else: # print('\nFiltering out rows using variance < '+str(args.thresh)+ ' removed '+str(delCnt)+' rows') elif args.axes == "Column": matrix, filter_rows, filter_cols,delCnt,minVal,maxVal = Variance_Percent_Filter_col(matrix,1,og_rows,og_cols,True) labeler(matrix,filter_rows,filter_cols,args.output_file_txt) # if delCnt < 1: # print('\nNO Filtering occurred for columns using variance < '+str(args.thresh)+ ' by columns. Matrix columns minimum variance= %.2f' % minVal+' and maximum variance= %.2f' % maxVal) # sys.stderr.write('\nFiltering out rows using variance < '+str(args.thresh)+ ' removed '+str(delCnt)+' rows') # sys.exit(-1) # else: # print('\nFiltering out columns using variance < '+str(args.thresh)+ ' removed '+str(delCnt)+' columns') else: print('Invalid Axes = '+str(args.axes)) sys.exit(-1) else: print("Invalid Filter Choice = "+str(args.choice)) sys.exit(-2) except Exception as err: traceback.print_exc() sys.exit(-3)
def main(): try: args = get_args() scaleValue = float(args.scalevalue) offsetValue = float(args.offsetvalue) #print(args) #sys.stdout.write(str(args)+"\n") matrix, og_cols, og_rows = reader(args.input_file_txt) if args.choice == "z_score_normalization": if args.axes == "Row": matrix = Zscore_row(matrix) labeler(matrix, og_cols, og_rows, args.output_file_txt) print("zcore, row") elif args.axes == "Column": matrix = Zscore_col(matrix) labeler(matrix, og_cols, og_rows, args.output_file_txt) print("zscore, column") else: print("zscore, invalid axis") elif args.choice == "mean_center_normalization": if args.axes == "Row": matrix = MeanMedianCenter_row(matrix, "mean") labeler(matrix, og_cols, og_rows, args.output_file_txt) print("mean-center by row") elif args.axes == "Column": matrix = MeanMedianCenter_col(matrix, "mean") labeler(matrix, og_cols, og_rows, args.output_file_txt) print("mean-center by column") else: print("meancenter, invalid axis") elif args.choice == "median_center_normalization": if args.axes == "Row": matrix = MeanMedianCenter_row(matrix, "median") labeler(matrix, og_cols, og_rows, args.output_file_txt) print("median-center by row") elif args.axes == "Column": matrix = MeanMedianCenter_col(matrix, "median") labeler(matrix, og_cols, og_rows, args.output_file_txt) print("median-center by column") else: print("meancenter, invalid axis") elif args.choice == "add_offset": if args.axes == "Row": #offset = -100 #!!!! TODO REMOVE AND ADD WHEN clause to xml to get value matrix = ScaleOffset_row(matrix, 1.0, offsetValue) labeler(matrix, og_cols, og_rows, args.output_file_txt) print("offset of " + str(offsetValue) + " by row") elif args.axes == "Column": matrix = ScaleOffset_col(matrix, 1.0, offsetValue) labeler(matrix, og_cols, og_rows, args.output_file_txt) print("offset of " + str(offsetValue) + " by column") else: print("offset" + str(offsetValue) + " invalid axis -not row or column") elif args.choice == "scale": if args.axes == "Row": #scaleValue = 1000 #!!!! TODO REMOVE AND ADD WHEN clause to xml to get value matrix = ScaleOffset_row(matrix, scaleValue, 0.0) labeler(matrix, og_cols, og_rows, args.output_file_txt) print("scaling " + str(scaleValue) + " by row") elif args.axes == "Column": matrix = ScaleOffset_col(matrix, scaleValue, 0.0) labeler(matrix, og_cols, og_rows, args.output_file_txt) print("scaling " + str(scaleValue) + " by column") else: print("scaling " + str(scaleValue) + " invalid axis") elif args.choice == "transpose": matrix = Transpose(matrix) #issue using same matrix? labeler(matrix, og_rows, og_cols, args.output_file_txt) #swapped row&col labels print("transpose mxn matrix to nxm size") elif args.choice == "ln_normalization": matrix = Convert2Logs(matrix, "log2", offsetValue) labeler(matrix, og_cols, og_rows, args.output_file_txt) print("log2 plus " + str(offsetValue) + " normalization for all values") elif args.choice == "log_normalization": matrix = Convert2Logs(matrix, "log10", offsetValue) labeler(matrix, og_cols, og_rows, args.output_file_txt) print("log10 normalization for all values") elif args.choice == "rank": if args.axes == "Row": matrix = Rankdata_ByRow(matrix) labeler(matrix, og_cols, og_rows, args.output_file_txt) print("performed rank normalization by row") elif args.axes == "Column": matrix = Rankdata_ByColumn(matrix) labeler(matrix, og_cols, og_rows, args.output_file_txt) print("performed rank normalization by column") else: print("rank, invalid axis") elif args.choice == "divide_by_sum": if args.axes == "Row": matrix = Divide_By_Sum_row(matrix) labeler(matrix, og_cols, og_rows, args.output_file_txt) print("performed divide row N values by row N's sum") elif args.axes == "Column": matrix = Divide_By_Sum_col(matrix) labeler(matrix, og_cols, og_rows, args.output_file_txt) print("performed divide column N values by column N's sum") else: print("divide_by_sum, invalid axis") else: print("Invalid normalization Choice") except Exception as err: traceback.print_exc() sys.exit(1)
def main(): args = get_args() #print(args) #sys.stdout.write(str(args)) #sys.stdout.write( '\nValid NAN identifiers are "NA","N/A","-", and "?"') matrix, og_cols, og_rows = reader(args.input_file_txt) # if nonNumCnt > 0: # print('\nERROR Matrix has non-numbers that are non-NAN identifiers in matrix. Total and percent unknown strings found = '+str(nonNumCnt)+ ', %.2f' % (100.0*nonNumCnt/(1.0*len(og_cols)*len(og_rows)))+'%' ) # #sys.stderr.write('\nERROR Matrix has non-numbers that are non-NAN identifiers in matrix. Total and percent unknown strings found = '+str(nonNumCnt)+ ', %.2f' % (100.0*nonNumCnt/(1.0*len(og_cols)*len(og_rows)))+'%' ) # if nanCnt > 0: # print('\nWARNING Matrix has '+str(nanCnt)+' that is %.2f' % (100.0*nanCnt/(1.0*len(og_cols)*len(og_rows)))+'% known NAN identifiers') # sys.exit(-1) # else: # if nanCnt > 0: # print('\nWARNING Matrix has NO unknown non-numbers in matrix, but contains '+str(nanCnt)+' that is %.2f' % (100.0*nanCnt/(1.0*len(og_cols)*len(og_rows)))+'% known NAN identifiers') # else: # print('Matrix is Good-to-Go -- all numbers in data area. ') #with open(args.output_file_txt,'w') as f: # f.write("Use original input file for further processing\n") #f.close() #sys.exit(0) # TODO !!!!! Below if MDA decides to use it TURNED OFF FOR NOW # TODO !!!!! Below if MDA decides to use it TURNED OFF FOR NOW if args.replacement == "Mean": if args.axes == "Row": matrix, nonNumCnt, nanCnt = nan_replacer_mean_rows(matrix) Labeler(matrix, og_cols, og_rows, args.output_file_txt) #OLD_labeler(matrix, og_cols, og_rows, args.output_file_txt) #print('Mean,Row') if nonNumCnt > 0: print( 'ERROR Matrix has non-numbers that are non-NAN identifiers in matrix. Total and percent unknown strings found = ' + str(nonNumCnt) + ', %.2f' % (100.0 * nonNumCnt / (1.0 * len(og_cols) * len(og_rows))) + '%') sys.stderr.write( 'ERROR Matrix has non-numbers that are non-NAN identifiers in matrix. Total and percent unknown strings found = ' + str(nonNumCnt) + ', %.2f' % (100.0 * nonNumCnt / (1.0 * len(og_cols) * len(og_rows))) + '%') if nanCnt > 0: print('WARNING Matrix has ' + str(nanCnt) + ' that is %.2f' % (100.0 * nanCnt / (1.0 * len(og_cols) * len(og_rows))) + '% known NAN identifiers') sys.exit(-1) else: if nanCnt > 0: print('\nWARNING Matrix has ' + str(nanCnt) + ' that is %.2f' % (100.0 * nanCnt / (1.0 * len(og_cols) * len(og_rows))) + '% known NAN identifiers') else: print('\nMatrix is Good-to-Go -- all numbers in matrix. ') sys.exit(0) elif args.axes == "Column": matrix, nonNumCnt, nanCnt = nan_replacer_mean_columns(matrix) Labeler(matrix, og_cols, og_rows, args.output_file_txt) #OLD_labeler(matrix, og_cols, og_rows, args.output_file_txt) #print('Mean,Column') if nonNumCnt > 0: print( '\nERROR Matrix has non-numbers that are non-NAN identifiers in matrix. Total and percent unknown strings found = ' + str(nonNumCnt) + ', %.2f' % (100.0 * nonNumCnt / (1.0 * len(og_cols) * len(og_rows))) + '%') sys.stderr.write( '\nERROR Matrix has non-numbers that are non-NAN identifiers in matrix. Total and percent unknown strings found = ' + str(nonNumCnt) + ', %.2f' % (100.0 * nonNumCnt / (1.0 * len(og_cols) * len(og_rows))) + '%') if nanCnt > 0: print('\nWARNING Matrix has ' + str(nanCnt) + ' that is %.2f' % (100.0 * nanCnt / (1.0 * len(og_cols) * len(og_rows))) + '% known NAN identifiers') sys.exit(-1) else: if nanCnt > 0: print('\nWARNING Matrix has ' + str(nanCnt) + ' that is %.2f' % (100.0 * nanCnt / (1.0 * len(og_cols) * len(og_rows))) + '% known NAN identifiers') else: print('\nMatrix is Good-to-Go -- all numbers in matrix. ') sys.exit(0) else: print('Mean, but given Invalid Axis= ' + str(args.axes)) sys.stderr.write('Mean, but given Invalid Axis= ' + str(args.axes)) elif args.replacement == "Zero": matrix, nonNumCnt, nanCnt = nan_replacer_zero(matrix) Labeler(matrix, og_cols, og_rows, args.output_file_txt) #OLD_labeler(matrix, og_cols, og_rows, args.output_file_txt) if nonNumCnt > 0: print( '\nERROR Matrix has non-numbers that are non-NAN identifiers in matrix. Total and percent unknown strings found = ' + str(nonNumCnt) + ', %.2f' % (100.0 * nonNumCnt / (1.0 * len(og_cols) * len(og_rows))) + '%') sys.stderr.write( '\nERROR Matrix has non-numbers that are non-NAN identifiers in matrix. Total and percent unknown strings found = ' + str(nonNumCnt) + ', %.2f' % (100.0 * nonNumCnt / (1.0 * len(og_cols) * len(og_rows))) + '%') if nanCnt > 0: print('\nWARNING Matrix has ' + str(nanCnt) + ' that is %.2f' % (100.0 * nanCnt / (1.0 * len(og_cols) * len(og_rows))) + '% known NAN identifiers') sys.exit(-1) else: if nanCnt > 0: print('\nWARNING Matrix has ' + str(nanCnt) + ' that is %.2f' % (100.0 * nanCnt / (1.0 * len(og_cols) * len(og_rows))) + '% known NAN identifiers') else: print('\nMatrix is Good-to-Go -- all numbers in matrix. ') sys.exit(0) else: print('zero, but given Invalid Axis= ' + str(args.axes)) sys.stderr.write('zero, but given Invalid Axis= ' + str(args.axes)) sys.exit(-2)
#---------------------------------------------------------------------- if __name__ == "__main__": # input_file1 = "/Users/bobbrown/Desktop/Gene-by-var.txt" # input_file2 = "/Users/bobbrown/Desktop/var-by-sample.txt" # out_fileName = "/Users/bobbrown/Desktop/MatixMult-1-2-Out.txt" # selection = "MatrixMultiply" #TODO address NANs ??? try: args = get_args() selection= args.choice matrix1,column_labels1,row_labels1 = reader(args.input_file1) # to be transposed later matrix2,column_labels2,row_labels2 = reader(args.input_file2) if args.transpose == 'y' or args.input_file1 == args.input_file2: matrix1 = Transpose(matrix1) print("\n>>>NOTICE Transposed first matrix so matrix 1 columns = Matrix 2 number rows ") temp = row_labels1 #swap labels for output matrix row_labels1 = column_labels1 #swap labels for output matrix column_labels1= temp #swap labels for output matrix MatchLabels(column_labels1,row_labels2) # verfiy labels and their order match if len(column_labels1) != len(row_labels2): print("\n>>> ERROR attempting to multiple Matrices of incompatible dimensions ") print("First Matrix is "+str(len(row_labels1))+" by "+str(len(column_labels1))+" where second Matrix is "+str(len(og_row2))+" by "+str(len(column_labels2))+"\n")
def main(): try: args = get_args() #sys.stdout.write(str(args)+"\n") # <option value="LowerLimit">Minimum Absolute(Cell) Values to remove row/column</option> # <option value="UpperLimit">Maximum Absolute(Cell) Values to remove row/column</option> # <option value="NANnumber">NAN Number Cells Limit to remove row/column</option> # <option value="NANpercent">NAN Percent Cells Limit to remove row/column</option> nanList = ["NAN", "NA", "N/A", "-", "?", "nan", "na", "n/a"] matrix, column_header_list, row_header_list = reader( args.input_file_txt) #old_reader matrix, row_header_list, column_header_list = reader(args.input_file_txt) threshold = float(args.thresh) if threshold < 0.000001: print('Invalid negative or near-zero threshold chosen = ' + str(args.thresh) + " choose positive value") sys.exit(-4) #VariancePercent if args.choice == "VariancePercent" or args.choice == "VarianceCount": # > percent variance if args.axes == "Row": if args.choice == "VarianceCount": threshold = (1 - threshold / len(row_header_list)) * 100.0 matrix, filter_rows, filter_cols, delCnt, minVal, maxVal = Variance_Percent_Filter_row( matrix, threshold, row_header_list, column_header_list) Labeler(matrix, filter_cols, filter_rows, args.output_file_txt) if delCnt < 1: print( '\nNO Filtering occurred for rows using variance percentile < ' + str(args.thresh) + ' by row. Matrix row minimum variance= %.2f' % minVal + ' and maximum variance= %.2f' % maxVal) sys.stderr.write( '\nFiltering out rows using variance percentile < ' + str(args.thresh) + ' removed ' + str(delCnt) + ' rows') sys.exit(-1) else: print('\nFiltering out rows using variance percentile < ' + str(args.thresh) + ' removed ' + str(delCnt) + ' rows') elif args.axes == "Column": if args.choice == "VarianceCount": threshold = (1 - threshold / len(column_header_list)) * 100.0 matrix, filter_rows, filter_cols, delCnt, minVal, maxVal = Variance_Percent_Filter_col( matrix, threshold, row_header_list, column_header_list) Labeler(matrix, filter_cols, filter_rows, args.output_file_txt) if delCnt < 1: print( '\nNO Filtering occurred for columns using variance percentile < ' + str(args.thresh) + ' by columns. Matrix columns minimum variance= %.2f' % minVal + ' and maximum variance= %.2f' % maxVal) sys.stderr.write( '\nNO Filtering out rows using variance percentile < ' + str(args.thresh) + ' removed ' + str(delCnt) + ' rows') sys.exit(-1) else: print( '\nFiltering out columns using variance percentile < ' + str(args.thresh) + ' removed ' + str(delCnt) + ' columns') else: print('Invalid Axes =' + str(args.thresh)) sys.exit(-1) #LowerLimit elif args.choice == "LowerLimit": #!! todo is NOT lower or upper limit but range of values if args.axes == "Row": matrix, filter_rows, filter_cols, delCnt, minVal, maxVal = UpperLowerLimit_Filter_Row( 'lower', matrix, threshold, row_header_list, column_header_list) Labeler(matrix, filter_cols, filter_rows, args.output_file_txt) if delCnt < 1: print( '\nNO Filtering occurred for rows using LowerLimit < ' + str(args.thresh) + ' by row. Matrix row minimum range= %.2f' % minVal + ' and maximum range= %.2f' % maxVal) sys.stderr.write( '\nNO Filtering out rows using LowerLimit < ' + str(args.thresh) + ' removed ' + str(delCnt) + ' rows') sys.exit(-1) else: print('\nFiltered out ' + str(delCnt) + ' rows with Lower Limit < ' + str(args.thresh)) elif args.axes == "Column": matrix, filter_rows, filter_cols, delCnt, minVal, maxVal = UpperLowerLimit_Filter_Col( 'lower', matrix, threshold, row_header_list, column_header_list) Labeler(matrix, filter_cols, filter_rows, args.output_file_txt) if delCnt < 1: print( '\nNO Filtering occurred for columns using Lower Limit < ' + str(args.thresh) + ' by columns. Matrix columns minimum range= %.2f' % minVal + ' and maximum range= %.2f' % maxVal) sys.stderr.write( '\nNO Filtering out rows using Lower Limit < ' + str(args.thresh) + ' removed ' + str(delCnt) + ' rows') sys.exit(-1) else: print('\nFiltered out ' + str(delCnt) + ' columns with Lower Limit < ' + str(args.thresh)) #UpperLimit elif args.choice == "UpperLimit": #!! todo is NOT lower or upper limit but range of values if args.axes == "Row": matrix, filter_rows, filter_cols, delCnt, minVal, maxVal = UpperLowerLimit_Filter_Row( 'upper', matrix, threshold, row_header_list, column_header_list) Labeler(matrix, filter_cols, filter_rows, args.output_file_txt) if delCnt < 1: print( '\nNO Filtering occurred for rows using Upper Limit < ' + str(args.thresh) + ' by row. Matrix row minimum range= %.2f' % minVal + ' and maximum range= %.2f' % maxVal) sys.stderr.write( '\nNO Filtering out rows using Upper Limit < ' + str(args.thresh) + ' by row. Matrix row minimum range= %.2f' % minVal + ' and maximum range= %.2f' % maxVal) sys.exit(-1) else: print('\nFiltered out ' + str(delCnt) + ' rows with UpperLimit < ' + str(args.thresh)) elif args.axes == "Column": matrix, filter_rows, filter_cols, delCnt, minVal, maxVal = UpperLowerLimit_Filter_Col( 'upper', matrix, threshold, row_header_list, column_header_list) Labeler(matrix, filter_cols, filter_rows, args.output_file_txt) if delCnt < 1: print( '\nNO Filtering occurred for columns using UpperLimit < ' + str(args.thresh) + ' by columns. Matrix columns minimum range= %.2f' % minVal + ' and maximum range= %.2f' % maxVal) sys.stderr.write( '\nFiltering out rows using UpperLimit < ' + str(args.thresh) + ' by columns. Matrix columns minimum range= %.2f' % minVal + ' and maximum range= %.2f' % maxVal) sys.exit(-1) else: print('\nFiltered out ' + str(delCnt) + ' columns with UpperLimit < ' + str(args.thresh)) #MADlimit elif args.choice == "MADcount" or args.choice == "MADpercent": #!! is lowerlimit of median absolute deviation medians threshold = threshold if args.axes == "Row": if args.choice == "MADpercent": threshold = len(row_header_list) * threshold / 100.0 matrix, filter_rows, filter_cols, delCnt, maxVal = Row_Value_MAD( matrix, threshold, row_header_list, column_header_list) Labeler(matrix, filter_cols, filter_rows, args.output_file_txt) if delCnt < 1: print('\nNO Filtering occurred for rows using MAD < ' + str(threshold) + ' by row. Matrix row MAD maximum value= %.2f' % maxVal) sys.stderr.write( '\nFiltering out rows using MAD < ' + str(threshold) + ' by row. Matrix row MAD maximum value= %.2f' % maxVal) sys.exit(-1) else: print('\nFiltered out ' + str(delCnt) + ' rows using MAD maximum value > ' + str(threshold)) elif args.axes == "Column": if args.choice == "MADpercent": threshold = len(column_header_list) * threshold / 100.0 matrix, filter_rows, filter_cols, delCnt, maxVal = Col_Value_MAD( matrix, threshold, row_header_list, column_header_list) Labeler(matrix, filter_cols, filter_rows, args.output_file_txt) if delCnt < 1: print( '\nNO Filtering occurred for columns using MAD < ' + str(threshold) + ' by columns. Matrix columns MAD maximum value= %.2f' % maxVal) sys.stderr.write( '\nFiltering out columns using MAD < ' + str(threshold) + ' by columns. Matrix columns MAD maximum value= %.2f' % maxVal) sys.exit(-1) else: print('\nFiltered out ' + str(delCnt) + ' columns using MAD maximum value > ' + str(threshold)) #NANlimit elif args.choice == "NANlimit" or args.choice == "NANpercent": maxNANs = int(args.thresh) val = ' ' if args.choice == "NANpercent": n, m = np.shape(matrix) maxNANs = int(int(args.thresh) * n / 100) val = '%' if args.axes == "Row": matrix, filter_rows, filter_cols, delCnt, maxFoundNANs = NAN_Filter_Row( matrix, nanList, maxNANs, row_header_list, column_header_list) Labeler(matrix, filter_cols, filter_rows, args.output_file_txt) if delCnt < 1: print( '\nNO Filtering occurred for rows using NAN limit = or > ' + str(args.thresh) + val + ' by row. Matrix row max NAN count is =' + str(maxFoundNANs)) sys.stderr.write( '\nNO Filtering out rows using NAN limit = or > ' + str(args.thresh) + val + ' by row. Matrix row max NAN count is =' + str(maxFoundNANs)) sys.exit(-1) else: print('\nFiltered out ' + str(delCnt) + ' rows using NAN limit = or > ' + str(args.thresh) + val) elif args.axes == "Column": matrix, filter_rows, filter_cols, delCnt, maxFoundNANs = NAN_Filter_Column( matrix, nanList, maxNANs, row_header_list, column_header_list) Labeler(matrix, filter_cols, filter_rows, args.output_file_txt) if delCnt < 1: print( '\nNO Filtering occurred for columns using NAN limit = or > ' + str(args.thresh) + val + ' by columns. Matrix columns max NAN count is = ' + str(maxFoundNANs)) sys.stderr.write( '\nNO Filtering out columns using NAN limit = or > ' + str(args.thresh) + val + ' by columns. Matrix columns max NAN count is = ' + str(maxFoundNANs)) sys.exit(-1) else: print('\nFiltered out ' + str(delCnt) + ' columns using NAN limit = or > ' + str(args.thresh) + val) # elif args.choice == "covariance": # if args.axes == "Row": # matrix, filter_rows, filter_cols = CoVariance_Percent_Filter_row(matrix,args.thresh,row_header_list,column_header_list) # Labeler(matrix,filter_rows,filter_cols,args.output_file_txt) # print('Covariance_Filter on row') # elif args.axes == "Column": # matrix, filter_rows, filter_cols = CoVariance_Percent_Filter_col(matrix,args.thresh,row_header_list,column_header_list) # Labeler(matrix,filter_rows,filter_cols,args.output_file_txt) # print('Covariance_Filter on column') else: print('Invalid Axes = ' + str(args.axes)) sys.exit(-1) else: print("Invalid Filter Choice = " + str(args.choice)) sys.exit(-2) except Exception as err: traceback.print_exc() sys.exit(-3)