def svd(args): filename = args.rpkm_matrix f_dir = os.path.dirname(filename) if f_dir != '': f_dir = f_dir + '/' output = filename if args.output: output = f_dir + str(args.output) # count the columns number of the data file f = open(filename) temp = f.readline().strip().split('\t') colsnum = len(temp) # skip 1st row and 4 columns print 'Loading file...' data = np.loadtxt(filename, dtype=np.float, delimiter='\t', skiprows=1, usecols=range(4, colsnum)) # loading targets str targets = jf.loadTargetsStrFromFirstCol(filename) # names of samples samples = temp[4:] print 'SVD...' U, S, Vt = np.linalg.svd(data, full_matrices=False) index = S < 0.7 * np.mean(S) new_S = np.diag(S * index) # reconstruct data matrix data_new = np.dot(U, np.dot(new_S, Vt)) # save to files file_u = open(output + '.U', 'w') file_s = open(output + '.S', 'w') file_vt = open(output + '.Vt', 'w') print 'Saving SVD files...' np.savetxt(file_u, U, delimiter='\t') np.savetxt(file_s, S, delimiter='\t') np.savetxt(file_vt, Vt, delimiter='\t') file_u.close() file_s.close() file_vt.close() print 'Saving matrix..' jf.saveRPKMMatrix(output + '.SVD', samples, targets, np.transpose(data_new))
def normalize(args): rpkm_matrix = str(args.rpkm_matrix) raw_dir = os.path.dirname(rpkm_matrix) if raw_dir != '': raw_dir = raw_dir + '/' output = rpkm_matrix + '.normalized' if args.output: output = raw_dir + str(args.output) + '.normalized' print 'Loading matrix...' result = jf.loadRPKMMatrix(rpkm_matrix) rpkm = result['rpkm'] samples = result['samples'] annotation = result['annotation'] targets = result['targets'] targets_str = result['targets_str'] GC_percentage = annotation[:, 0] map_ability = annotation[:, 1] exon_length = annotation[:, 2] GC_index = {} for ind in range(len(GC_percentage)): gc = GC_percentage[ind] if GC_index.has_key(gc): GC_index[gc].append(ind) else: GC_index[gc] = [ind] print 'Normalizing by GC percentage...' corrected_rpkm = np.zeros([len(rpkm), len(rpkm[0])], dtype=np.float) for i in range(len(samples)): print 'Normalizing RPKM by GC content for sample: ' + samples[i] overall_median = np.median(rpkm[:, i]) for gc in GC_index.keys(): t_ind = GC_index[gc] t_median = np.median(rpkm[t_ind, i]) if t_median == 0: print 'WARNING. Median == 0, sample: %s, GC: %d' %(samples[i], gc) corrected_rpkm[t_ind, i] = 0 else: corrected_rpkm[t_ind, i] = rpkm[t_ind, i] * overall_median / t_median print 'Saving GC normalized matrix..' file_GC_normalized = open(output + '.GC', 'w') np.savetxt(file_GC_normalized, corrected_rpkm, delimiter='\t', header='\t'.join(samples),comments='') file_GC_normalized.close() map_index = {} for ind in range(len(map_ability)): _map = map_ability[ind] if map_index.has_key(_map): map_index[_map].append(ind) else: map_index[_map] = [ind] print 'Normalizing by Mapping ability...' for i in range(len(samples)): print 'Normalizing RPKM by mapping ability for sample %s' %samples[i] overall_median = np.median(corrected_rpkm[:, i]) for _map in map_index.keys(): t_ind = map_index[_map] t_median = np.median(corrected_rpkm[t_ind, i]) if t_median == 0: print 'WARNING. Median == 0, sample: %s, Mapping ability: %d' %(samples[i], _map) corrected_rpkm[t_ind, i] = 0 else: corrected_rpkm[t_ind, i] = corrected_rpkm[t_ind, i] * overall_median / t_median print 'Saving Mappability normalized matrix..' file_MAP_normalized = open(output + '.MAP', 'w') np.savetxt(file_MAP_normalized, corrected_rpkm, delimiter='\t', header='\t'.join(samples),comments='') file_MAP_normalized.close() length_index = {} for ind in range(len(exon_length)): _length = exon_length[ind] if length_index.has_key(_length): length_index[_length].append(ind) else: length_index[_length] = [ind] print 'Normalizing by exon length...' for i in range(len(samples)): print 'Normalizing RPKM for by exon length for sample %s' %samples[i] overall_median = np.median(corrected_rpkm[:, i]) for _length in length_index.keys(): t_ind = length_index[_length] t_median = np.median(corrected_rpkm[t_ind, i]) if t_median == 0: print 'WARNING. Median == 0, sample: %s, Exome length: %d' %(samples[i], _length) corrected_rpkm[t_ind, i] = 0 else: corrected_rpkm[t_ind, i] = corrected_rpkm[t_ind, i] * overall_median / t_median print 'Saving exon_length normalized matrix..' file_exon_length_normalized = open(output + '.exon_length', 'w') np.savetxt(file_exon_length_normalized, corrected_rpkm, delimiter='\t', header='\t'.join(samples),comments='') file_exon_length_normalized.close() print 'Saving matrix..' jf.saveRPKMMatrix(output, samples, targets_str, np.transpose(corrected_rpkm))