Example #1
0
def svd(args):
    filename = args.rpkm_matrix 
    f_dir = os.path.dirname(filename)
    if f_dir != '':
        f_dir = f_dir + '/'

    output = filename
    if args.output:
        output = f_dir + str(args.output)
   
    # count the columns number of the data file
    f = open(filename)
    temp = f.readline().strip().split('\t')
    colsnum = len(temp)
    
    # skip 1st row and 4 columns
    print 'Loading file...'
    data = np.loadtxt(filename, dtype=np.float, delimiter='\t', skiprows=1, usecols=range(4, colsnum)) 
    # loading targets str
    targets = jf.loadTargetsStrFromFirstCol(filename)
    # names of samples
    samples = temp[4:] 

    print 'SVD...'
    U, S, Vt = np.linalg.svd(data, full_matrices=False)
    
    index = S < 0.7 * np.mean(S)
    new_S = np.diag(S * index)
    
    # reconstruct data matrix
    data_new = np.dot(U, np.dot(new_S, Vt))

    # save to files
    file_u = open(output + '.U', 'w')
    file_s = open(output + '.S', 'w')
    file_vt = open(output + '.Vt', 'w')
    
    print 'Saving SVD files...'
    np.savetxt(file_u, U, delimiter='\t')
    np.savetxt(file_s, S, delimiter='\t')
    np.savetxt(file_vt, Vt, delimiter='\t')
    file_u.close()
    file_s.close()
    file_vt.close()

    print 'Saving matrix..'
    jf.saveRPKMMatrix(output + '.SVD', samples, targets, np.transpose(data_new))
Example #2
0
def normalize(args): 
    rpkm_matrix = str(args.rpkm_matrix)
    raw_dir = os.path.dirname(rpkm_matrix)
    if raw_dir != '':
        raw_dir = raw_dir + '/'

    output = rpkm_matrix + '.normalized'
    if args.output:
        output = raw_dir + str(args.output) + '.normalized'

    print 'Loading matrix...'
    result = jf.loadRPKMMatrix(rpkm_matrix)
    rpkm = result['rpkm']
    samples = result['samples']
    annotation = result['annotation']
    targets = result['targets']
    targets_str = result['targets_str']

    GC_percentage = annotation[:, 0]
    map_ability = annotation[:, 1]
    exon_length = annotation[:, 2]

    GC_index = {}
    for ind in range(len(GC_percentage)):
        gc = GC_percentage[ind]

        if GC_index.has_key(gc):
            GC_index[gc].append(ind)
        else:
            GC_index[gc] = [ind]

    print 'Normalizing by GC percentage...'
    corrected_rpkm = np.zeros([len(rpkm), len(rpkm[0])], dtype=np.float)
    for i in range(len(samples)):
        print 'Normalizing RPKM by GC content for sample: ' + samples[i]
        overall_median = np.median(rpkm[:, i])
        for gc in GC_index.keys():
            t_ind = GC_index[gc]
            t_median = np.median(rpkm[t_ind, i])
            if t_median == 0:
                print 'WARNING. Median == 0, sample: %s, GC: %d' %(samples[i], gc)
                corrected_rpkm[t_ind, i] = 0
            else:
                corrected_rpkm[t_ind, i] = rpkm[t_ind, i] * overall_median / t_median

    print 'Saving GC normalized matrix..'
    file_GC_normalized = open(output + '.GC', 'w')
    np.savetxt(file_GC_normalized, corrected_rpkm, delimiter='\t', header='\t'.join(samples),comments='')
    file_GC_normalized.close()


    map_index = {}
    for ind in range(len(map_ability)):
        _map = map_ability[ind]

        if map_index.has_key(_map):
            map_index[_map].append(ind)
        else:
            map_index[_map] = [ind]

    print 'Normalizing by Mapping ability...'
    for i in range(len(samples)):
        print 'Normalizing RPKM by mapping ability for sample %s' %samples[i]
        overall_median = np.median(corrected_rpkm[:, i])
        for _map in map_index.keys():
            t_ind = map_index[_map]
            t_median = np.median(corrected_rpkm[t_ind, i])
            if t_median == 0:
                print 'WARNING. Median == 0, sample: %s, Mapping ability: %d' %(samples[i], _map)
                corrected_rpkm[t_ind, i] = 0
            else:
                corrected_rpkm[t_ind, i] = corrected_rpkm[t_ind, i] * overall_median / t_median

    print 'Saving Mappability normalized matrix..'
    file_MAP_normalized = open(output + '.MAP', 'w')
    np.savetxt(file_MAP_normalized, corrected_rpkm, delimiter='\t', header='\t'.join(samples),comments='')
    file_MAP_normalized.close()

    length_index = {}
    for ind in range(len(exon_length)):
        _length = exon_length[ind]
        if length_index.has_key(_length):
            length_index[_length].append(ind)
        else:
            length_index[_length] = [ind]

    print 'Normalizing by exon length...'
    for i in range(len(samples)):
        print 'Normalizing RPKM for by exon length for sample %s' %samples[i]
        overall_median = np.median(corrected_rpkm[:, i])
        for _length in length_index.keys():
            t_ind = length_index[_length]
            t_median = np.median(corrected_rpkm[t_ind, i])
            if t_median == 0:
                print 'WARNING. Median == 0, sample: %s, Exome length: %d' %(samples[i], _length)
                corrected_rpkm[t_ind, i] = 0
            else:
                corrected_rpkm[t_ind, i] = corrected_rpkm[t_ind, i] * overall_median / t_median
    
    print 'Saving exon_length normalized matrix..'
    file_exon_length_normalized = open(output + '.exon_length', 'w')
    np.savetxt(file_exon_length_normalized, corrected_rpkm, delimiter='\t', header='\t'.join(samples),comments='')
    file_exon_length_normalized.close()

    print 'Saving matrix..'
    jf.saveRPKMMatrix(output, samples, targets_str, np.transpose(corrected_rpkm))