Esempio n. 1
0
def precompile_clusters_sam_nobcs(rawdata_file,write_output):

    rawdata = []
    try:
        with open(rawdata_file, 'rb') as csvfile:
            print 'opened csv'
            csvreader = csv.reader(csvfile, delimiter=',')
            rawdata = [row for row in csvreader]
    except:
        sys_ops.throw_exception("Could not open file " + rawdata_file)
        sys.exit(1) 

    rawdata_sort = sorted(rawdata)
    chrids = map(itemgetter(0), rawdata_sort)

    ct = 0
    unique_chrids = {}
    for chrid in chrids:
        if chrid not in unique_chrids:
            unique_chrids[chrid] = [ct,0]
        else:
            unique_chrids[chrid][1] = ct

        ct+=1

    if write_output:
        print 'writing clusters' 
        for chrid in unique_chrids:
            out_file = rawdata_file.replace('.csv','_'+str(chrid)+'.csv')
            with open(out_file,'w') as csvfile:
                mywriter = csv.writer(csvfile, delimiter=',')
                for row in rawdata_sort[unique_chrids[chrid][0]:unique_chrids[chrid][1]]:
                    mywriter.writerow(row+[ct]) 
Esempio n. 2
0
def fastq_get_unique_bcs_inds(bc_fastqfile, BC_READS_THRESHOLD):
    try:
        print "fastq_get_unique_bcs, opening " + bc_fastqfile
        bc_handle = open(bc_fastqfile, "rU")
    except:
        sys_ops.throw_exception("Could not find file " + bc_fastqfile)
        return

    #read all unique bcs into list and record counts for each bc
    ct = 0
    BCSEQUNIQUE = {}
    for record in SeqIO.parse(bc_handle, "fastq"):
        bc = str(record.seq)
        #sort bcs into dict by sequence
        if (hash(bc) not in BCSEQUNIQUE):
            BCSEQUNIQUE[hash(bc)] = [[], bc]
            BCSEQUNIQUE[hash(bc)][0].append(ct)
        else:
            BCSEQUNIQUE[hash(bc)][0].append(ct)
        ct += 1

    bc_handle.close()

    BCSEQ = []
    for bchash in BCSEQUNIQUE:
        bc = BCSEQUNIQUE[bchash]
        if bc[0] >= BC_READS_THRESHOLD:
            BCSEQ.append(BCSEQUNIQUE[bchash])

    print "number of reads containing bc for clustering:"
    print ct

    return BCSEQ
Esempio n. 3
0
def write_sams_location_orientation(sam_filename,sampid,outfile_clust_ts,outfile_clust_bs,CLUSTER_READS_THRESHOLD,write_output):

    print "reading input files"
    try:
        samfile = pysam.Samfile(sam_filename)
    except:
        sys_ops.throw_exception("Could not find file "+sam_filename)
        return          

    sam_alignments_ts = []
    sam_alignments_bs = []
    with open(outfile_clust_bs,'w') as csvfile_bs:
        mywriter_bs = csv.writer(csvfile_bs, delimiter=',')   
        with open(outfile_clust_ts,'w') as csvfile_ts:
            mywriter_ts = csv.writer(csvfile_ts, delimiter=',')         
            for read in samfile.fetch():

                #check that read is mapped (mapq>0) and then check read orientation
                if read.mapq>0:
                    if read.is_reverse:
                        #using len(read.seq) is not entirely accurate (should make function to parse cigar string and output mapping length)
                        mywriter_bs.writerow([read.rname,(read.aend-1),(read.aend-1),(read.aend-1),1,1,0,0,1,hash(sampid)])
                    else:
                        mywriter_ts.writerow([read.rname,read.pos,read.pos,read.pos,1,1,0,0,0,hash(sampid)])    
    samfile.close                           

    return 1
Esempio n. 4
0
def fastq_get_unique_bcs_inds(bc_fastqfile,BC_READS_THRESHOLD):
    try:
        print "fastq_get_unique_bcs, opening "+bc_fastqfile
        bc_handle = open(bc_fastqfile, "rU")
    except:
        sys_ops.throw_exception("Could not find file "+bc_fastqfile)
        return      
    
    #read all unique bcs into list and record counts for each bc
    ct = 0
    BCSEQUNIQUE = {}
    for record in SeqIO.parse(bc_handle, "fastq"):
        bc = str(record.seq)
        #sort bcs into dict by sequence
        if (hash(bc) not in BCSEQUNIQUE):
            BCSEQUNIQUE[hash(bc)] = [[], bc]
            BCSEQUNIQUE[hash(bc)][0].append(ct)
        else:
            BCSEQUNIQUE[hash(bc)][0].append(ct)
        ct+=1

    bc_handle.close()

    BCSEQ = []
    for bchash in BCSEQUNIQUE:
        bc = BCSEQUNIQUE[bchash]
        if bc[0]>=BC_READS_THRESHOLD:
            BCSEQ.append(BCSEQUNIQUE[bchash])

    print "number of reads containing bc for clustering:"
    print ct

    return BCSEQ
Esempio n. 5
0
def compile_clusters_sam_nobcs(rawdata_filename):
    #assemble output for parallel clustering jobs

    outfile = rawdata_filename.replace('.csv','c.csv')
    
    finished = post_align_ops_v204.compile_clusters_sam_nobcs(rawdata_filename,outfile,SAMPLE_CLUSTER_READS_THRESHOLD,write_cluster_output)
    if os.path.isfile(outfile)==False:
        sys_ops.throw_exception("Failed at compile_clusters_sam on processing " + outfile)
        return
Esempio n. 6
0
def compile_clusters_sam_nobcs(rawdata_filename):
    #assemble output for parallel clustering jobs

    outfile = rawdata_filename.replace('.csv', 'c.csv')

    finished = post_align_ops_v204.compile_clusters_sam_nobcs(
        rawdata_filename, outfile, SAMPLE_CLUSTER_READS_THRESHOLD,
        write_cluster_output)
    if os.path.isfile(outfile) == False:
        sys_ops.throw_exception(
            "Failed at compile_clusters_sam on processing " + outfile)
        return
Esempio n. 7
0
def fastq_get_seq_dict(fastqfile):
    try:
        fq_handle = open(fastqfile, "rU")
    except:
        sys_ops.throw_exception("Could not find file " + fastqfile)
        return

    ct = 0
    READS = {}
    for record in SeqIO.parse(fq_handle, "fastq"):
        READS[hash(record.name)] = str(record.seq)

    fq_handle.close()

    return READS
Esempio n. 8
0
def fastq_get_seq_dict(fastqfile):
    try:
        fq_handle = open(fastqfile, "rU")
    except:
        sys_ops.throw_exception("Could not find file "+fastqfile)
        return  

    ct = 0
    READS = {}
    for record in SeqIO.parse(fq_handle, "fastq"):
        READS[hash(record.name)] = str(record.seq)
    
    fq_handle.close()

    return READS
Esempio n. 9
0
def run_sample_write_sam_location_orientation(sam_filename):
    #cluster sams, then ibcs for sam clusts, then lbcs for ibc clusts

    sam_filename2 = sam_filename.replace(sam_subpath,ibclbc_subpath)

    outfile_ts = sam_filename.replace('.sam','_sam_clusters_ts.csv')
    outfile_bs = sam_filename.replace('.sam','_sam_clusters_bs.csv')
    
    ############################################################
    sampid = sam_filename.split('.')[0]
    print sampid
    ############################################################

    finished = post_align_ops_v204.write_sams_location_orientation(sam_filename,sampid,outfile_ts,outfile_bs,SAMPLE_CLUSTER_READS_THRESHOLD,write_cluster_output)
    if (os.path.isfile(outfile_ts)==False) or (os.path.isfile(outfile_bs)==False):
        sys_ops.throw_exception("Failed at cluster_sams on processing " + outfile)
        return
Esempio n. 10
0
def cluster_bcs(BCSEQ, BC_THRESHOLD, BC_READS_THRESHOLD, BC_EXACT_MATCH,
                UMI_CLUSTER_METHOD):

    #aggregating all NSWMT sequences by sequence, just those that cluster in UMI space

    if str.find(UMI_CLUSTER_METHOD, 'explicit') >= 0:
        BCSEQC = threshold_cluster_uid_explicit(BCSEQ, BC_THRESHOLD)
    elif str.find(UMI_CLUSTER_METHOD, 'prelinked') >= 0:
        TEMP = threshold_cluster_uid_prelinked_setup(BCSEQ, BC_THRESHOLD)
        BCSEQC = threshold_cluster_uid_prelinked(TEMP, BC_THRESHOLD)
    else:
        sys_ops.throw_exception(
            "Options for cluster_bcs must be either 'explicit' or 'prelinked'. Exiting..."
        )
        return

    #print 'number of unique BC clusters before filtering:'
    #print len(BCSEQC)

    #filter our all clusters with only n reads less than read threshold
    BCSEQC2 = []
    #iterate through all clusters and find clusters
    for bcs in BCSEQC:
        #if greater than 1 barcode in the cluser or greater than 1 read in a sincle bc cluster, cluster is not junk
        if BC_EXACT_MATCH == 1:
            bcs2 = [bcs[0]]
        else:
            bcs2 = bcs

        ct = 0
        for bc in bcs2:
            ct = ct + bc[1]

            if (ct >= BC_READS_THRESHOLD):
                BCSEQC2.append(bcs2)
                break

    #print 'number of unique BC clusters after filtering:'
    #print len(BCSEQC2)

    return BCSEQC2
Esempio n. 11
0
def run_sample_write_sam_location_orientation(sam_filename):
    #cluster sams, then ibcs for sam clusts, then lbcs for ibc clusts

    sam_filename2 = sam_filename.replace(sam_subpath, ibclbc_subpath)

    outfile_ts = sam_filename.replace('.sam', '_sam_clusters_ts.csv')
    outfile_bs = sam_filename.replace('.sam', '_sam_clusters_bs.csv')

    ############################################################
    sampid = sam_filename.split('.')[0]
    print sampid
    ############################################################

    finished = post_align_ops_v204.write_sams_location_orientation(
        sam_filename, sampid, outfile_ts, outfile_bs,
        SAMPLE_CLUSTER_READS_THRESHOLD, write_cluster_output)
    if (os.path.isfile(outfile_ts) == False) or (os.path.isfile(outfile_bs)
                                                 == False):
        sys_ops.throw_exception("Failed at cluster_sams on processing " +
                                outfile)
        return
Esempio n. 12
0
def cluster_bcs(BCSEQ,BC_THRESHOLD,BC_READS_THRESHOLD,BC_EXACT_MATCH,UMI_CLUSTER_METHOD):

    #aggregating all NSWMT sequences by sequence, just those that cluster in UMI space
    
    if str.find(UMI_CLUSTER_METHOD,'explicit')>=0:
        BCSEQC = threshold_cluster_uid_explicit(BCSEQ,BC_THRESHOLD)
    elif str.find(UMI_CLUSTER_METHOD,'prelinked')>=0:
        TEMP = threshold_cluster_uid_prelinked_setup(BCSEQ,BC_THRESHOLD)
        BCSEQC = threshold_cluster_uid_prelinked(TEMP,BC_THRESHOLD)
    else:
        sys_ops.throw_exception("Options for cluster_bcs must be either 'explicit' or 'prelinked'. Exiting...")
        return           

    #print 'number of unique BC clusters before filtering:'
    #print len(BCSEQC)

    #filter our all clusters with only n reads less than read threshold
    BCSEQC2 = []
    #iterate through all clusters and find clusters
    for bcs in BCSEQC:
        #if greater than 1 barcode in the cluser or greater than 1 read in a sincle bc cluster, cluster is not junk
        if BC_EXACT_MATCH==1:
            bcs2 = [bcs[0]]
        else:
            bcs2 = bcs

        ct = 0
        for bc in bcs2:
            ct = ct + bc[1]
        
            if (ct>=BC_READS_THRESHOLD):
                BCSEQC2.append(bcs2)
                break

    #print 'number of unique BC clusters after filtering:'
    #print len(BCSEQC2)

    return BCSEQC2 
Esempio n. 13
0
def get_params(readparam_fileName):
    readparam_file = []
    try:
        with open(readparam_fileName, 'rb') as csvfile:
            csvreader = csv.reader(csvfile, delimiter=',')
            readparam_file = [row for row in csvreader]
    except:
        sys_ops.throw_exception("Could not open read-params " +
                                readparam_fileName)
        sys.exit(1)
##############################

    f_barcodes = readparam_file[0]
    if '' in f_barcodes:
        temp_index = f_barcodes.index('')
        f_barcodes = f_barcodes[0:temp_index]
        for i in range(0, len(f_barcodes)):
            f_barcodes[i] = str(f_barcodes[i])

    r_barcodes = readparam_file[1]
    if '' in r_barcodes:
        temp_index = r_barcodes.index('')
        r_barcodes = r_barcodes[0:temp_index]
        for i in range(0, len(r_barcodes)):
            r_barcodes[i] = str(r_barcodes[i])

    groups = readparam_file[2]
    if '' in groups:
        temp_index = groups.index('')
        groups = groups[0:temp_index]
        for i in range(0, len(groups)):
            groups[i] = str(groups[i])


##############################
    return [f_barcodes, r_barcodes, groups]
Esempio n. 14
0
def generate_sample_jobfile(param_filename, process_filename,
                            jobfile_filename):

    [f_barcodes, r_barcodes, groups] = get_params(param_filename)

    process_list_file = []
    try:
        with open(process_filename, 'rb') as csvfile:
            csvreader = csv.reader(csvfile, delimiter=',')
            process_file = [row for row in csvreader]
    except:
        sys_ops.throw_exception("Could not open process-list " +
                                process_filename)
        sys.exit(1)

    job_list = []
    for row in process_file:
        print row
        #columns in process_list_file must be, in order:
        f_barcode_checkList = [
            f_barcode in str(row[0]) for f_barcode in f_barcodes
        ]
        r_barcode_checkList = [
            r_barcode in str(row[1]) for r_barcode in r_barcodes
        ]
        #f_barcode_checkList2 = [f_barcode in str(row[3]) for f_barcode in f_barcodes]
        #r_barcode_checkList2 = [r_barcode in str(row[4]) for r_barcode in r_barcodes]

        #NOTE: in each of the above cases, the reader assumes the stored sequences in the function get_sequences() are necessary in their entirety to exist as substrings of the sequences in the process-list
        #if((True in f_barcode_checkList) and (True in r_barcode_checkList) and (True in f_barcode_checkList2) and (True in r_barcode_checkList2)):
        if 0:
            print row

            if ((len(row[0]) == 0) and (len(row[1]) == 0)):
                job_list.append([str(0), str(-1) + '_' + str(-1)])
                job_list.append([str(0), str(-1) + '_' + str(-1)])
            elif (len(row[0]) == 0):
                job_list.append([
                    str(0),
                    str(-1) + '_' + str(r_barcode_checkList.index(True))
                ])
                job_list.append([
                    str(0),
                    str(-1) + '_' + str(r_barcode_checkList2.index(True))
                ])
            elif (len(row[1]) == 0):
                job_list.append([
                    str(0),
                    str(f_barcode_checkList.index(True)) + '_' + str(-1)
                ])
                job_list.append([
                    str(0),
                    str(f_barcode_checkList.index2(True)) + '_' + str(-1)
                ])
            else:
                job_list.append([
                    str(0),
                    str(f_barcode_checkList.index(True)) + '_' +
                    str(r_barcode_checkList.index(True))
                ])
                job_list.append([
                    str(0),
                    str(f_barcode_checkList2.index(True)) + '_' +
                    str(r_barcode_checkList2.index(True))
                ])
        elif ((True in f_barcode_checkList) and (True in r_barcode_checkList)):
            print row

            if ((len(row[0]) == 0) and (len(row[1]) == 0)):
                job_list.append([str(0), str(-1) + '_' + str(-1)])
            elif (len(row[0]) == 0):
                job_list.append([
                    str(0),
                    str(-1) + '_' + str(r_barcode_checkList.index(True))
                ])
            elif (len(row[1]) == 0):
                job_list.append([
                    str(0),
                    str(f_barcode_checkList.index(True)) + '_' + str(-1)
                ])
            else:
                job_list.append([
                    str(0),
                    str(f_barcode_checkList.index(True)) + '_' +
                    str(r_barcode_checkList.index(True))
                ])
                #include 0 as first element to indicate that the currently-written job has NOT been completed
        elif ((True in f_barcode_checkList) or (True in r_barcode_checkList)
              or (True in f_barcode_checkList2)
              or (True in r_barcode_checkList2)):
            sys_ops.throw_exception(
                "Process list row " + str(row) +
                " contains mixture of identifiable and unidentifiable sequences. Exiting."
            )
            sys.exit(1)

        print job_list
    with open(jobfile_filename, 'w') as csvfile:
        mywriter = csv.writer(csvfile, delimiter=',')
        for job in job_list:
            mywriter.writerow(job)

    return
Esempio n. 15
0
def compile_clusters_sam_nobcs(rawdata_file,outfile_clust,CLUSTER_READS_THRESHOLD,write_output):

    rawdata = []
    try:
        with open(rawdata_file, 'rb') as csvfile:
            print 'opened csv'
            csvreader = csv.reader(csvfile, delimiter=',')
            rawdata = [row for row in csvreader]
    except:
        sys_ops.throw_exception("Could not open file " + rawdata_file)
        sys.exit(1)         
  
    rawdata_chrid = map(itemgetter(0), rawdata)
    rawdata_chrlb = map(itemgetter(1), rawdata)
    rawdata_chrub = map(itemgetter(2), rawdata)
    rawdata_chrmed = map(itemgetter(3), rawdata)
    rawdata_readct = map(itemgetter(4), rawdata)
    rawdata_uniquect = map(itemgetter(5), rawdata)
    rawdata_empty1 = map(itemgetter(6), rawdata)
    rawdata_empty2 = map(itemgetter(7), rawdata)
    rawdata_orientid = map(itemgetter(8), rawdata)
    rawdata_sampid = map(itemgetter(9), rawdata)
    rawdata_repreadct = map(itemgetter(10), rawdata)

    ct = 0
    repreadcts = {}
    repreadct0 = 0   
    sam_alignments = []
    for el in rawdata_chrid:
        repreadct = rawdata_repreadct[ct]
        if str(repreadct) not in repreadcts:
            repreadcts[str(repreadct)] = 1
            repreadct0+=int(rawdata_repreadct[ct])
        sam_alignments.append([int(rawdata_chrid[ct]),int(rawdata_chrlb[ct]),int(rawdata_chrub[ct]),ct])   
        ct+=1

    #cluster all reads for an individual ibc by position 
    clusters_loc = clustering_ops.nncluster_chr_positions(sam_alignments,'cbb',CLUSTER_READS_THRESHOLD,ALIGN_NN_THRESHOLD)
    name_clusters = clusters_loc[0]
    alignment_clusters = clusters_loc[1]

    ct = 0
    clusters = []
    #find lbcs corresponding to names in name_clusters
    with open(outfile_clust,'w') as csvfile:
        mywriter = csv.writer(csvfile, delimiter=',')    
        for name_cluster in name_clusters:
            chrid = alignment_clusters[ct][0][0]
            chrlb = min([int(rawdata_chrlb[c]) for c in name_cluster])
            chrub = max([int(rawdata_chrub[c]) for c in name_cluster])
            chrmed = numpy.median(numpy.array([float(rawdata_chrmed[c]) for c in name_cluster]))
            chrlocs = [str([rawdata_chrlb[c],rawdata_chrub[c],rawdata_chrmed[c]]) for c in name_cluster]

            readct = len(name_cluster)             
            uniquect = {}
            for loc in chrlocs:
                if hash(str(loc)) not in uniquect:
                    uniquect[hash(str(loc))] = loc

            reps = {}
            repstr = ''
            locs_ts = []
            locs_bs = []
            locs_ts2 = []
            locs_bs2 = []
            locs_tsu = []
            locs_bsu = []
            locs_tsu2 = []
            locs_bsu2 = []        
            for c in name_cluster:
                sampid = int(rawdata_sampid[c])
                if sampid not in reps:
                    reps[sampid] = 1
                    repstr+=(':'+str(sampid)) 

                if int(rawdata_orientid[c])==0:
                    locs_ts.append(int(rawdata_chrlb[c]))
                elif int(rawdata_orientid[c])==1:
                    locs_bs.append(int(rawdata_chrlb[c]))      

            #remove bias due to PCR amplification bias, take only unique mappings
            locs_tsu = numpy.unique(numpy.array(locs_ts)).tolist()
            locs_bsu = numpy.unique(numpy.array(locs_bs)).tolist()

            if (len(locs_ts)>0 and len(locs_bs)>0):
                locs_ts2 = numpy.matrix([locs_ts]*len(locs_bs))
                locs_bs2 = numpy.matrix([locs_bs]*len(locs_ts))
                locs_diff = locs_ts2-numpy.transpose(locs_bs2)
                locs_g0 = float((locs_diff>=OVERLAP_THRESHOLD).sum())/float((locs_diff.shape[0]*locs_diff.shape[1]))
                locs_l0 = float((locs_diff<OVERLAP_THRESHOLD).sum())/float((locs_diff.shape[0]*locs_diff.shape[1]))

                locs_tsu2 = numpy.matrix([locs_tsu]*len(locs_bsu))
                locs_bsu2 = numpy.matrix([locs_bsu]*len(locs_tsu))
                locs_diffu = locs_tsu2-numpy.transpose(locs_bsu2)
                locs_g0u = float((locs_diffu>=OVERLAP_THRESHOLD).sum())/float((locs_diffu.shape[0]*locs_diffu.shape[1]))
                locs_l0u = float((locs_diffu<OVERLAP_THRESHOLD).sum())/float((locs_diffu.shape[0]*locs_diffu.shape[1]))

                with open(str.replace(outfile_clust,'.csv','')+'_'+str(chrlb)+'_'+str(chrub)+'.csv','w') as csvfile:
                    mywriterc = csv.writer(csvfile, delimiter=',')
                    for row in locs_diff:
                        mywriterc.writerow(row.tolist()[0])    

            else:
                locs_g0 = -1
                locs_l0 = -1
                locs_g0u = -1
                locs_l0u = -1                                    

            orientid = rawdata_orientid[name_cluster[0]]

            if locs_g0u != -1: #if one is zero, all the others are zero
                mywriter.writerow([chrid,chrlb,chrub,chrmed,readct,len(uniquect),0,0,len(reps),repreadct0,0,0,locs_g0u,locs_l0u])

            ct+=1                         

    return 1