コード例 #1
0
def density_adjusted(fname, chr_sam, minlength, maxlength, path_wig, path_den,
                     path_gff):
    '''Density will be a size separated dictionary = {length : [reads at 0, reads at 1, ....]}
        this makes it easier to select a size range later for analysis'''

    fname = fname
    chr_sam = chr_sam
    minlength = minlength
    maxlength = maxlength
    GFFgen = GFF.parse(path_gff)

    # open chr aligned sam file
    f_samfile = open(chr_sam)
    samfile = csv.reader(f_samfile, delimiter='	')

    # dictionaries to hold read counts
    density_plus = {}
    density_minus = {}
    density_plus_sizesep = {}
    density_minus_sizesep = {}

    if minlength < 0 or maxlength < 0:
        print "Error. Length input not valid."
        return (0)

    # Makes 2 sets of indices, one for all reads, and another for size separated:
    for sequence in GFFgen:
        density_plus[sequence.id] = [0 for x in range(len(sequence) + 20)]
        density_minus[sequence.id] = [0 for x in range(len(sequence) + 20)]

    for length in range(minlength, maxlength + 1):
        density_plus_sizesep[length] = [0 for x in range(len(sequence) + 20)]
        density_minus_sizesep[length] = [0 for x in range(len(sequence) + 20)]

    total_reads = 0
    mapped_reads = 0

    # Loop through the samfile.
    for read in samfile:
        if read[0][0] == '@':  # Ignore header lines.
            continue

        if read[1] == '4':  # A bowtie mismatch.
            continue

        chrom = read[2]  # chromosome identified for read in bowtie
        readid = read[0]  # read id
        startp = int(
            read[3]
        ) - 1  # start position. Need to subtract 1 since genomic sequence starts at 1,
        seq = Seq.Seq(read[9])  # sequence of the read
        length = len(seq)  # length of read

        if length < 23:
            length_shift = 24 - length
        else:
            length_shift = 0

        if chrom not in density_plus.keys():
            print "Error: Bowtie index and GFF do not match"

        total_reads += 1

        # Note that Bowtie reverse complements any sequence aligning to the reverse strand.
        # and so read[3] is the 3'-end of minus strand reads

        # Filter to get rid of reads of particular length. Or a particular strand.
        if (length < minlength or length > maxlength):
            continue

        mapped_reads += 1

        # 16 is the minus strand, 0 is the plus strand
        if (read[1] == '16'):
            start = startp - length_shift
            density_minus[chrom][start] += 1
            density_minus_sizesep[length][start] += 1

        if (read[1] == '0'):
            start = startp + length - 1 + length_shift
            density_plus[chrom][start] += 1
            density_plus_sizesep[length][start] += 1

    path_oldformat = path_den + "binary/"
    if not os.path.exists(path_oldformat):
        os.makedirs(path_oldformat)

    density_plus[sequence.id] = [
        float(i) * 1000000 / float(mapped_reads)
        for i in density_plus[sequence.id]
    ]
    density_minus[sequence.id] = [
        float(i) * 1000000 / float(mapped_reads)
        for i in density_minus[sequence.id]
    ]

    ribo_util.writebin(density_plus, path_oldformat + fname + "_plus_")
    ribo_util.makePickle(density_plus, path_den + "plus")
    ribo_util.makePickle(density_plus_sizesep, path_den + "plus_sizesep")
    ribo_util.countstowig(density_plus, path_wig + "_plus")

    ribo_util.writebin(density_minus, path_oldformat + fname + "_minus_")
    ribo_util.makePickle(density_minus, path_den + "minus")
    ribo_util.makePickle(density_minus_sizesep, path_den + "minus_sizesep")
    ribo_util.countstowig(density_minus, path_wig + "_minus")
コード例 #2
0
def SD_affinity_genome(paths_in):
    '''This function takes octamers of genomic sequence and calculates shine dalgarno affinity:
    output is a size separated dict that can be used like a density dict.'''
    # load octamers
    SD_affinity = paths_in['SD_affinity']

    affinity_list = pd.read_csv(SD_affinity)
    affinity_list = pd.Series(affinity_list.SD_affinity.values,
                              index=affinity_list.Octamer).to_dict()

    length_range = range(10, 46)
    GFFgen = GFF.parse(paths_in['path_gff'])
    chr = GFFgen.next()
    feat_num = 0

    affinity_plus = []
    affinity_minus = []
    density_plus_sizesep = {}
    density_minus_sizesep = {}

    sequence = chr.seq
    sequence_rc = sequence.reverse_complement()
    genome_size = len(sequence)

    position = 0
    for position in range(0, genome_size):
        if position < 8:
            motif = 'AAAAAAAA'
            motif_rc = 'AAAAAAAA'
        elif genome_size - position < 8:
            motif_rc = 'AAAAAAAA'
            motif = 'AAAAAAAA'
        else:

            motif = sequence[position - 8:position].transcribe()
            motif_rc = sequence[position:position + 8].transcribe()

            motif_rc = motif_rc.reverse_complement()

        if len(motif) == 8 and len(motif_rc) == 8:
            SD_affinity_plus = affinity_list[motif]
            SD_affinity_minus = affinity_list[motif_rc]
        else:
            SD_affinity_plus = 0.0
            SD_affinity_minus = 0.0

        if position == 100000:
            print '100000'
        if position == 500000:
            print '500000'
        if position == 1000000:
            print '1000000'
        if position == 2000000:
            print '2000000'

        affinity_plus.append(SD_affinity_plus)
        affinity_minus.append(SD_affinity_minus)

    for length in length_range:
        density_plus_sizesep[length] = affinity_plus
        density_minus_sizesep[length] = affinity_minus

    path_den = inpath + 'density/density/SD1/'
    ribo_util.makePickle(density_plus_sizesep, path_den + "plus_sizesep")
    ribo_util.makePickle(density_minus_sizesep, path_den + "minus_sizesep")

    return
コード例 #3
0
def run_filter_UMI(inputs, paths_in, paths_out):
    
    print "\n\tStarted UMI removal at " + str(datetime.now())
        
    
    files        = inputs['files']
    run          = inputs['run_filter_UMI']
    linker_UMI   = inputs['linker_UMI'] 
    RT_UMI       = inputs['linker_UMI'] 
    
    for fname in files: 
                       
        file_in  = paths_out['path_filter'] + fname + '_UMI-trimmed.fastq'
        file_out = paths_out['path_filter'] + fname + '-trimmed.fastq'
        file_log = paths_out['path_log'] + fname + '_filter'
                    
        if not run == 'yes':
            print fname + " will not be filtered for a UMI"
            continue
            
        else:
            file_out = open(paths_out['path_filter'] + fname + '-trimmed.fastq', "w")

        UMI        = {}
        umi        = []
        n_umi      = []
        umi_unique = []
        with open(file_in, 'rb') as f:
            
            count = 0
            
            for line in f:
                if count == 0:
                    count = 1
                    Identifier = line
                    continue
                if count == 1:
                    count = 2
                    Sequence = line
                    continue
                if count == 2:
                    count = 3
                    QIdentifier = line
                    continue
                if count == 3:
                    count = 0
                    PHRED = line
                    
                    
                    Identifier  = Identifier[:-1]
                    Sequence    = Sequence[RT_UMI:-linker_UMI-1]
                    QIdentifier = QIdentifier[:-1]
                    PHRED       = PHRED[RT_UMI:-linker_UMI-1]

                    file_out.write(Identifier + "\n" + Sequence + "\n" + QIdentifier + "\n" + PHRED + "\n")
                    
                    umi_seq = Sequence[0:RT_UMI] + Sequence[-linker_UMI-1:-1]
                    
                    if umi_seq not in umi:
                        umi.append(umi_seq)
                        n_umi.append(1)
                        umi_unique.append('yes')
                        umi_read.append('')
                        n_seq.append(1)
                    
                    else: 
                        index = umi.index(umi_seq)
                        n_umi[index] += 1
                        umi_unique[index] = 'no'
                            
                    continue
            
        UMI['UMI']    = umi
        UMI['count']  = n_umi
        UMI['unique'] = umi_unique
        
        ribo_util.makePickle(UMI, file_log + fname + '_UMI', protocol=pickle.HIGHEST_PROTOCOL)

        f.close()
        file_out.close()

    print "\tFinished UMI removal at " + str(datetime.now())
    print "\tCOMPLETED UMI REMOVAL"
コード例 #4
0
def GFF_to_dict(paths_in, gff_settings):
    '''Parse gff into dict:
        - feat_of_interest = what to look for in gff (protein_coding, tRNA, rRNA, etc)
        - name_qual        = qualifier for alias/gene name (Name, gene_id)
        - name_qual_alt    = alternative qualifier, if none, set as 'none' 
        - biotype_qual     = qualifier for type of feature (biotype, etc)
        
        These values must correspont to values in the GFF'''
    '''Unload gff_settings'''

    path_out = gff_settings['path_out']
    feat_of_interest = gff_settings[
        'feat_of_interest']  #all, protein_coding, tRNA, rRNA
    name_qual = gff_settings['name_qual']
    name_qual_alt = gff_settings['name_qual_alt']
    remove_genes = gff_settings['remove_genes']
    #    aSD_seq          = gff_settings['aSD_seq']
    path_badgenes = paths_in['path_badgenes']
    '''Output path can be defined, or use 0 to set as the annotation file for my main pipeline'''

    if path_out == 0:
        path_gff_dict = paths_in['path_gff_dict']
    else:
        path_gff_dict = path_out
    '''Parse GFF using BCBio'''

    GFFgen = GFF.parse(paths_in['path_gff'])
    feat_num = 0
    '''Define data arrays: will be used as columns for pandas DateFrame'''

    gff_dict = {}
    chromosome = []
    aliaslist = []
    startlist = []
    stoplist = []
    seqlist = []
    typelist = []
    strandlist = []
    startcodon = []
    stopcodon = []
    SDaffinity = []
    G_content = []
    C_content = []
    A_content = []
    T_content = []

    aa_code, codon_code = ribo_util.get_genetic_code()
    aa_comp_dict = {}
    '''Make list of bad genes'''

    # from Gene-Wei-Li

    #    bad_genes = pd.read_csv(path_badgenes)
    #    bad_genes = bad_genes.to_dict(orient='list')
    #    bad_genes = bad_genes['GeneName']
    '''Sift through GFF for relevant information'''

    for chromosome_number in range(1, 50):
        chr = next(GFFgen, None)

        if chr is None:
            break

        for feature in chr.features:
            chromosome_id = chr.id

            if feature.sub_features == []:
                feat_num += 1
                continue

            if remove_genes == 'yes':
                '''Skip over non-CDS annotations'''

                if not feature.sub_features[0].type == feat_of_interest:
                    feat_num += 1
                    continue
                elif feature.qualifiers.has_key('pseudo') == True:
                    feat_num += 1
                    continue
                else:
                    feature_type = 'CDS'
            else:
                '''Add feat type to GFF, noting pseudogenes'''

                if feature.qualifiers.has_key('pseudo') == True:
                    feature_type = 'pseudo'
                else:
                    feature_type = feature.sub_features[0].type
            '''Get feature name'''

            if name_qual in feature.qualifiers:
                feat_name = feature.qualifiers[name_qual][0]
            elif name_qual_alt in feature.qualifiers:
                feat_name = feature.qualifiers[name_qual_alt][0]
            else:
                feat_name = 'None'
                feat_num += 1
                continue
            '''Remove feature if bad'''

            #            if remove_genes == 'yes':
            #                if feat_name in bad_genes:
            #                    feat_num+=1
            #                    continue
            #            else:
            #                if feat_name in bad_genes:
            #                    feature_type = 'bad'
            '''Get start, end, and strand position'''

            start = feature.location.start.position
            end = feature.location.end.position
            strand = feature.strand
            '''Analyze features of interest for feat information'''

            alias = feat_name
            '''Each strand is treated differently, + strand == 1'''

            if strand == 1:
                '''I save gene sequence + 50 bp from each end:
                makes it easier to analyze start and stop sequence 
                context without using whole genome sequence'''

                if start < 50:  # if gene is near the beginning of genome sequence:
                    sequence = 'N' * (50 - start
                                      )  # TB GFF starts at 0, add N * 50
                    sequence = sequence + chr[
                        0:end + 50].seq  # gene sequence + 50nt at each end
                else:
                    sequence = chr[start - 50:end +
                                   50].seq  # gene sequence + 50nt at each end

                strand_val = '+'
                startcodon_pos = start
                stopcodon_pos = end - 1

                if start > 200:
                    upstream_seq = chr[start - 200:start + 100].seq

            else:
                '''For minus strand, 'end' is start codon, 'start' is stop codon
                and sequence is reverse compliment of gene sequence.'''

                sequence_rc = chr[start - 50:end + 50].seq
                sequence = sequence_rc.reverse_complement()

                strand_val = '-'
                startcodon_pos = end - 1
                stopcodon_pos = start

                if end + 200 > len(chr.seq):
                    upstream_seq = 'none'
                else:
                    upstream_seq_rc = chr[end - 100:end + 200].seq
                    upstream_seq = upstream_seq_rc.reverse_complement()

            sequence = str(sequence)
            start_codon = sequence[50:53:1]
            stop_codon = sequence[-53:-50]
            '''get sequence from start to stop for GC analysis'''

            CDS_seq = sequence[50:-50:1]

            G, C, A, T = GC_of_CDS(CDS_seq)
            '''Calculate SD affinity'''

            #            SD_seq = sequence[30:50:1]    # analyze 20 nt upstream of start codons
            #            SD_affinity = shine_dalgarno_affinity(aSD_seq, SD_seq)
            '''Append data to lists'''

            if alias == 'trmD':
                print sequence

            chromosome.append(chromosome_id)
            typelist.append(feature_type)
            aliaslist.append(alias)
            seqlist.append(sequence)
            strandlist.append(strand_val)
            startlist.append(startcodon_pos)
            stoplist.append(stopcodon_pos)
            startcodon.append(start_codon)
            stopcodon.append(stop_codon)
            #            SDaffinity.append(SD_affinity)
            G_content.append(G)
            C_content.append(C)
            A_content.append(A)
            T_content.append(T)

            feat_num += 1
    '''Append lists to gff_dict'''
    gff_dict['Chromosome'] = chromosome
    gff_dict['Alias'] = aliaslist
    gff_dict['Strand'] = strandlist
    gff_dict['Start'] = startlist
    gff_dict['Stop'] = stoplist
    gff_dict['Sequence'] = seqlist
    gff_dict['Start_Codon'] = startcodon
    gff_dict['Stop_Codon'] = stopcodon
    gff_dict['Type'] = typelist
    #    gff_dict['SD_affinity'] = SDaffinity
    gff_dict['G_content'] = G_content
    gff_dict['C_content'] = C_content
    gff_dict['A_content'] = A_content
    gff_dict['T_content'] = T_content
    '''Pickle dict for use later'''
    ribo_util.makePickle(gff_dict, path_gff_dict)
    '''print dataframe, and save as .csv for use later'''
    ## Print GFF to check
    gff_df = pd.DataFrame(gff_dict)
    display(gff_df)
    gff_df.to_csv(path_gff_dict + '.csv')

    return
コード例 #5
0
def run_avggene_transcriptend(fname, settings, plus, minus, gff, path_start, path_stop):
    next_gene      = settings['next_gene']    
    equal_weight   = settings['equal_weight']
    length_in_ORF  = settings['length_in_ORF']
    length_out_ORF = settings['length_out_ORF']
    minlength      = settings['minlength']
    maxlength      = settings['maxlength']
    
    density_plus  = plus
    density_minus = minus 
    gff_dict = gff
    
    window_length = length_in_ORF + length_out_ORF
    positionindex = range(0, window_length) 
    lengthindex   = range(minlength, maxlength+1)
    
    alias_list  = gff_dict['Alias'] 
    strand_list = gff_dict['Strand'] 
    start_list  = gff_dict['Start'] 
    stop_list   = gff_dict['Stop'] 
    
    lastgene, dist_from_lastgene = get_transcript_end('/Volumes/HDD/data/gff/BS/rho-independent_ends.csv')

    # datastructure:
    # for heatmap - each dict has read counts at each position separated by length keys
    # for avggene summary - dictionary with position as keys 
    
    averagegene_start = {length : [0]*(window_length) for length in lengthindex}
    averagegene_stop  = {length : [0]*(window_length) for length in lengthindex}
    start_all = {position : 0 for position in positionindex}
    stop_all  = {position : 0 for position in positionindex}
    
    genes_too_close    = []  
    genes_below_thresh = []

    for alias, start, stop, strand in itertools.izip(alias_list, start_list,stop_list, strand_list):  
        
        if alias not in lastgene:
            continue
        else:
            getindex = lastgene.index(alias)
            endpos   = dist_from_lastgene[getindex]
            
        
        genelength = abs(start - stop) + 1  
        
        nextgene = ribo_util.nextgene(alias, gff_dict)
        
        # define plot start and stop window for genes in + or - strand:
        
        if strand == '+':
            stop = stop + endpos
            start_window = start - length_out_ORF 
            stop_window  = stop - length_in_ORF 
            stop_window_max = stop + length_out_ORF 
            
        if strand == '-':
            stop = stop - endpos
            start_window = start + length_out_ORF 
            stop_window  = stop + length_in_ORF 
            stop_window_max = stop - length_out_ORF 

        if next_gene > 0:
            if nextgene['distance'] < next_gene:           # exclude genes that are too close 
                genes_too_close.append(alias)
                genes_too_close.append(nextgene['alias'])
                continue
        
        if alias in genes_too_close:
            continue 
            
        if genelength < length_in_ORF + 20:            #exclude genes that are too small
            continue 
        
        if equal_weight == 'y':
            gene_reads, length_reads = ribo_util.get_genecounts(start_window, stop_window, strand, density_plus, 
                                             density_minus, minlength, maxlength)
            gene_reads = sum(gene_reads)
            if gene_reads < 10:
                genes_below_thresh.append(alias)
                continue
                
        else:
            gene_reads = 1
        
        gene_reads = float(gene_reads)        
        
        genomelength = len(density_plus[density_plus.keys()[0]])
        
        if strand == '+':
            
            if not 0 <= start_window < genomelength:
                continue
            if not 0 <= stop_window_max < genomelength:
                continue
            
            density_dict = density_plus
            for length in lengthindex:
                for position in positionindex:
                    start_density = density_dict[length][start_window + position] 
                    start_density = float(start_density) / gene_reads
                    averagegene_start[length][position] += start_density
                    start_all[position] += start_density
                    
                    stop_density = density_dict[length][stop_window + position] 
                    stop_density = float(stop_density) / gene_reads
                    averagegene_stop[length][position] += stop_density
                    stop_all[position]  += stop_density

        if strand == '-':
            
            if not 0 <= start_window < genomelength:
                continue
            if not 0 <= stop_window_max < genomelength:
                continue
                
            density_dict = density_minus
            for length in lengthindex:
                for position in positionindex:
                    start_density = density_dict[length][start_window - position]
                    start_density = float(start_density) / gene_reads
                    averagegene_start[length][position] += start_density
                    start_all[position] += start_density
                    
                    stop_density = density_dict[length][stop_window - position] 
                    stop_density = float(stop_density) / gene_reads
                    averagegene_stop[length][position] += stop_density 
                    stop_all[position]  += stop_density 
    
    genes_excluded = list(set(genes_too_close))
    print '\t' + str(len(genes_excluded)) + ' genes excluded from ' + fname + str(len(genes_below_thresh))
    
    ribo_util.makePickle(start_all, path_start + '_all_end', protocol=pickle.HIGHEST_PROTOCOL)                   
    ribo_util.makePickle(averagegene_start, path_start + '_HM_end', protocol=pickle.HIGHEST_PROTOCOL)
    ribo_util.makePickle(stop_all, path_stop + '_all_end', protocol=pickle.HIGHEST_PROTOCOL)
    ribo_util.makePickle(averagegene_stop, path_stop + '_HM_end', protocol=pickle.HIGHEST_PROTOCOL)