Ejemplos de chromosome_position en Python

Lenguaje de programación: Python

Namespace/Package Name: chromosome_and_gene_positions

Método / Función: chromosome_position

Ejemplos en hotexamples.com: 6

Python chromosome_position - 6 ejemplos encontrados. Estos son los ejemplos en Python del mundo real mejor valorados de chromosome_and_gene_positions.chromosome_position extraídos de proyectos de código abierto. Puedes valorar ejemplos para ayudarnos a mejorar la calidad de los ejemplos.

Ejemplo n.º 1

Mostrar archivo

Archivo: transposonread_profileplot.py Proyecto: SATAY-LL/LaanLab-SATAY-DataAnalysis

def profile_plot(bed_file,
                 variable="transposons",
                 chrom='I',
                 bar_width=None,
                 savefig=False):
    '''This function creates a bar plot along a specified chromosome for the number of transposons or reads.
    The height of each bar represents the number of transposons or reads at the genomic position indicated on the x-axis.
    The input is as follows: 
        - bed_file: input absolute path to bed file
        - variable: either transposons or reads
        - chrom: roman numeral indicated the chromosome that needs to be plotted
        - bar_width: integer. By default, the bar_width is set to length_chromosome/800
        - savefig: whether to save the figure at the location of the bed file (True or False)
        
    The bar_width determines how many basepairs are put in one bin. Little basepairs per bin may be slow. Too many basepairs in one bin and possible low transposon areas might be obscured.
    The bottom part of the graph is color coded to indicate areas that code for genes.
    For this a list for essential genes is needed (used in 'list_known_essentials' function) and a .gff file is required (for the functions in 'chromosome_and_gene_positions.py') and a list for gene aliases (used in the function 'gene_aliases')
    '''
    #%% USED FILES
    gff_file = os.path.join(file_dirname, '..', 'data_files',
                            'Saccharomyces_cerevisiae.R64-1-1.99.gff3')
    essential_genes_files = [
        os.path.join(file_dirname, '..', 'data_files',
                     'Cerevisiae_EssentialGenes_List_1.txt'),
        os.path.join(file_dirname, '..', 'data_files',
                     'Cerevisiae_EssentialGenes_List_2.txt')
    ]

    #%% GET CHROMOSOME LENGTHS AND POSITIONS
    chr_length_dict, chr_start_pos_dict, chr_end_pos_dict = chromosome_position(
        gff_file)

    #%% CREATE LIST OF ALL CHROMOSOMES IN ROMAN NUMERALS
    print('Chromosome length: ', chr_length_dict.get(chrom))
    if bar_width == None:
        bar_width = int(chr_length_dict.get(chrom) / 800)

#%% GET ALL GENES IN CURRENT CHROMOSOME
    gene_pos_dict = gene_position(gff_file)
    genes_currentchrom_pos_list = [
        k for k, v in gene_pos_dict.items() if chrom in v
    ]
    genes_essential_list = list_known_essentials(essential_genes_files)

    #%% READ BED FILE
    with open(bed_file) as f:
        lines = f.readlines()

#%% GET NAMES FOR THE CHROMOSOMES IN THE BED FILE
    chrom_start_index_dict, chrom_end_index_dict = chromosome_name_bedfile(
        bed_file)[1:3]

    #%% GET ALL TRANSPOSON COUNTS
    allcounts_list = np.zeros(chr_length_dict.get(chrom) + 1)
    if variable == "transposons":
        for line in lines[chrom_start_index_dict.
                          get(chrom):chrom_end_index_dict.get(chrom) + 1]:
            line = line.strip('\n').split()
            allcounts_list[int(line[1]) - 1] += 1

    elif variable == "reads":
        for line in lines[chrom_start_index_dict.
                          get(chrom):chrom_end_index_dict.get(chrom) + 1]:
            line = line.strip('\n').split()
            allcounts_list[int(line[1]) - 1] += (int(line[4]) - 100) / 20

    else:
        print(
            "ERROR: No valid variable argument given. Use transposons or reads"
        )
        sys.exit(1)

#%% BINNING OF THE READS
#THE LIST WITH ALL THE TRANPOSONS FOR THE CURRENT CHROMOSOME IS TYPICALLY REALLY LARGE.
#TO COMPRESS THIS LIST, THE BASEPAIR POSITIONS ARE GROUPED IN GROUPS WITH SIZE DEFINED BY 'BAR_WIDTH'
#IN EACH GROUP THE NUMBER OF readS ARE SUMMED UP.
#THIS IS DONE TO SPEED UP THE SCRIPT AS PLOTTING ALL VALUES IS SLOW
    allcounts_binnedlist = []
    val_counter = 0
    sum_values = 0
    if bar_width == 1:
        allcounts_binnedlist = allcounts_list
        allinsertionsites_list = np.linspace(
            0, chr_length_dict.get(chrom),
            int(chr_length_dict.get(chrom) / float(bar_width)))
    else:
        for n in range(len(allcounts_list)):
            if val_counter % bar_width != 0:
                sum_values += allcounts_list[n]
            elif val_counter % bar_width == 0:
                allcounts_binnedlist.append(sum_values)
                sum_values = 0
            val_counter += 1

        allinsertionsites_list = np.linspace(
            0, chr_length_dict.get(chrom),
            int(chr_length_dict.get(chrom) / bar_width) + 1)

#%% PLOTTING
    print('Plotting chromosome ', chrom, '...')
    print('bar width for plotting is ', bar_width)

    textsize = 18
    textcolor = "#000000"

    plt.figure(figsize=(19, 9))  #(17,6))
    grid = plt.GridSpec(20, 1, wspace=0.0, hspace=0.0)

    binsize = bar_width
    ax = plt.subplot(grid[0:19, 0])
    ax.bar(allinsertionsites_list,
           allcounts_binnedlist,
           width=binsize,
           color="#000000")
    ax.tick_params(axis='both', which='major', labelsize=textsize)
    ax.set_axisbelow(True)
    ax.grid(True)
    ax.set_xlim(0, chr_length_dict.get(chrom))
    #    ax.set_ylim(0, 200)
    ax.tick_params(axis='x', which='major', pad=30)
    ax.ticklabel_format(axis='x', style='sci', scilimits=(0, 0))
    ax.xaxis.get_offset_text().set_fontsize(textsize)
    ax.set_xlabel("Basepair position on chromosome " + chrom,
                  fontsize=textsize,
                  color=textcolor,
                  labelpad=10)
    if variable == "transposons":
        ax.set_ylabel('Transposon count',
                      fontsize=textsize,
                      color=textcolor,
                      labelpad=25)
    elif variable == "reads":
        ax.set_ylabel('Read count',
                      fontsize=textsize,
                      color=textcolor,
                      labelpad=25)
#    ax.set_title('Transposon profile for chromosome '+chrom)

    axc = plt.subplot(grid[19, 0])
    for gene in genes_currentchrom_pos_list:
        gene_start_pos = int(gene_pos_dict.get(gene)[1])
        gene_end_pos = int(gene_pos_dict.get(gene)[2])
        if gene in genes_essential_list:
            axc.axvspan(gene_start_pos,
                        gene_end_pos,
                        facecolor="#00F28E",
                        alpha=0.8)
#            ax.text(gene_start_pos,max(alltransposoncounts_binnedlist),gene_alias_list.get(gene)[0], rotation=90, fontsize=18)
        else:
            axc.axvspan(gene_start_pos,
                        gene_end_pos,
                        facecolor="#F20064",
                        alpha=0.8)
    axc.set_xlim(0, chr_length_dict.get(chrom))
    axc.tick_params(
        axis='x',  # changes apply to the x-axis
        which='both',  # both major and minor ticks are affected
        bottom=False,  # ticks along the bottom edge are off
        top=False,  # ticks along the top edge are off
        labelbottom=False)  # labels along the bottom edge are off

    axc.tick_params(
        axis='y',  # changes apply to the y-axis
        which='both',  # both major and minor ticks are affected
        left=False,  # ticks along the bottom edge are off
        right=False,  # ticks along the top edge are off
        labelleft=False)  # labels along the bottom edge are off

    if savefig == True and variable == "transposons":
        savepath = os.path.splitext(bed_file)
        print('saving figure at %s' % savepath[0] + '_transposonplot_chrom' +
              chrom + '.png')
        plt.savefig(savepath[0] + '_transposonplot_chrom' + chrom + '.png',
                    dpi=400)
        plt.close()
    elif savefig == True and variable == "reads":
        savepath = os.path.splitext(bed_file)
        print('saving figure at %s' % savepath[0] + '_readplot_chrom' + chrom +
              '.png')
        plt.savefig(savepath[0] + '_readplot_chrom' + chrom + '.png', dpi=400)
        plt.close()
    else:
        plt.show()

Ejemplo n.º 2

Mostrar archivo

Archivo: genomicfeatures_dataframe_with_normalization.py Proyecto: FloorDolsma/normalisation-

def dna_features(region,
                 wig_file,
                 pergene_insertions_file,
                 variable="reads",
                 normalize=True,
                 normalization_window_size=20000,
                 plotting=True,
                 savefigure=False,
                 verbose=True):
    '''This function inputs a wig file and pergene_insertions file created using transposonmapping_satay.py.
    Optional is to define with data is displayed, which can be either "insertions" or "reads".
    Output is a dataframe including major information about all genomic features and optionally a barplot indicating the number of transposons per genomic region.
    A genomic region is here defined as a gene (separated as annotated essential and not essential), telomere, centromere, ars etc.
    This can be used for identifying neutral regions (i.e. genomic regions that, if inhibited, do not influence the fitness of the cells).
    This function can be used for normalizing the transposon insertions per gene using the neutral regions.
    
    Input:
        - Region: e.g. chromosome number (either a normal number between 1 and 16 or in roman numerals between I and XVI), a list like ['V', 0, 14790] which creates a barplot between basepair 0 and 14790) or a genename.
        - wig_file: wiggle file from the output of transposonmapping.py that is used in the processing workflow.
        - pergene_insertions_file: text file from the output of transposonsmapping.py
        - variable: (only for plotting) either 'insertions' or 'reads', which determines what is being plotted.
        - normalize: (only for plotting) either True or False. Normalization only works for when variable is 'reads'. The normalized reads are plotted and adds a column to the dataframe.
        - normalization_window_size: Integer. Normalization relies on windows that corrects for inter chromosomal differences. This determine the size of those windows in terms of basepairs (default=10000)
        - plotting: Either True or False. Determines whether the barplot has to be created.
        - savefigure: (only for plotting) Whether to save the figure at the same location of this script.
        - Verbose: Either True of False. Determines how much textual feedback is given. When set to False, only warnings will be shown.

    Output:
        - dna_df2: Dataframe containing information about the selected chromosome.
    
    Required files (see next section):
        - essentials_file: https://github.com/Gregory94/LaanLab-SATAY-DataAnalysis/blob/master/Data_Files/Cerevisiae_AllEssentialGenes_List.txt
        - gene_information_file: https://github.com/Gregory94/LaanLab-SATAY-DataAnalysis/blob/master/Data_Files/Yeast_Protein_Names.txt
        - gff-file: https://github.com/Gregory94/LaanLab-SATAY-DataAnalysis/blob/master/Data_Files/Saccharomyces_cerevisiae.R64-1-1.99.gff3
        - sgd_features_file: https://github.com/Gregory94/LaanLab-SATAY-DataAnalysis/blob/master/Data_Files/SGD_features.tab
    '''
    #%% FILES
    essentials_file = os.path.join(file_dirname, '..', 'Data_Files',
                                   "Cerevisiae_AllEssentialGenes_List.txt")

    gene_information_file = os.path.join(file_dirname, '..', 'Data_Files',
                                         'Yeast_Protein_Names.txt')

    gff_file = os.path.join(file_dirname, '..', 'Data_Files',
                            'Saccharomyces_cerevisiae.R64-1-1.99.gff3')

    sgd_features_file = os.path.join(file_dirname, '..', 'Data_Files',
                                     'SGD_features.tab')

    variable = variable.lower()
    if plotting == True:
        create_plottitle = ''

#%% DETERMINE INPUTTED REGION

    warningmessage = "WARNING: Specified chromosome or gene name not found. Enter chromosome as a number (or roman numeral) between 1 and 16 (I and XVI), a list in the form ['chromosome number, start_position, end_position'] or a valid gene name."

    if verbose == True:
        print('Selected region: ', region)

    if type(region) == str:
        if region.upper() in chromosomename_roman_to_arabic()[1]:
            chrom = region.upper()
            roi_start = None
            roi_end = None
            region_type = 'Chromosome'

        elif region.upper() in list_gene_names(gene_information_file):
            gene_pos_dict = gene_position(gff_file)
            region = region.upper()
            if region in gene_pos_dict:
                region_pos = gene_pos_dict.get(region)
                chrom = region_pos[0]
                roi_start = int(region_pos[1])
                roi_end = int(region_pos[2])
            else:
                gene_alias_dict = gene_aliases(gene_information_file)[0]
                region_alias = [
                    key for key, val in gene_alias_dict.items()
                    if region in val
                ]
                if not region_alias == [] and region_alias[0] in gene_pos_dict:
                    region_pos = gene_pos_dict.get(region_alias[0])
                    chrom = region_pos[0]
                    roi_start = int(region_pos[1]) - 100
                    roi_end = int(region_pos[2]) + 100
                    del (region_alias, gene_alias_dict)
                else:
                    print(warningmessage)
#                    return()
            if plotting == True:
                create_plottitle = region
            region_type = 'Gene'
            del (region_pos, gene_pos_dict)

        else:
            print(warningmessage)
#            return()

    elif type(region) == list:
        if type(region[0]) == str:
            chrom = region[0].upper()
        elif type(region[0]) == int:
            if region[0] in chromosomename_roman_to_arabic()[0]:
                chrom = chromosomename_roman_to_arabic()[0].get(region[0])
        else:
            print(warningmessage)
#            return()
        roi_start = region[1]
        roi_end = region[2]
        region_type = 'Chromosome'

    elif type(region) == int:
        if region in chromosomename_roman_to_arabic()[0]:
            chrom = chromosomename_roman_to_arabic()[0].get(region)
            roi_start = None
            roi_end = None
        else:
            print(warningmessage)
#            return()
        region_type = 'Chromosome'

    else:
        print(warningmessage)
#        return()

    del (warningmessage)

    #%% READ WIG FILE FOR GETTING LOCATIONS OF ALL TN INSERTIONS

    with open(wig_file, 'r') as f:
        lines = f.readlines()

    chrom_start_line_dict, chrom_end_line_dict = chromosome_name_wigfile(
        lines)[1:]

    insrt_in_chrom_list = []
    reads_in_chrom_list = []
    for l in lines[chrom_start_line_dict.get(chrom):chrom_end_line_dict.
                   get(chrom)]:
        insrt_in_chrom_list.append(int(l.strip('\n').split(' ')[0]))
        reads_in_chrom_list.append(int(l.strip('\n').split(' ')[1]))

    del (lines, l, f, chrom_start_line_dict, chrom_end_line_dict)

    #%% READ PERGENE_INSERTIONS FILE FOR LOCATION OF ALL INSERTIONS PER EACH GENE.

    with open(pergene_insertions_file) as f:
        lines = f.readlines()

    gene_position_dict = {}
    for line in lines[1:]:
        line_split = line.strip('\n').split('\t')

        if line_split[1] == chrom:
            genename = line_split[0]
            gene_chrom = line_split[1]
            gene_start = int(line_split[2])
            gene_end = int(line_split[3])

            gene_position_dict[genename] = [
                gene_chrom, gene_start, gene_end
            ]  #DICT CONTAINING ALL GENES WITHIN THE DEFINED CHROMOSOME INCLUDING ITS START AND END POSITION

            geneinserts_str = line_split[4].strip('[]')
            if not geneinserts_str == '':
                geneinserts_list = [
                    int(ins) for ins in geneinserts_str.split(',')
                ]
            else:
                geneinserts_list = []

            genereads_str = line_split[5].strip('[]')
            if not genereads_str == '':
                genereads_list = [
                    int(read) for read in genereads_str.split(',')
                ]
            else:
                genereads_list = []

            if len(geneinserts_list) != len(genereads_list):
                print(
                    'WARNING: %s has different number of reads compared with the number of inserts'
                    % genename)

    del (f, lines, line, line_split, genename, gene_chrom, gene_start,
         gene_end, geneinserts_list, geneinserts_str, genereads_str,
         genereads_list)

    #%% DETERMINE THE LOCATION GENOMIC FEATURES IN THE CURRENT CHROMOSOME AND STORE THIS IN A DICTIONARY

    len_chr = chromosome_position(gff_file)[0].get(chrom)
    start_chr = chromosome_position(gff_file)[1].get(chrom)
    end_chr = chromosome_position(gff_file)[2].get(chrom)

    dna_dict = {
    }  #for each bp in chromosome, determine whether it belongs to a noncoding or coding region
    for bp in range(
            start_chr, end_chr + 1
    ):  #initialize dna_dict with all basepair positions as ['noncoding', None]
        dna_dict[bp] = ['noncoding', None]  #form is: ['element_name', 'type']

    feature_orf_dict = sgd_features(sgd_features_file)[1]
    gene_alias_dict = gene_aliases(gene_information_file)[0]

    for gene in gene_position_dict:
        if gene in feature_orf_dict:
            if (not gene.endswith("-A")
                    and not feature_orf_dict.get(gene)[1] == 'Verified') and (
                        not gene.endswith("-B")
                        and not feature_orf_dict.get(gene)[1] == 'Verified'):
                for bp in range(
                        gene_position_dict.get(gene)[1] + start_chr,
                        gene_position_dict.get(gene)[2] + start_chr + 1):
                    dna_dict[bp] = [
                        gene, "Gene; " + feature_orf_dict.get(gene)[1]
                    ]
        else:
            gene_alias = [
                key for key, val in gene_alias_dict.items() if gene in val
            ][0]
            for bp in range(
                    gene_position_dict.get(gene)[1] + start_chr,
                    gene_position_dict.get(gene)[2] + start_chr + 1):
                dna_dict[bp] = [
                    gene_alias, "Gene; " + feature_orf_dict.get(gene_alias)[1]
                ]

    del (gff_file, gene, bp, gene_alias)

    #%% GET FEATURES FROM INTERGENIC REGIONS (-> SEE SGD_features.tab IN DATA_FILES IN GITHUB FOLDER)

    genomicregions_list = sgd_features(sgd_features_file)[0]

    i = 2
    for genomicregion in genomicregions_list[1:]:
        dna_dict = feature_position(
            sgd_features(sgd_features_file)[i], chrom, start_chr, dna_dict,
            genomicregion)
        i += 1

    ### TEST IF ELEMENTS IN FEATURE_ORF_DICT FOR SELECTED CHROMOSOME ARE THE SAME AS THE GENES IN GENE_POSITION_DICT BY CREATING THE DICTIONARY FEATURE_POSITION_DICT CONTAINING ALL THE GENES IN FEATURE_ORF_DICT WITH THEIR CORRESPONDING POSITION IN THE CHROMOSOME
    gene_alias_dict = gene_aliases(gene_information_file)[0]
    orf_position_dict = {}
    for feature in feature_orf_dict:
        if feature_orf_dict.get(feature)[5] == chrom:
            if feature in gene_position_dict:
                orf_position_dict[feature] = [
                    feature_orf_dict.get(feature)[6],
                    feature_orf_dict.get(feature)[7]
                ]
            else:
                for feature_alias in gene_alias_dict.get(feature):
                    if feature_alias in gene_position_dict:
                        orf_position_dict[feature_alias] = [
                            feature_orf_dict.get(feature)[6],
                            feature_orf_dict.get(feature)[7]
                        ]

    if sorted(orf_position_dict) == sorted(gene_position_dict):
        if verbose == True:
            #            print('Everything alright, just ignore me!')
            pass
        else:
            pass
    else:
        print(
            'WARNING: Genes in feature_list are not the same as the genes in the gene_position_dict. Please check!'
        )

    del (sgd_features_file, feature_orf_dict, orf_position_dict, feature,
         feature_alias, gene_position_dict)

    #%% DETERMINE THE NUMBER OF TRANSPOSONS PER BP FOR EACH FEATURE

    reads_loc_list = [0] * len(
        dna_dict
    )  # CONTAINS ALL READS JUST LIKE READS_IN_CHROM_LIST, BUT THIS LIST HAS THE SAME LENGTH AS THE NUMBER OF BP IN THE CHROMOSOME WHERE THE LOCATIONS WITH NO READS ARE FILLED WITH ZEROS
    i = 0
    for ins in insrt_in_chrom_list:
        reads_loc_list[ins] = reads_in_chrom_list[i]
        i += 1

    del (i, ins, insrt_in_chrom_list, reads_in_chrom_list)  #, dna_df)

    #%% CREATE DATAFRAME FOR EACH FEATURE (E.G. NONCODING DNA, GENE, ETC.) IN THE CHROMOSOME AND DETERMINE THE NUMBER OF INSERTIONS AND READS PER FEATURE.

    feature_NameAndType_list = []
    f_previous = dna_dict.get(start_chr)[0]
    f_type = dna_dict.get(start_chr)[1]
    N_reads = []
    N_reads_list = []
    N_reads_truncatedgene_list = []
    N_insrt_truncatedgene_list = []
    N_insrt_list = []
    N_bp = 1
    N_bp_list = []
    f_start = 0
    f_end = 0
    f_pos_list = []
    i = 0
    for bp in dna_dict:
        f_current = dna_dict.get(bp)[0]
        if f_current == f_previous:
            f_type = dna_dict.get(bp)[1]
            f_end += 1
            N_bp += 1
            N_reads.append(reads_loc_list[i])
        elif (f_current != f_previous or
              (i + start_chr) == end_chr):  # and not f_current.endswith('-A'):
            feature_NameAndType_list.append([f_previous, f_type])
            N_reads_list.append(sum(N_reads))
            N_insrt_list.append(len([ins for ins in N_reads if not ins == 0]))
            if not f_type == None and f_type.startswith('Gene'):
                N10percent = 100  #int(len(N_reads) * 0.1)
                N_reads_truncatedgene_list.append(
                    sum(N_reads[N10percent:-N10percent]))
                N_insrt_truncatedgene_list.append(
                    len([
                        ins for ins in N_reads[N10percent:-N10percent]
                        if not ins == 0
                    ]))
            else:
                N_reads_truncatedgene_list.append(sum(N_reads))
                N_insrt_truncatedgene_list.append(
                    len([ins for ins in N_reads if not ins == 0]))

            N_bp_list.append(N_bp)
            N_reads = []
            N_bp = 1
            f_pos_list.append([f_start, f_end + f_start])
            f_start = f_start + f_end + 1
            f_end = 0
            f_previous = f_current
        i += 1

#    N_reads_per_bp_list = []
#    N_reads_per_bp_central80p_list = []
#    N_insrt_per_bp_list = []
#    N_insrt_per_bp_central80p_list = []
    N_reads_per_ins_list = []
    N_reads_per_ins_truncatedgene_list = []
    for i in range(len(N_reads_list)):
        #        N_reads_per_bp_list.append(N_reads_list[i]/N_bp_list[i])
        #        N_insrt_per_bp_list.append(N_insrt_list[i]/N_bp_list[i])
        #        if not feature_NameAndType_list[i][1] == None and feature_NameAndType_list[i][1].startswith('Gene'):
        #            N_reads_per_bp_central80p_list.append(N_reads_truncatedgene_list[i]/(N_bp_list[i]-200))#*0.8
        #            N_insrt_per_bp_central80p_list.append(N_insrt_truncatedgene_list[i]/(N_bp_list[i]-200))#*0.8
        #        else:
        #            N_reads_per_bp_central80p_list.append(N_reads_list[i]/N_bp_list[i])
        #            N_insrt_per_bp_central80p_list.append(N_insrt_list[i]/N_bp_list[i])

        if N_insrt_list[i] == 0:
            N_reads_per_ins_list.append(0)
            N_reads_per_ins_truncatedgene_list.append(0)
        elif N_insrt_truncatedgene_list[i] == 0:
            N_reads_per_ins_list.append(N_reads_list[i] / N_insrt_list[i])
            N_reads_per_ins_truncatedgene_list.append(0)
        else:
            N_reads_per_ins_list.append(N_reads_list[i] / N_insrt_list[i])
            N_reads_per_ins_truncatedgene_list.append(
                N_reads_truncatedgene_list[i] / N_insrt_truncatedgene_list[i])

    #############get all essential genes together with their aliases##############
    with open(essentials_file, 'r') as f:
        essentials_temp_list = f.readlines()[1:]
    essentials_list = [
        essential.strip('\n') for essential in essentials_temp_list
    ]
    del essentials_temp_list

    gene_alias_dict = gene_aliases(gene_information_file)[0]
    for key, val in gene_alias_dict.items():
        if key in essentials_list:
            for alias in val:
                essentials_list.append(alias)

    #ADD
    essentiality_list = []
    for feature in feature_NameAndType_list:
        if not feature[0] == "noncoding":
            if feature[1] in genomicregions_list:
                essentiality_list.append(None)
            elif feature[0] in essentials_list:
                essentiality_list.append(True)
            else:
                essentiality_list.append(False)
        else:
            essentiality_list.append(None)

    del (key, val, alias, essentials_list, feature, gene_information_file
         )  #, gene_alias_dict)#, reads_loc_list)
    ##############################################################################

    feature_name_list = []
    feature_type_list = []
    feature_alias_list = []
    feature_standardname_list = []
    for feature_name in feature_NameAndType_list:
        feature_name_list.append(feature_name[0])
        feature_type_list.append(feature_name[1])
        if feature_name[1] != None and feature_name[1].startswith(
                'Gene') and feature_name[0] in gene_alias_dict:
            if gene_alias_dict.get(feature_name[0])[0] == feature_name[0]:
                feature_standardname_list.append(feature_name[0])
                feature_alias_list.append('')
            else:
                if len(gene_alias_dict.get(feature_name[0])) > 1:
                    feature_standardname_list.append(
                        gene_alias_dict.get(feature_name[0])[0])
                    feature_alias_list.append(
                        gene_alias_dict.get(feature_name[0])[1:])
                else:
                    feature_standardname_list.append(
                        gene_alias_dict.get(feature_name[0])[0])
                    feature_alias_list.append('')
        else:
            feature_standardname_list.append(feature_name[0])
            feature_alias_list.append('')

    all_features = {
        'Feature_name': feature_name_list,
        'Standard_name': feature_standardname_list,
        'Feature_alias': feature_alias_list,
        'Feature_type': feature_type_list,
        'Essentiality': essentiality_list,
        'Position': f_pos_list,
        'Nbasepairs': N_bp_list,
        'Ninsertions': N_insrt_list,
        'Ninsertions_truncatedgene': N_insrt_truncatedgene_list,
        'Nreads': N_reads_list,
        'Nreads_truncatedgene': N_reads_truncatedgene_list,
        #                    'Ninsertionsperbp':N_insrt_per_bp_list,
        #                    'Ninsertionsperbp_gene_central80p':N_insrt_per_bp_central80p_list,
        #                    'Nreadsperbp':N_reads_per_bp_list,
        #                    'Nreadsperbp_gene_central80p':N_reads_per_bp_central80p_list,
        'Nreadsperinsrt': N_reads_per_ins_list,
        'Nreadsperinsrt_truncatedgene': N_reads_per_ins_truncatedgene_list
    }

    dna_df2 = pd.DataFrame(
        all_features, columns=[column_name for column_name in all_features]
    )  #search for feature using: dna_df2.loc[dna_df2['Feature'] == 'CDC42']
    #CREATE NEW COLUMN WITH ALL DOMAINS OF THE GENE (IF PRESENT) AND ANOTHER COLUMN THAT INCLUDES LISTS OF THE BP POSITIONS OF THESE DOMAINS

    #PRINT INFORMATION FOR THE SELECTED GENE
    if region_type == 'Gene':
        for region_info in dna_df2.itertuples():
            if region_info.Feature_name == region.upper(
            ) or region_info.Standard_name == region.upper():
                print(region_info)

    del (dna_dict, feature_NameAndType_list, feature_name_list,
         feature_type_list, feature_name, f_type, f_previous, f_start, f_end,
         f_pos_list, f_current, N_reads, N_reads_list, N_insrt_list,
         N_reads_truncatedgene_list, N_insrt_truncatedgene_list, N10percent,
         N_bp, N_bp_list, bp, i, start_chr, end_chr, all_features,
         essentiality_list, essentials_file, genomicregions_list)

    #%% NORMALIZE USING WINDOWS

    dna_df2, window_edge_list = reads_normalization_fixed_window(
        dna_df2, len_chr, normalization_window_size, wig_file)

    #%% CREATE BAR PLOT
    if plotting == True:
        noncoding_color = "#002538"
        essential_color = "#10e372"
        nonessential_color = "#d9252e"
        codingdna_color = '#29a7e6'
        textcolor = "#000000"
        textsize = 20

        feature_middle_pos_list = []
        sum_bp = 0
        for x in dna_df2['Nbasepairs']:
            feature_middle_pos_list.append(x / 2 + sum_bp)
            sum_bp += x
        del (x, sum_bp)

        feature_width_list = list(dna_df2['Nbasepairs'])

        barcolor_list = []
        for feature in dna_df2['Feature_name']:
            if feature == 'noncoding':
                barcolor_list.append(noncoding_color)
            elif dna_df2.loc[dna_df2['Feature_name'] ==
                             feature]['Essentiality'].iloc[0] == False:
                barcolor_list.append(nonessential_color)
            elif dna_df2.loc[dna_df2['Feature_name'] ==
                             feature]['Essentiality'].iloc[0] == True:
                barcolor_list.append(essential_color)
            elif dna_df2.loc[dna_df2['Feature_name'] ==
                             feature]['Essentiality'].iloc[0] == None:
                barcolor_list.append(codingdna_color)
        del (feature)

        ###PLOTTING
        plt.figure(figsize=(19, 9))
        grid = plt.GridSpec(20, 1, wspace=0.0, hspace=0.01)

        ax = plt.subplot(grid[0:19, 0])
        if variable == "insertions":
            ax.bar(feature_middle_pos_list,
                   list(dna_df2['Ninsertions']),
                   feature_width_list,
                   color=barcolor_list)
            #        ax.set_ylim(0, max(dna_df2['Ninsertionsperbp']) + 0.1*max(dna_df2['Ninsertionsperbp']))
            ax.set_ylabel("Transposons per region",
                          fontsize=textsize,
                          color=textcolor)
        elif variable == "reads":
            if normalize == False:
                ax.bar(feature_middle_pos_list,
                       list(dna_df2['Nreads']),
                       feature_width_list,
                       color=barcolor_list)
                ax.set_ylabel("Reads per region",
                              fontsize=textsize,
                              color=textcolor)
#                ax.set_ylim(0.0,10.0)
            elif normalize == True:
                ax.bar(feature_middle_pos_list,
                       list(dna_df2['Nreads_normalized_byNCregions']),
                       feature_width_list,
                       color=barcolor_list)
                #                ax.bar(feature_middle_pos_list, list(dna_df2['Nreads_normalized']), feature_width_list, color=barcolor_list)
                #                ax.bar(feature_middle_pos_list, list(dna_df2['Nreads_normalized']), feature_width_list, color=barcolor_list)
                ax.set_ylabel("Normalized reads per region",
                              fontsize=textsize,
                              color=textcolor)
#                ax.set_ylim(0.0, 150.0)

        if roi_start != None and roi_end != None and roi_start < len_chr and roi_end < len_chr:
            ax.set_xlim(roi_start, roi_end)
        else:
            ax.set_xlim(0, len_chr)

        ax.grid(linestyle='-', alpha=1.0)
        ax.tick_params(labelsize=textsize)
        #    ax.set_xticklabels([])
        ax.tick_params(axis='x', which='major', pad=30)
        ax.ticklabel_format(axis='x', style='sci', scilimits=(0, 0))
        ax.xaxis.get_offset_text().set_fontsize(textsize)
        ax.set_xlabel("Basepair position on chromosome " + chrom,
                      fontsize=textsize,
                      color=textcolor,
                      labelpad=10)
        ax.set_title(create_plottitle, fontsize=textsize, color=textcolor)
        legend_noncoding = mpatches.Patch(color=noncoding_color,
                                          label="Noncoding DNA")
        legend_essential = mpatches.Patch(color=essential_color,
                                          label="Annotated essential genes")
        legend_nonessential = mpatches.Patch(color=nonessential_color,
                                             label="Nonessential genes")
        legend_coding = mpatches.Patch(color=codingdna_color,
                                       label="Other genomic regions")
        leg = ax.legend(handles=[
            legend_noncoding, legend_essential, legend_nonessential,
            legend_coding
        ])  #ADD
        for text in leg.get_texts():
            text.set_color(textcolor)
        del text

        count = 0
        for i in range(len(window_edge_list) - 1):
            if count % 2 == 0:
                ax.axvspan(window_edge_list[i],
                           window_edge_list[i + 1],
                           facecolor=[0.0, 0.0, 0.0, 0.1])
            else:
                ax.axvspan(window_edge_list[i],
                           window_edge_list[i + 1],
                           facecolor=[0.0, 0.0, 0.0, 0.0])
            count += 1

        axc = plt.subplot(grid[19, 0])

        l = 0
        counter = 0
        for width in feature_width_list:
            if dna_df2.loc[counter][4] == True:
                axc.axvspan(l, l + width, facecolor=essential_color, alpha=0.3)
            elif dna_df2.loc[counter][
                    4] == False and not dna_df2.loc[counter][0] == 'noncoding':
                axc.axvspan(l,
                            l + width,
                            facecolor=nonessential_color,
                            alpha=0.3)
            elif dna_df2.loc[counter][
                    4] == None and not dna_df2.loc[counter][0] == 'noncoding':
                axc.axvspan(l, l + width, facecolor=codingdna_color, alpha=0.5)
            l += width
            counter += 1
        if roi_start != None and roi_end != None and roi_start < len_chr and roi_end < len_chr:
            axc.set_xlim(roi_start, roi_end)
        else:
            axc.set_xlim(0, len_chr)
        axc.tick_params(labelsize=textsize)
        axc.set_yticklabels([])
        axc.tick_params(
            axis='x',  # changes apply to the x-axis
            which='both',  # both major and minor ticks are affected
            bottom=False,  # ticks along the bottom edge are off
            top=False,  # ticks along the top edge are off
            labelbottom=False)  # labels along the bottom edge are off

        axc.tick_params(
            axis='y',  # changes apply to the y-axis
            which='both',  # both major and minor ticks are affected
            left=False,  # ticks along the bottom edge are off
            right=False,  # ticks along the top edge are off
            labelleft=False)  # labels along the bottom edge are off

        if savefigure == True:
            if normalize == True and variable == 'reads':
                saving_name = os.path.join(
                    file_dirname, 'GenomicFeaturesReads_Barplot_Chrom' +
                    chrom + '_Normalized_with_Windowsize_' +
                    str(normalization_window_size))
            elif normalize == False and variable == 'reads':
                saving_name = os.path.join(
                    file_dirname, 'GenomicFeaturesReads_Barplot_Chrom' +
                    chrom + '_NonNormalized')
            else:
                saving_name = os.path.join(
                    file_dirname, 'GenomicFeaturesInsertions_Barplot_Chrom' +
                    chrom + '_NonNormalized')
            plt.savefig(saving_name, orientation='landscape', dpi=200)
            plt.close()

#        del (barcolor_list, codingdna_color, essential_color, feature_middle_pos_list, feature_width_list, noncoding_color, nonessential_color, textcolor, textsize, l, counter, width, normalization_window_size)

#%% RETURN STATEMENT
    return (dna_df2)

Ejemplo n.º 3

Mostrar archivo

def compareplot(bed_files=None,
                variable="insertions",
                chromosome=None,
                set_barwidth=None,
                set_logscale=False,
                savefig=False):
    '''This function creates a bar plot along a specified chromosome for the number of transposons.
    The height of each bar represents the number of transposons at the genomic position indicated on the x-axis.
    The input is as follows:
        -The bed-files ('bed_files', a list containing two paths, each refering to a bed-file [mandatory]),
        -Which chromosome ('chromosome', indicated by roman numeral or list of roman numerals [optional]),
        -The width of the bars ('bar_width-user_set', indicated by an integer [optional]),
        -Path to where to save the figures ('savefigure_path', string containing an existing path [optional]),
        -Name of the figures ('savefigure_name', string containing a single name, the name will be automatically extended with the chromosomal number [optional]).
    
    The bed_file is one of the files created by the Matlab code from the kornmann-lab.
    The figure shows two graphs, the top one represents the first bed-file given in the list, the bottom plot the second bed-file in the list.
    If the chromosome number is not set by the user, it automatically loops over all chromosomes and determines the figures for each of them.    
    The bar_width determines how many basepairs are put in one bin. Little basepairs per bin may be slow. Too many basepairs in one bin and possible low transposon areas might be obscured.
    When either the savefigure_path and/or the savefigure_name is left empty, the figure won't be saved.
    If the both these variables are given, the figures are saved using the path/figurename_chromX where the _chromX extension is automatically added.
    
    The background of the graph is color coded to indicate areas that code for genes.
    For this a list for essential genes is needed (used in 'list_known_essentials' function) and a .gff file is required (for the functions in 'chromosome_and_gene_positions.py') and a list for gene aliases (used in the function 'gene_aliases').
    '''
    #%% USED FILES
    gff_file = os.path.join(file_dirname, '..', 'data_files',
                            'Saccharomyces_cerevisiae.R64-1-1.99.gff3')
    essential_genes_files = [
        os.path.join(file_dirname, '..', 'data_files',
                     'Cerevisiae_EssentialGenes_List_1.txt'),
        os.path.join(file_dirname, '..', 'data_files',
                     'Cerevisiae_EssentialGenes_List_2.txt')
    ]
    gene_information_file = os.path.join(file_dirname, '..', 'data_files',
                                         'Yeast_Protein_Names.txt')
    #%% GET CHROMOSOME LENGTHS AND POSITIONS
    chr_length_dict, chr_start_pos_dict, chr_end_pos_dict = chromosome_position(
        gff_file)

    #%% GET ALL GENES IN CURRENT CHROMOSOME
    gene_pos_dict = gene_position(gff_file)
    genes_essential_list = list_known_essentials(essential_genes_files,
                                                 verbose=False)
    gene_alias_list = gene_aliases(gene_information_file)[0]

    #%% DETERMINE WHICH CHROMOSOME NEEDS TO BE ANALYZED AND LOOP OVER THE CHROMOSOMES
    if type(chromosome) is list:
        chrom_list = chromosome
    elif type(chromosome) is str:
        chrom_list = [chromosome.upper()]
    else:
        chrom_list = []
        roman_to_arabic_numerals = chromosomename_roman_to_arabic()[1]
        for keys in roman_to_arabic_numerals:
            chrom_list.append(keys)

    for chrom in chrom_list:
        print('')
        print('Analyzing chromosome: ', chrom)
        genes_currentchrom_pos_list = [
            k for k, v in gene_pos_dict.items() if chrom in v
        ]

        #%% READ BED FILE
        allinsertionsites_allfiles_list = []
        alltransposoncounts_allfiles_binnedlist = []
        for bed_file in bed_files:
            print("Processing file: %s" % bed_file)
            with open(bed_file) as f:
                lines = f.readlines()

#%% GET NAMES FOR THE CHROMOSOMES IN THE BED FILE
            chrom_start_index_dict, chrom_end_index_dict = chromosome_name_bedfile(
                bed_file)[1:3]

            #%% GET ALL TRANSPOSON COUNTS
            allcounts_list = np.zeros(chr_length_dict.get(chrom) + 2)
            if variable == "insertions":
                for line in lines[chrom_start_index_dict.
                                  get(chrom):chrom_end_index_dict.get(chrom) +
                                  1]:
                    line = line.strip('\n').split()
                    allcounts_list[int(line[1])] += 1

            elif variable == "reads":
                for line in lines[chrom_start_index_dict.
                                  get(chrom):chrom_end_index_dict.get(chrom) +
                                  1]:
                    line = line.strip('\n').split()
                    allcounts_list[int(line[1])] += int(line[4])

#%% BINNING OF THE READS
            if set_barwidth == None:
                bar_width = int(chr_length_dict.get(chrom) / 500)
            else:
                bar_width = set_barwidth

            allcounts_binnedlist = []
            val_counter = 0
            sum_values = 0
            if bar_width == 1:
                allcounts_binnedlist = allcounts_list
                allinsertionsites_list = np.linspace(
                    0, chr_length_dict.get(chrom),
                    int(chr_length_dict.get(chrom) / float(bar_width)))
            else:
                for n in range(len(allcounts_list)):
                    if val_counter % bar_width != 0:
                        sum_values += allcounts_list[n]
                    elif val_counter % bar_width == 0:
                        allcounts_binnedlist.append(sum_values)
                        sum_values = 0
                    val_counter += 1

                allinsertionsites_list = np.linspace(
                    0, chr_length_dict.get(chrom),
                    int(chr_length_dict.get(chrom) / bar_width) + 1)

            allinsertionsites_allfiles_list.append(allinsertionsites_list)
            alltransposoncounts_allfiles_binnedlist.append(
                allcounts_binnedlist)

#%% DETERMINE DIFFERENCE BETWEEN DATASETS TRANSPOSONCOUNTS
        transposoncounts_positivedifference_list = [0] * len(
            alltransposoncounts_allfiles_binnedlist[0])
        transposoncounts_negativedifference_list = [0] * len(
            alltransposoncounts_allfiles_binnedlist[0])
        for i in range(0, len(alltransposoncounts_allfiles_binnedlist[0])):
            difference = alltransposoncounts_allfiles_binnedlist[0][
                i] - alltransposoncounts_allfiles_binnedlist[1][i]
            if difference >= 0:
                transposoncounts_positivedifference_list[i] = difference
            elif difference < 0:
                transposoncounts_negativedifference_list[i] = -difference

#%% PLOTTING
        print('Plotting chromosome ', chrom, '...')
        print('bar width for plotting is ', bar_width)
        binsize = bar_width
        font_size = 12
        max_ylim = max(
            [
                item for sublist in alltransposoncounts_allfiles_binnedlist
                for item in sublist
            ]
        )  #GET MAXIMUM VALUE FOR SETTING THE Y AXIS LIMIT EQUAL FOR BOTH GRAPHS
        max_ylim = max_ylim + 0.1 * max_ylim

        plt.figure(figsize=(19, 9))
        grid = plt.GridSpec(2, 1, wspace=0.0, hspace=0.0)

        ax1 = plt.subplot(grid[0, 0])
        for gene in genes_currentchrom_pos_list:
            gene_start_pos = int(gene_pos_dict.get(gene)[1])
            gene_end_pos = int(gene_pos_dict.get(gene)[2])
            if gene in genes_essential_list:
                ax1.axvspan(gene_start_pos,
                            gene_end_pos,
                            facecolor='g',
                            alpha=0.3)
                ax1.text(gene_start_pos,
                         max_ylim,
                         gene_alias_list.get(gene)[0],
                         rotation=45)
            else:
                ax1.axvspan(gene_start_pos,
                            gene_end_pos,
                            facecolor='r',
                            alpha=0.3)

        ax1.bar(allinsertionsites_allfiles_list[0],
                alltransposoncounts_allfiles_binnedlist[0],
                width=binsize,
                color=(0.2, 0.2, 0.2, 0.8))
        ax1.bar(allinsertionsites_allfiles_list[0],
                transposoncounts_positivedifference_list,
                width=binsize,
                color=(0.52, 0.71, 0.90, 0.8))

        if set_logscale == True:
            ax1.set_yscale('log')
        else:
            ax1.set_ylim(0, max_ylim)
        ax1.set_axisbelow(True)
        ax1.grid(True)
        if variable == "insertions":
            ax1.set_ylabel('Aboslute insertion count', fontsize=font_size)
        elif variable == "reads":
            ax1.set_ylabel('Aboslute read count', fontsize=font_size)
        ax1.set_xlim(0, chr_length_dict.get(chrom))

        ax2 = plt.subplot(grid[1, 0])
        for gene in genes_currentchrom_pos_list:
            gene_start_pos = int(gene_pos_dict.get(gene)[1])
            gene_end_pos = int(gene_pos_dict.get(gene)[2])
            if gene in genes_essential_list:
                ax2.axvspan(gene_start_pos,
                            gene_end_pos,
                            facecolor='g',
                            alpha=0.3)
            else:
                ax2.axvspan(gene_start_pos,
                            gene_end_pos,
                            facecolor='r',
                            alpha=0.3)

        if variable == "insertions":
            ax2.bar(allinsertionsites_allfiles_list[1],
                    alltransposoncounts_allfiles_binnedlist[1],
                    width=binsize,
                    color=(0.2, 0.2, 0.2, 0.8),
                    label='Number of transposons')
        elif variable == "reads":
            ax2.bar(allinsertionsites_allfiles_list[1],
                    alltransposoncounts_allfiles_binnedlist[1],
                    width=binsize,
                    color=(0.2, 0.2, 0.2, 0.8),
                    label='Number of reads')
        ax2.bar(allinsertionsites_allfiles_list[1],
                transposoncounts_negativedifference_list,
                width=binsize,
                color=(0.52, 0.71, 0.90, 0.8),
                label='Absolute difference datasets (set1-set2)')

        if set_logscale == True:
            ax2.set_yscale('log')
        else:
            ax2.set_ylim(0, max_ylim)
        ax2.set_axisbelow(True)
        ax2.grid(True)
        if variable == "insertions":
            ax2.set_ylabel('Aboslute insertion count', fontsize=font_size)
        elif variable == "reads":
            ax2.set_ylabel('Aboslute read count', fontsize=font_size)
        ax2.set_xlabel('Basepair position on chromosome ' + chrom,
                       fontsize=font_size)
        ax2.set_xlim(0, chr_length_dict.get(chrom))
        ax2.invert_yaxis()
        ax2.legend(loc='lower left', fontsize=font_size)

        plt.tight_layout()

        if savefig == True:
            saving_name = os.path.join(
                os.path.dirname(bed_files[0]),
                os.path.basename(bed_files[0]).strip(".bed") +
                "_compareplot_chrom" + chrom + ".png")
            plt.savefig(saving_name)
            plt.close()

Ejemplo n.º 4

Mostrar archivo

Archivo: clean_bedwigfiles.py Proyecto: SATAY-LL/LaanLab-SATAY-DataAnalysis

def cleanfiles(filepath=None, custom_header=None, split_chromosomes=False):
    '''
    This code reads a .bed or .wig file and remove any insertions that were mapped outside a chromosome.
    Mapping of a read outside a chromosome can happen during the alignment and transposon mapping steps and means that the position of an insertions site of a read is larger than the length of the chromosome it is mapped to.
    This function creates a new file with the same name as the inputfile with the extension _clean.bed or _clean.wig.
    This is saved at the same location as the input file.
    In this _clean file the redundant insertions that were mapped outside the chromosome are removed.
    The lengths of the chromosomes are determined the python function 'chromosome_position' which is part of the python module 'chromosome_and_gene_positions.py'.
    This module gets the lengths of the chromosomes from a .gff file downloaded from SGD (https://www.yeastgenome.org/).
    Besides removing the reads outside the chromosomes, it also changes the names of the chromosomes to roman numerals and a custom header can be inputted (optional).
    Finally, the bed and wig files can be split up in separate files for each chromosome. These are placed in _chromosomesplit folder located at the location of the bed or wig file.
    '''

    if filepath == None:
        sys.exit(0)
    else:
        assert os.path.isfile(filepath), 'File not found: %s' % filepath

    chr_length_dict = chromosome_position()[0]

    filepath_splitext = os.path.splitext(filepath)
    exten = filepath_splitext[1]



    num_roman = ['I','II','III','IV','V','VI','VII','VIII','IX','X','XI','XII','XIII','XIV','XV','XVI']



    if exten == ".bed":
        print("Bed file loaded %s" % filepath)

        chrom_names_dict, chrom_start_line_dict, chrom_end_line_dict = chromosome_name_bedfile(filepath)

        with open(filepath, "r") as f:
            lines = f.readlines()


        with open(filepath_splitext[0]+"_clean.bed", "w") as w:
            #write header
            if custom_header == None or custom_header == "":
                w.write(lines[0])
            else:
                w.write("track name=" + str(custom_header) + " useScore=1\n")

            for chrom in num_roman:
                print("evaluating chromosome %s" % chrom)

                for line in lines[chrom_start_line_dict.get(chrom): chrom_end_line_dict.get(chrom)+1]:
                    line_list = " ".join(line.strip("\n").split()).split(" ")
                    if int(line_list[2]) > chr_length_dict.get(chrom) or int(line_list[1]) < 0:
                        print("Line removed: %s" % line)
                    else:
                        for romanname, chromname in chrom_names_dict.items():
                            if chromname == line_list[0].replace("chr",""):
                                chrom_nameroman = romanname
                        w.write("chr" + str(chrom_nameroman) + " " + str(line_list[1]) + " " + str(line_list[2]) + " " + str(line_list[3]) + " " + str(line_list[4]) + "\n")

            
            for line in lines[chrom_end_line_dict.get("XVI")+1:]:
                line_list = " ".join(line.strip("\n").split()).split(" ")
                w.write("chrM" + " " + str(line_list[1]) + " " + str(line_list[2]) + " " + str(line_list[3]) + " " + str(line_list[4]) + "\n")


        if split_chromosomes == True:
            path = os.path.dirname(filepath)
            name = os.path.splitext(os.path.basename(filepath_splitext[0]+"_clean.bed"))[0]

            directoryname = os.path.join(path, name + '_chromosomesplit')

            if not os.path.exists(directoryname):
                os.mkdir(directoryname)

            chromosome_names = ['I','II','III','IV','V','VI','VII','VIII','IX','X','XI','XII','XIII','XIV','XV','XVI']

            chrom_names_dict, chrom_start_line_dict, chrom_end_line_dict = chromosome_name_bedfile(os.path.join(path, name+".bed"))

            with open(os.path.join(path,name+".bed"), 'r') as f:
                lines = f.readlines()
            header = lines[0]


            for chrom in chromosome_names:
                outputfile = os.path.join(directoryname, name + '_' + str(chrom) + '.bed')
                with open(outputfile, 'w+') as f:
                    f.write(header)
                    for l in range(chrom_start_line_dict.get(chrom), chrom_end_line_dict.get(chrom)+1):
                        f.write(lines[l])

                outputfile = os.path.join(directoryname, name + '_M.bed')
                with open(outputfile, 'w+') as f:
                    f.write(header)
                    for l in range(chrom_end_line_dict.get(chromosome_names[-1])+1, len(lines)):
                        f.write(lines[l])




    elif exten == ".wig":
        print("Wig file loaded %s" % filepath)

        chrom_names_dict, chrom_start_line_dict, chrom_end_line_dict = chromosome_name_wigfile(filepath)

        with open(filepath, 'r') as f:
            lines = f.readlines()

        with open(filepath_splitext[0]+"_clean.wig", "w") as w:
            #write header
            if custom_header == None:
                w.write(lines[0].replace(',',''))
            else:
                w.write("track type=wiggle_0 maxheightPixels=60 name=" + str(custom_header) + "\n")

            for chrom in num_roman:
                print("evaluating chromosome %s" % chrom)

                #replace chromosome names from reference genome with roman numerals
                chrom_headerline = lines[chrom_start_line_dict.get(chrom) - 1]
                chrom_nameline = chrom_headerline.split("=")[1].strip("\n").replace("chr","")
                for romanname, chromname in chrom_names_dict.items():
                    if chromname.replace("chr","") == chrom_nameline:
                        chrom_nameroman = romanname
                w.write("variablestep chrom=chr" + str(chrom_nameroman) + "\n") #write header for each chromosome
                for line in lines[chrom_start_line_dict.get(chrom): chrom_end_line_dict.get(chrom)]: #no '+1' in for loop, this is only for bed file
                    line_list = " ".join(line.strip("\n").split()).split(" ")
                    if int(line_list[0]) > chr_length_dict.get(chrom) or int(line_list[0]) < 0:
                        print("Line removed: %s" % line)
                    else:
                        w.write(line)


            w.write("variablestep chrom=chrM\n")
            for line in lines[chrom_end_line_dict.get("XVI")+1:]:
                w.write(line)



        if split_chromosomes == True:
            path = os.path.dirname(filepath)
            name = os.path.splitext(os.path.basename(filepath_splitext[0]+"_clean.wig"))[0]

            directoryname = os.path.join(path, name + '_chromosomesplit')

            if not os.path.exists(directoryname):
                os.mkdir(directoryname)

            chromosome_names = ['I','II','III','IV','V','VI','VII','VIII','IX','X','XI','XII','XIII','XIV','XV','XVI','M']

            chrom_names_dict, chrom_start_line_dict, chrom_end_line_dict = chromosome_name_wigfile(os.path.join(path, name+".wig"))

            with open(os.path.join(path,name+".wig"), 'r') as f:
                lines = f.readlines()
            header = lines[0]


            for chrom in chromosome_names:
                outputfile = os.path.join(directoryname, name + '_' + str(chrom) + '.wig')
                with open(outputfile, 'w+') as f:
                    f.write(header)
                    for l in range(chrom_start_line_dict.get(chrom)-1, chrom_end_line_dict.get(chrom)):
                        f.write(lines[l])

    else:
        print("Extension not recognized")

Ejemplo n.º 5

Mostrar archivo

Archivo: transposonread_profileplot_genome.py Proyecto: SATAY-LL/LaanLab-SATAY-DataAnalysis

def profile_genome(bed_file=None,
                   variable="transposons",
                   bar_width=None,
                   savefig=False):
    '''This function creates a bar plot along the entire genome.
    The height of each bar represents the number of transposons or reads at the genomic position indicated on the x-axis.
    The input is as follows:
        - bed file
        - variable ('transposons' or 'reads')
        - bar_width
        - savefig

    The bar_width determines how many basepairs are put in one bin. Little basepairs per bin may be slow. Too many basepairs in one bin and possible low transposon areas might be obscured.
    For this a list for essential genes is needed (used in 'list_known_essentials' function) and a .gff file is required (for the functions in 'chromosome_and_gene_positions.py') and a list for gene aliases (used in the function 'gene_aliases')
    '''

    #%%
    gff_file = os.path.join(file_dirname, '..', 'data_files',
                            'Saccharomyces_cerevisiae.R64-1-1.99.gff3')
    essential_genes_files = [
        os.path.join(file_dirname, '..', 'data_files',
                     'Cerevisiae_EssentialGenes_List_1.txt'),
        os.path.join(file_dirname, '..', 'data_files',
                     'Cerevisiae_EssentialGenes_List_2.txt')
    ]

    chrom_list = [
        'I', 'II', 'III', 'IV', 'V', 'VI', 'VII', 'VIII', 'IX', 'X', 'XI',
        'XII', 'XIII', 'XIV', 'XV', 'XVI'
    ]

    chr_length_dict, chr_start_pos_dict, chr_end_pos_dict = chromosome_position(
        gff_file)

    summed_chr_length_dict = {}
    summed_chr_length = 0
    for c in chrom_list:
        summed_chr_length_dict[c] = summed_chr_length
        summed_chr_length += chr_length_dict.get(c)

    l_genome = 0
    for chrom in chrom_list:
        l_genome += int(chr_length_dict.get(chrom))
    print('Genome length: ', l_genome)
    if bar_width == None:
        bar_width = l_genome / 1000

    middle_chr_position = []
    c1 = summed_chr_length_dict.get('I')
    for c in summed_chr_length_dict:
        if not c == 'I':
            c2 = summed_chr_length_dict.get(c)
            middle_chr_position.append(c1 + (c2 - c1) / 2)
            c1 = c2
    c2 = l_genome
    middle_chr_position.append(c1 + (c2 - c1) / 2)

    gene_pos_dict = gene_position(gff_file)
    genes_currentchrom_pos_list = [k for k, v in gene_pos_dict.items()]
    genes_essential_list = list_known_essentials(essential_genes_files)

    with open(bed_file) as f:
        lines = f.readlines()

    chrom_names_dict, chrom_start_index_dict, chrom_end_index_dict = chromosome_name_bedfile(
        bed_file)

    allcounts_list = np.zeros(l_genome)
    if variable == "transposons":
        for line in lines[chrom_start_index_dict.
                          get("I"):chrom_end_index_dict.get("XVI") + 1]:
            line = line.strip('\n').split()
            chrom_name = [
                k for k, v in chrom_names_dict.items()
                if v == line[0].replace("chr", '')
            ][0]
            allcounts_list[summed_chr_length_dict.get(chrom_name) +
                           int(line[1]) - 1] += 1
    elif variable == "reads":
        for line in lines[chrom_start_index_dict.
                          get("I"):chrom_end_index_dict.get("XVI") + 1]:
            line = line.strip('\n').split()
            chrom_name = [
                k for k, v in chrom_names_dict.items()
                if v == line[0].replace("chr", '')
            ][0]
            allcounts_list[summed_chr_length_dict.get(chrom_name) +
                           int(line[1]) - 1] += (int(line[4]) - 100) / 20

    allcounts_binnedlist = []
    val_counter = 0
    sum_values = 0
    for n in range(len(allcounts_list)):
        if int(val_counter % bar_width) != 0:
            sum_values += allcounts_list[n]
        elif int(val_counter % bar_width) == 0:
            allcounts_binnedlist.append(sum_values)
            sum_values = 0
        val_counter += 1
    allcounts_binnedlist.append(sum_values)

    if bar_width == (l_genome / 1000):
        allinsertionsites_list = np.linspace(0, l_genome,
                                             int(l_genome / bar_width + 1))
    else:
        allinsertionsites_list = np.linspace(0, l_genome,
                                             int(l_genome / bar_width + 2))

    plt.figure(figsize=(19.0, 9.0))  #(27.0,3))
    grid = plt.GridSpec(20, 1, wspace=0.0, hspace=0.0)

    textsize = 12
    textcolor = "#000000"
    binsize = bar_width
    ax = plt.subplot(grid[0:19, 0])
    #    for gene in genes_currentchrom_pos_list:
    #        if not gene_pos_dict.get(gene)[0] == 'Mito':
    #            gene_start_pos = summed_chr_length_dict.get(gene_pos_dict.get(gene)[0]) + int(gene_pos_dict.get(gene)[1])
    #            gene_end_pos = summed_chr_length_dict.get(gene_pos_dict.get(gene)[0]) + int(gene_pos_dict.get(gene)[2])
    #            if gene in genes_essential_list:
    #                ax.axvspan(gene_start_pos,gene_end_pos,facecolor="#BBE6AA",alpha=0.8)
    #            else:
    #                ax.axvspan(gene_start_pos,gene_end_pos,facecolor="#F6A089",alpha=0.8)
    ax.bar(allinsertionsites_list,
           allcounts_binnedlist,
           width=binsize,
           color="#333333")  #"#00918f")
    ax.grid(False)
    ax.set_xlim(0, l_genome)

    for chrom in summed_chr_length_dict:
        ax.axvline(x=summed_chr_length_dict.get(chrom),
                   linestyle='-',
                   color=(0.9, 0.9, 0.9, 1.0))

    ax.set_xticks(middle_chr_position)
    ax.set_xticklabels(chrom_list, fontsize=textsize)
    ax.tick_params(axis='x', which='major', pad=30)
    if variable == "transposons":
        plt.ylabel('Transposon Count', fontsize=textsize,
                   color=textcolor)  #, labelpad=30)
    elif variable == "reads":
        plt.ylabel('Read Count', fontsize=textsize,
                   color=textcolor)  #, labelpad=30)

    axc = plt.subplot(grid[19, 0])
    for gene in genes_currentchrom_pos_list:
        if not gene_pos_dict.get(gene)[0] == 'Mito':
            gene_start_pos = summed_chr_length_dict.get(
                gene_pos_dict.get(gene)[0]) + int(gene_pos_dict.get(gene)[1])
            gene_end_pos = summed_chr_length_dict.get(
                gene_pos_dict.get(gene)[0]) + int(gene_pos_dict.get(gene)[2])
            if gene in genes_essential_list:
                axc.axvspan(gene_start_pos,
                            gene_end_pos,
                            facecolor="#00F28E",
                            alpha=0.8)
            else:
                axc.axvspan(gene_start_pos,
                            gene_end_pos,
                            facecolor="#F20064",
                            alpha=0.8)
    axc.set_xlim(0, l_genome)
    axc.tick_params(
        axis='x',  # changes apply to the x-axis
        which='both',  # both major and minor ticks are affected
        bottom=False,  # ticks along the bottom edge are off
        top=False,  # ticks along the top edge are off
        labelbottom=False)  # labels along the bottom edge are off

    axc.tick_params(
        axis='y',  # changes apply to the y-axis
        which='both',  # both major and minor ticks are affected
        left=False,  # ticks along the bottom edge are off
        right=False,  # ticks along the top edge are off
        labelleft=False)  # labels along the bottom edge are off

    if savefig == True and variable == "transposons":
        savepath = os.path.splitext(bed_file)
        print('saving figure at %s' % savepath[0] +
              '_transposonplot_genome.png')
        plt.savefig(savepath[0] + '_transposonplot_genome.png', dpi=400)
        plt.close()
    elif savefig == True and variable == "reads":
        savepath = os.path.splitext(bed_file)
        print('saving figure at %s' % savepath[0] + '_readplot_genome.png')
        plt.savefig(savepath[0] + '_readplot_genome.png', dpi=400)
        plt.close()
    else:
        plt.show()

Ejemplo n.º 6

Mostrar archivo

Archivo: statistics_perchromosome.py Proyecto: LiedewijLaan/LaanLab-SATAY-DataAnalysis

def chromosome_insertion_periodicity(chromosome=None,bed_file=None,gff_file=None,printing=False):
    '''Determines statistical values for the transposon insertion per chromosome.
    When the printing variable is set to True, it prints these values and creates a plot for showing the distribution of the distance between insertions in terms of basepairs.
    The functions returns the distance between insertions in terms of basepairs for the chromosome given as a roman numeral.
    When no chromosome is given, the return variable contains all chromosome with a list of distances between insertions in the form of a dictionary.
    '''

#%% USED FILES
    if gff_file is None:
        import os
        file_dirname = os.path.dirname(os.path.abspath('__file__'))
        if os.path.isfile(os.path.join(file_dirname,'Data_Files','Saccharomyces_cerevisiae.R64-1-1.99.gff3')):
            gff_file = os.path.join(file_dirname,'Data_Files','Saccharomyces_cerevisiae.R64-1-1.99.gff3')
        else:
            gff_file = os.path.join(file_dirname,'..','Data_Files','Saccharomyces_cerevisiae.R64-1-1.99.gff3')
#%% GET CHROMOSOME START AND END POSTIONS
    chr_length_dict, chr_start_pos_dict, chr_end_pos_dict = chromosome_position(gff_file)

#%% GET ROMAN ARABIC NUMERALS
    roman_to_arabic_dict = chromosomename_roman_to_arabic()[1]
    chromosome_romannames_list = []
    for roman in roman_to_arabic_dict:
        chromosome_romannames_list.append(roman)

#%% OPEN BED FILE
    with open(bed_file) as f:
        lines = f.readlines()

#%% GET NAMES FOR THE CHROMOSOMES IN THE BED FILE
#    chrom_names_dict = {}
#    chrom_start_index_dict = {}
#    chrom_end_index_dict = {}
#    chrom_name = ''
#    chr_counter = 0
#    line_counter = 0
#    stop_loop = False
#    while stop_loop is False:
#        line = lines[line_counter]
#        chrom_name_current = line.split(' ')[0].replace('chr','')
#        if not chrom_name_current.startswith('track'): #SKIP HEADER
#            if not chrom_name_current.startswith('M'): #SKIP MITOCHRONDRIAL CHROMOSOMES
#                if chrom_name_current != chrom_name:
#                    chrom_names_dict[chromosome_romannames_list[chr_counter]] = chrom_name_current
#                    chrom_name = chrom_name_current
##                    print('Chromosome ',chromosome_romannames_list[chr_counter], 'is ',chrom_name_current)
#                    
#                    chrom_start_index_dict[chromosome_romannames_list[chr_counter]] = line_counter #GET START INDEX IN THE BED FILE OF THE CURENT CHROMOSOME
#                    if chr_counter != 0:
#                        chrom_end_index_dict[chromosome_romannames_list[chr_counter-1]] = line_counter-1 #GET THE END INDEX IN THE BED OF THE PREVIOUS CHROMOSOME (SKIP FOR THE FIRST CHROMOSOME)
#
#                    chr_counter += 1
#
#            elif chrom_name_current.startswith('M'):
#                chrom_end_index_dict[chromosome_romannames_list[-1]] = line_counter-1 #GET THE END INDEX IN THE BED FILE FOR THE FINAL CHROMOSOME
#                stop_loop = True
#                
#        line_counter += 1

    chrom_names_dict,chrom_start_index_dict, chrom_end_index_dict= chromosome_name_bedfile(lines)


#%% DETERMINE STATISTICS FOR INDIVIDUAL CHROMOSOMES AND PUT THE RESULTS IN A DICT
    if chromosome != None:
        chromosome = chromosome.upper()
        chrom_loop = {}
        chrom_loop[chromosome] = chrom_names_dict.get(chromosome)
    else:
        chrom_loop = chrom_names_dict

    bp_between_tn_insertions_dict = {}
    reads_per_tn_dict = {}
    for chrom in chrom_loop:
        tn_insertion_position_list = []
        reads_per_tn_list = []
        for line in lines[chrom_start_index_dict.get(chrom):chrom_end_index_dict.get(chrom)+1]:
            line = line.strip('\n').split()
            tn_insertion_position_list.append(int(line[1]))
            reads_per_tn_list.append((int(line[4])-100)/20)
        bp_between_tn_insertions = [abs(y-x) for x, y in zip(tn_insertion_position_list[:-1], tn_insertion_position_list[1:])]
        bp_between_tn_insertions.insert(0,tn_insertion_position_list[0]) #ADD START OF GENE (bp=0)
        bp_between_tn_insertions.append(chr_length_dict.get(chrom) - tn_insertion_position_list[-1]) #ADD END OF GENE (bp=INDEX LAST TN - GENE LENGTH)
        bp_between_tn_insertions_dict[chrom] = bp_between_tn_insertions
        reads_per_tn_dict[chrom] = reads_per_tn_list

        tn_insertion_meanfrequency = np.nanmean(bp_between_tn_insertions)
        tn_insertion_stdfrequency = np.nanstd(bp_between_tn_insertions)
        tn_insertion_medianfrequency = np.nanmedian(bp_between_tn_insertions)
        if printing != False:
            print('')
            print('For chromosome ',chrom,' with length ',chr_length_dict.get(chrom) ,':')
            print('Number of transposon insertions is ', len(reads_per_tn_list))
            print('Coverage is %.2f percent' % (len(tn_insertion_position_list)/chr_length_dict.get(chrom)*100))
            print('Mean transposon insertion periodicity is once every %.2f bp' % tn_insertion_meanfrequency)
            print('Standard deviation in transposon insertion periodicity is %.2f' % tn_insertion_stdfrequency)
            print('Median transposon insertion periodicity is once every %.2f bp' % tn_insertion_medianfrequency)
            print('Largest area devoid of transposons is %.2f' % max(bp_between_tn_insertions))
            print('Mean number of reads per transposon is %.2f' % np.nanmean(reads_per_tn_list))
            print('Median number of reads per transposon is %.2f' % np.nanmedian(reads_per_tn_list))
            print('')

#%% APPLY AUTOCORRELATION FOR CHECKING THE PERIODICITY

    #MAKE LIST OF ALL INSERTION LOCATIONS AND HOW MANY INSERTIONS EACH LOCATION HAS
#    for chrom in chrom_loop:
#        number_of_insertions = [0]*chr_length_dict.get(chrom)
#        for ins in tn_insertion_position_list:
#            number_of_insertions[ins] += 1
#    
#    norm = number_of_insertions - np.mean(number_of_insertions)
#    n = norm.size
#    corr = np.correlate(norm, norm, mode='same')
#    autocorr = corr[n//2 + 1:] / (np.var(number_of_insertions) * np.arange(n-1, n//2, -1))
#    lag = np.abs(autocorr).argmax() + 1
#    print(lag)
#    r = autocorr[lag-1]
#    print(r)
    
#%% DETERMINE STATISTICS FOR THE ENTIRE GENOME
    if chromosome == None:
        bp_between_tn_insertions_genome = []
        number_tn_insertions_list = []
        reads_per_tn_genome = []
        for chrom in chrom_loop:
            #the next line includes the distance between the start of each chromosome and the first insertion and the distance between the last insertion and the end of the chromosome.
            #This might not be accurate. Please check!
            for bp_between in bp_between_tn_insertions_dict.get(chrom):
                bp_between_tn_insertions_genome.append(bp_between)
            number_tn_insertions_list.append(len(bp_between_tn_insertions_dict.get(chrom)))
#            number_tn_insertions_list.append(sum(x > 0 for x in alltransposoncounts_dict.get(chrom)))

            for reads_tn in reads_per_tn_dict.get(chrom):
                reads_per_tn_genome.append(reads_tn)

        if printing != False:
            print('')
            print('For the entire genome:')
            print('Coverage is %.2f percent' % (sum(number_tn_insertions_list)/sum(chr_length_dict.values())*100))
            print('Mean transposon insertion periodicity for the entire genome is %.2f' % np.nanmean(bp_between_tn_insertions_genome))
            print('Median transposon insertion periodicity for the entire genome is %.2f' % np.nanmedian(bp_between_tn_insertions_genome))
            print('Mean number of reads per transposon for the entire genome is %.2f' % np.nanmean(reads_per_tn_genome))
            print('Median number of reads per transposon for the entire genome is %.2f' % np.nanmedian(reads_per_tn_genome))
#%% DETERMINE THE DISTRIBUTION OF THE NUMBER OF BP BETWEEN SUBSEQUENT TRANSPOSON INSERTIONS

    if printing != False:
        if chromosome != None:
            bp_between_tn_insertions_norm_list = [x/chr_length_dict.get(chromosome) for x in bp_between_tn_insertions_dict.get(chromosome)]
#            df = pd.DataFrame(data=bp_between_tn_insertions_dict.get(chromosome))
            df = pd.DataFrame(data=bp_between_tn_insertions_norm_list)
            df.columns = [chromosome]
            df_melt = df.melt(var_name='chromosomes',value_name='bp between insertions')
        elif chromosome == None:
            bp_between_tn_insertions_norm_list = [x/chr_length_dict.get('I') for x in bp_between_tn_insertions_dict.get('I')]
            df = pd.DataFrame(data=bp_between_tn_insertions_norm_list)
            for chrom in chrom_loop:
                if chrom != 'I':
                    bp_between_tn_insertions_norm_list = [x/chr_length_dict.get(chrom) for x in bp_between_tn_insertions_dict.get(chrom)]
                    df_temp = pd.DataFrame(data=bp_between_tn_insertions_norm_list)
                    df = pd.concat([df,df_temp], axis=1)
            df.columns = chromosome_romannames_list
            df_melt = df.melt(var_name='chromosomes',value_name='bp between insertions')
        
        v = sb.violinplot(x='chromosomes',y='bp between insertions',data=df_melt,inner='quartile',gridsize=3000, cut=0)
        v.set_yscale('log')

#%%
    return(bp_between_tn_insertions_dict)