Ejemplo n.º 1
0
def linkage_decay_from_IS(IS, plot_dir=False, **kwargs):
    # Load the required data
    try:
        db = IS.get('raw_linkage_table')
        db = db.sort_values('mm').drop_duplicates(subset=['scaffold', 'position_A', 'position_B'], keep='last')\
                    .sort_index().drop(columns=['mm'])

        stb = IS.get('scaffold2bin')
        Mdb = inStrain.genomeUtilities._add_stb(db, stb, verbose=False)
        assert len(Mdb) > 0
    except:
        logging.error("Skipping plot 5 - you don't have all required information. You need to run inStrain genome_wide first")
        traceback.print_exc()
        return

    # Make the plot
    logging.info("Plotting plot 5")
    name = 'LinkageDecay_plot.pdf'
    pp = PdfPages(plot_dir + name)

    for genome, mdb in Mdb.groupby('genome'):
        if not plot_genome(genome, IS, **kwargs):
            continue
        linkage_decay_plot(mdb, title=genome)
        fig = plt.gcf()
        fig.set_size_inches(6, 4)
        fig.tight_layout()
        pp.savefig(fig)#, bbox_inches='tight')
        #plt.show()
        plt.close(fig)

    # Save the figure
    pp.close()
    #plt.show()
    plt.close('all')
Ejemplo n.º 2
0
def genome_plot_from_IS(IS, plot_dir=False, **kwargs):
    # Load the required data
    try:
        stb = IS.get('scaffold2bin')
        b2s = defaultdict(list)
        for s, b in stb.items():
            b2s[b].append(s)
        assert len(b2s.keys()) > 0

        # Load the cache
        covTs = kwargs.get('covT')#, IS.get('covT'))
        clonTs = kwargs.get('clonT')#, IS.get('clonT'))
        raw_linkage_table = kwargs.get('raw_linkage_table')#, IS.get('raw_linkage_table'))
        cumulative_snv_table = kwargs.get('cumulative_snv_table')#, IS.get('cumulative_snv_table'))
        scaffold2length = IS.get('scaffold2length')
        rl = IS.get_read_length()
        profiled_scaffolds = set(scaffold2length.keys())

    except:
        logging.error("Skipping plot 2 - you don't have all required information. You need to run inStrain genome_wide first")
        traceback.print_exc()
        return

    # Make the plot
    logging.info("Plotting plot 2")
    name = 'genomeWide_microdiveristy_metrics.pdf'
    pp = PdfPages(plot_dir + name)


    for genome, scaffolds in b2s.items():
        if not plot_genome(genome, IS, **kwargs):
            continue
        present_scaffolds = list(set(scaffolds).intersection(set(profiled_scaffolds)))
        Wdb, breaks, midpoints = load_windowed_metrics(present_scaffolds,
                                scaffold2length,
                                rl,
                                report_midpoints=True,
                                covTs=covTs, clonTs=clonTs,
                                raw_linkage_table=raw_linkage_table,
                                cumulative_snv_table=cumulative_snv_table)
        if len(Wdb) == 0:
            logging.debug(f"{genome} could not have windowed metrics loaded")
            continue
        genomeWide_microdiveristy_metrics_plot(Wdb, breaks, title=genome)
        fig = plt.gcf()
        fig.set_size_inches(8, 5)
        fig.tight_layout()
        pp.savefig(fig)#, bbox_inches='tight')
        #plt.show()
        plt.close(fig)

    # Save the figure
    pp.close()
    #plt.show()
    plt.close('all')
Ejemplo n.º 3
0
def linkage_decay_type_from_IS(IS, plot_dir=False, **kwargs):
    # Load the required data
    try:
        # Prepare
        db = IS.get('raw_linkage_table')
        db = db.sort_values('mm').drop_duplicates(subset=['scaffold', 'position_A', 'position_B'], keep='last')\
                    .sort_index().drop(columns=['mm'])
        stb = IS.get('scaffold2bin')
        Mdb = inStrain.genomeUtilities._add_stb(db, stb, verbose=False)

        SNdb = IS.get('SNP_mutation_types')
        assert SNdb is not None
        if len(SNdb) == 0:
            return
        SNdb.loc[:,'key'] = ["{0}:{1}".format(s, p) for s, p in zip(SNdb['scaffold'], SNdb['position'])]
        k2t = SNdb.set_index('key')['mutation_type'].to_dict()

        Mdb.loc[:,'link_type'] = Mdb.apply(calc_link_type, axis=1, k2t=k2t)
        assert len(Mdb) > 0
    except:
        logging.error("Skipping plot 8 - you don't have all required information. You need to run inStrain profile_genes first")
        if kwargs.get('debug', False):
            traceback.print_exc()
        return

    # Make the plot
    logging.info("Plotting plot 8")
    name = 'LinkageDecay_types_plot.pdf'
    pp = PdfPages(plot_dir + name)

    for genome, mdb in Mdb.groupby('genome'):
        if not plot_genome(genome, IS, **kwargs):
            continue
        db = linkage_decay_type(mdb, title=genome)
        fig = plt.gcf()
        fig.set_size_inches(6, 4)
        fig.tight_layout()
        pp.savefig(fig)#, bbox_inches='tight')
        #plt.show()
        plt.close(fig)

    # Save the figure
    pp.close()
    #plt.show()
    plt.close('all')
Ejemplo n.º 4
0
def allele_freq_plot_from_IS(IS, plot_dir=False, **kwargs):
    # Load the required data
    try:
        if not hasattr(sns, 'histplot') and callable(getattr(sns, 'histplot')):
            raise Exception("Cannot make plot 4 because your seaborn is out of date- need v0.11+")

        db = IS.get('cumulative_snv_table')
        if len(db) == 0:
            return
        db = db.sort_values('mm').drop_duplicates(subset=['scaffold', 'position'], keep='last')\
                    .sort_index().drop(columns=['mm'])
        db = db[(db['cryptic'] == False)]
        if 'allele_count' in db.columns:
            db = db[db['allele_count'] >= 2]
        if 'morphia' in db.columns:
            db = db[db['morphia'] >= 2]

        stb = IS.get('scaffold2bin')
        Mdb = inStrain.genomeUtilities._add_stb(db, stb, verbose=False)
        assert len(Mdb) > 0
    except:
        logging.error("Skipping plot 4 - you don't have all required information. You need to run inStrain genome_wide first")
        traceback.print_exc()
        return

    # Make the plot
    logging.info("Plotting plot 4")
    name = 'MajorAllele_frequency_plot.pdf'
    pp = PdfPages(plot_dir + name)

    for genome, mdb in Mdb.groupby('genome'):
        if not plot_genome(genome, IS, **kwargs):
            continue
        db = major_allele_freq_plot(mdb, title=genome)
        fig = plt.gcf()
        fig.set_size_inches(6, 4)
        fig.tight_layout()
        pp.savefig(fig)#, bbox_inches='tight')
        #plt.show()
        plt.close(fig)

    # Save the figure
    pp.close()
    #plt.show()
    plt.close('all')
Ejemplo n.º 5
0
def mm_plot_from_IS(IS, plot_dir=False, **kwargs):
    # Load the required data
    try:
        Mdb = kwargs.get('GWdb', False)
        assert len(Mdb) > 0

        if 'mm' not in Mdb:
            raise Exception(
                'Plot 1 cannot be created when run with --database_mode or --skip_mm_profiling'
            )

        # Add the number of read-pairs
        readLen = int(IS.get_read_length())
        Mdb['read_length'] = readLen
        Mdb['mm'] = Mdb['mm'].astype(int)
        Mdb.loc[:,
                'ANI_level'] = [(readLen - mm) / readLen for mm in Mdb['mm']]
    except:
        logging.error(
            "Skipping plot 1 - you don't have all required information. You need to run inStrain genome_wide first"
        )
        traceback.print_exc()
        return

    # Make the plot
    logging.info("Plotting plot 1")
    name = 'CoverageAndBreadth_vs_readMismatch.pdf'
    pp = PdfPages(plot_dir + name)

    for genome, mdb in Mdb.groupby('genome'):
        if not plot_genome(genome, IS, **kwargs):
            continue
        mm_plot(mdb, title=genome)
        fig = plt.gcf()
        fig.set_size_inches(6, 4)
        pp.savefig(fig)
        plt.close(fig)

    # Save the figure
    pp.close()
    plt.close('all')
Ejemplo n.º 6
0
def gene_histogram_from_IS(IS, plot_dir=False, **kwargs):
    # Load the required data
    try:
        # Prepare
        db = inStrain.GeneProfile.get_gene_info(IS)
        stb = IS.get('scaffold2bin')
        Gdb = inStrain.genomeUtilities._add_stb(db, stb, verbose=False)
        if 'clonality' in Gdb.columns:
            Gdb.loc[:, 'nucl_diversity'] = 1 - Gdb['clonality']
        assert len(Gdb) > 0
    except:
        logging.error(
            "Skipping plot 9 - you don't have all required information. You need to run inStrain profile_genes first"
        )
        if kwargs.get('debug', False):
            traceback.print_exc()
        return

    # Make the plot
    logging.info("Plotting plot 9")
    name = 'GeneHistogram_plot.pdf'
    pp = PdfPages(plot_dir + name)

    for genome, mdb in Gdb.groupby('genome'):
        if not plot_genome(genome, IS, **kwargs):
            continue
        db = gene_histogram_plot(mdb, title=genome)
        fig = plt.gcf()
        fig.set_size_inches(8, 5)
        fig.tight_layout()
        pp.savefig(fig)  #, bbox_inches='tight')
        #plt.show()
        plt.close(fig)

    # Save the figure
    pp.close()
    #plt.show()
    plt.close('all')
Ejemplo n.º 7
0
def ANI_dist_plot_from_IS(IS, plot_dir=False, **kwargs):
    # Load the required data
    try:
        Mdb = prepare_read_ani_dist_plot(IS)
        if len(Mdb['ANI_level'].unique()) == 1:
            raise Exception(
                'Plot 3 cannot be created when run with --database_mode or --skip_mm_profiling'
            )
        assert len(Mdb) > 0
    except:
        logging.error(
            "Skipping plot 3 - you don't have all required information. You need to run inStrain genome_wide first"
        )
        traceback.print_exc()
        return

    # Make the plot
    logging.info("Plotting plot 3")
    name = 'readANI_distribution.pdf'
    pp = PdfPages(plot_dir + name)

    for genome, mdb in Mdb.groupby('genome'):
        if not plot_genome(genome, IS, **kwargs):
            continue
        db = read_ani_dist_plot(mdb, title=genome)
        fig = plt.gcf()
        fig.set_size_inches(6, 4)
        fig.tight_layout()
        pp.savefig(fig)  #, bbox_inches='tight')
        #plt.show()
        plt.close(fig)

    # Save the figure
    pp.close()
    #plt.show()
    plt.close('all')