コード例 #1
0
ファイル: cs.py プロジェクト: zzygyx9119/plastid
def write_output_files(table, title, args):
    """Write gene info table from :py:func:`do_generate` to several output files:

    OUTBASE_gene.positions
        Tab-delimited text file. Each line is a merged gene, and columns
        indicate the genomic coordinates and lengths of each of the position
        sets above.

    OUTBASE_transcript.positions
        Tab-delimited text file. Each line is a transcript, and columns
        indicate the genomic coordinates and lengths of each of the position
        sets above.

    OUTBASE_gene_REGION.bed
         `BED`_ files showing position sets for `REGION`,
         where `REGION` is one of *exon*, *utr5*, *cds*, *utr3*, or
         *masked*. These contain the same information in
         ``OUTBASE_gene.positions``, but can be visualized easily in a
         :term:`genome browser`


    Parameters
    ----------
    table : :class:`pandas.DataFrame`
        Gene info table made in :py:func:`do_generate`

    title : str
        Title ("gene" or "transcript")

    args : :py:class:`argparse.Namespace`
        Command-line arguments
    """
    keys = ("utr5", "utr3", "cds", "masked", "exon")
    bed_columns = ["%s_bed" % K for K in keys]

    bedfiles = {
        X: argsopener("%s_%s_%s.bed" % (args.outbase, title, X), args)
        for X in keys
    }

    for _, row in table[bed_columns].iterrows():
        for k in keys:
            bedfiles[k].write(row["%s_bed" % k])

    for k in bedfiles:
        bedfiles[k].close()

    pos_out = argsopener("%s_%s.positions" % (args.outbase, title), args)
    table.to_csv(pos_out,
                 sep="\t",
                 header=True,
                 index=False,
                 na_rep="nan",
                 float_format="%.8f",
                 columns=[
                     "region", "exon", "utr5", "cds", "utr3", "masked",
                     "exon_unmasked", "transcript_ids"
                 ])
    pos_out.close()
コード例 #2
0
ファイル: cs.py プロジェクト: zzygyx9119/plastid
def do_count(args, alignment_parser):
    """Count the number and density covering each merged gene in an annotation made made using the `generate` subcommand).
    
    Parameters
    ----------
    args : :py:class:`argparse.Namespace`
        command-line arguments for ``count`` subprogram    
    """
    # we expect many zero-lenght segmentchains, so turn these off for now
    warnings.filterwarnings(
        "ignore",
        ".*zero-length SegmentChain.*",
    )

    keys = ("exon", "utr5", "cds", "utr3")
    column_order = ["region"]
    gene_positions = read_pl_table(args.position_file)

    # read count files
    ga = alignment_parser.get_genome_array_from_args(args, printer=printer)
    total_counts = ga.sum()

    normconst = 1000.0 * 1e6 / total_counts

    printer.write("Dataset has %s counts in it." % total_counts)
    printer.write("Tallying genes ...")

    dtmp = {"region": []}
    for x in keys:
        for y in ("reads", "length", "rpkm"):
            label = "%s_%s" % (x, y)
            dtmp[label] = []
            column_order.append(label)

    for i, name in enumerate(gene_positions["region"]):
        dtmp["region"].append(name)
        if i % 500 == 0:
            printer.write("Processed %s genes ..." % i)

        for k in keys:
            ivc = SegmentChain.from_str(gene_positions[k][i])
            total = sum(ivc.get_counts(ga))
            length = ivc.length
            rpkm = (normconst * total / length) if length > 0 else numpy.nan
            dtmp["%s_reads" % k].append(total)
            dtmp["%s_length" % k].append(length)
            dtmp["%s_rpkm" % k].append(rpkm)

    fout = argsopener("%s.txt" % args.outbase, args, "w")
    dtmp = pd.DataFrame(dtmp)
    dtmp.to_csv(fout,
                sep="\t",
                header=True,
                index=False,
                columns=column_order,
                na_rep="nan",
                float_format="%.8f")

    fout.close()
    printer.write("Done.")
コード例 #3
0
ファイル: crossmap.py プロジェクト: joshuagryphon/plastid
def chrom_worker(chrom_seq,args=None):
    name, seq_or_kmers = chrom_seq
    printer.write("Processing chromosome %s..." % name)
    base         = "%s_%s_%s_%s" % (args.outbase, args.read_length, args.mismatches, name)
    kmer_file    = "%s_kmers.fa"     % base
    toomany_file = "%s_multimap.fa"  % base
    bed_file     = "%s_crossmap.bed" % base
    
    if args.have_kmers == False:
        with open(kmer_file,"w") as kmer: # only do this step if args.have_kmers == False
            simulate_reads(seq_or_kmers,kmer,args.read_length)

        kmer.close()
    else:
        kmer_file = seq_or_kmers

    argdict = { "mismatches" : args.mismatches,
                "processors" : 1, 
                "bowtie"     : args.bowtie,
                "toomany"    : toomany_file,
                "kmers"      : kmer_file,
                "ebwt"       : args.ebwt,
                "null"       : os.devnull,
                }
    
    cmd  = "%(bowtie)s -m1 -a --best -f -v %(mismatches)s -p %(processors)s %(ebwt)s %(kmers)s --max %(toomany)s >%(null)s" % argdict

    printer.write("Aligning %s-mers for chromosome '%s' :\n\t'%s'" % (args.read_length,name,cmd))
    try:
        retcode = subprocess.call(cmd,shell=True)
        if retcode < 0 or retcode == 2:
            printer.write("Alignment for chromosome '%s' terminated with status %s" % (name,retcode))
        else:
            if os.path.exists(toomany_file):
                printer.write("Assembling multimappers from chromosome '%s' into crossmap..."% name)
                with argsopener(bed_file,args,"w") as bed_out:
                    for plus_chain, minus_chain in fa_to_bed(open(toomany_file),
                                                         args.read_length,
                                                         offset=args.offset):
                        bed_out.write(plus_chain.as_bed())
                        bed_out.write(minus_chain.as_bed())
                
                    bed_out.close()
            
            else:
                printer.write("Could not find multimapper source file '%s' ." % toomany_file)
    except OSError as e:
        printer.write("Alignment failed for chromosome '%s': %s" % (name,e))

    printer.write("Cleaning up chromosome '%s'..." % name)
    os.remove(toomany_file)
    if args.have_kmers == False and args.save_kmers == False:
        os.remove(kmer_file)

    return bed_file
コード例 #4
0
def main(argv=sys.argv[1:]):
    """Command-line program
    
    Parameters
    ----------
    argv : list, optional
        A list of command-line arguments, which will be processed
        as if the script were called from the command line if
        :func:`main` is called directly.

        Default: `sys.argv[1:]`. The command-line arguments, if the script is
        invoked from the command line
    """
    ap = AnnotationParser()
    annotation_file_parser = ap.get_parser(conflict_handler="resolve")
    
    al = AlignmentParser(disabled=_DISABLED)
    alignment_file_parser  = al.get_parser(conflict_handler="resolve")
    
    mp = MaskParser()
    mask_file_parser = mp.get_parser()
    
    bp = BaseParser()
    base_parser = bp.get_parser()
    
    parser = argparse.ArgumentParser(description=format_module_docstring(__doc__),
                                     formatter_class=argparse.RawDescriptionHelpFormatter,
                                     parents=[base_parser,
                                              alignment_file_parser,
                                              annotation_file_parser,
                                              mask_file_parser],
                                     )
    parser.add_argument("outfile",type=str,help="Output filename")
    
    args = parser.parse_args(argv)
    bp.get_base_ops_from_args(args)

    ga          = al.get_genome_array_from_args(args,printer=printer)
    transcripts = ap.get_transcripts_from_args(args,printer=printer,return_type=SegmentChain)
    crossmap    = mp.get_genome_hash_from_args(args,printer=printer)
    
    ga_sum = ga.sum()
    normconst = 1000.0*1e6 / ga_sum
    
    with argsopener(args.outfile,args,"w") as fout:
        fout.write("## total_dataset_counts: %s\n" % ga_sum)
        fout.write("region_name\tregion\tcounts\tcounts_per_nucleotide\trpkm\tlength\n")
        for n,ivc in enumerate(transcripts):
            name = ivc.get_name()
            masks = crossmap.get_overlapping_features(ivc)
            ivc.add_masks(*itertools.chain.from_iterable((X for X in masks)))
            if n % 1000 == 0:
                printer.write("Processed %s regions..." % n)
                
            counts = numpy.nansum(ivc.get_masked_counts(ga))
            length = ivc.masked_length
            rpnt = numpy.nan if length == 0 else float(counts)/length
            rpkm = numpy.nan if length == 0 else rpnt * normconst 
            ltmp = [name,
                    str(ivc),
                    "%.8e" % counts,
                    "%.8e" % rpnt,
                    "%.8e" % rpkm,
                    "%d" % length]
            fout.write("%s\n" % "\t".join(ltmp))
    
        fout.close()
        
    printer.write("Processed %s regions total." % n)

    printer.write("Done.")
コード例 #5
0
def main(argv=sys.argv[1:]): 
    """Command-line program
    
    Parameters
    ----------
    argv : list, optional
        A list of command-line arguments, which will be processed
        as if the script were called from the command line if
        :py:func:`main` is called directly.

        Default: sys.argv[1:] (actually command-line arguments)
    """
    ap = AnnotationParser()
    bp = BaseParser()
    annotation_parser = ap.get_parser()
    base_parser = bp.get_parser()

    parser = argparse.ArgumentParser(description=format_module_docstring(__doc__),
                                     parents=[base_parser,annotation_parser],
                                     formatter_class=argparse.RawDescriptionHelpFormatter)
    parser.add_argument("--no_escape",default=True,action="store_false",
                        help="If specified and output format is GTF2, special characters in column 9 will be escaped (default: True)")
    parser.add_argument("--output_format",choices=["BED","GTF2"],default="GTF2",
                        help="Format of output file. (default: GTF2)")
    parser.add_argument("--extra_columns",nargs="+",default=[],type=str,
                        help="Attributes (e.g. 'gene_id' to output as extra columns in extended BED format (BED output only).")
    parser.add_argument("--empty_value",default="na",type=str,
                        help="Value to use of an attribute in `extra_columns` is not defined for a particular record (Default: 'na'")
    parser.add_argument("outfile",metavar="outfile.[ bed | gtf ]",type=str,
                        help="Output file")
    args = parser.parse_args(argv)
    bp.get_base_ops_from_args(args)

    end_message = ""    
    extra_cols = args.extra_columns
    if extra_cols is not None:
        if args.output_format == "BED":
            
            # avoid name clashes
            names_used = copy.copy(BED12_RESERVED_NAMES)
            asql_names = [fix_name(X,names_used) for X in extra_cols]
            autosql_str = "\n".join(AUTOSQL_ROW_FMT_STR % (X," "*max(15-len(X),2)) for X in asql_names)
            
            file_info = {
                "outbase" : args.outfile.replace(".bed","").replace(".gtf",""),
                "numcols" : len(extra_cols),
                "autosql" : DEFAULT_AUTOSQL_STR % (os.path.basename(args.outfile[:-4]),autosql_str),
                         
            }
            end_message = MAKE_BIGBED_MESSAGE % file_info
        else:
            warn("`--extra_columns` is ignored for %s-formatted output." % (args.output_format),ArgumentWarning)
            
            
    with argsopener(args.outfile,args,"w") as fout:
        c = 0
        transcripts = ap.get_transcripts_from_args(args,printer=printer)
        
        for transcript in transcripts:
            if args.output_format == "GTF2":
                fout.write(transcript.as_gtf(escape=args.no_escape))
            elif args.output_format == "BED":
                fout.write(transcript.as_bed(extra_columns=extra_cols,empty_value=args.empty_value))
            if c % 1000 == 1:
                printer.write("Processed %s transcripts ..." % c)
            c += 1
    
    printer.write("Processed %s transcripts total." % c)
    printer.write("Done.")
    print(end_message)
コード例 #6
0
def main(argv=sys.argv[1:]):
    """Command-line program
    
    Parameters
    ----------
    argv : list, optional
        A list of command-line arguments, which will be processed
        as if the script were called from the command line if
        :py:func:`main` is called directly.

        Default: sys.argv[1:] (actually command-line arguments)
    """
    ap = AnnotationParser(input_choices=_ANNOTATION_INPUT_CHOICES)
    annotation_file_parser = ap.get_parser()
    
    bp = BaseParser()
    base_parser = bp.get_parser()
    
    parser = argparse.ArgumentParser(description=format_module_docstring(__doc__),
                                     formatter_class=argparse.RawDescriptionHelpFormatter,
                                     parents=[base_parser,annotation_file_parser])
    parser.add_argument("--export_tophat",default=False,action="store_true",
                         help="Export tophat `.juncs` file in addition to BED output")
    parser.add_argument("outbase",type=str,help="Basename for output files")

    args = parser.parse_args(argv)
    bp.get_base_ops_from_args(args)
    
    transcripts = ap.get_transcripts_from_args(args,printer=printer,return_type=SegmentChain)
    
    with argsopener("%s.bed" % args.outbase,args,"w") as bed_out:
        if args.export_tophat == True:
            tophat_out = open("%s.juncs" % args.outbase,"w")
    
        printer.write("params: " +" ".join(argv))
        printer.write("Detecting & comparing junctions...")
        ex_pairs = {}
        
        c = 0
        u = 0
        for chain in transcripts:
            if len(chain) > 1: # if multi-exon
                chrom = chain.chrom
                strand = chain.strand
                try:
                    ep = ex_pairs[(chrom,strand)]
                except KeyError:
                    ex_pairs[(chrom,strand)] = []
                    ep = ex_pairs[(chrom,strand)]

                for i in range(0,len(chain)-1):
                    
                    seg1 = chain[i]
                    seg2 = chain[i+1]
                    if c % 1000 == 0 and c > 0:
                        printer.write("Processed %s junctions. Found %s unique..." % (c,u) )
                    c+=1
                    key = (seg1.end,seg2.start)
                        
                    if key not in ep:
                        ep.append(key)
                        u += 1
                        new_chain = SegmentChain(seg1,seg2)
                        bed_out.write(new_chain.as_bed())
                        if args.export_tophat == True:
                            my_junc = (chrom,seg1.end-1,seg2.start,strand)
                            tophat_out.write("%s\t%s\t%s\t%s\n" % my_junc)
                        
                        del new_chain
                    
                    del seg1
                    del seg2
                    
            del chain
    
        printer.write("Processed %s total junctions. Found %s unique." % (c,u) )
    
        bed_out.close()
        if args.export_tophat == True:
            tophat_out.close()

    printer.write("Done.")
コード例 #7
0
def main(argv=sys.argv[1:]):
    """Command-line program
	
	Parameters
	----------
	argv : list, optional
		A list of command-line arguments, which will be processed
		as if the script were called from the command line if
		:py:func:`main` is called directly.

		Default: `sys.argv[1:]`. The command-line arguments, if the script is
		invoked from the command line
	"""
    bp = BaseParser()
    parser = argparse.ArgumentParser(
        description=format_module_docstring(__doc__),
        formatter_class=argparse.RawDescriptionHelpFormatter,
        parents=[bp.get_parser()])
    parser.add_argument("--exclude",
                        nargs="+",
                        default=[],
                        help="Feature types to exclude from consideration")
    parser.add_argument("infile",
                        metavar="infile.gff",
                        type=str,
                        help="Input GFF3 file")
    parser.add_argument("outfile",
                        metavar="outfile.txt",
                        type=str,
                        help="Name of output file")

    args = parser.parse_args(argv)
    bp.get_base_ops_from_args(args)
    excluded = set(args.exclude)

    fin = sys.stdin if args.infile == "-" else opener(args.infile)

    feature_counts = Counter()
    features_with_parents = []
    feature_types = {}
    name_type = {}

    printer.write("Opening %s..." % args.infile)
    c = 0
    for feature in GFF3_Reader(fin, return_stopfeatures=False):
        if c % 10000 == 0:
            printer.write("Processed %s features..." % c)
        c += 1
        ftype = feature.attr["type"]
        fname = feature.get_name()
        if ftype not in excluded:
            if ftype not in feature_types:
                feature_types[ftype] = Counter()
            feature_counts[ftype] += 1
            if fname is not None:
                name_type[fname] = ftype
            if "Parent" in feature.attr:
                features_with_parents.append(feature)
            else:
                feature_types[ftype]["parent unspecified"] += 1

    printer.write("Sorting parents...")
    c = 0
    for feature in features_with_parents:
        if c % 10000 == 0:
            printer.write("Processed %s parents..." % c)
        c += 1
        pnames = feature.attr["Parent"]
        ftype = feature.attr["type"]
        if pnames == "":
            feature_types[ftype]["parent unspecified"] += 1
        else:
            if len(pnames) > 1:
                feature_types[ftype]["multiple parents"] += 1
            else:
                ptype = name_type.get(pnames[0], "parent not in database")
                feature_types[ftype][ptype] += 1

    rows = sorted(feature_types.keys())
    cols = rows + [
        "parent unspecified", "parent not in database", "multiple parents"
    ]

    with argsopener(args.outfile, args, "w") as fh:
        printer.write("Writing %s..." % args.outfile)
        header = "#feature_type\tcount\t" + "\t".join(cols) + "\n"
        fh.write(header)
        for r in rows:
            sout = "%s\t%s" % (r, feature_counts[r])
            for i in cols:
                sout += "\t%s" % feature_types[r].get(i, 0)
            fh.write("%s\n" % sout)

    printer.write("Done.")
コード例 #8
0
ファイル: psite.py プロジェクト: zzygyx9119/plastid
def main(argv=sys.argv[1:]):
    """Command-line program
    
    Parameters
    ----------
	argv : list, optional
		A list of command-line arguments, which will be processed
		as if the script were called from the command line if
		:py:func:`main` is called directrly.

        Default: `sys.argv[1:]`. The command-line arguments, if the script is
        invoked from the command line
    """
    ap = AlignmentParser(allow_mapping=False,input_choices=["BAM"],
                         disabled=["normalize","big_genome",])
    bp = BaseParser()
    alignment_file_parser = ap.get_parser()
    base_parser = bp.get_parser()
    
    pp = PlottingParser()
    plotting_parser = pp.get_parser()

    parser = argparse.ArgumentParser(description=format_module_docstring(__doc__),
                                     formatter_class=argparse.RawDescriptionHelpFormatter,
                                     parents=[base_parser,
                                              alignment_file_parser,
                                              plotting_parser])
    
    parser.add_argument("--min_counts",type=int,default=10,metavar="N",
                         help="Minimum counts required in normalization region "+
                              "to be included in metagene average (Default: 10)")
    parser.add_argument("--normalize_over",type=int,nargs=2,metavar="N",
                         default=None,
                         #default=(20,50),
                         help="Portion of each window against which its individual raw count profile"+
                              " will be normalized. Specify two integers, in nucleotide"+
                              " distance from landmark (negative for upstream, positive for downstream. Surround negative numbers with quotes.). (Default: 20 50)")
    parser.add_argument("--norm_region",type=int,nargs=2,metavar="N",
                         default=None,
                         help="Deprecated. Use ``--normalize_over`` instead. "+
                              "Formerly, Portion of each window against which its individual raw count profile"+
                              " will be normalized. Specify two integers, in nucleotide"+
                              " distance, from 5\' end of window. (Default: 70 100)")
    parser.add_argument("--require_upstream",default=False,action="store_true",
                        help="If supplied, the P-site offset is taken to be the distance "+
                             "between the largest peak upstream of the start codon and "+
                             "the start codon itself. Otherwise, the P-site offset is taken "+
                             "to be the distance between the largest peak in the entire ROI "+
                             "and the start codon. Ignored if ``--constrain`` is used."
                        )
    parser.add_argument("--constrain",type=int,nargs=2,default=None,metavar="X",
                        help="Constrain P-site offset to be between specified distance from "+
                             "start codon. Useful for noisy data. "+
                             "(Reasonable set: 10 15; default: not constrained)")
    parser.add_argument("--aggregate",default=False,action="store_true",
                        help="Estimate P-site from aggregate reads at each position, instead "+
                             "of median normalized read density. Noisier, but helpful for "+
                             "lower-count data or read lengths with few counts. (Default: False)"
                        ),
    parser.add_argument("--keep",default=False,action="store_true",
                        help="Save intermediate count files. Useful for additional computations (Default: False)")
    parser.add_argument("--default",type=int,default=13,
                        help="Default 5\' P-site offset for read lengths that are not present or evaluated in the dataset. Unaffected by ``--constrain`` (Default: 13)")

    parser.add_argument("roi_file",type=str,
                        help="ROI file surrounding start codons, from ``metagene generate`` subprogram")
    
    parser.add_argument("outbase",type=str,help="Basename for output files")
    
    # set manual options
    args = parser.parse_args(argv)
    bp.get_base_ops_from_args(args)

    # set defaults
    args.mapping = "fiveprime"
    args.offset  = 0
    args.nibble  = 0

    
    # process arguments
    min_len = args.min_length
    max_len = args.max_length
    profiles = max_len + 1 - min_len
    lengths = list(range(min_len,max_len+1))
    outbase = args.outbase
    title  = "Fiveprime read offsets by length" if args.title is None else args.title
    
    pp.set_style_from_args(args)
    colors = pp.get_colors_from_args(args,profiles)
 
    printer.write("Opening ROI file %s ..." % args.roi_file)
    with opener(args.roi_file) as roi_fh:
        roi_table = pd.read_table(roi_fh,sep="\t",comment="#",index_col=None,header=0)
        roi_fh.close()
        
    printer.write("Opening count files %s ..." % ",".join(args.count_files))
    ga = ap.get_genome_array_from_args(args,printer=printer)

    
    # remove default size filters
    my_filters = ga._filters.keys()
    for f in my_filters:
        ga.remove_filter(f)

    norm_start, norm_end = _get_norm_region(roi_table,args)
    
    # count
    count_dict, norm_count_dict, metagene_profile = do_count(roi_table,
                                                             ga,
                                                             norm_start,
                                                             norm_end,
                                                             args.min_counts,
                                                             min_len,
                                                             max_len,
                                                             aggregate=args.aggregate,
                                                             printer=printer)
    
    # save counts
    profile_fn = "%s_metagene_profiles.txt" % outbase
    with argsopener(profile_fn,args,"w") as metagene_out:
        metagene_profile.to_csv(metagene_out,
                                sep="\t",
                                header=True,
                                index=False,
                                na_rep="nan",
                                columns=["x"]+["%s-mers" % X for X in lengths])
        metagene_out.close()

    if args.keep == True:
        printer.write("Saving raw and normalized counts ...")
        for k in count_dict:
            count_fn     = "%s_%s_rawcounts.txt.gz"  % (outbase,k)
            normcount_fn = "%s_%s_normcounts.txt.gz" % (outbase,k)
            mask_fn      = "%s_%s_mask.txt.gz" % (outbase,k)
            numpy.savetxt(count_fn,count_dict[k],delimiter="\t")
            numpy.savetxt(normcount_fn,norm_count_dict[k],delimiter="\t")
            numpy.savetxt(mask_fn,norm_count_dict[k].mask,delimiter="\t")
    
    # plotting & offsets
    printer.write("Plotting and determining offsets ...")
    offset_dict = OrderedDict() 

    # Determine scaling factor for plotting metagene profiles
    max_y = numpy.nan 
    with warnings.catch_warnings():
        # ignore warnings for slices that contain only NaNs
        warnings.simplefilter("ignore",category=RuntimeWarning)
        for k in lengths:
            max_y = numpy.nanmax([max_y,
                                  numpy.nanmax(metagene_profile["%s-mers"% k].values)])

    if numpy.isnan(max_y) or max_y == 0:
        max_y = 1.0


    # parse arguments & set styles
    mplrc = matplotlib.rcParams
    plt_incr  = 1.2

    # use this figsize if not specified on command line
    figheight = 1.0 + 0.25*(profiles-1) + 0.75*(profiles)
    default_figsize = (7.5,figheight)

    fig = pp.get_figure_from_args(args,figsize=default_figsize)

    ax = plt.gca()
    plt.title(title)
    plt.xlabel("Distance from CDS start, (nt; 5' end mapping)")
    if args.aggregate == True:
        plt.ylabel("Aggregate read counts (au)")
    else:
        plt.ylabel("Median normalized read density (au)")
        
    plt.axvline(0.0,color=mplrc["axes.edgecolor"],dashes=[3,2])

    x = metagene_profile["x"].values
    xmin = x.min()
    xmax = x.max()
    
    if args.constrain is not None:
        mask = numpy.tile(True,len(x))
        
        zp = (x==0).argmax()
        l,r = args.constrain
        if l == r:
            warnings.warn("Minimum and maximum distance constraints are equal (both '%s'). This is silly." % l,ArgumentWarning)
            
        mindist = min(l,r)
        maxdist = max(l,r)
        
        mask[zp-maxdist:zp-mindist+1] = False
    elif args.require_upstream == True:
        mask = x >= 0
    else:
        mask = numpy.tile(False,len(x))

    for n,k in enumerate(lengths):
        color = colors[n]
        baseline = plt_incr*n
        y = metagene_profile["%s-mers" % k].values
        #ymask = y[mask]
        ymask = numpy.ma.MaskedArray(y,mask=mask)

        if numpy.isnan(y).all():
            plot_y = numpy.zeros_like(x)
        else:
            if args.aggregate == False:
                plot_y = y / max_y
            else:
                plot_y = y.astype(float) / numpy.nanmax(y) * 0.9
 
        # plot metagene profiles on common scale, offset by baseline from bottom to top
        ax.plot(x,baseline + plot_y,color=color)
        ax.text(xmin,baseline,"%s-mers" % k,
                ha="left",
                va="bottom",
                color=color,
                transform=matplotlib.transforms.offset_copy(ax.transData,fig,
                                                            x=6.0,y=3.0,units="points"))

        ymax = baseline + numpy.nanmax(plot_y)

        # if all valid positions are nan, or if all valid positions are <= 0
        if (~mask).sum() == numpy.isnan(ymask).sum() or numpy.nanmax(ymask) == 0:
            offset = args.default
            usedefault = True
        else:
            offset = -x[numpy.ma.argmax(ymask)]
            usedefault = False

        offset_dict[k] = offset
        if usedefault == False:
            yadj = ymax - 0.2 * plt_incr

            ax.plot([-offset,0],[yadj,yadj],color=color,dashes=[3,2])
            ax.text(-offset / 2.0,
                     yadj,
                     "%s nt" % (offset),
                     color=color,
                     ha="center",
                     va="bottom",
                     transform=matplotlib.transforms.offset_copy(ax.transData,fig,
                                                                 x=0.0,y=3.0,units="points")
                    )   

    plt.xlim(xmin,xmax)
    plt.ylim(-0.1,plt_incr+baseline)
    ax.yaxis.set_ticks([])

    # save data as p-site offset table
    fn = "%s_p_offsets.txt" % outbase
    fout = argsopener(fn,args)
    printer.write("Writing offset table to %s ..." % fn)
    fout.write("length\tp_offset\n")
    for k in offset_dict:
        fout.write("%s\t%s\n" % (k,offset_dict[k]))
    
    fout.write("default\t%s" % args.default)
    
    fout.close()

    # save plot
    plot_fn ="%s_p_offsets.%s" % (outbase,args.figformat) 
    printer.write("Saving plot to %s ..." % plot_fn)
    plt.savefig(plot_fn,dpi=args.dpi,bbox_inches="tight")

    printer.write("Done.")
コード例 #9
0
def main(argv=sys.argv[1:]):
    """Command-line program
    
    Parameters
    ----------
	argv : list, optional
		A list of command-line arguments, which will be processed
		as if the script were called from the command line if
		:py:func:`main` is called directly.

        Default: `sys.argv[1:]`. The command-line arguments, if the script is
        invoked from the command line
    """
    al = AlignmentParser(disabled=["normalize","big_genome","spliced_bowtie_files"],
                        input_choices=["BAM"])
    an = AnnotationParser()
    pp = PlottingParser()
    bp = BaseParser()
    plotting_parser = pp.get_parser()
    
    alignment_file_parser = al.get_parser(conflict_handler="resolve")
    annotation_file_parser = an.get_parser(conflict_handler="resolve")
    base_parser = bp.get_parser()

    parser = argparse.ArgumentParser(description=format_module_docstring(__doc__),
                                     formatter_class=argparse.RawDescriptionHelpFormatter,
                                     conflict_handler="resolve",
                                     parents=[base_parser,
                                              annotation_file_parser,
                                              alignment_file_parser,
                                              plotting_parser])
    
    parser.add_argument("roi_file",type=str,nargs="?",default=None,
                        help="Optional. ROI file of maximal spanning windows surrounding start codons, "+\
                             "from ``metagene generate`` subprogram. Using this instead of `--annotation_files` "+\
                             "prevents double-counting of codons when multiple transcript isoforms exist "+\
                             "for a gene. See the documentation for `metagene` for more info about ROI files."+\
                             "If an ROI file is not given, supply an annotation with ``--annotation_files``")
    parser.add_argument("outbase",type=str,help="Required. Basename for output files")
    parser.add_argument("--codon_buffer",type=int,default=5,
                        help="Codons before and after start codon to ignore (Default: 5)")


    args = parser.parse_args(argv)
    bp.get_base_ops_from_args(args)
    pp.set_style_from_args(args)
    gnd = al.get_genome_array_from_args(args,printer=printer)

    read_lengths = list(range(args.min_length,args.max_length+1))
    codon_buffer = args.codon_buffer
    
    dtmp = { "read_length"   : numpy.array(read_lengths),
             "reads_counted" : numpy.zeros_like(read_lengths,dtype=int),
            }

    if args.roi_file is not None:
        using_roi = True
        roi_table = read_pl_table(args.roi_file)
        regions = roi_table.iterrows()
        transform_fn = roi_row_to_cds
        back_buffer = -1
        if len(args.annotation_files) > 0:
            warnings.warn("If an ROI file is given, annotation files are ignored. Pulling regions from '%s'. Ignoring '%s'" % (args.roi_file,
                                                                                                                               ", ".join(args.annotation_files)),
                          ArgumentWarning)
    else:
        using_roi = False
        if len(args.annotation_files) == 0:
            printer.write("Either an ROI file or at least annotation file must be given.")
            sys.exit(1)
        else:
            warnings.warn("Using a transcript annotation file instead of an ROI file can lead to double-counting of codons if the annotation contains multiple transcripts per gene.",
                          ArgumentWarning)        
            regions = an.get_transcripts_from_args(args,printer=printer)
            back_buffer  = -codon_buffer
            transform_fn = lambda x: x.get_cds()
    
    phase_sums = {}
    for k in read_lengths:
        phase_sums[k] = numpy.zeros(3)
    
    for n, roi in enumerate(regions):
        if n % 1000 == 1:
            printer.write("Counted %s ROIs ..." % n)
        
        # transformation needed to extract CDS from transcript or from ROI file window
        cds_part = transform_fn(roi)
        
        # only calculate for coding genes
        if len(cds_part) > 0:

            read_dict     = {}
            count_vectors = {}
            for k in read_lengths:
                read_dict[k]     = []
                count_vectors[k] = []
            
            # for each seg, fetch reads, sort them, and create individual count vectors
            for seg in cds_part:
                reads = gnd.get_reads(seg)
                for read in filter(lambda x: len(x.positions) in read_dict,reads):
                    read_dict[len(read.positions)].append(read)
    
                # map and sort by length
                for read_length in read_dict:
                    count_vector = list(gnd.map_fn(read_dict[read_length],seg)[1])
                    count_vectors[read_length].extend(count_vector)
            
            # add each count vector for each length to total
            for k, vec in count_vectors.items():
                counts = numpy.array(vec)
                if cds_part.strand == "-":
                    counts = counts[::-1]
               
                if len(counts) % 3 == 0:
                    counts = counts.reshape((len(counts)/3,3))
                else:
                    if using_roi == False:
                        message = "Length of '%s' coding region (%s nt) is not divisible by 3. Ignoring last partial codon." % (roi.get_name(),len(counts))
                        warnings.warn(message,DataWarning)
                    newlen = len(counts)//3
                    counts = counts[:3*newlen]
                    counts = counts.reshape(newlen,3)
    
                phase_sums[k] += counts[codon_buffer:back_buffer,:].sum(0)

    printer.write("Counted %s ROIs total." % (n+1))
    for k in dtmp:
        dtmp[k] = numpy.array(dtmp[k])
    
    # total reads counted for each size    
    for k in read_lengths:
        dtmp["reads_counted"][dtmp["read_length"] == k] = phase_sums[k].sum() 
    
    # read length distribution
    dtmp["fraction_reads_counted"] = dtmp["reads_counted"].astype(float) / dtmp["reads_counted"].sum() 
    
    # phase vectors
    phase_vectors = { K : V.astype(float)/V.astype(float).sum() for K,V in phase_sums.items() }
    for i in range(3):
        dtmp["phase%s" % i] = numpy.zeros(len(dtmp["read_length"])) 

    for k, vec in phase_vectors.items():
        for i in range(3):
            dtmp["phase%s" % i][dtmp["read_length"] == k] = vec[i]
    
    # phase table
    fn = "%s_phasing.txt" % args.outbase
    printer.write("Saving phasing table to %s ..." % fn)
    dtmp = pd.DataFrame(dtmp)
    with argsopener(fn,args) as fh:
        dtmp.to_csv(fh,columns=["read_length",
                                "reads_counted",
                                "fraction_reads_counted",
                                "phase0",
                                "phase1",
                                "phase2",
                                ],
                       float_format="%.6f",
                       na_rep="nan",
                       sep="\t",
                       index=False,
                       header=True
                      )
        fh.close()
    
    fig = {}
    if args.figsize is not None:
        fig["figsize"] = tuple(args.figsize)

    colors = pp.get_colors_from_args(args,len(read_lengths))

    fn = "%s_phasing.%s" % (args.outbase,args.figformat)
    printer.write("Plotting to %s ..." % fn)
    plot_counts = numpy.vstack([V for (_,V) in sorted(phase_sums.items())])
    fig, (ax1,_) = phase_plot(plot_counts,labels=read_lengths,lighten_by=0.3,
                                cmap=None,color=colors,fig=fig)

    if args.title is not None:
        ax1.set_title(args.title)
    else:
        ax1.set_title("Phasing stats for %s" % args.outbase)

    fig.savefig(fn,dpi=args.dpi,bbox_inches="tight")
コード例 #10
0
ファイル: slidejuncs.py プロジェクト: zzygyx9119/plastid
def main(argv=sys.argv[1:]):
    """Command-line program
    
    Parameters
    ----------
    argv : list, optional
        A list of command-line arguments, which will be processed
        as if the script were called from the command line if
        :py:func:`main` is called directly.

        Default: `sys.argv[1:]`. The command-line arguments, if the script is
        invoked from the command line
    """
    sp = SequenceParser()
    mp = MaskParser()
    bp = BaseParser()
    
    parser = argparse.ArgumentParser(description=format_module_docstring(__doc__),
                                     formatter_class=argparse.RawDescriptionHelpFormatter,
                                     parents=[bp.get_parser(),sp.get_parser(),mp.get_parser()],
                                     )
    parser.add_argument("--maxslide",type=int,default=10,
                        help="Maximum number of nt to search 5\' and 3\' of intron"+
                             " boundaries (Default: 10)")
    parser.add_argument("--ref",type=str,metavar="ref.bed",default=None,
                        help="Reference file describing known splice junctions")
    parser.add_argument("--slide_canonical",action="store_true",default=False,
                        help="Slide junctions to canonical junctions if present within equal support region")
    parser.add_argument("infile",type=str,metavar="input.bed",
                        help="BED file describing discovered junctions")
    parser.add_argument("outbase",type=str,
                        help="Basename for output files")
    args = parser.parse_args(argv)
    bp.get_base_ops_from_args(args)
    
    printer.write("Opening genome from %s..." % args.sequence_file)
    genome = sp.get_seqdict_from_args(args)
    
    # load crossmap    
    cross_hash = mp.get_genome_hash_from_args(args)

    # load ref junctions
    if args.ref is not None:
        printer.write("Loading reference junctions from %s" % args.ref)
        known_hash = GenomeHash(list(BED_Reader(open(args.ref))),do_copy=False)
    else:
        known_hash = GenomeHash()

    # set up variables    
    canonicals_plus = [("GT","AG"),
                       ("GC","AG")
                      ]
    
    canonicals_minus = [("CT","AC"),
                        ("CT","GC")
                       ]
    
    known_in_range     = 0
    canonical_in_range = 0
    repetitive         = 0
    untouched          = 0
    c = 0
    
    seen_already = []

    outfiles = {
                 "repetitive" : "%s_repetitive.bed" % args.outbase,
                 "known"      : "%s_shifted_known.bed" % args.outbase,
                 "canonical"  : "%s_shifted_canonical.bed" % args.outbase,
                 "untouched"  : "%s_untouched.bed" % args.outbase,
                }
    outfiles = { K : argsopener(V,args,"w") for K,V in outfiles.items() }

    # process data
    printer.write("Opening junctions from %s..." % args.infile)
    for ivc in BED_Reader(CommentReader(opener(args.infile))):
        processed = False
        tup = None

        if c % 1000 == 0 and c > 0:
            printer.write("Processed: %s\tknown: %s\tshifted to canonical: %s\trepetitive: %s\tuntouched: %s" % \
                    (c, known_in_range, canonical_in_range, repetitive, untouched))
                   
        assert len(ivc) == 2
        strand = ivc.strand
        
        minus_range, plus_range = find_match_range(ivc,genome,args.maxslide)
        
        # see if either end of splice junction +- match_range lands in repetitive areas of genome
        if covered_by_repetitive(ivc,minus_range,plus_range,cross_hash):
            repetitive += 1
            outfiles["repetitive"].write(ivc.as_bed())
            processed = True

        # see if one or more known junctions in range
        if processed == False and args.ref is not None:
            # find_known_in_range(query_ivc,minus_range,plus_range,knownjunctions)
            known_juncs = find_known_in_range(ivc,minus_range,plus_range,known_hash.get_nearby_features(ivc))
            if len(known_juncs) > 0:
                known_in_range += 1
                for my_known in known_juncs:
                    tup = get_junction_tuple(my_known)
                    if tup not in seen_already:
                        outfiles["known"].write(my_known.as_bed())
                        seen_already.append(tup)
                    
                processed = True
            
        # see if one or more canonical junctions in range
        if processed == False and args.slide_canonical == True:
            canonicals = canonicals_plus if strand == "+" else canonicals_minus
            #find_canonicals_in_range(query_ivc,minus_range,plus_range,genome,canonicals)
            canonical_juncs = find_canonicals_in_range(ivc,minus_range,plus_range,genome,canonicals)
            if len(canonical_juncs) > 0:
                canonical_in_range += 1
                for can in canonical_juncs:
                    tup = get_junction_tuple(can)
                    if tup not in seen_already:
                        outfiles["canonical"].write(can.as_bed())
                        seen_already.append(tup)

                processed = True
                    
        if processed == False:
            outfiles["untouched"].write(ivc.as_bed())
            untouched += 1
            
        c += 1

    # save output
    printer.write("Totals: %s\tknown: %s\tshifted to canonical: %s\trepetitive: %s\tuntouched: %s" % \
            (c, known_in_range, canonical_in_range, repetitive, untouched))    

    for v in outfiles.values():
        v.close()
    
    printer.write("Done.")
コード例 #11
0
ファイル: cs.py プロジェクト: zzygyx9119/plastid
def do_chart(args, plot_parser):
    """Produce a set of charts comparing multiple samples pairwise.
    
    Charts include histograms of log2 fold changes and scatter plots with
    correlation coefficients, both generated for raw count and RPKM data.

    Parameters
    ----------
    args : :py:class:`argparse.Namespace`
        command-line arguments for ``chart`` subprogram       
    """
    plot_parser.set_style_from_args(args)

    outbase = args.outbase
    bins = numpy.array(args.bins)
    figformat = args.figformat

    # read input files
    printer.write("Reading input files: %s ..." % ", ".join(args.infiles))

    samples = { get_short_samplename(X) : read_count_file(opener(X),args.list_of_regions)\
                                          for X in args.infiles }

    # Define some variables for later
    sample_names = sorted(samples.keys())
    comparisons = [
        X for X in itertools.combinations(sample_names, 2) if X[0] != X[1]
    ]
    colors = plot_parser.get_colors_from_args(args, len(comparisons))
    binkeys = tuple(["%s_reads" % k for k in args.regions])

    comparison_labels = sorted(["%s_vs_%s" % (X, Y) for X, Y in comparisons])
    bigtable = {}.fromkeys(comparison_labels)
    corrcoef_by_bin_table = {}.fromkeys(comparison_labels)

    # ki, kj = names of samples i and j
    # vi, vj = data of samples i and j
    for ki, kj in comparisons:
        try:
            assert (samples[ki]["region"] == samples[kj]["region"]).all()
        except AssertionError:
            printer.write(
                "Mismatched line entries for samples %s and %s. Were different gene lists used for counting?"
                % (ki, kj))

        vi = samples[ki]
        vj = samples[kj]
        printer.write("Comparing %s to %s:" % (ki, kj))

        label = "%s_vs_%s" % (ki, kj)

        bigtable[label] = {}
        corrcoef_by_bin_table[label] = {}

        for binkey in binkeys:
            bigtable[label][binkey] = {
                K: copy.deepcopy({})
                for K in args.metrics
            }
            corrcoef_by_bin_table[label][binkey] = {
                K: copy.deepcopy({})
                for K in args.metrics
            }

        for region in args.regions:
            region_counts = "%s_reads" % region
            count_mask = (vi[region_counts].values + vj[region_counts].values
                          >= 128)

            for metric in args.metrics:
                region_metric = "%s_%s" % (region, metric)
                printer.write("    -%s across %s for all >=128 ..." %
                              (metric, region))

                # divide into regions >=128 and <128 and plot
                viover = vi[region_metric][count_mask].values
                vjover = vj[region_metric][count_mask].values

                viunder = vi[region_metric][~count_mask].values
                vjunder = vj[region_metric][~count_mask].values

                pearsonr = scipy.stats.pearsonr(viover, vjover)[0]
                spearmanr = scipy.stats.spearmanr(viover, vjover)[0]

                # log2 fold change stats
                log2_ratios = numpy.log2(
                    numpy.ma.masked_invalid(vjover / viover))
                log2_mean = log2_ratios.mean()
                log2_std = log2_ratios.std()
                min_diff = 2**(3 * log2_std)
                num_genes_log2 = (~log2_ratios.mask).sum()

                # ma plot
                try:
                    ma_title = "%s vs %s (%s %s)" % (ki, kj, region, metric)
                    fig = plot_parser.get_figure_from_args(args)
                    _, axdict = ma_plot(viunder,
                                        vjunder,
                                        color=process_black,
                                        label="< 128 counts",
                                        axes=plt.gca(),
                                        kdalpha=0.2,
                                        title=ma_title)
                    _, _ = ma_plot(viover,
                                   vjover,
                                   axes=axdict,
                                   label=">= 128 counts",
                                   kdalpha=0.8)

                    mainax = axdict["main"]
                    mainax.set_xlabel("%s %s" % (ki, metric))
                    mainax.legend(loc="upper right", frameon=False)

                    text_kwargs = {
                        "horizontalalignment": "right",
                        "verticalalignment": "baseline",
                        "linespacing": 1.6,
                        "transform": mainax.transAxes,
                    }

                    plot_text = "\n".join([
                        "sample mean: %0.3e" % log2_mean,
                        "sample stdev: %0.3e" % log2_std,
                        "3-sigma fold change: %0.2f" % min_diff,
                        "regions_counted: %s" % count_mask.sum(),
                    ])

                    mainax.text(0.96, 0.04, plot_text, **text_kwargs)

                    plt.savefig("%s_ma_%s_%s_%s.%s" %
                                (outbase, label, region, metric, figformat),
                                bbox_inches="tight")
                    plt.close()
                except Exception as e:
                    warnings.warn(
                        "Could not make MA plot for samples %s and %s. Error message:\n    %s"
                        % (ki, kj, e.message), DataWarning)

                # scatter plot
                try:
                    scatter_title = "%s vs %s (%s %s)" % (ki, kj, region,
                                                          metric)
                    do_scatter(vi[region_metric].values,
                               vj[region_metric].values,
                               count_mask,
                               plot_parser,
                               args,
                               pearsonr=pearsonr,
                               xlabel=ki,
                               ylabel=kj,
                               title=scatter_title)

                    plt.savefig("%s_scatter_%s_%s_%s.%s" %
                                (outbase, label, region, metric, figformat),
                                bbox_inches="tight")
                    plt.close()
                except ValueError as e:
                    warnings.warn(
                        "Could not make scatter plot for samples %s and %s. Error message:\n    %s"
                        % (ki, kj, e.message), DataWarning)

                # TODO: make these tables into dataframes. this scheme is insane

                # add entries to bigtable for export later
                bigtable[label][region_counts][metric]["pearsonr"] = pearsonr
                bigtable[label][region_counts][metric]["spearmanr"] = spearmanr
                bigtable[label][region_counts][metric]["log2_mean"] = log2_mean
                bigtable[label][region_counts][metric]["log2_std"] = log2_std
                bigtable[label][region_counts][metric]["2**3sigma"] = 2**(
                    3 * log2_std)
                bigtable[label][region_counts][metric][
                    "num_genes_128plus"] = count_mask.sum()
                bigtable[label][region_counts][metric][
                    "num_genes_log2"] = num_genes_log2

                # do bin-by-bin counting
                printer.write("    -%s across %s by bin ..." %
                              (metric, region))
                bin_masks = get_bin_mask_by_summed_key(vi,
                                                       vj,
                                                       bins=args.bins,
                                                       key=region_counts)
                for my_bin, bin_mask in sorted(bin_masks.items()):

                    bin_vec_i = vi[region_metric][bin_mask]
                    bin_vec_j = vj[region_metric][bin_mask]

                    # make sure there are genes in bin before attempting calculations
                    if len(bin_vec_i) > 0:
                        nonzero_binmask = get_nonzero_either_mask(
                            bin_vec_i, bin_vec_j)
                        bin_vi_log2 = bin_vec_i[nonzero_binmask]
                        bin_vj_log2 = bin_vec_j[nonzero_binmask]
                        my_logs = numpy.log2(bin_vj_log2 / bin_vi_log2)
                        my_logmean = numpy.mean(my_logs)
                        my_logstd = numpy.std(my_logs)
                        if len(bin_vec_i) > 2:
                            my_pearsonr = scipy.stats.pearsonr(
                                bin_vec_i, bin_vec_j)[0]
                            my_spearmanr = scipy.stats.spearmanr(
                                bin_vec_i, bin_vec_j)[0]
                        else:
                            my_spearmanr = numpy.nan
                            my_pearsonr = numpy.nan
                    else:
                        # fill with dummy values
                        my_logs = numpy.array([])
                        my_logmean = numpy.nan
                        my_logstd = numpy.nan
                        my_spearmanr = numpy.nan
                        my_pearsonr = numpy.nan

                    corrcoef_by_bin_table[label][region_counts][metric][
                        my_bin] = {}
                    corrcoef_by_bin_table[label][region_counts][metric][
                        my_bin]["pearsonr"] = my_pearsonr
                    corrcoef_by_bin_table[label][region_counts][metric][
                        my_bin]["spearmanr"] = my_spearmanr
                    corrcoef_by_bin_table[label][region_counts][metric][
                        my_bin]["genes_in_bin"] = sum(bin_mask)
                    corrcoef_by_bin_table[label][region_counts][metric][
                        my_bin]["log2_genes_in_bin"] = sum(nonzero_binmask)
                    corrcoef_by_bin_table[label][region_counts][metric][
                        my_bin]["log2_mean"] = my_logmean
                    corrcoef_by_bin_table[label][region_counts][metric][
                        my_bin]["log2_std"] = my_logstd

    # export big (non-binned) table
    printer.write("Writing tables ...")

    bigtable_out = argsopener("%s_bigtable.txt" % args.outbase, args, "w")
    stats = (
        "num_genes_128plus",  # 0
        "pearsonr",  # 1
        "spearmanr",  # 2
        "num_genes_log2",  # 3
        "log2_mean",  # 4
        "log2_std",  # 5
        "2**3sigma",  # 6
    )

    header = ["#region", "metric", "statistic"]
    header += [X for X in comparison_labels]

    bigtable_out.write("\t".join(header) + "\n")
    for region in binkeys:
        for metric in args.metrics:
            for stat in stats:
                ltmp = [region, metric, stat]
                for key in comparison_labels:
                    ltmp.append(bigtable[key][region][metric][stat])
                ltmp = [str(X) for X in ltmp]
                bigtable_out.write("\t".join(ltmp) + "\n")

    bigtable_out.close()

    # export binned data table and make bin-by-bin plots
    bintable_out = argsopener("%s_bintable.txt" % args.outbase, args, "w")

    region_metrics = [
        "%s_%s" % (X, Y) for X in args.regions for Y in args.metrics
    ]
    stats = [
        "genes_in_bin",  # 0
        "pearsonr",  # 1
        "spearmanr",  # 2
        "",  # 3
        "log2_genes_in_bin",  # 4
        "log2_mean",  # 5
        "log2_std",  # 6
        ""
    ]  # 7
    for region_metric in region_metrics:
        plt.close()
        fig = plot_parser.get_figure_from_args(args)
        plt.semilogx(basex=2)
        plt.title("Correlation coefficients by bin for %s" % region_metric)
        plt.xlabel("Bin")
        plt.ylabel("Spearman rho")

        region, metric = region_metric.split("_")
        region_counts = "%s_reads" % region
        bintable_out.write("%s\n\n" % region_metric)

        try:
            for n, label in enumerate(comparison_labels):
                corrcoefs = []
                bintable_out.write("%s\t\t\t\t\t\t\t\t" % label)
                bintable_out.write("\n")
                bintable_out.write("bin\t" + "\t".join(stats) + "\n")
                for my_bin in bins:
                    ltmp = [my_bin]
                    for stat in stats:
                        ltmp.append(corrcoef_by_bin_table[label][region_counts]
                                    [metric][my_bin].get(stat, ""))
                    ltmp = [str(X) for X in ltmp]
                    bintable_out.write("\t".join(ltmp) + "\n\n")
                    corrcoefs.append(
                        corrcoef_by_bin_table[label][region_counts][metric]
                        [my_bin]["spearmanr"])
                corrcoefs = ma.masked_invalid(corrcoefs)
                plt.plot(bins[~corrcoefs.mask],
                         corrcoefs[~corrcoefs.mask],
                         label=label,
                         color=colors[n])
            plt.legend(loc="lower right")
            plt.savefig("%s_corrcoef_by_bin_%s_%s.%s" %
                        (outbase, region_metric, label, figformat),
                        bbox_inches="tight")
        except Exception as e:
            warnings.warn(
                "Could not plot correlation-by-bin plot for '%s', '%s'" %
                (outbase, region_metric), DataWarning)

        bintable_out.write("\n\n")
    bintable_out.close()

    printer.write("Done.")
コード例 #12
0
ファイル: cs.py プロジェクト: zzygyx9119/plastid
def do_generate(args, annotation_parser, mask_parser):
    """Generate gene position files from gene annotations.
    
     1. Genes whose transcripts share exons are first collapsed into merged
        genes.
        
     2. Within merged genes, all positions are classified. All positions are
        included in a set called *exon*. All positions that appear as coding
        regions in all transcripts (i.e. are never part of a 5'UTR or 3'UTR)
        included in a set called *CDS*. Similarly, all positions that appear
        as 5' UTR or 3' UTR in all transcripts are included in sets called
        *UTR5* or *UTR3*, respectively.
    
     3. Genomic positions that are overlapped by multiple merged genes are
        excluded from the position sets for those genes.
    
     4. If a :term:`mask file` is supplied, positions annotated in the mask file
        are also excluded
    
     5. Output is given as a series of `BED`_ files and a `positions` file
        containing the same data.
    
    Parameters
    ----------
    args : :py:class:`argparse.Namespace`
        command-line arguments for ``generate`` subprogram
    """
    # variables for transcript <-> merged gene mapping
    transcripts = {}
    merged_genes = {}

    # data table for merged genes
    gene_table = pd.DataFrame({
        "region": [],
        "transcript_ids": [],
        "exon_unmasked": [],
        "exon": [],
        "masked": [],
        "utr5": [],
        "cds": [],
        "utr3": [],
        "exon_bed": [],
        "utr5_bed": [],
        "cds_bed": [],
        "utr3_bed": [],
        "masked_bed": [],
    })

    # data table for transcripts
    transcript_table = pd.DataFrame({
        "region": [],
        "exon": [],
        "utr5": [],
        "cds": [],
        "utr3": [],
        "exon_bed": [],
        "utr5_bed": [],
        "cds_bed": [],
        "utr3_bed": [],
        "masked": [],
        "exon_unmasked": [],
        "transcript_ids": [],
        "masked_bed": [],
    })

    # data
    is_sorted = (args.sorted == True) or \
                (args.tabix == True) or \
                (args.annotation_format == "BigBed")

    annotation_message = """`cs` relies upon relationships between
    transcripts and genes to collapse transcripts to genes for quantitation.
    Gene-transcript relationships are not generally preserved in BED or BigBed
    files, and a `gene_id` column could not be found in the input data. This
    may yield nonsensical results in the output.

    Consider either (1) using a GTF2 or GFF3 file or (2) creating an extended
    BED or BigBed file with a `gene_id` column.""".replace("    ", "").replace(
        "\n", " ")

    if args.annotation_format == "BED":
        if not isinstance(args.bed_extra_columns,
                          list) or 'gene_id' not in args.bed_extra_columns:
            warnings.warn(annotation_message, FileFormatWarning)
    elif args.annotation_format == "BigBed":
        reader = BigBedReader(args.annotation_files[0])
        if 'gene_id' not in reader.extension_fields:
            warnings.warn(annotation_message, FileFormatWarning)

    source = annotation_parser.get_transcripts_from_args(args, printer=printer)
    mask_hash = mask_parser.get_genome_hash_from_args(args)

    # loop conditions
    last_chrom = None
    do_loop = True

    # to save memory, we process one chromosome at a time if input file is sorted
    # knowing that at that moment all transcript parts are assembled
    while do_loop == True:
        try:
            tx = next(source)
        except StopIteration:
            do_loop = False

        try:
            # if chromosome is completely processed or EOF
            if (is_sorted and tx.spanning_segment.chrom != last_chrom
                ) or do_loop == False:
                if do_loop == True:
                    source = itertools.chain([tx], source)

                if last_chrom is not None or do_loop == False:
                    printer.write("Merging genes on chromosome/contig '%s'" %
                                  last_chrom)
                    my_gene_table, my_transcript_table, my_merged_genes = process_partial_group(
                        transcripts, mask_hash, printer)
                    gene_table = pd.concat((gene_table, my_gene_table), axis=0)
                    transcript_table = pd.concat(
                        (transcript_table, my_transcript_table), axis=0)
                    merged_genes.update(my_merged_genes)

                del transcripts
                gc.collect()
                del gc.garbage[:]
                transcripts = {}

                # reset last chrom
                last_chrom = tx.spanning_segment.chrom

            # otherwise, remember transcript
            else:
                transcripts[tx.get_name()] = tx

        # exit gracefully if no transcripts found
        except UnboundLocalError:
            pass

    # write output
    printer.write("Writing output ...")

    merged_fn = "%s_merged.txt" % args.outbase
    number_merged = len(set(merged_genes.values()))
    printer.write("Collapsed %s genes to %s merged groups. Writing to %s" %
                  (len(merged_genes), number_merged, merged_fn))
    fout = argsopener(merged_fn, args, "w")
    for gene, merged_name in sorted(merged_genes.items()):
        fout.write("%s\t%s\n" % (gene, merged_name))

    fout.close()

    printer.write("Writing gene table and BED files ...")
    write_output_files(gene_table, "gene", args)
    printer.write("Writing transcript summary table and BED files ...")
    write_output_files(transcript_table, "transcript", args)
    printer.write("Done!")
コード例 #13
0
ファイル: crossmap.py プロジェクト: zzygyx9119/plastid
def chrom_worker(chrom_seq, args=None):
    name, seq_or_kmers = chrom_seq
    printer.write("Processing chromosome %s..." % name)
    base = "%s_%s_%s_%s" % (args.outbase, args.read_length, args.mismatches,
                            name)
    kmer_file = "%s_kmers.fa" % base
    toomany_file = "%s_multimap.fa" % base
    bed_file = "%s_crossmap.bed" % base

    if args.have_kmers == False:
        with open(
                kmer_file,
                "w") as kmer:  # only do this step if args.have_kmers == False
            simulate_reads(seq_or_kmers, kmer, args.read_length)

        kmer.close()
    else:
        kmer_file = seq_or_kmers

    argdict = {
        "mismatches": args.mismatches,
        "processors": 1,
        "bowtie": args.bowtie,
        "toomany": toomany_file,
        "kmers": kmer_file,
        "ebwt": args.ebwt,
        "null": os.devnull,
    }

    cmd = "%(bowtie)s -m1 -a --best -f -v %(mismatches)s -p %(processors)s %(ebwt)s %(kmers)s --max %(toomany)s >%(null)s" % argdict

    printer.write("Aligning %s-mers for chromosome '%s' :\n\t'%s'" %
                  (args.read_length, name, cmd))
    try:
        retcode = subprocess.call(cmd, shell=True)
        if retcode < 0 or retcode == 2:
            printer.write(
                "Alignment for chromosome '%s' terminated with status %s" %
                (name, retcode))
        else:
            if os.path.exists(toomany_file):
                printer.write(
                    "Assembling multimappers from chromosome '%s' into crossmap..."
                    % name)
                with argsopener(bed_file, args, "w") as bed_out:
                    for plus_chain, minus_chain in fa_to_bed(
                            open(toomany_file),
                            args.read_length,
                            offset=args.offset):
                        bed_out.write(plus_chain.as_bed())
                        bed_out.write(minus_chain.as_bed())

                    bed_out.close()

            else:
                printer.write("Could not find multimapper source file '%s' ." %
                              toomany_file)
    except OSError as e:
        printer.write("Alignment failed for chromosome '%s': %s" % (name, e))

    printer.write("Cleaning up chromosome '%s'..." % name)
    os.remove(toomany_file)
    if args.have_kmers == False and args.save_kmers == False:
        os.remove(kmer_file)

    return bed_file
コード例 #14
0
ファイル: make_wiggle.py プロジェクト: zzygyx9119/plastid
def main(argv=sys.argv[1:]):
    """Command-line program
    
    Parameters
    ----------
    argv : list, optional
        A list of command-line arguments, which will be processed
        as if the script were called from the command line if
        :py:func:`main` is called directly.

        Default: sys.argv[1:] (actually command-line arguments)
    """
    ap = AlignmentParser()
    bp = BaseParser()
    parser = argparse.ArgumentParser(description=format_module_docstring(__doc__),
                                     formatter_class=argparse.RawDescriptionHelpFormatter,
                                     parents=[bp.get_parser(),ap.get_parser()])
    parser.add_argument("-o","--out",dest="outbase",type=str,required=True,
                        metavar="FILENAME",
                        help="Base name for output files")
    parser.add_argument("--window_size",default=100000,metavar="N",type=int,
                        help="Size of nucleotides to fetch at once for export. "+\
                             "Large values are faster but require more memory "+\
                             "(Default: 100000)")

    track_opts = parser.add_argument_group(title="Browser track options")
    track_opts.add_argument("--color",type=str,default=None,
                        help="An RGB hex string (`'#NNNNNN'`, `N` in `[0-9,A-F]`) specifying \
                              the track color.")
    track_opts.add_argument("-t","--track_name",dest="track_name",type=str,
                        help="Name to give browser track",
                        default=None)
    track_opts.add_argument("--output_format",choices=("bedgraph","variable_step"),
                        default="bedgraph",
                        help="Format of output file (Default: bedgraph)")

    args = parser.parse_args(argv)
    gnd  = ap.get_genome_array_from_args(args,printer=printer)
    bp.get_base_ops_from_args(args)
    
    if args.track_name is None:
        name = args.outbase
    else:
        name = args.track_name
    
    if args.color is not None:
        fw_color = rc_color = "%s,%s,%s" % tuple(get_rgb255(args.color))
    else:
        fw_color = rc_color = "0,0,0"
    
    if args.output_format == "bedgraph":
        outfn = gnd.to_bedgraph
    elif args.output_format == "variable_step":
        outfn = gnd.to_variable_step

    track_fw = "%s_fw.wig" % args.outbase
    track_rc = "%s_rc.wig" % args.outbase

    with argsopener(track_fw,args,"w") as fw_out:
        printer.write("Writing forward strand track to %s ..." % track_fw)
        outfn(fw_out,"%s_fw" % name,"+",window_size=args.window_size,color=fw_color,
                printer=printer)
        fw_out.close()

    with argsopener(track_rc,args,"w") as rc_out:
        printer.write("Writing reverse strand track to %s ..." % track_rc)
        outfn(rc_out,"%s_rc" % name,"-",window_size=args.window_size,color=rc_color,
                printer=printer)
        rc_out.close()
    
    printer.write("Done!")
コード例 #15
0
def main(argv=sys.argv[1:]):
    """Command-line program
    
    Parameters
    ----------
	argv : list, optional
		A list of command-line arguments, which will be processed
		as if the script were called from the command line if
		:py:func:`main` is called directly.

        Default: `sys.argv[1:]`. The command-line arguments, if the script is
        invoked from the command line
    """
    al = AlignmentParser(
        disabled=["normalize", "big_genome", "spliced_bowtie_files"],
        input_choices=["BAM"])
    an = AnnotationParser()
    pp = PlottingParser()
    bp = BaseParser()
    plotting_parser = pp.get_parser()

    alignment_file_parser = al.get_parser(conflict_handler="resolve")
    annotation_file_parser = an.get_parser(conflict_handler="resolve")
    base_parser = bp.get_parser()

    parser = argparse.ArgumentParser(
        description=format_module_docstring(__doc__),
        formatter_class=argparse.RawDescriptionHelpFormatter,
        conflict_handler="resolve",
        parents=[
            base_parser, annotation_file_parser, alignment_file_parser,
            plotting_parser
        ])

    parser.add_argument("roi_file",type=str,nargs="?",default=None,
                        help="Optional. ROI file of maximal spanning windows surrounding start codons, "+\
                             "from ``metagene generate`` subprogram. Using this instead of `--annotation_files` "+\
                             "prevents double-counting of codons when multiple transcript isoforms exist "+\
                             "for a gene. See the documentation for `metagene` for more info about ROI files."+\
                             "If an ROI file is not given, supply an annotation with ``--annotation_files``")
    parser.add_argument("outbase",
                        type=str,
                        help="Required. Basename for output files")
    parser.add_argument(
        "--codon_buffer",
        type=int,
        default=5,
        help="Codons before and after start codon to ignore (Default: 5)")

    args = parser.parse_args(argv)
    bp.get_base_ops_from_args(args)
    pp.set_style_from_args(args)
    gnd = al.get_genome_array_from_args(args, printer=printer)

    read_lengths = list(range(args.min_length, args.max_length + 1))
    codon_buffer = args.codon_buffer

    dtmp = {
        "read_length": numpy.array(read_lengths),
        "reads_counted": numpy.zeros_like(read_lengths, dtype=int),
    }

    if args.roi_file is not None:
        using_roi = True
        roi_table = read_pl_table(args.roi_file)
        regions = roi_table.iterrows()
        transform_fn = roi_row_to_cds
        back_buffer = -1
        if len(args.annotation_files) > 0:
            warnings.warn(
                "If an ROI file is given, annotation files are ignored. Pulling regions from '%s'. Ignoring '%s'"
                % (args.roi_file, ", ".join(args.annotation_files)),
                ArgumentWarning)
    else:
        using_roi = False
        if len(args.annotation_files) == 0:
            printer.write(
                "Either an ROI file or at least annotation file must be given."
            )
            sys.exit(1)
        else:
            warnings.warn(
                "Using a transcript annotation file instead of an ROI file can lead to double-counting of codons if the annotation contains multiple transcripts per gene.",
                ArgumentWarning)
            regions = an.get_transcripts_from_args(args, printer=printer)
            back_buffer = -codon_buffer
            transform_fn = lambda x: x.get_cds()

    phase_sums = {}
    for k in read_lengths:
        phase_sums[k] = numpy.zeros(3)

    for n, roi in enumerate(regions):
        if n % 1000 == 1:
            printer.write("Counted %s ROIs ..." % n)

        # transformation needed to extract CDS from transcript or from ROI file window
        cds_part = transform_fn(roi)

        # only calculate for coding genes
        if len(cds_part) > 0:

            read_dict = {}
            count_vectors = {}
            for k in read_lengths:
                read_dict[k] = []
                count_vectors[k] = []

            # for each seg, fetch reads, sort them, and create individual count vectors
            for seg in cds_part:
                reads = gnd.get_reads(seg)
                for read in filter(lambda x: len(x.positions) in read_dict,
                                   reads):
                    read_dict[len(read.positions)].append(read)

                # map and sort by length
                for read_length in read_dict:
                    count_vector = list(
                        gnd.map_fn(read_dict[read_length], seg)[1])
                    count_vectors[read_length].extend(count_vector)

            # add each count vector for each length to total
            for k, vec in count_vectors.items():
                counts = numpy.array(vec)
                if cds_part.strand == "-":
                    counts = counts[::-1]

                if len(counts) % 3 == 0:
                    counts = counts.reshape((int(len(counts) / 3), 3))
                else:
                    if using_roi == False:
                        message = "Length of '%s' coding region (%s nt) is not divisible by 3. Ignoring last partial codon." % (
                            roi.get_name(), len(counts))
                        warnings.warn(message, DataWarning)
                    newlen = int(len(counts) // 3)
                    counts = counts[:3 * newlen]
                    counts = counts.reshape(newlen, 3)

                phase_sums[k] += counts[codon_buffer:back_buffer, :].sum(0)

    printer.write("Counted %s ROIs total." % (n + 1))
    for k in dtmp:
        dtmp[k] = numpy.array(dtmp[k])

    # total reads counted for each size
    for k in read_lengths:
        dtmp["reads_counted"][dtmp["read_length"] == k] = phase_sums[k].sum()

    # read length distribution
    dtmp["fraction_reads_counted"] = dtmp["reads_counted"].astype(
        float) / dtmp["reads_counted"].sum()

    # phase vectors
    phase_vectors = {
        K: V.astype(float) / V.astype(float).sum()
        for K, V in phase_sums.items()
    }
    for i in range(3):
        dtmp["phase%s" % i] = numpy.zeros(len(dtmp["read_length"]))

    for k, vec in phase_vectors.items():
        for i in range(3):
            dtmp["phase%s" % i][dtmp["read_length"] == k] = vec[i]

    # phase table
    fn = "%s_phasing.txt" % args.outbase
    printer.write("Saving phasing table to %s ..." % fn)
    dtmp = pd.DataFrame(dtmp)
    with argsopener(fn, args) as fh:
        dtmp.to_csv(fh,
                    columns=[
                        "read_length",
                        "reads_counted",
                        "fraction_reads_counted",
                        "phase0",
                        "phase1",
                        "phase2",
                    ],
                    float_format="%.6f",
                    na_rep="nan",
                    sep="\t",
                    index=False,
                    header=True)
        fh.close()

    fig = {}
    if args.figsize is not None:
        fig["figsize"] = tuple(args.figsize)

    colors = pp.get_colors_from_args(args, len(read_lengths))

    fn = "%s_phasing.%s" % (args.outbase, args.figformat)
    printer.write("Plotting to %s ..." % fn)
    plot_counts = numpy.vstack([V for (_, V) in sorted(phase_sums.items())])
    fig, (ax1, _) = phase_plot(plot_counts,
                               labels=read_lengths,
                               lighten_by=0.3,
                               cmap=None,
                               color=colors,
                               fig=fig)

    if args.title is not None:
        ax1.set_title(args.title)
    else:
        ax1.set_title("Phasing stats for %s" % args.outbase)

    fig.savefig(fn, dpi=args.dpi, bbox_inches="tight")
コード例 #16
0
def main(argv=sys.argv[1:]):
	"""Command-line program
	
	Parameters
	----------
	argv : list, optional
		A list of command-line arguments, which will be processed
		as if the script were called from the command line if
		:py:func:`main` is called directly.

		Default: `sys.argv[1:]`. The command-line arguments, if the script is
		invoked from the command line
	"""
	bp = BaseParser()
	parser = argparse.ArgumentParser(description=format_module_docstring(__doc__),
									 formatter_class=argparse.RawDescriptionHelpFormatter,
									 parents=[bp.get_parser()])
	parser.add_argument("--exclude",nargs="+",default=[],
					    help="Feature types to exclude from consideration")
	parser.add_argument("infile",metavar="infile.gff",type=str,
	                   help="Input GFF3 file")
	parser.add_argument("outfile",metavar="outfile.txt",type=str,
					   help="Name of output file")
	
	args = parser.parse_args(argv)
	bp.get_base_ops_from_args(args)
	excluded = set(args.exclude)

	fin = sys.stdin if args.infile == "-" else opener(args.infile)
	
	feature_counts        = Counter()
	features_with_parents = []
	feature_types         = {}
	name_type             = {}

	
	printer.write("Opening %s..." % args.infile)
	c = 0
	for feature in GFF3_Reader(fin,return_stopfeatures=False):
		if c % 10000 == 0:
			printer.write("Processed %s features..." % c)
		c += 1
		ftype = feature.attr["type"]
		fname = feature.get_name()
		if ftype not in excluded:
			if ftype not in feature_types:
				feature_types[ftype] = Counter()
			feature_counts[ftype] += 1
			if fname is not None:
				name_type[fname] = ftype
			if "Parent" in feature.attr:
				features_with_parents.append(feature)
			else:
				feature_types[ftype]["parent unspecified"] += 1
	
	printer.write("Sorting parents...")
	c = 0
	for feature in features_with_parents:
		if c % 10000 == 0:
			printer.write("Processed %s parents..." % c)
		c += 1
		pnames = feature.attr["Parent"]
		ftype = feature.attr["type"]
		if pnames == "":
			feature_types[ftype]["parent unspecified"] += 1
		else:
			if len(pnames) > 1:
				feature_types[ftype]["multiple parents"] += 1
			else:
				ptype = name_type.get(pnames[0],"parent not in database")
				feature_types[ftype][ptype] += 1

	rows = sorted(feature_types.keys())
	cols = rows + ["parent unspecified","parent not in database","multiple parents"]

	with argsopener(args.outfile,args,"w") as fh:
		printer.write("Writing %s..." % args.outfile)
		header = "#feature_type\tcount\t" + "\t".join(cols) + "\n"
		fh.write(header)
		for r in rows:
			sout = "%s\t%s" % (r, feature_counts[r])
			for i in cols:
				sout += "\t%s" % feature_types[r].get(i,0)
			fh.write("%s\n" % sout)

	printer.write("Done.")
コード例 #17
0
def main(argv=sys.argv[1:]):
    """Command-line program
    
    Parameters
    ----------
    argv : list, optional
        A list of command-line arguments, which will be processed
        as if the script were called from the command line if
        :func:`main` is called directly.

        Default: `sys.argv[1:]`. The command-line arguments, if the script is
        invoked from the command line
    """
    ap = AnnotationParser()
    annotation_file_parser = ap.get_parser(conflict_handler="resolve")

    al = AlignmentParser(disabled=_DISABLED)
    alignment_file_parser = al.get_parser(conflict_handler="resolve")

    mp = MaskParser()
    mask_file_parser = mp.get_parser()

    bp = BaseParser()
    base_parser = bp.get_parser()

    parser = argparse.ArgumentParser(
        description=format_module_docstring(__doc__),
        formatter_class=argparse.RawDescriptionHelpFormatter,
        parents=[
            base_parser, alignment_file_parser, annotation_file_parser,
            mask_file_parser
        ],
    )
    parser.add_argument("outfile", type=str, help="Output filename")

    args = parser.parse_args(argv)
    bp.get_base_ops_from_args(args)

    ga = al.get_genome_array_from_args(args, printer=printer)
    transcripts = ap.get_transcripts_from_args(args,
                                               printer=printer,
                                               return_type=SegmentChain)
    crossmap = mp.get_genome_hash_from_args(args, printer=printer)

    ga_sum = ga.sum()
    normconst = 1000.0 * 1e6 / ga_sum

    with argsopener(args.outfile, args, "w") as fout:
        fout.write("## total_dataset_counts: %s\n" % ga_sum)
        fout.write(
            "region_name\tregion\tcounts\tcounts_per_nucleotide\trpkm\tlength\n"
        )
        for n, ivc in enumerate(transcripts):
            name = ivc.get_name()
            masks = crossmap.get_overlapping_features(ivc)
            ivc.add_masks(*itertools.chain.from_iterable((X for X in masks)))
            if n % 1000 == 0:
                printer.write("Processed %s regions..." % n)

            counts = numpy.nansum(ivc.get_masked_counts(ga))
            length = ivc.masked_length
            rpnt = numpy.nan if length == 0 else float(counts) / length
            rpkm = numpy.nan if length == 0 else rpnt * normconst
            ltmp = [
                name,
                str(ivc),
                "%.8e" % counts,
                "%.8e" % rpnt,
                "%.8e" % rpkm,
                "%d" % length
            ]
            fout.write("%s\n" % "\t".join(ltmp))

        fout.close()

    printer.write("Processed %s regions total." % n)

    printer.write("Done.")