def main(argv=sys.argv[1:]):
    """Command-line program
    
    Parameters
    ----------
    argv : list, optional
        A list of command-line arguments, which will be processed
        as if the script were called from the command line if
        :func:`main` is called directly.

        Default: `sys.argv[1:]`. The command-line arguments, if the script is
        invoked from the command line
    """
    ap = AnnotationParser()
    annotation_file_parser = ap.get_parser(conflict_handler="resolve")
    
    al = AlignmentParser(disabled=_DISABLED)
    alignment_file_parser  = al.get_parser(conflict_handler="resolve")
    
    mp = MaskParser()
    mask_file_parser = mp.get_parser()
    
    bp = BaseParser()
    base_parser = bp.get_parser()
    
    parser = argparse.ArgumentParser(description=format_module_docstring(__doc__),
                                     formatter_class=argparse.RawDescriptionHelpFormatter,
                                     parents=[base_parser,
                                              alignment_file_parser,
                                              annotation_file_parser,
                                              mask_file_parser],
                                     )
    parser.add_argument("outfile",type=str,help="Output filename")
    
    args = parser.parse_args(argv)
    bp.get_base_ops_from_args(args)

    ga          = al.get_genome_array_from_args(args,printer=printer)
    transcripts = ap.get_transcripts_from_args(args,printer=printer,return_type=SegmentChain)
    crossmap    = mp.get_genome_hash_from_args(args,printer=printer)
    
    ga_sum = ga.sum()
    normconst = 1000.0*1e6 / ga_sum
    
    with argsopener(args.outfile,args,"w") as fout:
        fout.write("## total_dataset_counts: %s\n" % ga_sum)
        fout.write("region_name\tregion\tcounts\tcounts_per_nucleotide\trpkm\tlength\n")
        for n,ivc in enumerate(transcripts):
            name = ivc.get_name()
            masks = crossmap.get_overlapping_features(ivc)
            ivc.add_masks(*itertools.chain.from_iterable((X for X in masks)))
            if n % 1000 == 0:
                printer.write("Processed %s regions..." % n)
                
            counts = numpy.nansum(ivc.get_masked_counts(ga))
            length = ivc.masked_length
            rpnt = numpy.nan if length == 0 else float(counts)/length
            rpkm = numpy.nan if length == 0 else rpnt * normconst 
            ltmp = [name,
                    str(ivc),
                    "%.8e" % counts,
                    "%.8e" % rpnt,
                    "%.8e" % rpkm,
                    "%d" % length]
            fout.write("%s\n" % "\t".join(ltmp))
    
        fout.close()
        
    printer.write("Processed %s regions total." % n)

    printer.write("Done.")
def main(args=sys.argv[1:]):
    """Command-line program
    
    Parameters
    ----------
    argv : list, optional
        A list of command-line arguments, which will be processed
        as if the script were called from the command line if
        :func:`main` is called directly.

        Default: `sys.argv[1:]`. The command-line arguments, if the script is
        invoked from the command line
    """
    al = AlignmentParser()
    an = AnnotationParser()
    mp = MaskParser()
    bp = BaseParser()

    alignment_file_parser = al.get_parser(conflict_handler="resolve")
    annotation_file_parser = an.get_parser(conflict_handler="resolve")
    mask_file_parser = mp.get_parser()
    base_parser = bp.get_parser()

    parser = argparse.ArgumentParser(
        description=format_module_docstring(__doc__),
        formatter_class=argparse.RawDescriptionHelpFormatter,
        conflict_handler="resolve",
        parents=[base_parser, alignment_file_parser, annotation_file_parser, mask_file_parser],
    )

    parser.add_argument("out_folder", type=str, help="Folder in which to save output vectors")
    parser.add_argument(
        "--out_prefix", default="", type=str, help="Prefix to prepend to output files (default: no prefix)"
    )
    parser.add_argument(
        "--format", default="%.8f", type=str, help=r"printf-style format string for output (default: '%%.8f')"
    )
    args = parser.parse_args(args)
    bp.get_base_ops_from_args(args)

    # if output folder doesn't exist, create it
    if not os.path.isdir(args.out_folder):
        os.mkdir(args.out_folder)

    # parse args
    ga = al.get_genome_array_from_args(args, printer=printer)
    transcripts = an.get_segmentchains_from_args(args, printer=printer)
    mask_hash = mp.get_genome_hash_from_args(args, printer=printer)

    # evaluate
    for n, tx in enumerate(transcripts):
        if n % 1000 == 0:
            printer.write("Processed %s regions of interest" % n)
        filename = "%s%s.txt" % (args.out_prefix, tx.get_name())
        full_filename = os.path.join(args.out_folder, filename)

        # mask out overlapping masked regions
        overlapping = mask_hash.get_overlapping_features(tx)
        for feature in overlapping:
            tx.add_masks(*feature.segments)

        count_vec = tx.get_masked_counts(ga)
        numpy.savetxt(full_filename, count_vec, fmt=args.format)
Example #3
0
def main(args=sys.argv[1:]):
    """Command-line program
    
    Parameters
    ----------
    argv : list, optional
        A list of command-line arguments, which will be processed
        as if the script were called from the command line if
        :func:`main` is called directly.

        Default: `sys.argv[1:]`. The command-line arguments, if the script is
        invoked from the command line
    """
    al = AlignmentParser()
    an = AnnotationParser()
    mp = MaskParser()
    bp = BaseParser()

    alignment_file_parser = al.get_parser(conflict_handler="resolve")
    annotation_file_parser = an.get_parser(conflict_handler="resolve")
    mask_file_parser = mp.get_parser()
    base_parser = bp.get_parser()

    parser = argparse.ArgumentParser(
        description=format_module_docstring(__doc__),
        formatter_class=argparse.RawDescriptionHelpFormatter,
        conflict_handler="resolve",
        parents=[
            base_parser, alignment_file_parser, annotation_file_parser,
            mask_file_parser
        ])

    parser.add_argument("out_folder",
                        type=str,
                        help="Folder in which to save output vectors")
    parser.add_argument(
        "--out_prefix",
        default="",
        type=str,
        help="Prefix to prepend to output files (default: no prefix)")
    parser.add_argument(
        "--format",
        default="%.8f",
        type=str,
        help=r"printf-style format string for output (default: '%%.8f')")
    args = parser.parse_args(args)
    bp.get_base_ops_from_args(args)

    # if output folder doesn't exist, create it
    if not os.path.isdir(args.out_folder):
        os.mkdir(args.out_folder)

    # parse args
    ga = al.get_genome_array_from_args(args, printer=printer)
    transcripts = an.get_segmentchains_from_args(args, printer=printer)
    mask_hash = mp.get_genome_hash_from_args(args, printer=printer)

    # evaluate
    for n, tx in enumerate(transcripts):
        if n % 1000 == 0:
            printer.write("Processed %s regions of interest" % n)
        filename = "%s%s.txt" % (args.out_prefix, tx.get_name())
        full_filename = os.path.join(args.out_folder, filename)

        # mask out overlapping masked regions
        overlapping = mask_hash.get_overlapping_features(tx)
        for feature in overlapping:
            tx.add_masks(*feature.segments)

        count_vec = tx.get_masked_counts(ga)
        numpy.savetxt(full_filename, count_vec, fmt=args.format)
Example #4
0
def main(argv=sys.argv[1:]):
    """Command-line program
    
    Parameters
    ----------
    argv : list, optional
        A list of command-line arguments, which will be processed
        as if the script were called from the command line if
        :func:`main` is called directly.

        Default: `sys.argv[1:]`. The command-line arguments, if the script is
        invoked from the command line
    """
    al = AlignmentParser(disabled=["normalize"])
    an = AnnotationParser()
    mp = MaskParser()
    pl = PlottingParser()
    bp = BaseParser()

    alignment_file_parser = al.get_parser()
    annotation_file_parser = an.get_parser()
    mask_file_parser = mp.get_parser()
    plotting_parser = pl.get_parser()
    base_parser = bp.get_parser()

    generator_help = "Create unambiguous position file from GFF3 annotation"
    generator_desc = format_module_docstring(do_generate.__doc__)

    counter_help = "Count reads in unambiguous gene positions"
    counter_desc = format_module_docstring(do_count.__doc__)

    chart_help = "Produce charts comparing reads between samples"
    chart_desc = format_module_docstring(do_chart.__doc__)

    parser = argparse.ArgumentParser(
        description=format_module_docstring(__doc__),
        formatter_class=argparse.RawDescriptionHelpFormatter)
    subparsers = parser.add_subparsers(
        title="subcommands",
        description="choose one of the following",
        dest="program")
    gparser = subparsers.add_parser(
        "generate",
        help=generator_help,
        description=generator_desc,
        formatter_class=argparse.RawDescriptionHelpFormatter,
        parents=[base_parser, annotation_file_parser, mask_file_parser],
    )
    cparser = subparsers.add_parser(
        "count",
        help=counter_help,
        description=counter_desc,
        parents=[base_parser, alignment_file_parser],
        formatter_class=argparse.RawDescriptionHelpFormatter,
    )
    pparser = subparsers.add_parser(
        "chart",
        help=chart_help,
        description=chart_desc,
        parents=[base_parser, plotting_parser],
        formatter_class=argparse.RawDescriptionHelpFormatter)

    gparser.add_argument("outbase",
                         metavar="outbase",
                         type=str,
                         help="Basename for output files")

    cparser.add_argument(
        "position_file",
        type=str,
        metavar="file.positions",
        help=
        "File assigning positions to genes or transcripts (made using 'generate' subcommand)"
    )
    cparser.add_argument("outbase", type=str, help="Basename for output files")

    pparser.add_argument("-i",
                         "--in",
                         nargs="+",
                         type=str,
                         dest="infiles",
                         help="input files, made by 'count' subprogram")
    pparser.add_argument(
        "--bins",
        nargs="+",
        type=int,
        default=(0, 32, 64, 128, 256, 512, 1024, 2048, 4096),
        help="Bins into which features are partitioned based on counts")
    pparser.add_argument(
        "--regions",
        nargs="+",
        type=str,
        default=("exon", "utr5", "cds", "utr3"),
        help="Regions to compare (default: exon, utr5, cds, utr3)")
    pparser.add_argument("--metrics",
                         nargs="+",
                         type=str,
                         default=("rpkm", "reads"),
                         help="Metrics to compare (default: rpkm, reads)")
    pparser.add_argument(
        "list_of_regions",
        type=str,
        metavar='gene_list.txt',
        nargs="?",
        default=None,
        help=
        "Optional. File listing regions (genes or transcripts), one per line, to include in comparisons. If not given, all genes are included."
    )
    pparser.add_argument("outbase", type=str, help="Basename for output files")

    args = parser.parse_args(argv)
    bp.get_base_ops_from_args(args)

    if args.program == "generate":
        #generate position file
        do_generate(args, an, mp)

    elif args.program == "count":
        #use position file to count gene expression in infiles
        do_count(args, al)

    elif args.program == "chart":
        #use count files to generate a family of charts and tables
        do_chart(args, pl)
Example #5
0
def main(argv=sys.argv[1:]):
    """Command-line program
    
    Parameters
    ----------
    argv : list, optional
        A list of command-line arguments, which will be processed
        as if the script were called from the command line if
        :py:func:`main` is called directly.

        Default: `sys.argv[1:]`. The command-line arguments, if the script is
        invoked from the command line
    """
    sp = SequenceParser()
    mp = MaskParser()
    bp = BaseParser()
    
    parser = argparse.ArgumentParser(description=format_module_docstring(__doc__),
                                     formatter_class=argparse.RawDescriptionHelpFormatter,
                                     parents=[bp.get_parser(),sp.get_parser(),mp.get_parser()],
                                     )
    parser.add_argument("--maxslide",type=int,default=10,
                        help="Maximum number of nt to search 5\' and 3\' of intron"+
                             " boundaries (Default: 10)")
    parser.add_argument("--ref",type=str,metavar="ref.bed",default=None,
                        help="Reference file describing known splice junctions")
    parser.add_argument("--slide_canonical",action="store_true",default=False,
                        help="Slide junctions to canonical junctions if present within equal support region")
    parser.add_argument("infile",type=str,metavar="input.bed",
                        help="BED file describing discovered junctions")
    parser.add_argument("outbase",type=str,
                        help="Basename for output files")
    args = parser.parse_args(argv)
    bp.get_base_ops_from_args(args)
    
    printer.write("Opening genome from %s..." % args.sequence_file)
    genome = sp.get_seqdict_from_args(args)
    
    # load crossmap    
    cross_hash = mp.get_genome_hash_from_args(args)

    # load ref junctions
    if args.ref is not None:
        printer.write("Loading reference junctions from %s" % args.ref)
        known_hash = GenomeHash(list(BED_Reader(open(args.ref))),do_copy=False)
    else:
        known_hash = GenomeHash()

    # set up variables    
    canonicals_plus = [("GT","AG"),
                       ("GC","AG")
                      ]
    
    canonicals_minus = [("CT","AC"),
                        ("CT","GC")
                       ]
    
    known_in_range     = 0
    canonical_in_range = 0
    repetitive         = 0
    untouched          = 0
    c = 0
    
    seen_already = []

    outfiles = {
                 "repetitive" : "%s_repetitive.bed" % args.outbase,
                 "known"      : "%s_shifted_known.bed" % args.outbase,
                 "canonical"  : "%s_shifted_canonical.bed" % args.outbase,
                 "untouched"  : "%s_untouched.bed" % args.outbase,
                }
    outfiles = { K : argsopener(V,args,"w") for K,V in outfiles.items() }

    # process data
    printer.write("Opening junctions from %s..." % args.infile)
    for ivc in BED_Reader(CommentReader(opener(args.infile))):
        processed = False
        tup = None

        if c % 1000 == 0 and c > 0:
            printer.write("Processed: %s\tknown: %s\tshifted to canonical: %s\trepetitive: %s\tuntouched: %s" % \
                    (c, known_in_range, canonical_in_range, repetitive, untouched))
                   
        assert len(ivc) == 2
        strand = ivc.strand
        
        minus_range, plus_range = find_match_range(ivc,genome,args.maxslide)
        
        # see if either end of splice junction +- match_range lands in repetitive areas of genome
        if covered_by_repetitive(ivc,minus_range,plus_range,cross_hash):
            repetitive += 1
            outfiles["repetitive"].write(ivc.as_bed())
            processed = True

        # see if one or more known junctions in range
        if processed == False and args.ref is not None:
            # find_known_in_range(query_ivc,minus_range,plus_range,knownjunctions)
            known_juncs = find_known_in_range(ivc,minus_range,plus_range,known_hash.get_nearby_features(ivc))
            if len(known_juncs) > 0:
                known_in_range += 1
                for my_known in known_juncs:
                    tup = get_junction_tuple(my_known)
                    if tup not in seen_already:
                        outfiles["known"].write(my_known.as_bed())
                        seen_already.append(tup)
                    
                processed = True
            
        # see if one or more canonical junctions in range
        if processed == False and args.slide_canonical == True:
            canonicals = canonicals_plus if strand == "+" else canonicals_minus
            #find_canonicals_in_range(query_ivc,minus_range,plus_range,genome,canonicals)
            canonical_juncs = find_canonicals_in_range(ivc,minus_range,plus_range,genome,canonicals)
            if len(canonical_juncs) > 0:
                canonical_in_range += 1
                for can in canonical_juncs:
                    tup = get_junction_tuple(can)
                    if tup not in seen_already:
                        outfiles["canonical"].write(can.as_bed())
                        seen_already.append(tup)

                processed = True
                    
        if processed == False:
            outfiles["untouched"].write(ivc.as_bed())
            untouched += 1
            
        c += 1

    # save output
    printer.write("Totals: %s\tknown: %s\tshifted to canonical: %s\trepetitive: %s\tuntouched: %s" % \
            (c, known_in_range, canonical_in_range, repetitive, untouched))    

    for v in outfiles.values():
        v.close()
    
    printer.write("Done.")
Example #6
0
def main(argv=sys.argv[1:]):
    """Command-line program
    
    Parameters
    ----------
    argv : list, optional
        A list of command-line arguments, which will be processed
        as if the script were called from the command line if
        :func:`main` is called directly.

        Default: `sys.argv[1:]`. The command-line arguments, if the script is
        invoked from the command line
    """
    ap = AnnotationParser()
    annotation_file_parser = ap.get_parser(conflict_handler="resolve")

    al = AlignmentParser(disabled=_DISABLED)
    alignment_file_parser = al.get_parser(conflict_handler="resolve")

    mp = MaskParser()
    mask_file_parser = mp.get_parser()

    bp = BaseParser()
    base_parser = bp.get_parser()

    parser = argparse.ArgumentParser(
        description=format_module_docstring(__doc__),
        formatter_class=argparse.RawDescriptionHelpFormatter,
        parents=[
            base_parser, alignment_file_parser, annotation_file_parser,
            mask_file_parser
        ],
    )
    parser.add_argument("outfile", type=str, help="Output filename")

    args = parser.parse_args(argv)
    bp.get_base_ops_from_args(args)

    ga = al.get_genome_array_from_args(args, printer=printer)
    transcripts = ap.get_transcripts_from_args(args,
                                               printer=printer,
                                               return_type=SegmentChain)
    crossmap = mp.get_genome_hash_from_args(args, printer=printer)

    ga_sum = ga.sum()
    normconst = 1000.0 * 1e6 / ga_sum

    with argsopener(args.outfile, args, "w") as fout:
        fout.write("## total_dataset_counts: %s\n" % ga_sum)
        fout.write(
            "region_name\tregion\tcounts\tcounts_per_nucleotide\trpkm\tlength\n"
        )
        for n, ivc in enumerate(transcripts):
            name = ivc.get_name()
            masks = crossmap.get_overlapping_features(ivc)
            ivc.add_masks(*itertools.chain.from_iterable((X for X in masks)))
            if n % 1000 == 0:
                printer.write("Processed %s regions..." % n)

            counts = numpy.nansum(ivc.get_masked_counts(ga))
            length = ivc.masked_length
            rpnt = numpy.nan if length == 0 else float(counts) / length
            rpkm = numpy.nan if length == 0 else rpnt * normconst
            ltmp = [
                name,
                str(ivc),
                "%.8e" % counts,
                "%.8e" % rpnt,
                "%.8e" % rpkm,
                "%d" % length
            ]
            fout.write("%s\n" % "\t".join(ltmp))

        fout.close()

    printer.write("Processed %s regions total." % n)

    printer.write("Done.")