Esempio n. 1
0
def main(argv=sys.argv[1:]):
    """Command-line program
    
    Parameters
    ----------
    argv : list, optional
        A list of command-line arguments, which will be processed
        as if the script were called from the command line if
        :func:`main` is called directly.

        Default: `sys.argv[1:]`. The command-line arguments, if the script is
        invoked from the command line
    """
    ap = AnnotationParser()
    annotation_file_parser = ap.get_parser(conflict_handler="resolve")
    
    al = AlignmentParser(disabled=_DISABLED)
    alignment_file_parser  = al.get_parser(conflict_handler="resolve")
    
    mp = MaskParser()
    mask_file_parser = mp.get_parser()
    
    bp = BaseParser()
    base_parser = bp.get_parser()
    
    parser = argparse.ArgumentParser(description=format_module_docstring(__doc__),
                                     formatter_class=argparse.RawDescriptionHelpFormatter,
                                     parents=[base_parser,
                                              alignment_file_parser,
                                              annotation_file_parser,
                                              mask_file_parser],
                                     )
    parser.add_argument("outfile",type=str,help="Output filename")
    
    args = parser.parse_args(argv)
    bp.get_base_ops_from_args(args)

    ga          = al.get_genome_array_from_args(args,printer=printer)
    transcripts = ap.get_transcripts_from_args(args,printer=printer,return_type=SegmentChain)
    crossmap    = mp.get_genome_hash_from_args(args,printer=printer)
    
    ga_sum = ga.sum()
    normconst = 1000.0*1e6 / ga_sum
    
    with argsopener(args.outfile,args,"w") as fout:
        fout.write("## total_dataset_counts: %s\n" % ga_sum)
        fout.write("region_name\tregion\tcounts\tcounts_per_nucleotide\trpkm\tlength\n")
        for n,ivc in enumerate(transcripts):
            name = ivc.get_name()
            masks = crossmap.get_overlapping_features(ivc)
            ivc.add_masks(*itertools.chain.from_iterable((X for X in masks)))
            if n % 1000 == 0:
                printer.write("Processed %s regions..." % n)
                
            counts = numpy.nansum(ivc.get_masked_counts(ga))
            length = ivc.masked_length
            rpnt = numpy.nan if length == 0 else float(counts)/length
            rpkm = numpy.nan if length == 0 else rpnt * normconst 
            ltmp = [name,
                    str(ivc),
                    "%.8e" % counts,
                    "%.8e" % rpnt,
                    "%.8e" % rpkm,
                    "%d" % length]
            fout.write("%s\n" % "\t".join(ltmp))
    
        fout.close()
        
    printer.write("Processed %s regions total." % n)

    printer.write("Done.")
Esempio n. 2
0
def main(args=sys.argv[1:]):
    """Command-line program
    
    Parameters
    ----------
    argv : list, optional
        A list of command-line arguments, which will be processed
        as if the script were called from the command line if
        :func:`main` is called directly.

        Default: `sys.argv[1:]`. The command-line arguments, if the script is
        invoked from the command line
    """
    al = AlignmentParser()
    an = AnnotationParser()
    mp = MaskParser()
    bp = BaseParser()

    alignment_file_parser = al.get_parser(conflict_handler="resolve")
    annotation_file_parser = an.get_parser(conflict_handler="resolve")
    mask_file_parser = mp.get_parser()
    base_parser = bp.get_parser()

    parser = argparse.ArgumentParser(
        description=format_module_docstring(__doc__),
        formatter_class=argparse.RawDescriptionHelpFormatter,
        conflict_handler="resolve",
        parents=[base_parser, alignment_file_parser, annotation_file_parser, mask_file_parser],
    )

    parser.add_argument("out_folder", type=str, help="Folder in which to save output vectors")
    parser.add_argument(
        "--out_prefix", default="", type=str, help="Prefix to prepend to output files (default: no prefix)"
    )
    parser.add_argument(
        "--format", default="%.8f", type=str, help=r"printf-style format string for output (default: '%%.8f')"
    )
    args = parser.parse_args(args)
    bp.get_base_ops_from_args(args)

    # if output folder doesn't exist, create it
    if not os.path.isdir(args.out_folder):
        os.mkdir(args.out_folder)

    # parse args
    ga = al.get_genome_array_from_args(args, printer=printer)
    transcripts = an.get_segmentchains_from_args(args, printer=printer)
    mask_hash = mp.get_genome_hash_from_args(args, printer=printer)

    # evaluate
    for n, tx in enumerate(transcripts):
        if n % 1000 == 0:
            printer.write("Processed %s regions of interest" % n)
        filename = "%s%s.txt" % (args.out_prefix, tx.get_name())
        full_filename = os.path.join(args.out_folder, filename)

        # mask out overlapping masked regions
        overlapping = mask_hash.get_overlapping_features(tx)
        for feature in overlapping:
            tx.add_masks(*feature.segments)

        count_vec = tx.get_masked_counts(ga)
        numpy.savetxt(full_filename, count_vec, fmt=args.format)
Esempio n. 3
0
def main(argv=sys.argv[1:]):
    """Command-line program
    
    Parameters
    ----------
    argv : list, optional
        A list of command-line arguments, which will be processed
        as if the script were called from the command line if
        :py:func:`main` is called directly.

        Default: `sys.argv[1:]`. The command-line arguments, if the script is
        invoked from the command line
    """
    sp = SequenceParser()
    mp = MaskParser()
    bp = BaseParser()
    
    parser = argparse.ArgumentParser(description=format_module_docstring(__doc__),
                                     formatter_class=argparse.RawDescriptionHelpFormatter,
                                     parents=[bp.get_parser(),sp.get_parser(),mp.get_parser()],
                                     )
    parser.add_argument("--maxslide",type=int,default=10,
                        help="Maximum number of nt to search 5\' and 3\' of intron"+
                             " boundaries (Default: 10)")
    parser.add_argument("--ref",type=str,metavar="ref.bed",default=None,
                        help="Reference file describing known splice junctions")
    parser.add_argument("--slide_canonical",action="store_true",default=False,
                        help="Slide junctions to canonical junctions if present within equal support region")
    parser.add_argument("infile",type=str,metavar="input.bed",
                        help="BED file describing discovered junctions")
    parser.add_argument("outbase",type=str,
                        help="Basename for output files")
    args = parser.parse_args(argv)
    bp.get_base_ops_from_args(args)
    
    printer.write("Opening genome from %s..." % args.sequence_file)
    genome = sp.get_seqdict_from_args(args)
    
    # load crossmap    
    cross_hash = mp.get_genome_hash_from_args(args)

    # load ref junctions
    if args.ref is not None:
        printer.write("Loading reference junctions from %s" % args.ref)
        known_hash = GenomeHash(list(BED_Reader(open(args.ref))),do_copy=False)
    else:
        known_hash = GenomeHash()

    # set up variables    
    canonicals_plus = [("GT","AG"),
                       ("GC","AG")
                      ]
    
    canonicals_minus = [("CT","AC"),
                        ("CT","GC")
                       ]
    
    known_in_range     = 0
    canonical_in_range = 0
    repetitive         = 0
    untouched          = 0
    c = 0
    
    seen_already = []

    outfiles = {
                 "repetitive" : "%s_repetitive.bed" % args.outbase,
                 "known"      : "%s_shifted_known.bed" % args.outbase,
                 "canonical"  : "%s_shifted_canonical.bed" % args.outbase,
                 "untouched"  : "%s_untouched.bed" % args.outbase,
                }
    outfiles = { K : argsopener(V,args,"w") for K,V in outfiles.items() }

    # process data
    printer.write("Opening junctions from %s..." % args.infile)
    for ivc in BED_Reader(CommentReader(opener(args.infile))):
        processed = False
        tup = None

        if c % 1000 == 0 and c > 0:
            printer.write("Processed: %s\tknown: %s\tshifted to canonical: %s\trepetitive: %s\tuntouched: %s" % \
                    (c, known_in_range, canonical_in_range, repetitive, untouched))
                   
        assert len(ivc) == 2
        strand = ivc.strand
        
        minus_range, plus_range = find_match_range(ivc,genome,args.maxslide)
        
        # see if either end of splice junction +- match_range lands in repetitive areas of genome
        if covered_by_repetitive(ivc,minus_range,plus_range,cross_hash):
            repetitive += 1
            outfiles["repetitive"].write(ivc.as_bed())
            processed = True

        # see if one or more known junctions in range
        if processed == False and args.ref is not None:
            # find_known_in_range(query_ivc,minus_range,plus_range,knownjunctions)
            known_juncs = find_known_in_range(ivc,minus_range,plus_range,known_hash.get_nearby_features(ivc))
            if len(known_juncs) > 0:
                known_in_range += 1
                for my_known in known_juncs:
                    tup = get_junction_tuple(my_known)
                    if tup not in seen_already:
                        outfiles["known"].write(my_known.as_bed())
                        seen_already.append(tup)
                    
                processed = True
            
        # see if one or more canonical junctions in range
        if processed == False and args.slide_canonical == True:
            canonicals = canonicals_plus if strand == "+" else canonicals_minus
            #find_canonicals_in_range(query_ivc,minus_range,plus_range,genome,canonicals)
            canonical_juncs = find_canonicals_in_range(ivc,minus_range,plus_range,genome,canonicals)
            if len(canonical_juncs) > 0:
                canonical_in_range += 1
                for can in canonical_juncs:
                    tup = get_junction_tuple(can)
                    if tup not in seen_already:
                        outfiles["canonical"].write(can.as_bed())
                        seen_already.append(tup)

                processed = True
                    
        if processed == False:
            outfiles["untouched"].write(ivc.as_bed())
            untouched += 1
            
        c += 1

    # save output
    printer.write("Totals: %s\tknown: %s\tshifted to canonical: %s\trepetitive: %s\tuntouched: %s" % \
            (c, known_in_range, canonical_in_range, repetitive, untouched))    

    for v in outfiles.values():
        v.close()
    
    printer.write("Done.")
Esempio n. 4
0
def main(args=sys.argv[1:]):
    """Command-line program
    
    Parameters
    ----------
    argv : list, optional
        A list of command-line arguments, which will be processed
        as if the script were called from the command line if
        :func:`main` is called directly.

        Default: `sys.argv[1:]`. The command-line arguments, if the script is
        invoked from the command line
    """
    al = AlignmentParser()
    an = AnnotationParser()
    mp = MaskParser()
    bp = BaseParser()

    alignment_file_parser = al.get_parser(conflict_handler="resolve")
    annotation_file_parser = an.get_parser(conflict_handler="resolve")
    mask_file_parser = mp.get_parser()
    base_parser = bp.get_parser()

    parser = argparse.ArgumentParser(
        description=format_module_docstring(__doc__),
        formatter_class=argparse.RawDescriptionHelpFormatter,
        conflict_handler="resolve",
        parents=[
            base_parser, alignment_file_parser, annotation_file_parser,
            mask_file_parser
        ])

    parser.add_argument("out_folder",
                        type=str,
                        help="Folder in which to save output vectors")
    parser.add_argument(
        "--out_prefix",
        default="",
        type=str,
        help="Prefix to prepend to output files (default: no prefix)")
    parser.add_argument(
        "--format",
        default="%.8f",
        type=str,
        help=r"printf-style format string for output (default: '%%.8f')")
    args = parser.parse_args(args)
    bp.get_base_ops_from_args(args)

    # if output folder doesn't exist, create it
    if not os.path.isdir(args.out_folder):
        os.mkdir(args.out_folder)

    # parse args
    ga = al.get_genome_array_from_args(args, printer=printer)
    transcripts = an.get_segmentchains_from_args(args, printer=printer)
    mask_hash = mp.get_genome_hash_from_args(args, printer=printer)

    # evaluate
    for n, tx in enumerate(transcripts):
        if n % 1000 == 0:
            printer.write("Processed %s regions of interest" % n)
        filename = "%s%s.txt" % (args.out_prefix, tx.get_name())
        full_filename = os.path.join(args.out_folder, filename)

        # mask out overlapping masked regions
        overlapping = mask_hash.get_overlapping_features(tx)
        for feature in overlapping:
            tx.add_masks(*feature.segments)

        count_vec = tx.get_masked_counts(ga)
        numpy.savetxt(full_filename, count_vec, fmt=args.format)
Esempio n. 5
0
def main(argv=sys.argv[1:]):
    """Command-line program
    
    Parameters
    ----------
    argv : list, optional
        A list of command-line arguments, which will be processed
        as if the script were called from the command line if
        :func:`main` is called directly.

        Default: `sys.argv[1:]`. The command-line arguments, if the script is
        invoked from the command line
    """
    ap = AnnotationParser()
    annotation_file_parser = ap.get_parser(conflict_handler="resolve")

    al = AlignmentParser(disabled=_DISABLED)
    alignment_file_parser = al.get_parser(conflict_handler="resolve")

    mp = MaskParser()
    mask_file_parser = mp.get_parser()

    bp = BaseParser()
    base_parser = bp.get_parser()

    parser = argparse.ArgumentParser(
        description=format_module_docstring(__doc__),
        formatter_class=argparse.RawDescriptionHelpFormatter,
        parents=[
            base_parser, alignment_file_parser, annotation_file_parser,
            mask_file_parser
        ],
    )
    parser.add_argument("outfile", type=str, help="Output filename")

    args = parser.parse_args(argv)
    bp.get_base_ops_from_args(args)

    ga = al.get_genome_array_from_args(args, printer=printer)
    transcripts = ap.get_transcripts_from_args(args,
                                               printer=printer,
                                               return_type=SegmentChain)
    crossmap = mp.get_genome_hash_from_args(args, printer=printer)

    ga_sum = ga.sum()
    normconst = 1000.0 * 1e6 / ga_sum

    with argsopener(args.outfile, args, "w") as fout:
        fout.write("## total_dataset_counts: %s\n" % ga_sum)
        fout.write(
            "region_name\tregion\tcounts\tcounts_per_nucleotide\trpkm\tlength\n"
        )
        for n, ivc in enumerate(transcripts):
            name = ivc.get_name()
            masks = crossmap.get_overlapping_features(ivc)
            ivc.add_masks(*itertools.chain.from_iterable((X for X in masks)))
            if n % 1000 == 0:
                printer.write("Processed %s regions..." % n)

            counts = numpy.nansum(ivc.get_masked_counts(ga))
            length = ivc.masked_length
            rpnt = numpy.nan if length == 0 else float(counts) / length
            rpkm = numpy.nan if length == 0 else rpnt * normconst
            ltmp = [
                name,
                str(ivc),
                "%.8e" % counts,
                "%.8e" % rpnt,
                "%.8e" % rpkm,
                "%d" % length
            ]
            fout.write("%s\n" % "\t".join(ltmp))

        fout.close()

    printer.write("Processed %s regions total." % n)

    printer.write("Done.")