Ejemplo n.º 1
0
def main():

    options, args = doc_optparse.parse(__doc__)
    try:
        sources = args[0].translate(tree_tx).split()
        ref_2bit = bx.seq.twobit.TwoBitFile(open(args[1]))
        index = maf.MultiIndexed(args[2:])

        out = maf.Writer(sys.stdout)
        missing_data = bool(options.missingData)
        use_strand = bool(options.strand)
    except:
        doc_optparse.exception()

    for line in sys.stdin:
        fields = line.split()
        ref_src, start, end = fields[0:3]
        if use_strand and len(fields) > 5:
            strand = fields[5]
        else:
            strand = '+'
        do_interval(sources, index, out, ref_src, int(start), int(end),
                    ref_2bit, missing_data, strand)

    out.close()
Ejemplo n.º 2
0
def main():
    options, args = doc_optparse.parse( __doc__ )
    try:
        inp_file, out_file, sitetype, definition = args
        if options.mask:
            mask = int(options.mask)
        else:
            mask = 0
    except:
        print >> sys.stderr, "Tool initialization error."
        sys.exit()

    reader = bx.align.maf.Reader( open(inp_file, 'r') )
    writer = bx.align.maf.Writer( open(out_file,'w') )
    
    mask_chr_dict = {0:'#', 1:'$', 2:'^', 3:'*', 4:'?', 5:'N'}
    mask = mask_chr_dict[mask]
    
    if sitetype == "CpG":
        if int(definition) == 1:
            cpgfilter = bx.align.sitemask.cpg.Restricted( mask=mask )
            defn = "CpG-Restricted"
        else:
            cpgfilter = bx.align.sitemask.cpg.Inclusive( mask=mask )
            defn = "CpG-Inclusive"
    else:
        cpgfilter = bx.align.sitemask.cpg.nonCpG( mask=mask )
        defn = "non-CpG"
    cpgfilter.run( reader, writer.write )
    
    print "%2.2f percent bases masked; Mask character = %s, Definition = %s" %(float(cpgfilter.masked)/float(cpgfilter.total) * 100, mask, defn)
Ejemplo n.º 3
0
def main():
    
    # Parse command line
    options, args = doc_optparse.parse( __doc__ )
    try:
        if options.comp:
            comp_type = options.comp
        else:
            comp_type = None
        score_fname = args[0]
        out_fname = args[1]
    except:
        doc_optparse.exit()

    scores = BinnedArray()

    ## last_chrom = None
    for i, ( chrom, pos, val ) in enumerate( bx.wiggle.Reader( misc.open_compressed( score_fname ) ) ):
        #if last_chrom is None: 
        #    last_chrom = chrom
        #else: 
        #    assert chrom == last_chrom, "This script expects a 'wiggle' input on only one chromosome"
        scores[pos] = val
        # Status
        if i % 10000 == 0: print i, "scores processed"

    out = open( out_fname, "w" )
    if comp_type:
        scores.to_file( out, comp_type=comp_type )
    else:    
        scores.to_file( out )
    out.close()
Ejemplo n.º 4
0
def __main__():
    # Parse Command Line
    options, args = doc_optparse.parse(__doc__)
    # common temp file setup
    tmpf = tempfile.NamedTemporaryFile()  # forward reads
    tmpqf = tempfile.NamedTemporaryFile()
    tmpqf = replaceNeg1(open(options.input2), tmpqf)
    # if paired-end data (have reverse input files)
    if options.input3 != "None" and options.input4 != "None":
        tmpr = tempfile.NamedTemporaryFile()  # reverse reads
        # replace the -1 in the qualities file
        tmpqr = tempfile.NamedTemporaryFile()
        tmpqr = replaceNeg1(open(options.input4, 'r'), tmpqr)
        cmd1 = "%s/bwa_solid2fastq_modified.pl 'yes' %s %s %s %s %s %s 2>&1" % (os.path.split(sys.argv[0])[0], tmpf.name, tmpr.name, options.input1, tmpqf.name, options.input3, tmpqr.name)
        try:
            os.system(cmd1)
            os.system('gunzip -c %s >> %s' % (tmpf.name, options.output1))
            os.system('gunzip -c %s >> %s' % (tmpr.name, options.output2))
        except Exception as eq:
            stop_err("Error converting data to fastq format.\n" + str(eq))
        tmpr.close()
        tmpqr.close()
    # if single-end data
    else:
        cmd1 = "%s/bwa_solid2fastq_modified.pl 'no' %s %s %s %s %s %s 2>&1" % (os.path.split(sys.argv[0])[0], tmpf.name, None, options.input1, tmpqf.name, None, None)
        try:
            os.system(cmd1)
            os.system('gunzip -c %s >> %s' % (tmpf.name, options.output1))
        except Exception as eq:
            stop_err("Error converting data to fastq format.\n" + str(eq))
    tmpqf.close()
    tmpf.close()
    sys.stdout.write('converted SOLiD data')
Ejemplo n.º 5
0
def __main__():

    # Parse command line arguments
    options, args = doc_optparse.parse( __doc__ )
    try:
        keep_header = bool( options.header )
        keep_comments = bool( options.comments )
        expr = args[0]
        colname = args[1]
    except:
        doc_optparse.exception()

    # Compile expression for SPEED
    if expr: expr = compile( expr, '<expr arg>', 'eval' )

    for element in bx.tabular.io.Reader( sys.stdin ):
        if type( element ) is bx.tabular.io.Header:
            if keep_header: 
                print str( element ) + "\t" + colname
        elif type( element ) is bx.tabular.io.Comment:
            if keep_comments: 
                print element
        else:
            val = eval( expr, dict( row=element ) )
            print str( element ) + "\t" + str( val )
def main():

    options, args = doc_optparse.parse(__doc__)

    try:
        species = args
        # Allow a comma separated list, TODO: allow a newick format tree
        if len(species) == 1 and "," in species[0]:
            species = species[0].split(",")
        fuse = not (bool(options.nofuse))
    except:
        doc_optparse.exit()

    maf_reader = bx.align.maf.Reader(sys.stdin)
    maf_writer = bx.align.maf.Writer(sys.stdout)

    if fuse:
        maf_writer = FusingAlignmentWriter(maf_writer)

    for m in maf_reader:
        new_components = get_components_for_species(m, species)
        if new_components:
            remove_all_gap_columns(new_components)
            m.components = new_components
            m.score = 0.0
            maf_writer.write(m)

    maf_reader.close()
    maf_writer.close()
def main():
    # Parsing Command Line here
    options, args = doc_optparse.parse(__doc__)

    try:
        inp_file, out_file, column, features = args
    except:
        stop_err(
            "One or more arguments is missing or invalid.\nUsage: prog input output column features"
        )
    try:
        column = int(column)
    except:
        stop_err("Column %s is an invalid column." % column)

    if features is None:
        stop_err(
            "Column %d has no features to display, select another column." %
            (column + 1))

    fo = open(out_file, 'w')
    for i, line in enumerate(open(inp_file)):
        line = line.rstrip('\r\n')
        if line and line.startswith('#'):
            # Keep valid comment lines in the output
            fo.write("%s\n" % line)
        else:
            try:
                if line.split('\t')[column] in features.split(','):
                    fo.write("%s\n" % line)
            except:
                pass
    fo.close()

    print 'Column %d features: %s' % (column + 1, features)
Ejemplo n.º 8
0
def main():
    mincols = 1

    options, args = doc_optparse.parse( __doc__ )
    try:
        chr_col_1, start_col_1, end_col_1, strand_col_1 = parse_cols_arg( options.cols1 )
        chr_col_2, start_col_2, end_col_2, strand_col_2 = parse_cols_arg( options.cols2 )
        if options.mincols:
            mincols = int( options.mincols )
        pieces = bool( options.pieces )
        in1_gff_format = bool( options.gff1 )
        in2_gff_format = bool( options.gff2 )
        in_fname, in2_fname, out_fname = args
    except:
        doc_optparse.exception()

    # Set readers to handle either GFF or default format.
    if in1_gff_format:
        in1_reader_wrapper = GFFReaderWrapper
    else:
        in1_reader_wrapper = NiceReaderWrapper
    if in2_gff_format:
        in2_reader_wrapper = GFFReaderWrapper
    else:
        in2_reader_wrapper = NiceReaderWrapper

    g1 = in1_reader_wrapper( fileinput.FileInput( in_fname ),
                             chrom_col=chr_col_1,
                             start_col=start_col_1,
                             end_col=end_col_1,
                             strand_col=strand_col_1,
                             fix_strand=True )
    if in1_gff_format:
        # Intersect requires coordinates in BED format.
        g1.convert_to_bed_coord = True
    g2 = in2_reader_wrapper( fileinput.FileInput( in2_fname ),
                             chrom_col=chr_col_2,
                             start_col=start_col_2,
                             end_col=end_col_2,
                             strand_col=strand_col_2,
                             fix_strand=True )
    if in2_gff_format:
        # Intersect requires coordinates in BED format.
        g2.convert_to_bed_coord = True

    out_file = open( out_fname, "w" )
    try:
        for feature in intersect( [g1, g2], pieces=pieces, mincols=mincols ):
            if isinstance( feature, GFFFeature ):
                # Convert back to GFF coordinates since reader converted automatically.
                convert_bed_coords_to_gff( feature )
                for interval in feature.intervals:
                    out_file.write( "%s\n" % "\t".join( interval.fields ) )
            elif isinstance( feature, GenomicInterval ):
                out_file.write( "%s\n" % "\t".join( feature.fields ) )
            else:
                out_file.write( "%s\n" % feature )
    except ParseError, e:
        out_file.close()
        fail( "Invalid file format: %s" % str( e ) )
Ejemplo n.º 9
0
def main():   
    # Parsing Command Line here
    options, args = doc_optparse.parse( __doc__ )
    
    try:
        inp_file, out_file, column, features = args
    except:
        stop_err( "One or more arguments is missing or invalid.\nUsage: prog input output column features" )
    try:
        column = int( column )
    except:
        stop_err( "Column %s is an invalid column." % column )
    
    if features == None:
        stop_err( "Column %d has no features to display, select another column." %( column + 1 ) )

    fo=open( out_file, 'w' )
    for i, line in enumerate( file( inp_file ) ):
        line = line.rstrip( '\r\n' )
        if line and line.startswith( '#' ):
            # Keep valid comment lines in the output
            fo.write( "%s\n" % line )
        else:
            try:
                if line.split( '\t' )[column] in features.split( ',' ):
                    fo.write( "%s\n" % line )
            except:
                pass
    fo.close()
            
    print 'Column %d features: %s' %( column + 1, features )
def main():

    # Parse command line
    options, args = doc_optparse.parse(__doc__)
    try:
        score_file = open(args[0])
        interval_file = open(args[1])
        if len(args) > 2:
            out_file = open(args[2], "w")
        else:
            out_file = sys.stdout
    except:
        doc_optparse.exit()

    scores_by_chrom = read_scores(misc.open_compressed(sys.argv[1]))
    for line in open(sys.argv[2]):
        fields = line.split()
        chrom, start, stop = fields[0], int(fields[1]), int(fields[2])
        if chrom in scores_by_chrom:
            ba = scores_by_chrom[chrom]
            scores = [ba[i] for i in range(start, stop)]
        else:
            scores = []
        print >> out_file, " ".join(fields), " ".join(map(str, scores))

    score_file.close()
    interval_file.close()
    out_file.close()
Ejemplo n.º 11
0
def main():
    distance = 0
    minregions = 2
    output = 1

    options, args = doc_optparse.parse(__doc__)
    try:
        chr_col_1, start_col_1, end_col_1, strand_col_1 = parse_cols_arg(
            options.cols1)
        if options.distance:
            distance = int(options.distance)
        if options.overlap:
            distance = -1 * int(options.overlap)
        if options.output:
            output = int(options.output)
        if options.minregions:
            minregions = int(options.minregions)
        in_fname, out_fname = args
    except:
        doc_optparse.exception()

    g1 = NiceReaderWrapper(fileinput.FileInput(in_fname),
                           chrom_col=chr_col_1,
                           start_col=start_col_1,
                           end_col=end_col_1,
                           strand_col=strand_col_1,
                           fix_strand=True)

    # Get the cluster tree
    try:
        clusters, extra = find_clusters(g1,
                                        mincols=distance,
                                        minregions=minregions)
    except ParseError, exc:
        fail("Invalid file format: %s" % str(exc))
Ejemplo n.º 12
0
def main():
    options, args = doc_optparse.parse(__doc__)
    try:
        chr_col_1, start_col_1, end_col_1, strand_col_1 = parse_cols_arg(
            options.cols1)
        chr_col_2, start_col_2, end_col_2, strand_col_2 = parse_cols_arg(
            options.cols2)
        in_fname, in2_fname, out_fname = args
    except:
        doc_optparse.exception()

    g1 = NiceReaderWrapper(fileinput.FileInput(in_fname),
                           chrom_col=chr_col_1,
                           start_col=start_col_1,
                           end_col=end_col_1,
                           strand_col=strand_col_1,
                           fix_strand=True)
    g2 = NiceReaderWrapper(fileinput.FileInput(in2_fname),
                           chrom_col=chr_col_2,
                           start_col=start_col_2,
                           end_col=end_col_2,
                           strand_col=strand_col_2,
                           fix_strand=True)

    out_file = open(out_fname, "w")

    try:
        for line in coverage([g1, g2]):
            if type(line) is GenomicInterval:
                out_file.write("%s\n" % "\t".join(line.fields))
            else:
                out_file.write("%s\n" % line)
    except ParseError, exc:
        out_file.close()
        fail("Invalid file format: %s" % str(exc))
Ejemplo n.º 13
0
def main():

    options, args = doc_optparse.parse( __doc__ )

    try:
        species = args[0].split(',')
        nrequired = int( args[1] )
    except:
        doc_optparse.exit()

    maf_reader = bx.align.maf.Reader( sys.stdin )
    
    interval_start = None
    interval_end = None   

    for m in maf_reader:            
        ref = m.components[0]
        # Does this alignment have enough of the required species        
        if nrequired <= len( [ comp for comp in m.components if comp.src.split('.')[0] in species ] ):
            if interval_start is None:
                interval_start = ref.start
                interval_end = ref.end
            else:
                if ref.start - interval_end < SPAN:
                    interval_end = ref.end
                else:
                    if interval_end - interval_start >= MIN:
                        print(ref.src.split('.')[1], interval_start, interval_end)
                    interval_start = ref.start
                    interval_end = ref.end    
        else:
            if interval_start != None and interval_end - interval_start >= MIN:
                print(ref.src.split('.')[1], interval_start, interval_end)
            interval_start = None
            interval_end = None
Ejemplo n.º 14
0
def main():
    distance = 0
    minregions = 2
    output = 1

    options, args = doc_optparse.parse( __doc__ )
    try:
        chr_col_1, start_col_1, end_col_1, strand_col_1 = parse_cols_arg( options.cols1 )
        if options.distance:
            distance = int( options.distance )
        if options.overlap:
            distance = -1 * int( options.overlap )
        if options.output:
            output = int( options.output )
        if options.minregions:
            minregions = int( options.minregions )
        in_fname, out_fname = args
    except:
        doc_optparse.exception()

    g1 = NiceReaderWrapper( fileinput.FileInput( in_fname ),
                            chrom_col=chr_col_1,
                            start_col=start_col_1,
                            end_col=end_col_1,
                            strand_col=strand_col_1,
                            fix_strand=True )

    # Get the cluster tree
    try:
        clusters, extra = find_clusters( g1, mincols=distance, minregions=minregions)
    except ParseError, exc:
        fail( "Invalid file format: %s" % str( exc ) )
Ejemplo n.º 15
0
def __main__():
    #
    # Parse options, args.
    #
    options, args = doc_optparse.parse( __doc__ )
    try:
        if len(options.cols.split(',')) == 5:
            # BED file
            chrom_col, start_col, end_col, strand_col, name_col = parse_cols_arg( options.cols )
        else:
            # gff file
            chrom_col, start_col, end_col, strand_col = parse_cols_arg( options.cols )
            name_col = False
        dbkey = options.dbkey
        output_format = options.output_format
        gff_format = options.gff
        interpret_features = options.interpret_features
        GALAXY_DATA_INDEX_DIR = options.GALAXY_DATA_INDEX_DIR
        fasta_file = options.fasta
        input_filename, output_filename = args
    except:
        doc_optparse.exception()

    includes_strand_col = strand_col >= 0
    strand = None
    nibs = {}

    #
    # Set path to sequence data.
    #
    if fasta_file:
        # Need to create 2bit file from fasta file.
        try:
            seq_path = tempfile.NamedTemporaryFile( dir="." ).name
            cmd = "faToTwoBit %s %s" % ( fasta_file, seq_path )

            tmp_name = tempfile.NamedTemporaryFile( dir="." ).name
            tmp_stderr = open( tmp_name, 'wb' )
            proc = subprocess.Popen( args=cmd, shell=True, stderr=tmp_stderr.fileno() )
            returncode = proc.wait()
            tmp_stderr.close()

            # Get stderr, allowing for case where it's very large.
            tmp_stderr = open( tmp_name, 'rb' )
            stderr = ''
            buffsize = 1048576
            try:
                while True:
                    stderr += tmp_stderr.read( buffsize )
                    if not stderr or len( stderr ) % buffsize != 0:
                        break
            except OverflowError:
                pass
            tmp_stderr.close()

            # Error checking.
            if returncode != 0:
                raise Exception(stderr)
        except Exception, e:
            stop_err( 'Error running faToTwoBit. ' + str( e ) )
Ejemplo n.º 16
0
def main():

    # Parse command line arguments
    options, args = doc_optparse.parse( __doc__ )

    try:
        lnorm = bool( options.lnorm )
        recalculate = bool( options.recalculate )
    except:
        doc_optparse.exit()

    hox70 = score.build_scoring_scheme( """  A    C    G    T
                                      91 -114  -31 -123
                                    -114  100 -125  -31
                                     -31 -125  100 -114
                                    -123  -31 -114   91 """, 400, 30, default=0 )

    maf_reader = maf.Reader( sys.stdin )

    for m in maf_reader: 
        if m.text_size == 0:
            print "NA"
            continue
        s = m.score
        # Recalculate?
        if recalculate:
            s = hox70.score_alignment( m )
        # Normalize?
        if lnorm:
            s = s / m.text_size
        # Print
        print s
Ejemplo n.º 17
0
def main():

    # Parse command line
    options, args = doc_optparse.parse(__doc__)
    try:
        data_fname, model_fname, out_fname = args
        window = int(getopt(options, "window", 100))
        shift = int(getopt(options, "shift", 5))
        low = float(getopt(options, "low", -1.0))
        high = float(getopt(options, "high", 1.0))
        if options.mapping:
            align_count, mapping = rp.mapping.alignment_mapping_from_file(file(options.mapping))
        else:
            mapping = None
        modname = getattr(options, "model")
        if modname is None:
            modname = "standard"
        reorder = getopt(options, "reorder", None)
        if reorder:
            reorder = map(int, reorder.split(","))
    except:
        doc_optparse.exception()

    out = open(out_fname, "w")
    run(open(data_fname), modname, open(model_fname), out, mapping, window, shift, low, high, reorder)
    out.close()
Ejemplo n.º 18
0
def main():
    # Parsing Command Line here
    options, args = doc_optparse.parse(__doc__)

    try:
        chr_col_1, start_col_1, end_col_1, strand_col_1 = parse_cols_arg(options.cols)
        inp_file, winsize, out_file, makesliding, offset = args
        winsize = int(winsize)
        offset = int(offset)
        makesliding = int(makesliding)
    except:
        stop_err(
            "Data issue, click the pencil icon in the history item to correct the metadata attributes of the input dataset."
        )

    fo = open(out_file, "w")

    skipped_lines = 0
    first_invalid_line = 0
    invalid_line = None
    if offset == 0:
        makesliding = 0

    for i, line in enumerate(file(inp_file)):
        line = line.strip()
        if line and line[0:1] != "#":
            try:
                elems = line.split("\t")
                start = int(elems[start_col_1])
                end = int(elems[end_col_1])
                if makesliding == 0:
                    numwin = (end - start) / winsize
                else:
                    numwin = (end - start) / offset
                if numwin > 0:
                    for win in range(numwin):
                        elems_1 = elems
                        elems_1[start_col_1] = str(start)
                        elems_1[end_col_1] = str(start + winsize)
                        fo.write("%s\n" % "\t".join(elems_1))
                        if makesliding == 0:
                            start = start + winsize
                        else:
                            start = start + offset
                            if start + winsize > end:
                                break
            except:
                skipped_lines += 1
                if not invalid_line:
                    first_invalid_line = i + 1
                    invalid_line = line

    fo.close()

    if makesliding == 1:
        print "Window size=%d, Sliding=Yes, Offset=%d" % (winsize, offset)
    else:
        print "Window size=%d, Sliding=No" % (winsize)
    if skipped_lines > 0:
        print 'Skipped %d invalid lines starting with #%d: "%s"' % (skipped_lines, first_invalid_line, invalid_line)
def main():

    # Parse command line
    options, args = doc_optparse.parse(__doc__)
    try:
        score_file = open(args[0])
        interval_file = open(args[1])
        if len(args) > 2:
            out_file = open(args[2], 'w')
        else:
            out_file = sys.stdout
    except:
        doc_optparse.exit()

    scores_by_chrom = read_scores(misc.open_compressed(sys.argv[1]))
    for line in open(sys.argv[2]):
        fields = line.split()
        chrom, start, stop = fields[0], int(fields[1]), int(fields[2])
        if chrom in scores_by_chrom:
            ba = scores_by_chrom[chrom]
            scores = [ba[i] for i in range(start, stop)]
        else:
            scores = []
        print >> out_file, " ".join(fields), " ".join(map(str, scores))

    score_file.close()
    interval_file.close()
    out_file.close()
Ejemplo n.º 20
0
def main():
    options, args = doc_optparse.parse( __doc__ )
    try:
        chr_col_1, start_col_1, end_col_1, strand_col_1 = parse_cols_arg( options.cols1 )
        chr_col_2, start_col_2, end_col_2, strand_col_2 = parse_cols_arg( options.cols2 )
        in_fname, in2_fname, out_fname = args
    except:
        doc_optparse.exception()

    g1 = NiceReaderWrapper( fileinput.FileInput( in_fname ),
                            chrom_col=chr_col_1,
                            start_col=start_col_1,
                            end_col=end_col_1,
                            strand_col=strand_col_1,
                            fix_strand=True )
    g2 = NiceReaderWrapper( fileinput.FileInput( in2_fname ),
                            chrom_col=chr_col_2,
                            start_col=start_col_2,
                            end_col=end_col_2,
                            strand_col=strand_col_2,
                            fix_strand=True )

    out_file = open( out_fname, "w" )

    try:
        for line in coverage( [g1, g2] ):
            if type( line ) is GenomicInterval:
                out_file.write( "%s\n" % "\t".join( line.fields ) )
            else:
                out_file.write( "%s\n" % line )
    except ParseError, exc:
        out_file.close()
        fail( "Invalid file format: %s" % str( exc ) )
Ejemplo n.º 21
0
def __main__():
    #Parse Command Line
    options, args = doc_optparse.parse( __doc__ )
    seq_path = check_seq_file( options.dbkey, options.indexDir )
    tmp_dir = tempfile.gettempdir()
    os.chdir(tmp_dir)
    tmpf0 = tempfile.NamedTemporaryFile(dir=tmp_dir)
    tmpf0bam = '%s.bam' % tmpf0.name
    tmpf0bambai = '%s.bam.bai' % tmpf0.name
    tmpf1 = tempfile.NamedTemporaryFile(dir=tmp_dir)
    tmpf1fai = '%s.fai' % tmpf1.name
    opts = '%s %s -M %s' % (('','-s')[options.lastCol=='yes'], ('','-i')[options.indels=='yes'], options.mapCap)
    if options.consensus == 'yes':
        opts += ' -c -T %s -N %s -r %s -I %s' % (options.theta, options.hapNum, options.fraction, options.phredProb)
    cmd1 = None
    cmd2 = 'cp %s %s; cp %s %s' % (options.input1, tmpf0bam, options.bamIndex, tmpf0bambai)
    cmd3 = 'samtools pileup %s -f %s %s > %s 2> /dev/null'
    if options.ref =='indexed':
        full_path = "%s.fai" % seq_path 
        if not os.path.exists( full_path ):
            stop_err( "No sequences are available for '%s', request them by reporting this error." % options.dbkey )
        cmd3 = cmd3 % (opts, seq_path, tmpf0bam, options.output1)
    elif options.ref == 'history':
        cmd1 = 'cp %s %s; samtools faidx %s' % (options.ownFile, tmpf1.name, tmpf1.name)
        cmd3 = cmd3 % (opts, tmpf1.name, tmpf0bam, options.output1)
    # index reference if necessary
    if cmd1:
        try:
            os.system(cmd1)
            if options.ref == 'history' and not os.path.exists( tmpf1fai ):
                stop_err( "Problem creating index file from history item." )
        except Exception, eq:
            stop_err('Error handling reference sequence\n' + str(eq))
Ejemplo n.º 22
0
def __main__():
    # Parse Command Line
    options, args = doc_optparse.parse(__doc__)
    # common temp file setup
    tmpf = tempfile.NamedTemporaryFile()  # forward reads
    tmpqf = tempfile.NamedTemporaryFile()
    tmpqf = replaceNeg1(file(options.input2, "r"), tmpqf)
    # if paired-end data (have reverse input files)
    if options.input3 != "None" and options.input4 != "None":
        tmpr = tempfile.NamedTemporaryFile()  # reverse reads
        # replace the -1 in the qualities file
        tmpqr = tempfile.NamedTemporaryFile()
        tmpqr = replaceNeg1(file(options.input4, "r"), tmpqr)
        cmd1 = "%s/bwa_solid2fastq_modified.pl 'yes' %s %s %s %s %s %s 2>&1" % (
            os.path.split(sys.argv[0])[0],
            tmpf.name,
            tmpr.name,
            options.input1,
            tmpqf.name,
            options.input3,
            tmpqr.name,
        )
        try:
            os.system(cmd1)
            os.system("gunzip -c %s >> %s" % (tmpf.name, options.output1))
            os.system("gunzip -c %s >> %s" % (tmpr.name, options.output2))
        except Exception, eq:
            stop_err("Error converting data to fastq format.\n" + str(eq))
        tmpr.close()
        tmpqr.close()
Ejemplo n.º 23
0
def main():
    options, args = doc_optparse.parse(__doc__)
    try:
        inp_file, out_file, sitetype, definition = args
        if options.mask:
            mask = int(options.mask)
        else:
            mask = 0
    except:
        print >> sys.stderr, "Tool initialization error."
        sys.exit()

    reader = bx.align.maf.Reader(open(inp_file, 'r'))
    writer = bx.align.maf.Writer(open(out_file, 'w'))

    mask_chr_dict = {0: '#', 1: '$', 2: '^', 3: '*', 4: '?', 5: 'N'}
    mask = mask_chr_dict[mask]

    if sitetype == "CpG":
        if int(definition) == 1:
            cpgfilter = bx.align.sitemask.cpg.Restricted(mask=mask)
            defn = "CpG-Restricted"
        else:
            cpgfilter = bx.align.sitemask.cpg.Inclusive(mask=mask)
            defn = "CpG-Inclusive"
    else:
        cpgfilter = bx.align.sitemask.cpg.nonCpG(mask=mask)
        defn = "non-CpG"
    cpgfilter.run(reader, writer.write)

    print "%2.2f percent bases masked; Mask character = %s, Definition = %s" % (
        float(cpgfilter.masked) / float(cpgfilter.total) * 100, mask, defn)
def __main__():

    # Parse command line arguments
    options, args = doc_optparse.parse(__doc__)
    try:
        keep_header = bool(options.header)
        keep_comments = bool(options.comments)
        expr = args[0]
        colname = args[1]
    except:
        doc_optparse.exception()

    # Compile expression for SPEED
    if expr: expr = compile(expr, '<expr arg>', 'eval')

    for element in bx.tabular.io.Reader(sys.stdin):
        if type(element) is bx.tabular.io.Header:
            if keep_header:
                print str(element) + "\t" + colname
        elif type(element) is bx.tabular.io.Comment:
            if keep_comments:
                print element
        else:
            val = eval(expr, dict(row=element))
            print str(element) + "\t" + str(val)
Ejemplo n.º 25
0
def main():
    options, args = doc_optparse.parse( __doc__ )
    try:
        chr_col_1, start_col_1, end_col_1, strand_col_1 = parse_cols_arg( options.cols1 )
        chr_col_2, start_col_2, end_col_2, strand_col_2 = parse_cols_arg( options.cols2 )      
        in_fname, in2_fname, out_fname, direction = args
    except:
        doc_optparse.exception()

    g1 = NiceReaderWrapper( fileinput.FileInput( in_fname ),
                            chrom_col=chr_col_1,
                            start_col=start_col_1,
                            end_col=end_col_1,
                            strand_col=strand_col_1,
                            fix_strand=True )
    g2 = NiceReaderWrapper( fileinput.FileInput( in2_fname ),
                            chrom_col=chr_col_2,
                            start_col=start_col_2,
                            end_col=end_col_2,
                            strand_col=strand_col_2,
                            fix_strand=True )
    out_file = open( out_fname, "w" )
    try:
        for line in proximal_region_finder([g1,g2], direction):
            if type( line ) is list:
                out_file.write( "%s\n" % "\t".join( line ) )
            else:
                out_file.write( "%s\n" % line )
    except ParseError, exc:
        fail( "Invalid file format: %s" % str( exc ) )
Ejemplo n.º 26
0
def main():

    options, args = doc_optparse.parse(__doc__)

    try:
        species = args
        # Allow a comma separated list, TODO: allow a newick format tree
        if len(species) == 1 and ',' in species[0]:
            species = species[0].split(',')
        fuse = not (bool(options.nofuse))
    except:
        doc_optparse.exit()

    maf_reader = bx.align.maf.Reader(sys.stdin)
    maf_writer = bx.align.maf.Writer(sys.stdout)

    if fuse:
        maf_writer = FusingAlignmentWriter(maf_writer)

    for m in maf_reader:
        new_components = get_components_for_species(m, species)
        if new_components:
            remove_all_gap_columns(new_components)
            m.components = new_components
            m.score = 0.0
            maf_writer.write(m)

    maf_reader.close()
    maf_writer.close()
Ejemplo n.º 27
0
def main():
    options, args = doc_optparse.parse(__doc__)
    try:
        chr_col_1, start_col_1, end_col_1, strand_col_1 = parse_cols_arg(
            options.cols1)
        chr_col_2, start_col_2, end_col_2, strand_col_2 = parse_cols_arg(
            options.cols2)
        in_fname, in2_fname, out_fname, direction = args
    except:
        doc_optparse.exception()

    g1 = NiceReaderWrapper(fileinput.FileInput(in_fname),
                           chrom_col=chr_col_1,
                           start_col=start_col_1,
                           end_col=end_col_1,
                           strand_col=strand_col_1,
                           fix_strand=True)
    g2 = NiceReaderWrapper(fileinput.FileInput(in2_fname),
                           chrom_col=chr_col_2,
                           start_col=start_col_2,
                           end_col=end_col_2,
                           strand_col=strand_col_2,
                           fix_strand=True)
    out_file = open(out_fname, "w")
    try:
        for line in proximal_region_finder([g1, g2], direction):
            if type(line) is list:
                out_file.write("%s\n" % "\t".join(line))
            else:
                out_file.write("%s\n" % line)
    except ParseError, exc:
        fail("Invalid file format: %s" % str(exc))
Ejemplo n.º 28
0
def main():

    # Parse command line arguments
    options, args = doc_optparse.parse(__doc__)

    try:
        lnorm = bool(options.lnorm)
        recalculate = bool(options.recalculate)
    except Exception:
        doc_optparse.exit()

    hox70 = score.build_scoring_scheme("""  A    C    G    T
                                      91 -114  -31 -123
                                    -114  100 -125  -31
                                     -31 -125  100 -114
                                    -123  -31 -114   91 """,
                                       400,
                                       30,
                                       default=0)

    maf_reader = maf.Reader(sys.stdin)

    for m in maf_reader:
        if m.text_size == 0:
            print("NA")
            continue
        s = m.score
        # Recalculate?
        if recalculate:
            s = hox70.score_alignment(m)
        # Normalize?
        if lnorm:
            s = s / m.text_size
        # Print
        print(s)
Ejemplo n.º 29
0
def main():

    # Parse command line

    options, args = doc_optparse.parse( __doc__ )

    try:
        maf_file = args[0]
        # If it appears to be a bz2 file, attempt to open with table
        if maf_file.endswith( ".bz2" ):
            table_file = maf_file + "t"
            if not os.path.exists( table_file ):
                doc_optparse.exit( "To index bz2 compressed files first "
                                   "create a bz2t file with bzip-table." )
            # Open with SeekableBzip2File so we have tell support
            maf_in = SeekableBzip2File( maf_file, table_file )
            # Strip .bz2 from the filename before adding ".index"
            maf_file = maf_file[:-4]
        elif maf_file.endswith( ".lzo" ):
            from bx.misc.seeklzop import SeekableLzopFile
            table_file = maf_file + "t"
            if not os.path.exists( table_file ):
                doc_optparse.exit( "To index lzo compressed files first "
                                   "create a lzot file with lzop_build_offset_table." )
            # Open with SeekableBzip2File so we have tell support
            maf_in = SeekableLzopFile( maf_file, table_file )
            # Strip .lzo from the filename before adding ".index"
            maf_file = maf_file[:-4]
        else:
            maf_in = open( maf_file )
        # Determine the name of the index file
        if len( args ) > 1: 
            index_file = args[1]
        else: 
            index_file = maf_file + ".index" 
        if options.species:
            species = options.species.split( "," )
        else:
            species = None
    except:
        doc_optparse.exception()

    maf_reader = bx.align.maf.Reader( maf_in )

    indexes = interval_index_file.Indexes()

    # Need to be a bit tricky in our iteration here to get the 'tells' right
    while 1:
        pos = maf_reader.file.tell()
        block = maf_reader.next()
        if block is None: break
        for c in block.components:
            if species is not None and c.src.split('.')[0] not in species:
                continue
            indexes.add( c.src, c.forward_strand_start, c.forward_strand_end, pos, max=c.src_size )

    out = open( index_file, 'w' )
    indexes.write( out )
    out.close()
Ejemplo n.º 30
0
def main():

    # Parse command line

    options, args = doc_optparse.parse( __doc__ )

    try:
        maf_file = args[0]
        # If it appears to be a bz2 file, attempt to open with table
        if maf_file.endswith( ".bz2" ):
            table_file = maf_file + "t"
            if not os.path.exists( table_file ):
                doc_optparse.exit( "To index bz2 compressed files first "
                                   "create a bz2t file with bzip-table." )
            # Open with SeekableBzip2File so we have tell support
            maf_in = SeekableBzip2File( maf_file, table_file )
            # Strip .bz2 from the filename before adding ".index"
            maf_file = maf_file[:-4]
        elif maf_file.endswith( ".lzo" ):
            from bx.misc.seeklzop import SeekableLzopFile
            table_file = maf_file + "t"
            if not os.path.exists( table_file ):
                doc_optparse.exit( "To index lzo compressed files first "
                                   "create a lzot file with lzop_build_offset_table." )
            # Open with SeekableBzip2File so we have tell support
            maf_in = SeekableLzopFile( maf_file, table_file )
            # Strip .lzo from the filename before adding ".index"
            maf_file = maf_file[:-4]
        else:
            maf_in = open( maf_file )
        # Determine the name of the index file
        if len( args ) > 1: 
            index_file = args[1]
        else: 
            index_file = maf_file + ".index" 
        if options.species:
            species = options.species.split( "," )
        else:
            species = None
    except:
        doc_optparse.exception()

    maf_reader = bx.align.maf.Reader( maf_in )

    indexes = interval_index_file.Indexes()

    # Need to be a bit tricky in our iteration here to get the 'tells' right
    while 1:
        pos = maf_reader.file.tell()
        block = maf_reader.next()
        if block is None: break
        for c in block.components:
            if species is not None and c.src.split('.')[0] not in species:
                continue
            indexes.add( c.src, c.forward_strand_start, c.forward_strand_end, pos, max=c.src_size )

    out = open( index_file, 'w' )
    indexes.write( out )
    out.close()
Ejemplo n.º 31
0
def main():

    # Parse command line
    options, args = doc_optparse.parse(__doc__)
    try:
        score_fname = args[0]
        interval_fname = args[1]
        if len(args) > 2:
            out_file = open(args[2], 'w')
        else:
            out_file = sys.stdout
        binned = bool(options.binned)
        mask_fname = options.mask
    except Exception:
        doc_optparse.exit()

    if binned:
        scores_by_chrom = load_scores_ba_dir(score_fname)
    else:
        scores_by_chrom = load_scores_wiggle(score_fname)

    if mask_fname:
        masks = binned_bitsets_from_file(open(mask_fname))
    else:
        masks = None

    for line in open(interval_fname):
        fields = line.split()
        chrom, start, stop = fields[0], int(fields[1]), int(fields[2])
        total = 0
        count = 0
        min_score = 100000000
        max_score = -100000000
        for i in range(start, stop):
            if chrom in scores_by_chrom and scores_by_chrom[chrom][i]:
                # Skip if base is masked
                if masks and chrom in masks:
                    if masks[chrom][i]:
                        continue
                # Get the score, only count if not 'nan'
                score = scores_by_chrom[chrom][i]
                if not isNaN(score):
                    total += score
                    count += 1
                    max_score = max(score, max_score)
                    min_score = min(score, min_score)
        if count > 0:
            avg = total / count
        else:
            avg = "nan"
            min_score = "nan"
            max_score = "nan"

        print("\t".join(
            map(str, [chrom, start, stop, avg, min_score, max_score])),
              file=out_file)

    out_file.close()
Ejemplo n.º 32
0
def __main__():
    #Parse Command Line
    options, args = doc_optparse.parse( __doc__ )
    #get parameters for intersect command
    if options.priority == "second_file":
        file1 = options.input2
        file2 = options.input1
    else:
        file1 = options.input1
        file2 = options.input2
    
    cmd = "wc -l %s | cut -f1 -d ' '" % (file1)
    nrBefore = float(subprocess.check_output(cmd, shell=True))
    
    if options.intersect == 'keep_intersect':
        cmd = 'intersectBed -header -sorted -a %s -b %s > %s' % ( file1, file2, options.output1 )
        subprocess.check_call(cmd, shell=True)
        print cmd
    elif options.intersect == 'keep_unique':
        cmd = 'intersectBed -v -header -sorted -a %s -b %s > %s' % ( file1, file2, options.output1 )
        subprocess.check_call(cmd, shell=True)
        print cmd
    elif options.intersect == 'keep_allele':
        try:
            x,tmp1 = tempfile.mkstemp()
            x,tmp2 = tempfile.mkstemp()
            cmd = 'intersectBed -v -a %s -b %s > %s' % ( file1, file2, tmp1 )
            subprocess.check_call(cmd, shell=True)
            print cmd
            
            cmd = 'intersectBed -wa -wb -a %s -b %s > %s' % ( file1, file2, tmp2 )
            subprocess.check_call(cmd, shell=True)
            print cmd
            
            subtract(tmp2,tmp1)
            
            cmd = 'sortBed -i %s > %s' % ( tmp1, options.output1 )
            subprocess.check_call(cmd, shell=True)
            print cmd
        
        finally:
            os.remove(tmp1)
            os.remove(tmp2)
    
    
    cmd = "wc -l %s | cut -f1 -d ' '" % (options.output1)
    nrAfter = float(subprocess.check_output(cmd, shell=True))
    
    output_handle = file(options.output2, "w")
    output_handle.write("# SNPs before\tSNPs after\tfraction subtracted\tfraction retained\n")
    output_handle.write("%i\t%i\t%.4f\t%.4f\n" % (nrBefore, nrAfter, (nrBefore-nrAfter)/nrBefore, nrAfter/nrBefore))
    output_handle.close()
    
    # check that there are results in the output file
    print os.path.getsize( options.output1 )
    sys.stdout.write( 'Intersected VCF A with VCF B\n' )
Ejemplo n.º 33
0
def __main__():
    strout = ""
    # Parse Command Line
    options, args = doc_optparse.parse(__doc__)
    coverage = int(options.coverage)
    fin = file(options.input, "r")
    fout = file(options.output, "w")
    inLine = fin.readline()
    if options.format == "six":
        seqIndex = 0
        locIndex = 1
        baseIndex = 2
        covIndex = 3
    elif options.format == "ten":
        seqIndex = 0
        locIndex = 1
        if options.base == "first":
            baseIndex = 2
        else:
            baseIndex = 3
        covIndex = 7
    else:
        seqIndex = int(options.seq_column) - 1
        locIndex = int(options.loc_column) - 1
        baseIndex = int(options.base_column) - 1
        covIndex = int(options.cvrg_column) - 1
    lastSeq = ""
    lastLoc = -1
    locs = []
    startLoc = -1
    bases = []
    while inLine.strip() != "":
        lineParts = inLine.split("\t")
        try:
            seq, loc, base, cov = (
                lineParts[seqIndex],
                int(lineParts[locIndex]),
                lineParts[baseIndex],
                int(lineParts[covIndex]),
            )
        except IndexError, ei:
            if options.format == "ten":
                stop_err(
                    "It appears that you have selected 10 columns while your file has 6. Make sure that the number of columns you specify matches the number in your file.\n"
                    + str(ei)
                )
            else:
                stop_err("There appears to be something wrong with your column index values.\n" + str(ei))
        except ValueError, ev:
            if options.format == "six":
                stop_err(
                    "It appears that you have selected 6 columns while your file has 10. Make sure that the number of columns you specify matches the number in your file.\n"
                    + str(ev)
                )
            else:
                stop_err("There appears to be something wrong with your column index values.\n" + str(ev))
def main():

    # Parse command line
    options, args = doc_optparse.parse(__doc__)
    try:
        score_fname = args[0]
        interval_fname = args[1]
        if len(args) > 2:
            out_file = open(args[2], "w")
        else:
            out_file = sys.stdout
        binned = bool(options.binned)
        mask_fname = options.mask
    except:
        doc_optparse.exit()

    if binned:
        scores_by_chrom = load_scores_ba_dir(score_fname)
    else:
        scores_by_chrom = load_scores_wiggle(score_fname)

    if mask_fname:
        masks = binned_bitsets_from_file(open(mask_fname))
    else:
        masks = None

    for line in open(interval_fname):
        fields = line.split()
        chrom, start, stop = fields[0], int(fields[1]), int(fields[2])
        total = 0
        count = 0
        min_score = 100000000
        max_score = -100000000
        for i in range(start, stop):
            if chrom in scores_by_chrom and scores_by_chrom[chrom][i]:
                # Skip if base is masked
                if masks and chrom in masks:
                    if masks[chrom][i]:
                        continue
                # Get the score, only count if not 'nan'
                score = scores_by_chrom[chrom][i]
                if not isNaN(score):
                    total += score
                    count += 1
                    max_score = max(score, max_score)
                    min_score = min(score, min_score)
        if count > 0:
            avg = total / count
        else:
            avg = "nan"
            min_score = "nan"
            max_score = "nan"

        print >> out_file, "\t".join(map(str, [chrom, start, stop, avg, min_score, max_score]))

    out_file.close()
Ejemplo n.º 35
0
def __main__():
    strout = ''
    #Parse Command Line
    options, args = doc_optparse.parse(__doc__)
    coverage = int(options.coverage)
    fin = file(options.input, 'r')
    fout = file(options.output, 'w')
    inLine = fin.readline()
    if options.format == 'six':
        seqIndex = 0
        locIndex = 1
        baseIndex = 2
        covIndex = 3
    elif options.format == 'ten':
        seqIndex = 0
        locIndex = 1
        if options.base == 'first':
            baseIndex = 2
        else:
            baseIndex = 3
        covIndex = 7
    else:
        seqIndex = int(options.seq_column) - 1
        locIndex = int(options.loc_column) - 1
        baseIndex = int(options.base_column) - 1
        covIndex = int(options.cvrg_column) - 1
    lastSeq = ''
    lastLoc = -1
    locs = []
    startLoc = -1
    bases = []
    while inLine.strip() != '':
        lineParts = inLine.split('\t')
        try:
            seq, loc, base, cov = lineParts[seqIndex], int(
                lineParts[locIndex]), lineParts[baseIndex], int(
                    lineParts[covIndex])
        except IndexError, ei:
            if options.format == 'ten':
                stop_err(
                    'It appears that you have selected 10 columns while your file has 6. Make sure that the number of columns you specify matches the number in your file.\n'
                    + str(ei))
            else:
                stop_err(
                    'There appears to be something wrong with your column index values.\n'
                    + str(ei))
        except ValueError, ev:
            if options.format == 'six':
                stop_err(
                    'It appears that you have selected 6 columns while your file has 10. Make sure that the number of columns you specify matches the number in your file.\n'
                    + str(ev))
            else:
                stop_err(
                    'There appears to be something wrong with your column index values.\n'
                    + str(ev))
Ejemplo n.º 36
0
def main():
    # Parsing Command Line here
    options, args = doc_optparse.parse(__doc__)

    try:
        chr_col_1, start_col_1, end_col_1, strand_col_1 = parse_cols_arg(options.cols)
        inp_file, winsize, out_file, makesliding, offset = args
        winsize = int(winsize)
        offset = int(offset)
        makesliding = int(makesliding)
    except Exception:
        sys.exit("Data issue, click the pencil icon in the history item to correct the metadata attributes of the input dataset.")

    skipped_lines = 0
    first_invalid_line = 0
    invalid_line = None
    if offset == 0:
        makesliding = 0

    with open(out_file, 'w') as fo, open(inp_file) as fi:
        for i, line in enumerate(fi):
            line = line.strip()
            if line and line[0:1] != "#":
                try:
                    elems = line.split('\t')
                    start = int(elems[start_col_1])
                    end = int(elems[end_col_1])
                    if makesliding == 0:
                        numwin = (end - start) // winsize
                    else:
                        numwin = (end - start) // offset
                    if numwin > 0:
                        for _ in range(numwin):
                            elems_1 = elems
                            elems_1[start_col_1] = str(start)
                            elems_1[end_col_1] = str(start + winsize)
                            fo.write("%s\n" % '\t'.join(elems_1))
                            if makesliding == 0:
                                start = start + winsize
                            else:
                                start = start + offset
                                if start + winsize > end:
                                    break
                except Exception:
                    skipped_lines += 1
                    if not invalid_line:
                        first_invalid_line = i + 1
                        invalid_line = line

    if makesliding == 1:
        print('Window size=%d, Sliding=Yes, Offset=%d' % (winsize, offset))
    else:
        print('Window size=%d, Sliding=No' % (winsize))
    if skipped_lines > 0:
        print('Skipped %d invalid lines starting with #%d: "%s"' % (skipped_lines, first_invalid_line, invalid_line))
Ejemplo n.º 37
0
def __main__():
    #Parse Command Line
    options, args = doc_optparse.parse(__doc__)

    db_build = options.db_build
    query_filename = options.input.strip()
    output_filename = options.output.strip()
    mega_word_size = options.word_size  # -W
    mega_iden_cutoff = options.identity_cutoff  # -p
    mega_evalue_cutoff = options.eval_cutoff  # -e
    mega_temp_output = tempfile.NamedTemporaryFile().name
    mega_filter = options.filter_query  # -F
    GALAXY_DATA_INDEX_DIR = options.index_dir
    DB_LOC = "%s/blastdb.loc" % GALAXY_DATA_INDEX_DIR

    # megablast parameters
    try:
        int(mega_word_size)
    except:
        stop_err('Invalid value for word size')
    try:
        float(mega_iden_cutoff)
    except:
        stop_err('Invalid value for identity cut-off')
    try:
        float(mega_evalue_cutoff)
    except:
        stop_err('Invalid value for Expectation value')

    # prepare the database
    db = {}
    for i, line in enumerate(file(DB_LOC)):
        line = line.rstrip('\r\n')
        if not line or line.startswith('#'):
            continue
        fields = line.split('\t')
        db[fields[0]] = fields[1]

    if not db.has_key(db_build):
        stop_err(
            'Cannot locate the target database. Please check your location file.'
        )

    # arguments for megablast
    chunk = db[(db_build)]
    megablast_command = "megablast -d %s -i %s -o %s -m 8 -a 8 -W %s -p %s -e %s -F %s > /dev/null 2>&1 " \
        % ( chunk, query_filename, mega_temp_output, mega_word_size, mega_iden_cutoff, mega_evalue_cutoff, mega_filter )

    print megablast_command

    try:
        os.system(megablast_command)
    except Exception, e:
        stop_err(str(e))
Ejemplo n.º 38
0
def __main__():
    #Parse Command Line
    options, args = doc_optparse.parse(__doc__)
    # validate parameters
    error = ''
    try:
        read_len = int(options.read_len)
        if read_len <= 0:
            raise Exception, ' greater than 0'
    except TypeError, e:
        error = ': %s' % str(e)
Ejemplo n.º 39
0
def __main__():
    # Parse Command Line
    options, args = doc_optparse.parse( __doc__ )
    # validate parameters
    error = ''
    try:
        read_len = int( options.read_len )
        if read_len <= 0:
            raise Exception(' greater than 0')
    except TypeError, e:
        error = ': %s' % str( e )
Ejemplo n.º 40
0
def main():
    # Parse command line
    options, args = doc_optparse.parse( __doc__ )
    try:
        h5_fname = args[0]
        mapping_fname = args[1]
        in_fname = args[2]
        out_fname = args[3]
        chrom_col, start_col, end_col = map( lambda x: int( x ) - 1, args[4:7] )
        per_col = bool( options.perCol )
    except Exception, e:
        doc_optparse.exception()
Ejemplo n.º 41
0
def main():
    # Parse command line
    options, args = doc_optparse.parse(__doc__)
    try:
        h5_fname = args[0]
        mapping_fname = args[1]
        in_fname = args[2]
        out_fname = args[3]
        chrom_col, start_col, end_col = map(lambda x: int(x) - 1, args[4:7])
        per_col = bool(options.perCol)
    except Exception, e:
        doc_optparse.exception()
Ejemplo n.º 42
0
def main():   
    # Parsing Command Line here
    options, args = doc_optparse.parse( __doc__ )
    
    try:
        #chr_col_1, start_col_1, end_col_1, strand_col_1 = parse_cols_arg( options.cols )
        inp_file, out_file, pri_species, mask_species, qual_cutoff, mask_chr, mask_region, mask_length, loc_file = args
        qual_cutoff = int(qual_cutoff)
        mask_chr = int(mask_chr)
        mask_region = int(mask_region)
        if mask_region != 3:
            mask_length = int(mask_length)
        else:
            mask_length_r = int(mask_length.split(',')[0])
            mask_length_l = int(mask_length.split(',')[1])
    except:
        stop_err( "Data issue, click the pencil icon in the history item to correct the metadata attributes of the input dataset." )
    
    if pri_species == 'None':
        stop_err( "No primary species selected, try again by selecting at least one primary species." )
    if mask_species == 'None':
        stop_err( "No mask species selected, try again by selecting at least one species to mask." )

    mask_chr_count = 0
    mask_chr_dict = {0:'#', 1:'$', 2:'^', 3:'*', 4:'?', 5:'N'}
    mask_reg_dict = {0:'Current pos', 1:'Current+Downstream', 2:'Current+Upstream', 3:'Current+Both sides'}

    #ensure dbkey is present in the twobit loc file
    filepath = None
    try:
        pspecies_all = pri_species.split(',')
        pspecies_all2 = pri_species.split(',')
        pspecies = []
        filepaths = []
        for line in open(loc_file):
            if pspecies_all2 == []:    
                break
            if line[0:1] == "#":
                continue
            fields = line.split('\t')
            try:
                build = fields[0]
                for i,dbkey in enumerate(pspecies_all2):
                    if dbkey == build:
                        pspecies.append(build)
                        filepaths.append(fields[1])
                        del pspecies_all2[i]        
                    else:
                        continue
            except:
                pass
    except Exception, exc:
        stop_err( 'Initialization errorL %s' % str( exc ) )
Ejemplo n.º 43
0
def main():
    mincols = 1
    upstream_pad = 0
    downstream_pad = 0
    leftfill = False
    rightfill = False

    options, args = doc_optparse.parse(__doc__)
    try:
        chr_col_1, start_col_1, end_col_1, strand_col_1 = parse_cols_arg(
            options.cols1)
        chr_col_2, start_col_2, end_col_2, strand_col_2 = parse_cols_arg(
            options.cols2)
        if options.mincols: mincols = int(options.mincols)
        if options.fill:
            if options.fill == "both":
                rightfill = leftfill = True
            else:
                rightfill = options.fill == "right"
                leftfill = options.fill == "left"
        in_fname, in2_fname, out_fname = args
    except:
        doc_optparse.exception()

    g1 = NiceReaderWrapper(fileinput.FileInput(in_fname),
                           chrom_col=chr_col_1,
                           start_col=start_col_1,
                           end_col=end_col_1,
                           strand_col=strand_col_1,
                           fix_strand=True)
    g2 = NiceReaderWrapper(fileinput.FileInput(in2_fname),
                           chrom_col=chr_col_2,
                           start_col=start_col_2,
                           end_col=end_col_2,
                           strand_col=strand_col_2,
                           fix_strand=True)

    out_file = open(out_fname, "w")

    try:
        for outfields in join(g1,
                              g2,
                              mincols=mincols,
                              rightfill=rightfill,
                              leftfill=leftfill):
            if type(outfields) is list:
                out_file.write("%s\n" % "\t".join(outfields))
            else:
                out_file.write("%s\n" % outfields)
    except ParseError, exc:
        out_file.close()
        fail("Invalid file format: %s" % str(exc))
Ejemplo n.º 44
0
def __main__():
    #Parse Command Line
    options, args = doc_optparse.parse( __doc__ )
    
    db_build = options.db_build
    query_filename = options.input.strip()
    output_filename = options.output.strip()
    mega_word_size = options.word_size        # -W
    mega_iden_cutoff = options.identity_cutoff      # -p
    mega_evalue_cutoff = options.eval_cutoff      # -e
    mega_temp_output = tempfile.NamedTemporaryFile().name
    mega_filter = options.filter_query           # -F
    GALAXY_DATA_INDEX_DIR = options.index_dir
    DB_LOC = "%s/blastdb.loc" % GALAXY_DATA_INDEX_DIR

    # megablast parameters
    try:
        int( mega_word_size )    
    except:
        stop_err( 'Invalid value for word size' )
    try:
        float(mega_iden_cutoff)
    except:
        stop_err( 'Invalid value for identity cut-off' )
    try:
        float(mega_evalue_cutoff)
    except:
        stop_err( 'Invalid value for Expectation value' )

    # prepare the database
    db = {}
    for i, line in enumerate( file( DB_LOC ) ):
        line = line.rstrip( '\r\n' )
        if not line or line.startswith( '#' ):
            continue
        fields = line.split( '\t' )
        db[ fields[0] ] = fields[1]

    if not db.has_key( db_build ):
        stop_err( 'Cannot locate the target database. Please check your location file.' )
    
    # arguments for megablast    
    chunk = db[ ( db_build ) ]
    megablast_command = "megablast -d %s -i %s -o %s -m 8 -a 8 -W %s -p %s -e %s -F %s > /dev/null 2>&1 " \
        % ( chunk, query_filename, mega_temp_output, mega_word_size, mega_iden_cutoff, mega_evalue_cutoff, mega_filter ) 
    
    print megablast_command
       
    try:
        os.system( megablast_command )
    except Exception, e:
        stop_err( str( e ) )
Ejemplo n.º 45
0
def main():
    mincols = 1

    options, args = doc_optparse.parse(__doc__)
    try:
        chr_col_1, start_col_1, end_col_1, strand_col_1 = parse_cols_arg(
            options.cols1)
        if options.mincols:
            mincols = int(options.mincols)
        in_fname, out_fname = args
    except:
        doc_optparse.exception()

    g1 = NiceReaderWrapper(fileinput.FileInput(in_fname),
                           chrom_col=chr_col_1,
                           start_col=start_col_1,
                           end_col=end_col_1,
                           strand_col=strand_col_1,
                           fix_strand=True)

    out_file = open(out_fname, "w")

    try:
        for line in merge(g1, mincols=mincols):
            if options.threecol:
                if type(line) is GenomicInterval:
                    out_file.write(
                        "%s\t%s\t%s\n" %
                        (line.chrom, str(line.startCol), str(line.endCol)))
                elif type(line) is list:
                    out_file.write("%s\t%s\t%s\n" %
                                   (line[chr_col_1], str(line[start_col_1]),
                                    str(line[end_col_1])))
                else:
                    out_file.write("%s\n" % line)
            else:
                if type(line) is GenomicInterval:
                    out_file.write("%s\n" % "\t".join(line.fields))
                elif type(line) is list:
                    out_file.write("%s\n" % "\t".join(line))
                else:
                    out_file.write("%s\n" % line)
    except ParseError as exc:
        out_file.close()
        fail("Invalid file format: %s" % str(exc))

    out_file.close()

    if g1.skipped > 0:
        print(skipped(g1, filedesc=" of 1st dataset"))
Ejemplo n.º 46
0
def __main__():

    # Parse Command Line

    options, args = doc_optparse.parse(__doc__)

    try:
        range_filename = args[0]
        refindex = int(args[1])
        if options.mincols: mincols = int(options.mincols)
        else: mincols = 10
    except:
        doc_optparse.exit()

    # Load Intervals

    intersecter = intervals.Intersecter()
    for line in file(range_filename):
        fields = line.split()
        intersecter.add_interval(
            intervals.Interval(int(fields[0]), int(fields[1])))

    # Start axt on stdout

    out = bx.align.axt.Writer(sys.stdout)

    # Iterate over input axt

    for axt in bx.align.axt.Reader(sys.stdin):
        ref_component = axt.components[refindex]
        # Find overlap with reference component
        intersections = intersecter.find(ref_component.start,
                                         ref_component.end)
        # Keep output axt ordered
        intersections.sort()
        # Write each intersecting block
        for interval in intersections:
            start = max(interval.start, ref_component.start)
            end = min(interval.end, ref_component.end)
            sliced = axt.slice_by_component(refindex, start, end)
            good = True
            for c in sliced.components:
                if c.size < 1:
                    good = False
            if good and sliced.text_size > mincols: out.write(sliced)

    # Close output axt

    out.close()
Ejemplo n.º 47
0
def __main__():

    # Parse Command Line

    options, args = doc_optparse.parse(__doc__)

    try:
        maf_files = args
        if options.prefix: prefix = options.prefix
        else: prefix = None
    except:
        doc_optparse.exit()

    # Open indexed access to mafs
    indexes = [
        bx.align.maf.Indexed(maf_file, maf_file + ".index")
        for maf_file in maf_files
    ]

    # Iterate over input ranges

    for line in sys.stdin:
        fields = line.split()
        src, start, end = fields[0], int(fields[1]), int(fields[2])
        if prefix: src = prefix + src

        total_length = end - start

        # Find overlap with reference component
        blocks = []
        for index in indexes:
            blocks += index.get(src, start, end)

        coverage = dict()
        for block in blocks:
            overlap_start = max(start, block.components[0].start)
            overlap_end = min(end, block.components[0].end)
            length = overlap_end - overlap_start
            assert length > 0
            for c in block.components[1:]:
                species = c.src.split('.')[0]
                try:
                    coverage[species] += length
                except:
                    coverage[species] = length

        print(line, end=' ')
        for key, value in coverage.items():
            print("   ", key.ljust(10), "%0.2f" % (value / total_length))
Ejemplo n.º 48
0
def __main__():
    # Parse Command Line
    options, args = doc_optparse.parse(__doc__)

    cmd = "fq_all2std.pl %s %s > %s"
    if options.command == "sol2std":
        cmd = cmd % (options.command, options.input, options.outputFastqsanger)
    elif options.command == "std2sol":
        cmd = cmd % (options.command, options.input, options.outputFastqsolexa)
    elif options.command == "fq2fa":
        cmd = cmd % (options.command, options.input, options.outputFasta)
    try:
        os.system(cmd)
    except Exception, eq:
        stop_err("Error converting data format.\n" + str(eq))
Ejemplo n.º 49
0
def __main__():

    # Parse command line arguments
    options, args = doc_optparse.parse(__doc__)

    try:
        refindex = int(args[0])
    except Exception:
        doc_optparse.exit()

    maf_reader = maf.Reader(sys.stdin)

    for m in maf_reader:
        c = m.components[refindex].src
        print(c[c.rfind("chr") + 3:])
Ejemplo n.º 50
0
def __main__():
    # Parse Command Line
    options, args = doc_optparse.parse(__doc__)

    cmd = "fq_all2std.pl %s %s > %s"
    if options.command == 'sol2std':
        cmd = cmd % (options.command, options.input, options.outputFastqsanger)
    elif options.command == 'std2sol':
        cmd = cmd % (options.command, options.input, options.outputFastqsolexa)
    elif options.command == 'fq2fa':
        cmd = cmd % (options.command, options.input, options.outputFasta)
    try:
        os.system(cmd)
    except Exception as eq:
        stop_err("Error converting data format.\n" + str(eq))
Ejemplo n.º 51
0
def __main__():

    # Parse command line arguments
    options, args = doc_optparse.parse( __doc__ )

    try:
        refindex = int( args[0] )
    except:
        doc_optparse.exit()

    maf_reader = maf.Reader( sys.stdin )

    for m in maf_reader: 
		c = m.components[ refindex ].src
		print c[ c.rfind( "chr" ) + 3 : ]
Ejemplo n.º 52
0
def __main__():

	# Parse Command Line

	options, args = doc_optparse.parse( __doc__ )

	try:
		range_filename = args[ 0 ]
		refindex = int( args[ 1 ] )
		if options.mincols: mincols = int( options.mincols )
		else: mincols = 10
	except:
		doc_optparse.exit()

	# Load Intervals

	intersecter = intervals.Intersecter()
	for line in file( range_filename ):
		fields = line.split()
		intersecter.add_interval( intervals.Interval( int( fields[0] ), int( fields[1] ) ) )

	# Start axt on stdout

	out = bx.align.axt.Writer( sys.stdout )

	# Iterate over input axt

	for axt in bx.align.axt.Reader( sys.stdin ):
		ref_component = axt.components[ refindex ]
		# Find overlap with reference component
		intersections = intersecter.find( ref_component.start, ref_component.end )
		# Keep output axt ordered
		intersections.sort()
		# Write each intersecting block
		for interval in intersections: 
			start = max( interval.start, ref_component.start )
			end = min( interval.end, ref_component.end )
			sliced = axt.slice_by_component( refindex, start, end ) 
			good = True
			for c in sliced.components: 
				if c.size < 1: 
					good = False
			if good and sliced.text_size > mincols: out.write( sliced )
		 
	# Close output axt

	out.close()
Ejemplo n.º 53
0
def __main__():

    # Parse command line arguments
    options, args = doc_optparse.parse(__doc__)
    try:
        keep_header = bool(options.header)
        keep_comments = bool(options.comments)
        cols = []
        if options.cols:
            for c in options.cols.split(','):
                try:
                    v = int(c)
                except ValueError:
                    v = c
                cols.append(v)
        if len(args) > 0:
            expr = args[0]
        else:
            expr = None
        if options.force_header:
            force_header = bx.tabular.io.FIRST_LINE_IS_HEADER
        else:
            force_header = None
    except Exception:
        doc_optparse.exception()

    # Compile expression for SPEED
    if expr:
        expr = compile(expr, '<expr arg>', 'eval')

    for element in bx.tabular.io.TableReader(sys.stdin,
                                             force_header=force_header):
        if isinstance(element, bx.tabular.io.Header):
            if keep_header:
                if cols:
                    print("#" + "\t".join(element[c] for c in cols))
                else:
                    print(element)
        elif isinstance(element, bx.tabular.io.Comment):
            if keep_comments:
                print(element)
        else:
            if expr is None or bool(eval(expr, dict(row=element))):
                if cols:
                    print("\t".join(element[c] for c in cols))
                else:
                    print(element)
Ejemplo n.º 54
0
def main():
    options, args = doc_optparse.parse( __doc__ )

    try:
        extension = options.ext
    except:
        doc_optparse.exception()

    # create datatype
    data = model.Dataset( extension=extension, id=int( args[0] ) )
    data.file_path = "/home/ian/trunk/database/files/"
    
    if options.metadata:
        data.metadata = util.string_to_object( options.metadata )

    errors = data.datatype.validate( data )
    print util.object_to_string(errors)
def __main__():

    # Parse command line arguments
    options, args = doc_optparse.parse(__doc__)
    try:
        keep_header = bool(options.header)
        keep_comments = bool(options.comments)
        cols = []
        if options.cols:
            for c in options.cols.split(","):
                try:
                    v = int(c)
                except:
                    v = c
                cols.append(c)
        if len(args) > 0:
            expr = args[0]
        else:
            expr = None
        if options.force_header:
            force_header = bx.tabular.io.FIRST_LINE_IS_HEADER
        else:
            force_header = None
    except:
        doc_optparse.exception()

    # Compile expression for SPEED
    if expr:
        expr = compile(expr, "<expr arg>", "eval")

    for element in bx.tabular.io.TableReader(sys.stdin, force_header=force_header):
        if type(element) is bx.tabular.io.Header:
            if keep_header:
                if cols:
                    print "#" + "\t".join(element[c] for c in cols)
                else:
                    print element
        elif type(element) is bx.tabular.io.Comment:
            if keep_comments:
                print element
        else:
            if expr is None or bool(eval(expr, dict(row=element))):
                if cols:
                    print "\t".join([element[c] for c in cols])
                else:
                    print element
Ejemplo n.º 56
0
def __main__():

    options, args = doc_optparse.parse(__doc__)

    try:
        range_file = file(args[0])
        nib_file = file(args[1])
    except:
        doc_optparse.exit()

    nib = bx.seq.nib.NibFile(nib_file)

    for line in range_file:
        fields = line.split()
        start, end = int(fields[0]), int(fields[1])
        print ">", start, end
        print_wrapped(nib.get(start, end - start))