def __main__(): try: maf_reader = maf.Reader( open( sys.argv[1] ) ) except Exception as e: maf_utilities.tool_fail( "Error opening MAF: %s" % e ) try: out = maf.Writer( open( sys.argv[2], "w") ) except Exception as e: maf_utilities.tool_fail( "Error opening file for output: %s" % e ) try: collapse_columns = string_as_bool( sys.argv[3] ) except Exception as e: maf_utilities.tool_fail( "Error determining collapse columns value: %s" % e ) start_count = 0 end_count = 0 for start_count, start_block in enumerate( maf_reader ): for block in maf_utilities.iter_blocks_split_by_species( start_block ): if collapse_columns: block.remove_all_gap_columns() out.write( block ) end_count += 1 out.close() if end_count: print "%i alignment blocks created from %i original blocks." % ( end_count, start_count + 1 ) else: print "No alignment blocks were created."
def __main__(): input_filename = sys.argv[1] output_filename = sys.argv[2] output_id = sys.argv[3] # where to store files that become additional output database_tmp_dir = sys.argv[4] primary_spec = sys.argv[5] species = sys.argv[6].split( ',' ) all_species = sys.argv[7].split( ',' ) partial = sys.argv[8] keep_gaps = sys.argv[9] out_files = {} if "None" in species: species = [] if primary_spec not in species: species.append( primary_spec ) if primary_spec not in all_species: all_species.append( primary_spec ) all_species.sort() for spec in species: if spec == primary_spec: out_files[ spec ] = open( output_filename, 'wb+' ) else: out_files[ spec ] = open( os.path.join( database_tmp_dir, 'primary_%s_%s_visible_interval_%s' % ( output_id, spec, spec ) ), 'wb+' ) out_files[ spec ].write( '#chrom\tstart\tend\tstrand\tscore\tname\t%s\n' % ( '\t'.join( all_species ) ) ) num_species = len( all_species ) file_in = open( input_filename, 'r' ) maf_reader = maf.Reader( file_in ) for i, m in enumerate( maf_reader ): for j, block in enumerate( maf_utilities.iter_blocks_split_by_species( m ) ): if len( block.components ) < num_species and partial == "partial_disallowed": continue sequences = {} for c in block.components: spec, chrom = maf_utilities.src_split( c.src ) if keep_gaps == 'remove_gaps': sequences[ spec ] = c.text.replace( '-', '' ) else: sequences[ spec ] = c.text sequences = '\t'.join( [ sequences.get( _, '' ) for _ in all_species ] ) for spec in species: c = block.get_component_by_src_start( spec ) if c is not None: spec2, chrom = maf_utilities.src_split( c.src ) assert spec2 == spec, Exception( 'Species name inconsistancy found in component: %s != %s' % ( spec, spec2 ) ) out_files[ spec ].write( "%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % ( chrom, c.forward_strand_start, c.forward_strand_end, c.strand, m.score, "%s_%s_%s" % (spec, i, j), sequences ) ) file_in.close() for file_out in out_files.values(): file_out.close()
def __main__(): try: species = maf_utilities.parse_species_option(sys.argv[1]) except Exception as e: maf_utilities.tool_fail("Error determining species value: %s" % e) try: input_filename = sys.argv[2] except Exception as e: maf_utilities.tool_fail("Error reading MAF filename: %s" % e) try: file_out = open(sys.argv[3], 'w') except Exception as e: maf_utilities.tool_fail("Error opening file for output: %s" % e) if species: print "Restricted to species: %s" % ', '.join(species) else: print "Not restricted to species." if not species: try: species = maf_utilities.get_species_in_maf(input_filename) except Exception as e: maf_utilities.tool_fail( "Error determining species in input MAF: %s" % e) for spec in species: file_out.write(">" + spec + "\n") try: for start_block in maf.Reader(open(input_filename, 'r')): for block in maf_utilities.iter_blocks_split_by_species( start_block): block.remove_all_gap_columns() # remove extra gaps component = block.get_component_by_src_start( spec ) # blocks only have one occurrence of a particular species, so this is safe if component: file_out.write(component.text) else: file_out.write("-" * block.text_size) except Exception as e: maf_utilities.tool_fail( "Your MAF file appears to be malformed: %s" % e) file_out.write("\n") file_out.close()
def __main__(): try: species = maf_utilities.parse_species_option(sys.argv[1]) except Exception as e: maf_utilities.tool_fail("Error determining species value: %s" % e) try: input_filename = sys.argv[2] except Exception as e: maf_utilities.tool_fail("Error reading MAF filename: %s" % e) try: file_out = open(sys.argv[3], 'w') except Exception as e: maf_utilities.tool_fail("Error opening file for output: %s" % e) if species: print("Restricted to species: %s" % ', '.join(species)) else: print("Not restricted to species.") if not species: try: species = maf_utilities.get_species_in_maf(input_filename) except Exception as e: maf_utilities.tool_fail("Error determining species in input MAF: %s" % e) for spec in species: file_out.write(">" + spec + "\n") try: for start_block in maf.Reader(open(input_filename, 'r')): for block in maf_utilities.iter_blocks_split_by_species(start_block): block.remove_all_gap_columns() # remove extra gaps component = block.get_component_by_src_start(spec) # blocks only have one occurrence of a particular species, so this is safe if component: file_out.write(component.text) else: file_out.write("-" * block.text_size) except Exception as e: maf_utilities.tool_fail("Your MAF file appears to be malformed: %s" % e) file_out.write("\n") file_out.close()
if species: print "Restricted to species: %s" % ", ".join(species) else: print "Not restricted to species." if not species: try: species = maf_utilities.get_species_in_maf(input_filename) except Exception, e: maf_utilities.tool_fail("Error determining species in input MAF: %s" % e) for spec in species: file_out.write(">" + spec + "\n") try: for start_block in maf.Reader(open(input_filename, "r")): for block in maf_utilities.iter_blocks_split_by_species(start_block): block.remove_all_gap_columns() # remove extra gaps component = block.get_component_by_src_start( spec ) # blocks only have one occurrence of a particular species, so this is safe if component: file_out.write(component.text) else: file_out.write("-" * block.text_size) except Exception, e: maf_utilities.tool_fail("Your MAF file appears to be malformed: %s" % e) file_out.write("\n") file_out.close() if __name__ == "__main__":
print "Restricted to species: %s" % ', '.join(species) else: print "Not restricted to species." if not species: try: species = maf_utilities.get_species_in_maf(input_filename) except Exception, e: maf_utilities.tool_fail( "Error determining species in input MAF: %s" % e) for spec in species: file_out.write(">" + spec + "\n") try: for start_block in maf.Reader(open(input_filename, 'r')): for block in maf_utilities.iter_blocks_split_by_species( start_block): block.remove_all_gap_columns() #remove extra gaps component = block.get_component_by_src_start( spec ) #blocks only have one occurrence of a particular species, so this is safe if component: file_out.write(component.text) else: file_out.write("-" * block.text_size) except Exception, e: maf_utilities.tool_fail( "Your MAF file appears to be malformed: %s" % e) file_out.write("\n") file_out.close()
def __main__(): index = index_filename = None # Parse Command Line options, args = doc_optparse.parse(__doc__) if options.dbkey: dbkey = options.dbkey else: dbkey = None if dbkey in [None, "?"]: maf_utilities.tool_fail( "You must specify a proper build in order to extract alignments. You can specify your genome build by clicking on the pencil icon associated with your interval file." ) species = maf_utilities.parse_species_option(options.species) if options.chromCol: chromCol = int(options.chromCol) - 1 else: maf_utilities.tool_fail( "Chromosome column not set, click the pencil icon in the history item to set the metadata attributes." ) if options.startCol: startCol = int(options.startCol) - 1 else: maf_utilities.tool_fail( "Start column not set, click the pencil icon in the history item to set the metadata attributes." ) if options.endCol: endCol = int(options.endCol) - 1 else: maf_utilities.tool_fail( "End column not set, click the pencil icon in the history item to set the metadata attributes." ) if options.strandCol: strandCol = int(options.strandCol) - 1 else: strandCol = -1 if options.interval_file: interval_file = options.interval_file else: maf_utilities.tool_fail("Input interval file has not been specified.") if options.output_file: output_file = options.output_file else: maf_utilities.tool_fail("Output file has not been specified.") split_blocks_by_species = remove_all_gap_columns = False if options.split_blocks_by_species and options.split_blocks_by_species == 'split_blocks_by_species': split_blocks_by_species = True if options.remove_all_gap_columns and options.remove_all_gap_columns == 'remove_all_gap_columns': remove_all_gap_columns = True else: remove_all_gap_columns = True # Finish parsing command line # Open indexed access to MAFs if options.mafType: if options.indexLocation: index = maf_utilities.maf_index_by_uid(options.mafType, options.indexLocation) else: index = maf_utilities.maf_index_by_uid(options.mafType, options.mafIndexFile) if index is None: maf_utilities.tool_fail( "The MAF source specified (%s) appears to be invalid." % (options.mafType)) elif options.mafFile: index, index_filename = maf_utilities.open_or_build_maf_index( options.mafFile, options.mafIndex, species=[dbkey]) if index is None: maf_utilities.tool_fail("Your MAF file appears to be malformed.") else: maf_utilities.tool_fail( "Desired source MAF type has not been specified.") # Create MAF writter out = bx.align.maf.Writer(open(output_file, "w")) # Iterate over input regions num_blocks = 0 num_regions = None for num_regions, region in enumerate( bx.intervals.io.NiceReaderWrapper( open(interval_file), chrom_col=chromCol, start_col=startCol, end_col=endCol, strand_col=strandCol, fix_strand=True, return_header=False, return_comments=False)): # noqa: B007 src = maf_utilities.src_merge(dbkey, region.chrom) for block in index.get_as_iterator(src, region.start, region.end): if split_blocks_by_species: blocks = [ new_block for new_block in maf_utilities.iter_blocks_split_by_species(block) if maf_utilities.component_overlaps_region( new_block.get_component_by_src_start(dbkey), region) ] else: blocks = [block] for block in blocks: block = maf_utilities.chop_block_by_region(block, src, region) if block is not None: if species is not None: block = block.limit_to_species(species) block = maf_utilities.orient_block_by_region( block, src, region) if remove_all_gap_columns: block.remove_all_gap_columns() out.write(block) num_blocks += 1 # Close output MAF out.close() # remove index file if created during run maf_utilities.remove_temp_index_file(index_filename) if num_blocks: print("%i MAF blocks extracted for %i regions." % (num_blocks, (num_regions + 1))) elif num_regions is not None: print("No MAF blocks could be extracted for %i regions." % (num_regions + 1)) else: print("No valid regions have been provided.")
def __main__(): maf_source_type = sys.argv.pop(1) input_maf_filename = sys.argv[1].strip() input_interval_filename = sys.argv[2].strip() output_filename = sys.argv[3].strip() dbkey = sys.argv[4].strip() try: chr_col = int(sys.argv[5].strip()) - 1 start_col = int(sys.argv[6].strip()) - 1 end_col = int(sys.argv[7].strip()) - 1 except Exception: print( "You appear to be missing metadata. You can specify your metadata by clicking on the pencil icon associated with your interval file.", file=sys.stderr) sys.exit() summary = sys.argv[8].strip() if summary.lower() == "true": summary = True else: summary = False mafIndexFile = "%s/maf_index.loc" % sys.argv[9] try: maf_index_filename = sys.argv[10].strip() except Exception: maf_index_filename = None index = index_filename = None if maf_source_type == "user": # index maf for use here index, index_filename = maf_utilities.open_or_build_maf_index( input_maf_filename, maf_index_filename, species=[dbkey]) if index is None: print("Your MAF file appears to be malformed.", file=sys.stderr) sys.exit() elif maf_source_type == "cached": # access existing indexes index = maf_utilities.maf_index_by_uid(input_maf_filename, mafIndexFile) if index is None: print("The MAF source specified (%s) appears to be invalid." % (input_maf_filename), file=sys.stderr) sys.exit() else: print('Invalid source type specified: %s' % maf_source_type, file=sys.stdout) sys.exit() out = open(output_filename, 'w') num_region = None num_bad_region = 0 species_summary = {} total_length = 0 # loop through interval file for num_region, region in enumerate( bx.intervals.io.NiceReaderWrapper( open(input_interval_filename, 'r'), chrom_col=chr_col, start_col=start_col, end_col=end_col, fix_strand=True, return_header=False, return_comments=False)): # noqa: B007 src = "%s.%s" % (dbkey, region.chrom) region_length = region.end - region.start if region_length < 1: num_bad_region += 1 continue total_length += region_length coverage = {dbkey: BitSet(region_length)} for block in index.get_as_iterator(src, region.start, region.end): for spec in maf_utilities.get_species_in_block(block): if spec not in coverage: coverage[spec] = BitSet(region_length) for block in maf_utilities.iter_blocks_split_by_species(block): if maf_utilities.component_overlaps_region( block.get_component_by_src(src), region): # need to chop and orient the block block = maf_utilities.orient_block_by_region( maf_utilities.chop_block_by_region(block, src, region), src, region, force_strand='+') start_offset, alignment = maf_utilities.reduce_block_by_primary_genome( block, dbkey, region.chrom, region.start) for i in range(len(alignment[dbkey])): for spec, text in alignment.items(): if text[i] != '-': coverage[spec].set(start_offset + i) if summary: # record summary for key in coverage.keys(): if key not in species_summary: species_summary[key] = 0 species_summary[ key] = species_summary[key] + coverage[key].count_range() else: # print coverage for interval coverage_sum = coverage[dbkey].count_range() out.write("%s\t%s\t%s\t%s\n" % ("\t".join(region.fields), dbkey, coverage_sum, region_length - coverage_sum)) keys = list(coverage.keys()) keys.remove(dbkey) keys.sort() for key in keys: coverage_sum = coverage[key].count_range() out.write("%s\t%s\t%s\t%s\n" % ("\t".join(region.fields), key, coverage_sum, region_length - coverage_sum)) if summary: out.write("#species\tnucleotides\tcoverage\n") for spec in species_summary: out.write("%s\t%s\t%.4f\n" % (spec, species_summary[spec], float(species_summary[spec]) / total_length)) out.close() if num_region is not None: print("%i regions were processed with a total length of %i." % (num_region + 1, total_length)) if num_bad_region: print("%i regions were invalid." % (num_bad_region)) maf_utilities.remove_temp_index_file(index_filename)
def __main__(): maf_source_type = sys.argv.pop( 1 ) input_maf_filename = sys.argv[1].strip() input_interval_filename = sys.argv[2].strip() output_filename = sys.argv[3].strip() dbkey = sys.argv[4].strip() try: chr_col = int( sys.argv[5].strip() ) - 1 start_col = int( sys.argv[6].strip() ) - 1 end_col = int( sys.argv[7].strip() ) - 1 except: print("You appear to be missing metadata. You can specify your metadata by clicking on the pencil icon associated with your interval file.", file=sys.stderr) sys.exit() summary = sys.argv[8].strip() if summary.lower() == "true": summary = True else: summary = False mafIndexFile = "%s/maf_index.loc" % sys.argv[9] try: maf_index_filename = sys.argv[10].strip() except: maf_index_filename = None index = index_filename = None if maf_source_type == "user": # index maf for use here index, index_filename = maf_utilities.open_or_build_maf_index( input_maf_filename, maf_index_filename, species=[dbkey] ) if index is None: print("Your MAF file appears to be malformed.", file=sys.stderr) sys.exit() elif maf_source_type == "cached": # access existing indexes index = maf_utilities.maf_index_by_uid( input_maf_filename, mafIndexFile ) if index is None: print("The MAF source specified (%s) appears to be invalid." % ( input_maf_filename ), file=sys.stderr) sys.exit() else: print('Invalid source type specified: %s' % maf_source_type, file=sys.stdout) sys.exit() out = open(output_filename, 'w') num_region = None num_bad_region = 0 species_summary = {} total_length = 0 # loop through interval file for num_region, region in enumerate( bx.intervals.io.NiceReaderWrapper( open( input_interval_filename, 'r' ), chrom_col=chr_col, start_col=start_col, end_col=end_col, fix_strand=True, return_header=False, return_comments=False ) ): src = "%s.%s" % ( dbkey, region.chrom ) region_length = region.end - region.start if region_length < 1: num_bad_region += 1 continue total_length += region_length coverage = { dbkey: BitSet( region_length ) } for block in index.get_as_iterator( src, region.start, region.end ): for spec in maf_utilities.get_species_in_block( block ): if spec not in coverage: coverage[spec] = BitSet( region_length ) for block in maf_utilities.iter_blocks_split_by_species( block ): if maf_utilities.component_overlaps_region( block.get_component_by_src( src ), region ): # need to chop and orient the block block = maf_utilities.orient_block_by_region( maf_utilities.chop_block_by_region( block, src, region ), src, region, force_strand='+' ) start_offset, alignment = maf_utilities.reduce_block_by_primary_genome( block, dbkey, region.chrom, region.start ) for i in range( len( alignment[dbkey] ) ): for spec, text in alignment.items(): if text[i] != '-': coverage[spec].set( start_offset + i ) if summary: # record summary for key in coverage.keys(): if key not in species_summary: species_summary[key] = 0 species_summary[key] = species_summary[key] + coverage[key].count_range() else: # print coverage for interval coverage_sum = coverage[dbkey].count_range() out.write( "%s\t%s\t%s\t%s\n" % ( "\t".join( region.fields ), dbkey, coverage_sum, region_length - coverage_sum ) ) keys = list(coverage.keys()) keys.remove( dbkey ) keys.sort() for key in keys: coverage_sum = coverage[key].count_range() out.write( "%s\t%s\t%s\t%s\n" % ( "\t".join( region.fields ), key, coverage_sum, region_length - coverage_sum ) ) if summary: out.write( "#species\tnucleotides\tcoverage\n" ) for spec in species_summary: out.write( "%s\t%s\t%.4f\n" % ( spec, species_summary[spec], float( species_summary[spec] ) / total_length ) ) out.close() if num_region is not None: print("%i regions were processed with a total length of %i." % ( num_region + 1, total_length )) if num_bad_region: print("%i regions were invalid." % ( num_bad_region )) maf_utilities.remove_temp_index_file( index_filename )
def __main__(): index = index_filename = None # Parse Command Line options, args = doc_optparse.parse(__doc__) if options.dbkey: dbkey = options.dbkey else: dbkey = None if dbkey in [None, "?"]: maf_utilities.tool_fail("You must specify a proper build in order to extract alignments. You can specify your genome build by clicking on the pencil icon associated with your interval file.") species = maf_utilities.parse_species_option(options.species) if options.chromCol: chromCol = int(options.chromCol) - 1 else: maf_utilities.tool_fail("Chromosome column not set, click the pencil icon in the history item to set the metadata attributes.") if options.startCol: startCol = int(options.startCol) - 1 else: maf_utilities.tool_fail("Start column not set, click the pencil icon in the history item to set the metadata attributes.") if options.endCol: endCol = int(options.endCol) - 1 else: maf_utilities.tool_fail("End column not set, click the pencil icon in the history item to set the metadata attributes.") if options.strandCol: strandCol = int(options.strandCol) - 1 else: strandCol = -1 if options.interval_file: interval_file = options.interval_file else: maf_utilities.tool_fail("Input interval file has not been specified.") if options.output_file: output_file = options.output_file else: maf_utilities.tool_fail("Output file has not been specified.") split_blocks_by_species = remove_all_gap_columns = False if options.split_blocks_by_species and options.split_blocks_by_species == 'split_blocks_by_species': split_blocks_by_species = True if options.remove_all_gap_columns and options.remove_all_gap_columns == 'remove_all_gap_columns': remove_all_gap_columns = True else: remove_all_gap_columns = True # Finish parsing command line # Open indexed access to MAFs if options.mafType: if options.indexLocation: index = maf_utilities.maf_index_by_uid(options.mafType, options.indexLocation) else: index = maf_utilities.maf_index_by_uid(options.mafType, options.mafIndexFile) if index is None: maf_utilities.tool_fail("The MAF source specified (%s) appears to be invalid." % (options.mafType)) elif options.mafFile: index, index_filename = maf_utilities.open_or_build_maf_index(options.mafFile, options.mafIndex, species=[dbkey]) if index is None: maf_utilities.tool_fail("Your MAF file appears to be malformed.") else: maf_utilities.tool_fail("Desired source MAF type has not been specified.") # Create MAF writter out = bx.align.maf.Writer(open(output_file, "w")) # Iterate over input regions num_blocks = 0 num_regions = None for num_regions, region in enumerate(bx.intervals.io.NiceReaderWrapper(open(interval_file, 'r'), chrom_col=chromCol, start_col=startCol, end_col=endCol, strand_col=strandCol, fix_strand=True, return_header=False, return_comments=False)): src = maf_utilities.src_merge(dbkey, region.chrom) for block in index.get_as_iterator(src, region.start, region.end): if split_blocks_by_species: blocks = [new_block for new_block in maf_utilities.iter_blocks_split_by_species(block) if maf_utilities.component_overlaps_region(new_block.get_component_by_src_start(dbkey), region)] else: blocks = [block] for block in blocks: block = maf_utilities.chop_block_by_region(block, src, region) if block is not None: if species is not None: block = block.limit_to_species(species) block = maf_utilities.orient_block_by_region(block, src, region) if remove_all_gap_columns: block.remove_all_gap_columns() out.write(block) num_blocks += 1 # Close output MAF out.close() # remove index file if created during run maf_utilities.remove_temp_index_file(index_filename) if num_blocks: print("%i MAF blocks extracted for %i regions." % (num_blocks, (num_regions + 1))) elif num_regions is not None: print("No MAF blocks could be extracted for %i regions." % (num_regions + 1)) else: print("No valid regions have been provided.")