def test_shrink_gaps(self): """ Test the gap shrinkage function. """ test_map = Map() end = 0 for i in xrange(10): for name in ('fr_{}'.format(i), 'GAP'): fr_length = random.randrange(100) new_record = Map.Record(name, fr_length, 0, fr_length, random.choice(('+', '-')), 'ref_1', end, end + fr_length) end += fr_length test_map.add_record(new_record) gap_size = 50 test_map.shrink_gaps(gap_size) # check if gap sizes are of the specified value for i in test_map.chromosomes(): for j in test_map.fragments(i): if j.fr_name == 'GAP': self.assertEqual(j.fr_length, gap_size) self.assertEqual(j.fr_end - j.fr_start, gap_size) self.assertEqual(j.ref_end - j.ref_start, gap_size) # check that fragments are adjacent to each other for j, k in zip( list(test_map.fragments(i))[:-1], list(test_map.fragments(i))[1:]): self.assertEqual(j.ref_end, k.ref_start)
def test_write(self): """ The the writing method of the fragment simulator. """ self.__fragments = tempfile.mkstemp()[1] self.__chromosomes = tempfile.mkstemp()[1] self.__map = tempfile.mkstemp()[1] self.__simulator.write(self.__map, self.__fragments, self.__chromosomes) # check if the correct number of fragment and chromosome # sequences was written fragment_fasta = pyfaidx.Fasta(self.__fragments) self.assertEqual(len(fragment_fasta.keys()), self.__fragment_number + self.__unplaced_number) chromosome_fasta = pyfaidx.Fasta(self.__chromosomes) self.assertEqual(len(chromosome_fasta.keys()), self.__chromosome_number) # check if a correct fragment map was written test_map = Map() test_map.read(self.__map) os.unlink(self.__fragments) os.unlink(self.__fragments + '.fai') os.unlink(self.__chromosomes) os.unlink(self.__chromosomes + '.fai') os.unlink(self.__map)
def test_summary(self): """ Test the Map summary routine. """ fragment_map = Map() fragment_map.read(self.__test_line) self.assertIsInstance(fragment_map.summary(), dict)
class Transfer(object): """ Implements transfering routines for abstract data. """ def __init__(self, fragment_map): """ Create a Transfer object. :param fragment_map: a fragment map for feature transfer :type fragment_map: Map """ self.__fragment_map = Map() self.__fragment_map.read(fragment_map) def find_fragment(self, fragment): """ Given a fragment name, return its record from the fragment map. If the specified fragment is absent in the map, return None. :param fragment: a fragment name :type fragment: str :return: a fragment map record corresponding to the specified fragment :rtype Map.Record """ for chromosome in self.__fragment_map.chromosomes(): for record in self.__fragment_map.fragments(chromosome): if record.fr_name == fragment: return record return None def coordinate(self, fragment, pos): """ Given a position on a fragment, return the corresponding coordinates on the assembled chromosomes according to the fragment map specified when the object was created. :param fragment: a fragment name :param pos: a position on a fragment (zero-based) :type fragment: str :type pos: int :return: a tuple of the chromosome name and a position on it :rtype: tuple """ fr_record = self.find_fragment(fragment) if fr_record is None: # the fragment is absent in the assembly, skip the feature return None chrom = fr_record.ref_chr if fr_record.fr_strand == '+': chrom_pos = fr_record.ref_start + pos else: chrom_pos = fr_record.ref_end - pos return chrom, chrom_pos
def test_chromosomes(self): """ Test the Map chromosomes iterator. """ fragment_map = Map() fragment_map.read(self.__test_line) chromosomes = list(fragment_map.chromosomes()) self.assertEqual(chromosomes, ['chr1'])
def __init__(self, fragment_map): """ Create a Transfer object. :param fragment_map: a fragment map for feature transfer :type fragment_map: Map """ self.__fragment_map = Map() self.__fragment_map.read(fragment_map)
def test_add_record(self): """ Check if fragment records are added correctly. """ fragment_map = Map() new_record = Map.Record(fr_name='fragment1', fr_length=180, fr_start=0, fr_end=180, fr_strand='+', ref_chr='chr1', ref_start=5000, ref_end=5180) fragment_map.add_record(new_record)
def test_blast(self): """ Test the blast method which utilizes BLASTN alignments to construct a fragment map. """ fragment_lengths = SeqLengths(self.__fragment_file) map_creator = AlignmentToMap(self.__gap_size, fragment_lengths.lengths()) with open(self.__alignment_file) as alignment_file: blast_alignments = BlastTab(alignment_file) new_map = map_creator.blast(blast_alignments, 1.2)[0] orig_map = Map() orig_map.read(self.__map_file) # compare the obtained fragment map with the original one for chromosome in orig_map.chromosomes(): for orig, new in izip(orig_map.fragments(chromosome), new_map.fragments(chromosome)): self.assertEqual(orig, new) # now test againt the situation when a fragment which length # is missing is added to the alignments with open(self.__alignment_file) as alignment_file: blast_alignments = BlastTab(alignment_file) incomplete_lengths = fragment_lengths.lengths() del incomplete_lengths[sorted(incomplete_lengths.keys())[0]] map_creator = AlignmentToMap(self.__gap_size, incomplete_lengths, min_fragment_length=50) with self.assertRaises(AlignmentToMapError): map_creator.blast(blast_alignments, 1.2)
def test_convert2bed(self): """ Test the BED conversion routine. """ fragment_map = Map() fragment_map.read(self.__test_line) fragment_map.convert2bed(self.__output_file) # try to read the produced BED file with open(self.__output_file) as bed_file: reader = Reader(bed_file) for _ in reader.records(): pass
def test_fragments(self): """ Test the Map fragments iterator. """ fragment_map = Map() fragment_map.read(self.__test_line) fragments = list(fragment_map.fragments('chr1')) self.assertEqual(len(fragments), 1) self.assertIsInstance(fragments[0], Map.Record) # check if the missing chromosome is processed correctly with self.assertRaises(MapError): list(fragment_map.fragments('chrN'))
def test_read(self): """ Test the Map reading routine. """ fragment_map = Map() fragment_map.read(self.__test_line) fragment = fragment_map.fragments('chr1').next() self.assertEqual(fragment.fr_name, 'fragment1') self.assertEqual(fragment.fr_length, 180) self.assertEqual(fragment.fr_start, 0) self.assertEqual(fragment.fr_end, 180) self.assertEqual(fragment.fr_strand, '+') self.assertEqual(fragment.ref_chr, 'chr1') self.assertEqual(fragment.ref_start, 5000) self.assertEqual(fragment.ref_end, 5180) # check for incorrect input files for i in self.__incorrect_files: with self.assertRaises(MapError): fragment_map.read(os.path.join(self.__incorrect_file_dir, i))
def test_write(self): """ Test the Map writing routine. """ fragment_map = Map() fragment_map.read(self.__test_line) output_filename = os.path.join('data', 'fragment_map', 'fragment_map_output.txt') fragment_map.write(output_filename) with open(output_filename) as output_file: with open(self.__test_line) as original_file: for x, y in izip(original_file, output_file): self.assertEqual(x, y) os.unlink(output_filename)
def chromosomer(): """ The main function that is run if Chromosomer was launched. It defines a command-line parser which processed arguments passed to the program. """ parser = argparse.ArgumentParser( description='Reference-assisted chromosome assembly tool.') subparsers = parser.add_subparsers(dest='command') parser.add_argument('-v', '--version', action='version', version='%(prog)s 0.1.4') parser.add_argument('-d', '--debug', action='store_true', help='show debugging messages') # Parser for the 'chromosomer assemble' part that produces a FASTA # file of assembled chromosomes from the specified fragment map. assemble_parser = subparsers.add_parser( 'assemble', help='get sequences of assembled chromosomes', description='Get the FASTA file of assembled chromosomes.', formatter_class=argparse.ArgumentDefaultsHelpFormatter ) # required arguments for the 'assemble' routine assemble_parser.add_argument('map', help='a fragment map file') assemble_parser.add_argument('fragment_fasta', help='a FASTA file of fragment ' 'sequences to be assembled') assemble_parser.add_argument('output_fasta', help='the output FASTA file of the ' 'assembled chromosome sequences') # optinal arguments for the 'assemble' routine assemble_parser.add_argument('-s', '--save_soft_mask', action='store_true', help='keep soft masking from the ' 'original fragment sequences') # Parser for the 'chromosomer fragmentmap' part that # produces a map of fragment positions on reference # chromosomes from BLAST alignments of the fragments to the # chromosomes. fragmentmap_parser = subparsers.add_parser( 'fragmentmap', description='Construct a fragment map from fragment ' 'alignments to reference chromosomes.', help='construct a fragment map from alignments', formatter_class=argparse.ArgumentDefaultsHelpFormatter ) # required arguments for the 'fragmentmap' routine fragmentmap_parser.add_argument( 'alignment_file', help='a BLAST tabular file of fragment alignments to ' 'reference chromosomes' ) fragmentmap_parser.add_argument( 'gap_size', type=int, help='a size of a gap inserted between mapped fragments' ) fragmentmap_parser.add_argument( 'fragment_lengths', help='a file containing lengths of fragment sequences; it can ' 'be obtained using the \'chromosomer fastalength\' tool' ) fragmentmap_parser.add_argument( 'output_map', help='an output fragment map file name' ) # optional arguments for the 'fragmentmap' routine fragmentmap_parser.add_argument( '-r', '--ratio_threshold', type=float, default=1.2, help='the least ratio of two greatest fragment alignment ' 'scores to determine the fragment placed to a reference ' 'genome' ) fragmentmap_parser.add_argument( '-s', '--shrink_gaps', action='store_true', help='shrink large interfragment gaps to the specified size' ) # Parser for the 'chromosomer fragmentmapstat' part that reports # statistics on a fragment map fragmentmapstat_parser = subparsers.add_parser( 'fragmentmapstat', description='Show statistics on a fragment map.', help='show fragment map statistics' ) # required arguments for the 'fragmentmapstat' routine fragmentmapstat_parser.add_argument('map', help='a fragment map file') fragmentmapstat_parser.add_argument('output', help='an output file of ' 'fragment map statistics') # Parser for the 'chromosomer fragmentmapbed' part that converts # a fragement map to the BED format fragmentmapbed_parser = subparsers.add_parser( 'fragmentmapbed', description='Convert a fragment map to the BED format.', help='convert a fragment map to the BED format' ) # required arguments for the 'fragmentmapbed' routine fragmentmapbed_parser.add_argument('map', help='a fragment map file') fragmentmapbed_parser.add_argument('output', help='an output BED file ' 'representing the ' 'fragment map') # Parser for the 'chromosomer transfer' part that transfers # genome feature annotation from fragments to their assembly transfer_parser = subparsers.add_parser( 'transfer', description='Transfer annotated genomic features from ' 'fragments to their assembly.', help='transfer annotated features from fragments to ' 'chromosomes', formatter_class=argparse.ArgumentDefaultsHelpFormatter ) # required arguments for the 'transfer' routine transfer_parser.add_argument('map', help='a fragment map file') transfer_parser.add_argument('annotation', help='a file of annotated genome ' 'features') transfer_parser.add_argument('output', help='an output file of the ' 'transfered annotation') # optional arguments for the 'transfer' routine transfer_parser.add_argument('-f', '--format', default='bed', choices=['bed', 'gff3', 'vcf'], help='the format of a file of ' 'annotated features (bed, ' 'gff3 or vcf)') # Parser for the 'chromosomer fastalength' part that calculates # lengths of sequences in the given FASTA file. fastalength_parser = subparsers.add_parser( 'fastalength', description='Get lengths of sequences in the specified FASTA ' 'file (required to build a fragment map).', help='get lengths of sequences from a FASTA file', ) # required arguments for the 'fastalength' routine fastalength_parser.add_argument('fasta', help='a FASTA file which sequence ' 'lengths are to be obtained') fastalength_parser.add_argument('output', help='an output file of sequence ' 'lengths') # Parser for the 'chromosomer simulator' routine simulator_parser = subparsers.add_parser( 'simulator', description='Simulate fragments and test assembly for ' 'testing purposes.', help='fragment simulator for testing purposes' ) # required arguments for the 'simulator' routine simulator_parser.add_argument('fr_num', type=int, help='the number of ' 'chromosome fragments') simulator_parser.add_argument('fr_len', type=int, help='the length of fragments') simulator_parser.add_argument('chr_num', type=int, help='the number of chromosomes') simulator_parser.add_argument('output_dir', help='the directory for output files') simulator_parser.add_argument('-g', '--gap_size', type=int, default=2000, help='the size of gaps between ' 'fragments on a chromosome') simulator_parser.add_argument('-p', '--unplaced', type=int, help='the number of unplaced ' 'fragments') simulator_parser.add_argument('--prefix', default='', help='the prefix for output file ' 'names') # Parser for the 'chromosomer agp2map' routine agp2map_parser = subparsers.add_parser( 'agp2map', description='Convert an AGP file to the fragment map format.', help='convert an AGP file to a fragment map' ) # required arguments for the 'agp2map' routine agp2map_parser.add_argument('agp_file', help='an AGP file') agp2map_parser.add_argument('output_file', help='the output ' 'fragment map ' 'file') args = parser.parse_args() if args.debug: logger.setLevel(logging.DEBUG) else: logger.setLevel(logging.INFO) logger.propagate = False formatter = logging.Formatter('%(asctime)-15s - %(message)s', '%Y-%m-%d %H:%M:%S') ch = logging.StreamHandler() ch.setLevel(logging.INFO) ch.setFormatter(formatter) logger.addHandler(ch) logging.basicConfig() cli_logger = logging.getLogger(__name__) cli_logger.propagate = False cli_logger.addHandler(ch) cli_logger.setLevel(logging.INFO) if args.command == 'assemble': fragment_map = Map() fragment_map.read(args.map) fragment_map.assemble(args.fragment_fasta, args.output_fasta, args.save_soft_mask) elif args.command == 'fragmentmap': fragment_lengths = read_fragment_lengths(args.fragment_lengths) map_creator = AlignmentToMap(args.gap_size, fragment_lengths) with open(args.alignment_file) as alignment_file: alignments = BlastTab(alignment_file) fragment_map, unlocalized, unplaced = map_creator.blast( alignments, args.ratio_threshold) if args.shrink_gaps: fragment_map.shrink_gaps(args.gap_size) fragment_map.write(args.output_map) # write unlocalized and unplaced fragments with open(splitext(args.output_map)[0] + '_unlocalized.txt', 'w') as unlocalized_file: for i in unlocalized: unlocalized_file.write('{}\t{}\n'.format(*i)) with open(splitext(args.output_map)[0] + '_unplaced.txt', 'w') as unplaced_file: for i in unplaced: unplaced_file.write('{}\n'.format(i)) elif args.command == 'transfer': total_count = transferred_count = 0 if args.format == 'bed': transferrer = BedTransfer(args.map) with open(args.annotation) as input_file: with bioformats.bed.Writer(args.output) as output_file: for feature in bioformats.bed.Reader( input_file).records(): total_count += 1 transferred_feature = transferrer.feature( feature) if transferred_feature is not None: transferred_count += 1 output_file.write(transferred_feature) elif args.format == 'gff3': transferrer = Gff3Transfer(args.map) with open(args.annotation) as input_file: with bioformats.gff3.Writer(args.output) as output_file: for feature in bioformats.gff3.Reader( input_file).records(): total_count += 1 transferred_feature = transferrer.feature( feature) if transferred_feature is not None: transferred_count += 1 output_file.write(transferred_feature) elif args.format == 'vcf': transferrer = VcfTransfer(args.map) reader = vcf.Reader(open(args.annotation)) writer = vcf.Writer(open(args.output, 'w'), reader) for variant in reader: total_count += 1 transferred_feature = transferrer.feature(variant) if transferred_feature is not None: transferred_count += 1 writer.write_record(transferred_feature) writer.close() logger.info('%d features transferred', transferred_count) logger.info('%d features skipped', total_count - transferred_count) elif args.command == 'fastalength': seq_lengths = SeqLengths(args.fasta) with open(args.output, 'wt') as length_file: length_writer = csv.writer(length_file, delimiter='\t') for header, length in seq_lengths.lengths().iteritems(): length_writer.writerow((header, length, )) elif args.command == 'simulator': fr_simulator = Simulator(args.fr_len, args.fr_num, args.chr_num, args.unplaced, args.gap_size) map_file = os.path.join(args.output_dir, args.prefix + 'map.txt') chr_file = os.path.join(args.output_dir, args.prefix + 'chromosomes.fa') fr_file = os.path.join(args.output_dir, args.prefix + 'fragments.fa') fr_simulator.write(map_file, fr_file, chr_file) elif args.command == 'fragmentmapstat': fragment_map = Map() fragment_map.read(args.map) summary = fragment_map.summary() template = '\t'.join(['{}'] * 4) + '\n' with open(args.output, 'w') as output_file: for chromosome in sorted(summary.keys()): output_file.write(template.format(chromosome, *summary[chromosome])) elif args.command == 'fragmentmapbed': fragment_map = Map() fragment_map.read(args.map) fragment_map.convert2bed(args.output) elif args.command == 'agp2map': agp2map(args.agp_file, args.output_file)
def test_assemble(self): """ Test the assemble routine. """ # first, we form fragment and chromosome sequences fragments = {} fragment_pattern = ['AC', 'AG', 'CT', 'CG', 'AT'] for i, pattern in enumerate(fragment_pattern): fragments['fragment{}'.format(i + 1)] = pattern * 5 # a negative number indicated reverse orientation of a fragment chromosome_content = {'chr1': [1, -2, 3], 'chr2': [-4, 5]} # get chromosome sequences chromosomes = {} complement = string.maketrans('ATCGatcgNnXx', 'TAGCtagcNnXx') gap_size = 10 for i, chromosome_fragments in chromosome_content.iteritems(): chromosomes[i] = [] for j in chromosome_fragments: fr_seq = fragments['fragment{}'.format(abs(j))] if j < 0: chromosomes[i].append(fr_seq[::-1].translate(complement)) else: chromosomes[i].append(fr_seq) chromosomes[i].append('N' * gap_size) chromosomes[i] = ''.join(chromosomes[i]) # contruct a fragment __map fragment_map = Map() for i, chromosome_fragments in chromosome_content.iteritems(): current_start = 0 for j in chromosome_fragments: fr_name = 'fragment{}'.format(abs(j)) fr_length = 10 fr_start = 0 fr_end = fr_length fr_strand = '+' if j > 0 else '-' ref_chr = i ref_start = current_start ref_end = current_start + fr_length fragment_map.add_record( Map.Record(fr_name, fr_length, fr_start, fr_end, fr_strand, ref_chr, ref_start, ref_end)) current_start += fr_length # add the gap fr_name = 'GAP' fr_length = gap_size fr_start = 0 fr_end = gap_size fr_strand = '+' ref_chr = i ref_start = current_start ref_end = current_start + fr_end fragment_map.add_record( Map.Record(fr_name, fr_length, fr_start, fr_end, fr_strand, ref_chr, ref_start, ref_end)) current_start += fr_length output_chromosomes = os.path.join(self.__output_dir, 'temp_chromosomes.txt') output_fragments = os.path.join(self.__output_dir, 'temp_fragments.txt') # write the fragment sequences to a FASTA file with Writer(output_fragments) as writer: for i, j in fragments.iteritems(): writer.write(i, j) fragment_map.assemble(output_fragments, output_chromosomes) # read fragments from the written FASTA file and compare them # to the original ones assembled_chromosomes = pyfaidx.Fasta(output_chromosomes) for i, seq in chromosomes.iteritems(): self.assertEqual(seq, assembled_chromosomes[i][:].seq) # try to use the fragment absent in the FASTA file of # fragment sequences fragment_map.add_record( Map.Record(fr_name='missing_fragment', fr_length=0, fr_start=0, fr_end=0, fr_strand='+', ref_chr='chr3', ref_start=0, ref_end=0)) with self.assertRaises(MapError): fragment_map.assemble(output_fragments, output_chromosomes) os.unlink(output_chromosomes) os.unlink(output_chromosomes + '.fai') os.unlink(output_fragments) os.unlink(output_fragments + '.fai')
def chromosomer(): """ The main function that is run if Chromosomer was launched. It defines a command-line parser which processed arguments passed to the program. """ parser = argparse.ArgumentParser( description='Reference-assisted chromosome assembly tool.') subparsers = parser.add_subparsers(dest='command') parser.add_argument('-v', '--version', action='version', version='%(prog)s 0.1.4') parser.add_argument('-d', '--debug', action='store_true', help='show debugging messages') # Parser for the 'chromosomer assemble' part that produces a FASTA # file of assembled chromosomes from the specified fragment map. assemble_parser = subparsers.add_parser( 'assemble', help='get sequences of assembled chromosomes', description='Get the FASTA file of assembled chromosomes.', formatter_class=argparse.ArgumentDefaultsHelpFormatter) # required arguments for the 'assemble' routine assemble_parser.add_argument('map', help='a fragment map file') assemble_parser.add_argument('fragment_fasta', help='a FASTA file of fragment ' 'sequences to be assembled') assemble_parser.add_argument('output_fasta', help='the output FASTA file of the ' 'assembled chromosome sequences') # optinal arguments for the 'assemble' routine assemble_parser.add_argument('-s', '--save_soft_mask', action='store_true', help='keep soft masking from the ' 'original fragment sequences') # Parser for the 'chromosomer fragmentmap' part that # produces a map of fragment positions on reference # chromosomes from BLAST alignments of the fragments to the # chromosomes. fragmentmap_parser = subparsers.add_parser( 'fragmentmap', description='Construct a fragment map from fragment ' 'alignments to reference chromosomes.', help='construct a fragment map from alignments', formatter_class=argparse.ArgumentDefaultsHelpFormatter) # required arguments for the 'fragmentmap' routine fragmentmap_parser.add_argument( 'alignment_file', help='a BLAST tabular file of fragment alignments to ' 'reference chromosomes') fragmentmap_parser.add_argument( 'gap_size', type=int, help='a size of a gap inserted between mapped fragments') fragmentmap_parser.add_argument( 'fragment_lengths', help='a file containing lengths of fragment sequences; it can ' 'be obtained using the \'chromosomer fastalength\' tool') fragmentmap_parser.add_argument('output_map', help='an output fragment map file name') # optional arguments for the 'fragmentmap' routine fragmentmap_parser.add_argument( '-r', '--ratio_threshold', type=float, default=1.2, help='the least ratio of two greatest fragment alignment ' 'scores to determine the fragment placed to a reference ' 'genome') fragmentmap_parser.add_argument( '-s', '--shrink_gaps', action='store_true', help='shrink large interfragment gaps to the specified size') # Parser for the 'chromosomer fragmentmapstat' part that reports # statistics on a fragment map fragmentmapstat_parser = subparsers.add_parser( 'fragmentmapstat', description='Show statistics on a fragment map.', help='show fragment map statistics') # required arguments for the 'fragmentmapstat' routine fragmentmapstat_parser.add_argument('map', help='a fragment map file') fragmentmapstat_parser.add_argument('output', help='an output file of ' 'fragment map statistics') # Parser for the 'chromosomer fragmentmapbed' part that converts # a fragement map to the BED format fragmentmapbed_parser = subparsers.add_parser( 'fragmentmapbed', description='Convert a fragment map to the BED format.', help='convert a fragment map to the BED format') # required arguments for the 'fragmentmapbed' routine fragmentmapbed_parser.add_argument('map', help='a fragment map file') fragmentmapbed_parser.add_argument('output', help='an output BED file ' 'representing the ' 'fragment map') # Parser for the 'chromosomer transfer' part that transfers # genome feature annotation from fragments to their assembly transfer_parser = subparsers.add_parser( 'transfer', description='Transfer annotated genomic features from ' 'fragments to their assembly.', help='transfer annotated features from fragments to ' 'chromosomes', formatter_class=argparse.ArgumentDefaultsHelpFormatter) # required arguments for the 'transfer' routine transfer_parser.add_argument('map', help='a fragment map file') transfer_parser.add_argument('annotation', help='a file of annotated genome ' 'features') transfer_parser.add_argument('output', help='an output file of the ' 'transfered annotation') # optional arguments for the 'transfer' routine transfer_parser.add_argument('-f', '--format', default='bed', choices=['bed', 'gff3', 'vcf'], help='the format of a file of ' 'annotated features (bed, ' 'gff3 or vcf)') # Parser for the 'chromosomer fastalength' part that calculates # lengths of sequences in the given FASTA file. fastalength_parser = subparsers.add_parser( 'fastalength', description='Get lengths of sequences in the specified FASTA ' 'file (required to build a fragment map).', help='get lengths of sequences from a FASTA file', ) # required arguments for the 'fastalength' routine fastalength_parser.add_argument('fasta', help='a FASTA file which sequence ' 'lengths are to be obtained') fastalength_parser.add_argument('output', help='an output file of sequence ' 'lengths') # Parser for the 'chromosomer simulator' routine simulator_parser = subparsers.add_parser( 'simulator', description='Simulate fragments and test assembly for ' 'testing purposes.', help='fragment simulator for testing purposes') # required arguments for the 'simulator' routine simulator_parser.add_argument('fr_num', type=int, help='the number of ' 'chromosome fragments') simulator_parser.add_argument('fr_len', type=int, help='the length of fragments') simulator_parser.add_argument('chr_num', type=int, help='the number of chromosomes') simulator_parser.add_argument('output_dir', help='the directory for output files') simulator_parser.add_argument('-g', '--gap_size', type=int, default=2000, help='the size of gaps between ' 'fragments on a chromosome') simulator_parser.add_argument('-p', '--unplaced', type=int, help='the number of unplaced ' 'fragments') simulator_parser.add_argument('--prefix', default='', help='the prefix for output file ' 'names') # Parser for the 'chromosomer agp2map' routine agp2map_parser = subparsers.add_parser( 'agp2map', description='Convert an AGP file to the fragment map format.', help='convert an AGP file to a fragment map') # required arguments for the 'agp2map' routine agp2map_parser.add_argument('agp_file', help='an AGP file') agp2map_parser.add_argument('output_file', help='the output ' 'fragment map ' 'file') args = parser.parse_args() if args.debug: logger.setLevel(logging.DEBUG) else: logger.setLevel(logging.INFO) logger.propagate = False formatter = logging.Formatter('%(asctime)-15s - %(message)s', '%Y-%m-%d %H:%M:%S') ch = logging.StreamHandler() ch.setLevel(logging.INFO) ch.setFormatter(formatter) logger.addHandler(ch) logging.basicConfig() cli_logger = logging.getLogger(__name__) cli_logger.propagate = False cli_logger.addHandler(ch) cli_logger.setLevel(logging.INFO) if args.command == 'assemble': fragment_map = Map() fragment_map.read(args.map) fragment_map.assemble(args.fragment_fasta, args.output_fasta, args.save_soft_mask) elif args.command == 'fragmentmap': fragment_lengths = read_fragment_lengths(args.fragment_lengths) map_creator = AlignmentToMap(args.gap_size, fragment_lengths) with open(args.alignment_file) as alignment_file: alignments = BlastTab(alignment_file) fragment_map, unlocalized, unplaced = map_creator.blast( alignments, args.ratio_threshold) if args.shrink_gaps: fragment_map.shrink_gaps(args.gap_size) fragment_map.write(args.output_map) # write unlocalized and unplaced fragments with open(splitext(args.output_map)[0] + '_unlocalized.txt', 'w') as unlocalized_file: for i in unlocalized: unlocalized_file.write('{}\t{}\n'.format(*i)) with open(splitext(args.output_map)[0] + '_unplaced.txt', 'w') as unplaced_file: for i in unplaced: unplaced_file.write('{}\n'.format(i)) elif args.command == 'transfer': total_count = transferred_count = 0 if args.format == 'bed': transferrer = BedTransfer(args.map) with open(args.annotation) as input_file: with bioformats.bed.Writer(args.output) as output_file: for feature in bioformats.bed.Reader(input_file).records(): total_count += 1 transferred_feature = transferrer.feature(feature) if transferred_feature is not None: transferred_count += 1 output_file.write(transferred_feature) elif args.format == 'gff3': transferrer = Gff3Transfer(args.map) with open(args.annotation) as input_file: with bioformats.gff3.Writer(args.output) as output_file: for feature in bioformats.gff3.Reader( input_file).records(): total_count += 1 transferred_feature = transferrer.feature(feature) if transferred_feature is not None: transferred_count += 1 output_file.write(transferred_feature) elif args.format == 'vcf': transferrer = VcfTransfer(args.map) reader = vcf.Reader(open(args.annotation)) writer = vcf.Writer(open(args.output, 'w'), reader) for variant in reader: total_count += 1 transferred_feature = transferrer.feature(variant) if transferred_feature is not None: transferred_count += 1 writer.write_record(transferred_feature) writer.close() logger.info('%d features transferred', transferred_count) logger.info('%d features skipped', total_count - transferred_count) elif args.command == 'fastalength': seq_lengths = SeqLengths(args.fasta) with open(args.output, 'wt') as length_file: length_writer = csv.writer(length_file, delimiter='\t') for header, length in seq_lengths.lengths().iteritems(): length_writer.writerow(( header, length, )) elif args.command == 'simulator': fr_simulator = Simulator(args.fr_len, args.fr_num, args.chr_num, args.unplaced, args.gap_size) map_file = os.path.join(args.output_dir, args.prefix + 'map.txt') chr_file = os.path.join(args.output_dir, args.prefix + 'chromosomes.fa') fr_file = os.path.join(args.output_dir, args.prefix + 'fragments.fa') fr_simulator.write(map_file, fr_file, chr_file) elif args.command == 'fragmentmapstat': fragment_map = Map() fragment_map.read(args.map) summary = fragment_map.summary() template = '\t'.join(['{}'] * 4) + '\n' with open(args.output, 'w') as output_file: for chromosome in sorted(summary.keys()): output_file.write( template.format(chromosome, *summary[chromosome])) elif args.command == 'fragmentmapbed': fragment_map = Map() fragment_map.read(args.map) fragment_map.convert2bed(args.output) elif args.command == 'agp2map': agp2map(args.agp_file, args.output_file)