def parse_basic_args(parser): 'It parses the command line and it returns a dict with the arguments.' parsed_args = parser.parse_args() # we have to wrap the file in a BufferedReader to allow peeking into stdin wrapped_fhands = [] # if input is stdin it will be a fhand not a list of fhands. # we have to convert to a list in_fhands = parsed_args.input if not isinstance(in_fhands, list): in_fhands = [in_fhands] for fhand in in_fhands: fhand = wrap_in_buffered_reader(fhand) fhand = uncompress_if_required(fhand) wrapped_fhands.append(fhand) # We have to add the one_line to the fastq files in order to get the # speed improvements of the seqitems in_format = parsed_args.in_format if in_format == GUESS_FORMAT: for wrapped_fhand in wrapped_fhands: get_format(wrapped_fhand) else: if in_format != get_format(wrapped_fhands[0]): msg = 'The given input format does not correspond to the input' msg += ' file' raise WrongFormatError(msg) if 'fastq' in in_format: for wrapped_fhand in wrapped_fhands: get_format(wrapped_fhand) else: # we dont set the first one because already did in the previous # checking for wrapped_fhand in wrapped_fhands[1:]: set_format(wrapped_fhand, in_format) out_fhand = getattr(parsed_args, OUTFILE) comp_kind = get_requested_compression(parsed_args) if isinstance(out_fhand, list): new_out_fhands = [] for out_f in out_fhand: try: out_f = compress_fhand(out_f, compression_kind=comp_kind) except RuntimeError, error: parser.error(error) new_out_fhands.append(out_f) out_fhand = new_out_fhands
def _sorted_mapped_reads(ref_fpath, paired_fpaths=None, unpaired_fpaths=None, directory=None, file_format=None, min_seed_len=None): fhand = open(paired_fpaths[0]) if paired_fpaths else open(unpaired_fpaths[0]) if file_format is not None: set_format(fhand, file_format) else: file_format = get_format(fhand) index_fpath = get_or_create_bwa_index(ref_fpath, directory) extra_params = ['-a', '-M'] if min_seed_len is not None: extra_params.extend(['-k', min_seed_len]) bwa = map_with_bwamem(index_fpath, paired_fpaths=paired_fpaths, unpaired_fpath=unpaired_fpaths, extra_params=extra_params) bam_fhand = NamedTemporaryFile(dir='/home/carlos/tmp') sort_mapped_reads(bwa, bam_fhand.name, key='queryname') bamfile = pysam.Samfile(bam_fhand.name) return bamfile
def test_get_format_fhand(self): "It checks the get/set format functions" # file fhand fhand = NamedTemporaryFile() fhand.write(">seq\natgctacgacta\n") fhand.flush() name = fhand.name id_ = id(fhand) file_format = get_format(fhand) assert FILEFORMAT_INVENTORY[(id_, name)] == file_format num_keys = len(FILEFORMAT_INVENTORY) file_format = get_format(fhand) assert FILEFORMAT_INVENTORY[(id_, name)] == file_format assert len(FILEFORMAT_INVENTORY) == num_keys fhand = NamedTemporaryFile() set_format(fhand, "fasta") assert "fasta" == get_format(fhand)
def test_get_format_fhand(self): "It checks the get/set format functions" #file fhand fhand = NamedTemporaryFile() fhand.write('>seq\natgctacgacta\n') fhand.flush() name = fhand.name id_ = id(fhand) file_format = get_format(fhand) assert FILEFORMAT_INVENTORY[(id_, name)] == file_format num_keys = len(FILEFORMAT_INVENTORY) file_format = get_format(fhand) assert FILEFORMAT_INVENTORY[(id_, name)] == file_format assert len(FILEFORMAT_INVENTORY) == num_keys fhand = NamedTemporaryFile() set_format(fhand, 'fasta') assert 'fasta' == get_format(fhand)
def parse_basic_args(parser): 'It parses the command line and it returns a dict with the arguments.' parsed_args = parser.parse_args() # we have to wrap the file in a BufferedReader to allow peeking into stdin wrapped_fhands = [] # if input is stdin it will be a fhand not a list of fhands. # we have to convert to a list in_fhands = parsed_args.input if not isinstance(in_fhands, list): in_fhands = [in_fhands] for fhand in in_fhands: fhand = wrap_in_buffered_reader(fhand) fhand = uncompress_if_required(fhand) wrapped_fhands.append(fhand) # We have to add the one_line to the fastq files in order to get the # speed improvements of the seqitems in_format = parsed_args.in_format if in_format == GUESS_FORMAT: for wrapped_fhand in wrapped_fhands: get_format(wrapped_fhand) else: for wrapped_fhand in wrapped_fhands: set_format(wrapped_fhand, in_format) out_fhand = getattr(parsed_args, OUTFILE) comp_kind = get_requested_compression(parsed_args) if isinstance(out_fhand, list): new_out_fhands = [] for out_f in out_fhand: try: out_f = compress_fhand(out_f, compression_kind=comp_kind) except RuntimeError, error: parser.error(error) new_out_fhands.append(out_f) out_fhand = new_out_fhands
def test_pair_matcher(self): 'It test the pair matcher function' # with equal seqs but the last ones file1 = os.path.join(TEST_DATA_DIR, 'pairend1.sfastq') file2 = os.path.join(TEST_DATA_DIR, 'pairend2.sfastq') fwd_seqs = read_seqs([open(file1)]) rev_seqs = read_seqs([open(file2)]) out_fhand = StringIO() orphan_out_fhand = StringIO() out_format = 'fastq' seqs = flat_zip_longest(fwd_seqs, rev_seqs) match_pairs(seqs, out_fhand, orphan_out_fhand, out_format) output = out_fhand.getvalue() assert '@seq1:136:FC706VJ:2:2104:15343:197393 1:Y:18:ATCACG' in output assert '@seq2:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in output orp = orphan_out_fhand.getvalue() assert '@seq8:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in orp # with the firsts seqs different file1 = os.path.join(TEST_DATA_DIR, 'pairend1.sfastq') file2 = os.path.join(TEST_DATA_DIR, 'pairend3.sfastq') fwd_seqs = read_seqs([open(file1)]) rev_seqs = read_seqs([open(file2)]) out_fhand = StringIO() orphan_out_fhand = StringIO() out_format = 'fastq' seqs = flat_zip_longest(fwd_seqs, rev_seqs) match_pairs(seqs, out_fhand, orphan_out_fhand, out_format) output = out_fhand.getvalue() assert '@seq4:136:FC706VJ:2:2104:15343:197393 1:Y:18:ATCACG' in output assert '@seq5:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in output orp = orphan_out_fhand.getvalue() assert '@seq1:136:FC706VJ:2:2104:15343:197393 1:Y:18:ATCACG' in orp assert '@seq3:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in orp assert '@seq6:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in orp file1 = os.path.join(TEST_DATA_DIR, 'pairend4.sfastq') file2 = os.path.join(TEST_DATA_DIR, 'pairend2.sfastq') fwd_seqs = read_seqs([open(file1)]) rev_seqs = read_seqs([open(file2)]) out_fhand = StringIO() orphan_out_fhand = StringIO() out_format = 'fastq' seqs = flat_zip_longest(fwd_seqs, rev_seqs) match_pairs(seqs, out_fhand, orphan_out_fhand, out_format) output = out_fhand.getvalue() assert '@seq8:136:FC706VJ:2:2104:15343:197393 1:Y:18:ATCACG' in output assert '@seq8:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in output orp = orphan_out_fhand.getvalue() assert '@seq1:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in orp assert '@seq2:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in orp # with reads with no direcction file1 = os.path.join(TEST_DATA_DIR, 'pairend7.sfastq') file2 = os.path.join(TEST_DATA_DIR, 'pairend2.sfastq') fwd_seqs = read_seqs([open(file1)]) rev_seqs = read_seqs([open(file2)]) out_fhand = StringIO() orphan_out_fhand = StringIO() out_format = 'fastq' seqs = flat_zip_longest(fwd_seqs, rev_seqs) match_pairs(seqs, out_fhand, orphan_out_fhand, out_format) output = out_fhand.getvalue() assert '@seq8:136:FC706VJ:2:2104:15343:197393 1:Y:18:ATCACG' in output assert '@seq8:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in output assert '@seq1:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in output assert '@seq1:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in output orp = orphan_out_fhand.getvalue() assert '@seq6:136:FC706VJ:2:2104:15343:197393.mpl_1' in orp assert '@seq7:136:FC706VJ:2:2104:15343:197393.hhhh' in orp assert '@seq2:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCAC' in orp # File is not sorted file1 = '''@s1.f AACCAGTCAAC + CCCFFFFFGHH @s2.f AACCAGTCAAC + CCCFFFFFGHH @s1.r AACCAGTCAAC + CCCFFFFFGHH ''' file1 = StringIO(file1) set_format(file1, 'fastq') seqs = read_seqs([file1]) out_fhand = StringIO() orphan_out_fhand = StringIO() out_format = 'fastq' try: match_pairs(seqs, out_fhand, orphan_out_fhand, out_format, check_order_buffer_size=10) output = out_fhand.getvalue() self.fail('ItemsNotSortedError error expected') except ItemsNotSortedError: pass