コード例 #1
0
ファイル: bin_utils.py プロジェクト: milw/seq_crumbs
def parse_basic_args(parser):
    'It parses the command line and it returns a dict with the arguments.'
    parsed_args = parser.parse_args()
    # we have to wrap the file in a BufferedReader to allow peeking into stdin
    wrapped_fhands = []
    # if input is stdin it will be a fhand not a list of fhands.
    # we have to convert to a list
    in_fhands = parsed_args.input
    if not isinstance(in_fhands, list):
        in_fhands = [in_fhands]
    for fhand in in_fhands:
        fhand = wrap_in_buffered_reader(fhand)
        fhand = uncompress_if_required(fhand)
        wrapped_fhands.append(fhand)

    # We have to add the one_line to the fastq files in order to get the
    # speed improvements of the seqitems
    in_format = parsed_args.in_format
    if in_format == GUESS_FORMAT:
        for wrapped_fhand in wrapped_fhands:
            get_format(wrapped_fhand)
    else:
        if in_format != get_format(wrapped_fhands[0]):
            msg = 'The given input format does not correspond to the input'
            msg += ' file'
            raise WrongFormatError(msg)

        if 'fastq' in in_format:
            for wrapped_fhand in wrapped_fhands:
                get_format(wrapped_fhand)
        else:
            # we dont set the first one because already did in the previous
            # checking
            for wrapped_fhand in wrapped_fhands[1:]:
                set_format(wrapped_fhand, in_format)

    out_fhand = getattr(parsed_args, OUTFILE)

    comp_kind = get_requested_compression(parsed_args)
    if isinstance(out_fhand, list):
        new_out_fhands = []
        for out_f in out_fhand:
            try:
                out_f = compress_fhand(out_f, compression_kind=comp_kind)
            except RuntimeError, error:
                parser.error(error)

            new_out_fhands.append(out_f)
        out_fhand = new_out_fhands
コード例 #2
0
ファイル: filters.py プロジェクト: radaniba/seq_crumbs
def _sorted_mapped_reads(ref_fpath, paired_fpaths=None,
                     unpaired_fpaths=None, directory=None,
                     file_format=None, min_seed_len=None):
    fhand = open(paired_fpaths[0]) if paired_fpaths else open(unpaired_fpaths[0])
    if file_format is not None:
        set_format(fhand, file_format)
    else:
        file_format = get_format(fhand)
    index_fpath = get_or_create_bwa_index(ref_fpath, directory)
    extra_params = ['-a', '-M']
    if min_seed_len is not None:
        extra_params.extend(['-k', min_seed_len])
    bwa = map_with_bwamem(index_fpath, paired_fpaths=paired_fpaths,
                         unpaired_fpath=unpaired_fpaths,
                         extra_params=extra_params)
    bam_fhand = NamedTemporaryFile(dir='/home/carlos/tmp')
    sort_mapped_reads(bwa, bam_fhand.name, key='queryname')
    bamfile = pysam.Samfile(bam_fhand.name)
    return bamfile
コード例 #3
0
ファイル: test_utils.py プロジェクト: radaniba/seq_crumbs
    def test_get_format_fhand(self):
        "It checks the get/set format functions"
        # file fhand
        fhand = NamedTemporaryFile()
        fhand.write(">seq\natgctacgacta\n")
        fhand.flush()
        name = fhand.name
        id_ = id(fhand)

        file_format = get_format(fhand)
        assert FILEFORMAT_INVENTORY[(id_, name)] == file_format
        num_keys = len(FILEFORMAT_INVENTORY)

        file_format = get_format(fhand)
        assert FILEFORMAT_INVENTORY[(id_, name)] == file_format
        assert len(FILEFORMAT_INVENTORY) == num_keys

        fhand = NamedTemporaryFile()
        set_format(fhand, "fasta")

        assert "fasta" == get_format(fhand)
コード例 #4
0
    def test_get_format_fhand(self):
        "It checks the get/set format functions"
        #file fhand
        fhand = NamedTemporaryFile()
        fhand.write('>seq\natgctacgacta\n')
        fhand.flush()
        name = fhand.name
        id_ = id(fhand)

        file_format = get_format(fhand)
        assert FILEFORMAT_INVENTORY[(id_, name)] == file_format
        num_keys = len(FILEFORMAT_INVENTORY)

        file_format = get_format(fhand)
        assert FILEFORMAT_INVENTORY[(id_, name)] == file_format
        assert len(FILEFORMAT_INVENTORY) == num_keys

        fhand = NamedTemporaryFile()
        set_format(fhand, 'fasta')

        assert 'fasta' == get_format(fhand)
コード例 #5
0
ファイル: bin_utils.py プロジェクト: pziarsolo/seq_crumbs
def parse_basic_args(parser):
    'It parses the command line and it returns a dict with the arguments.'
    parsed_args = parser.parse_args()
    # we have to wrap the file in a BufferedReader to allow peeking into stdin
    wrapped_fhands = []
    # if input is stdin it will be a fhand not a list of fhands.
    # we have to convert to a list
    in_fhands = parsed_args.input
    if not isinstance(in_fhands, list):
        in_fhands = [in_fhands]
    for fhand in in_fhands:
        fhand = wrap_in_buffered_reader(fhand)
        fhand = uncompress_if_required(fhand)
        wrapped_fhands.append(fhand)

    # We have to add the one_line to the fastq files in order to get the
    # speed improvements of the seqitems
    in_format = parsed_args.in_format
    if in_format == GUESS_FORMAT:
        for wrapped_fhand in wrapped_fhands:
            get_format(wrapped_fhand)
    else:
        for wrapped_fhand in wrapped_fhands:
            set_format(wrapped_fhand, in_format)

    out_fhand = getattr(parsed_args, OUTFILE)

    comp_kind = get_requested_compression(parsed_args)
    if isinstance(out_fhand, list):
        new_out_fhands = []
        for out_f in out_fhand:
            try:
                out_f = compress_fhand(out_f, compression_kind=comp_kind)
            except RuntimeError, error:
                parser.error(error)

            new_out_fhands.append(out_f)
        out_fhand = new_out_fhands
コード例 #6
0
def parse_basic_args(parser):
    'It parses the command line and it returns a dict with the arguments.'
    parsed_args = parser.parse_args()
    # we have to wrap the file in a BufferedReader to allow peeking into stdin
    wrapped_fhands = []
    # if input is stdin it will be a fhand not a list of fhands.
    # we have to convert to a list
    in_fhands = parsed_args.input
    if not isinstance(in_fhands, list):
        in_fhands = [in_fhands]
    for fhand in in_fhands:
        fhand = wrap_in_buffered_reader(fhand)
        fhand = uncompress_if_required(fhand)
        wrapped_fhands.append(fhand)

    # We have to add the one_line to the fastq files in order to get the
    # speed improvements of the seqitems
    in_format = parsed_args.in_format
    if in_format == GUESS_FORMAT:
        for wrapped_fhand in wrapped_fhands:
            get_format(wrapped_fhand)
    else:
        for wrapped_fhand in wrapped_fhands:
            set_format(wrapped_fhand, in_format)

    out_fhand = getattr(parsed_args, OUTFILE)

    comp_kind = get_requested_compression(parsed_args)
    if isinstance(out_fhand, list):
        new_out_fhands = []
        for out_f in out_fhand:
            try:
                out_f = compress_fhand(out_f, compression_kind=comp_kind)
            except RuntimeError, error:
                parser.error(error)

            new_out_fhands.append(out_f)
        out_fhand = new_out_fhands
コード例 #7
0
ファイル: test_pairing.py プロジェクト: djinnome/seq_crumbs
    def test_pair_matcher(self):
        'It test the pair matcher function'
        # with equal seqs but the last ones
        file1 = os.path.join(TEST_DATA_DIR, 'pairend1.sfastq')
        file2 = os.path.join(TEST_DATA_DIR, 'pairend2.sfastq')
        fwd_seqs = read_seqs([open(file1)])
        rev_seqs = read_seqs([open(file2)])

        out_fhand = StringIO()
        orphan_out_fhand = StringIO()
        out_format = 'fastq'
        seqs = flat_zip_longest(fwd_seqs, rev_seqs)
        match_pairs(seqs, out_fhand, orphan_out_fhand, out_format)

        output = out_fhand.getvalue()
        assert '@seq1:136:FC706VJ:2:2104:15343:197393 1:Y:18:ATCACG' in output
        assert '@seq2:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in output
        orp = orphan_out_fhand.getvalue()
        assert '@seq8:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in orp

        # with the firsts seqs different
        file1 = os.path.join(TEST_DATA_DIR, 'pairend1.sfastq')
        file2 = os.path.join(TEST_DATA_DIR, 'pairend3.sfastq')
        fwd_seqs = read_seqs([open(file1)])
        rev_seqs = read_seqs([open(file2)])
        out_fhand = StringIO()
        orphan_out_fhand = StringIO()
        out_format = 'fastq'
        seqs = flat_zip_longest(fwd_seqs, rev_seqs)
        match_pairs(seqs, out_fhand, orphan_out_fhand, out_format)

        output = out_fhand.getvalue()
        assert '@seq4:136:FC706VJ:2:2104:15343:197393 1:Y:18:ATCACG' in output
        assert '@seq5:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in output
        orp = orphan_out_fhand.getvalue()
        assert '@seq1:136:FC706VJ:2:2104:15343:197393 1:Y:18:ATCACG' in orp
        assert '@seq3:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in orp
        assert '@seq6:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in orp

        file1 = os.path.join(TEST_DATA_DIR, 'pairend4.sfastq')
        file2 = os.path.join(TEST_DATA_DIR, 'pairend2.sfastq')
        fwd_seqs = read_seqs([open(file1)])
        rev_seqs = read_seqs([open(file2)])
        out_fhand = StringIO()
        orphan_out_fhand = StringIO()
        out_format = 'fastq'

        seqs = flat_zip_longest(fwd_seqs, rev_seqs)
        match_pairs(seqs, out_fhand, orphan_out_fhand, out_format)

        output = out_fhand.getvalue()
        assert '@seq8:136:FC706VJ:2:2104:15343:197393 1:Y:18:ATCACG' in output
        assert '@seq8:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in output
        orp = orphan_out_fhand.getvalue()
        assert '@seq1:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in orp
        assert '@seq2:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in orp

        # with reads with no direcction
        file1 = os.path.join(TEST_DATA_DIR, 'pairend7.sfastq')
        file2 = os.path.join(TEST_DATA_DIR, 'pairend2.sfastq')
        fwd_seqs = read_seqs([open(file1)])
        rev_seqs = read_seqs([open(file2)])
        out_fhand = StringIO()
        orphan_out_fhand = StringIO()
        out_format = 'fastq'

        seqs = flat_zip_longest(fwd_seqs, rev_seqs)
        match_pairs(seqs, out_fhand, orphan_out_fhand, out_format)
        output = out_fhand.getvalue()
        assert '@seq8:136:FC706VJ:2:2104:15343:197393 1:Y:18:ATCACG' in output
        assert '@seq8:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in output
        assert '@seq1:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in output
        assert '@seq1:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in output

        orp = orphan_out_fhand.getvalue()
        assert '@seq6:136:FC706VJ:2:2104:15343:197393.mpl_1' in orp
        assert '@seq7:136:FC706VJ:2:2104:15343:197393.hhhh' in orp
        assert '@seq2:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCAC' in orp

        # File is not sorted
        file1 = '''@s1.f
AACCAGTCAAC
+
CCCFFFFFGHH
@s2.f
AACCAGTCAAC
+
CCCFFFFFGHH
@s1.r
AACCAGTCAAC
+
CCCFFFFFGHH
'''
        file1 = StringIO(file1)
        set_format(file1, 'fastq')
        seqs = read_seqs([file1])
        out_fhand = StringIO()
        orphan_out_fhand = StringIO()
        out_format = 'fastq'

        try:
            match_pairs(seqs, out_fhand, orphan_out_fhand, out_format,
                        check_order_buffer_size=10)
            output = out_fhand.getvalue()
            self.fail('ItemsNotSortedError error expected')
        except ItemsNotSortedError:
            pass
コード例 #8
0
    def test_pair_matcher(self):
        'It test the pair matcher function'
        # with equal seqs but the last ones
        file1 = os.path.join(TEST_DATA_DIR, 'pairend1.sfastq')
        file2 = os.path.join(TEST_DATA_DIR, 'pairend2.sfastq')
        fwd_seqs = read_seqs([open(file1)])
        rev_seqs = read_seqs([open(file2)])

        out_fhand = StringIO()
        orphan_out_fhand = StringIO()
        out_format = 'fastq'
        seqs = flat_zip_longest(fwd_seqs, rev_seqs)
        match_pairs(seqs, out_fhand, orphan_out_fhand, out_format)

        output = out_fhand.getvalue()
        assert '@seq1:136:FC706VJ:2:2104:15343:197393 1:Y:18:ATCACG' in output
        assert '@seq2:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in output
        orp = orphan_out_fhand.getvalue()
        assert '@seq8:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in orp

        # with the firsts seqs different
        file1 = os.path.join(TEST_DATA_DIR, 'pairend1.sfastq')
        file2 = os.path.join(TEST_DATA_DIR, 'pairend3.sfastq')
        fwd_seqs = read_seqs([open(file1)])
        rev_seqs = read_seqs([open(file2)])
        out_fhand = StringIO()
        orphan_out_fhand = StringIO()
        out_format = 'fastq'
        seqs = flat_zip_longest(fwd_seqs, rev_seqs)
        match_pairs(seqs, out_fhand, orphan_out_fhand, out_format)

        output = out_fhand.getvalue()
        assert '@seq4:136:FC706VJ:2:2104:15343:197393 1:Y:18:ATCACG' in output
        assert '@seq5:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in output
        orp = orphan_out_fhand.getvalue()
        assert '@seq1:136:FC706VJ:2:2104:15343:197393 1:Y:18:ATCACG' in orp
        assert '@seq3:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in orp
        assert '@seq6:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in orp

        file1 = os.path.join(TEST_DATA_DIR, 'pairend4.sfastq')
        file2 = os.path.join(TEST_DATA_DIR, 'pairend2.sfastq')
        fwd_seqs = read_seqs([open(file1)])
        rev_seqs = read_seqs([open(file2)])
        out_fhand = StringIO()
        orphan_out_fhand = StringIO()
        out_format = 'fastq'

        seqs = flat_zip_longest(fwd_seqs, rev_seqs)
        match_pairs(seqs, out_fhand, orphan_out_fhand, out_format)

        output = out_fhand.getvalue()
        assert '@seq8:136:FC706VJ:2:2104:15343:197393 1:Y:18:ATCACG' in output
        assert '@seq8:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in output
        orp = orphan_out_fhand.getvalue()
        assert '@seq1:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in orp
        assert '@seq2:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in orp

        # with reads with no direcction
        file1 = os.path.join(TEST_DATA_DIR, 'pairend7.sfastq')
        file2 = os.path.join(TEST_DATA_DIR, 'pairend2.sfastq')
        fwd_seqs = read_seqs([open(file1)])
        rev_seqs = read_seqs([open(file2)])
        out_fhand = StringIO()
        orphan_out_fhand = StringIO()
        out_format = 'fastq'

        seqs = flat_zip_longest(fwd_seqs, rev_seqs)
        match_pairs(seqs, out_fhand, orphan_out_fhand, out_format)
        output = out_fhand.getvalue()
        assert '@seq8:136:FC706VJ:2:2104:15343:197393 1:Y:18:ATCACG' in output
        assert '@seq8:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in output
        assert '@seq1:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in output
        assert '@seq1:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in output

        orp = orphan_out_fhand.getvalue()
        assert '@seq6:136:FC706VJ:2:2104:15343:197393.mpl_1' in orp
        assert '@seq7:136:FC706VJ:2:2104:15343:197393.hhhh' in orp
        assert '@seq2:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCAC' in orp

        # File is not sorted
        file1 = '''@s1.f
AACCAGTCAAC
+
CCCFFFFFGHH
@s2.f
AACCAGTCAAC
+
CCCFFFFFGHH
@s1.r
AACCAGTCAAC
+
CCCFFFFFGHH
'''
        file1 = StringIO(file1)
        set_format(file1, 'fastq')
        seqs = read_seqs([file1])
        out_fhand = StringIO()
        orphan_out_fhand = StringIO()
        out_format = 'fastq'

        try:
            match_pairs(seqs,
                        out_fhand,
                        orphan_out_fhand,
                        out_format,
                        check_order_buffer_size=10)
            output = out_fhand.getvalue()
            self.fail('ItemsNotSortedError error expected')
        except ItemsNotSortedError:
            pass