def test_fastq(self):
        'It guesses the format for the solexa and illumina fastq'

        txt = '@HWI-EAS209_0006_FC706VJ:5:58:5894:21141#ATCACG/1\n'
        txt += 'TTAATTGGTAAATAAATCTCCTAATAGCTTAGATNTTACCTTNNNNNNNNNNTAGTTTCT\n'
        txt += '+HWI-EAS209_0006_FC706VJ:5:58:5894:21141#ATCACG/1\n'
        txt += 'efcfffffcfeefffcffffffddf`feed]`]_Ba_^__[YBBBBBBBBBBRTT\]][]\n'
        fhand = StringIO(txt)
        assert get_format(fhand) == 'fastq-illumina'

        txt = '@HWI-EAS209_0006_FC706VJ:5:58:5894:21141#ATCACG/1\n'
        txt += 'TTAATTGGTAAATAAATCTCCTAATAGCTTAGATNTTACCTTNNNNNNNNNNTAGTTTCT\n'
        txt += 'TTAATTGGTAAATAAATCTCCTAATAGCTTAGATNTTACCTTNNNNNNNNNNTAGTTTCT\n'
        txt += '+HWI-EAS209_0006_FC706VJ:5:58:5894:21141#ATCACG/1\n'
        txt += 'efcfffffcfeefffcffffffddf`feed]`]_Ba_^__[YBBBBBBBBBBRTT\]][]\n'
        txt += 'efcfffffcfeefffcffffffddf`feed]`]_Ba_^__[YBBBBBBBBBBRTT\]][]\n'

        fhand = StringIO(txt + txt)
        assert get_format(fhand) == 'fastq-illumina'

        fhand = StringIO('@HWI-EAS209\n@')
        try:
            assert get_format(fhand) == 'fasta'
            self.fail('UndecidedFastqVersionError expected')
        except UndecidedFastqVersionError:
            pass

        # sanger
        txt = '@HWI-EAS209_0006_FC706VJ:5:58:5894:21141#ATCACG/1\n'
        txt += 'TTAATTGGTAAATAAATCTCCTAATAGCTTAGATNTTACCTTNNNNNNNNNNTAGTTTCT\n'
        txt += '+HWI-EAS209_0006_FC706VJ:5:58:5894:21141#ATCACG/1\n'
        txt += '000000000000000000000000000000000000000000000000000000000000\n'
        fhand = StringIO(txt)
        assert get_format(fhand) == 'fastq'
Example #2
0
    def test_fastq(self):
        'It guesses the format for the solexa and illumina fastq'

        txt = '@HWI-EAS209_0006_FC706VJ:5:58:5894:21141#ATCACG/1\n'
        txt += 'TTAATTGGTAAATAAATCTCCTAATAGCTTAGATNTTACCTTNNNNNNNNNNTAGTTTCT\n'
        txt += '+HWI-EAS209_0006_FC706VJ:5:58:5894:21141#ATCACG/1\n'
        txt += 'efcfffffcfeefffcffffffddf`feed]`]_Ba_^__[YBBBBBBBBBBRTT\]][]\n'
        fhand = StringIO(txt)
        assert get_format(fhand) == 'fastq-illumina'

        txt = '@HWI-EAS209_0006_FC706VJ:5:58:5894:21141#ATCACG/1\n'
        txt += 'TTAATTGGTAAATAAATCTCCTAATAGCTTAGATNTTACCTTNNNNNNNNNNTAGTTTCT\n'
        txt += 'TTAATTGGTAAATAAATCTCCTAATAGCTTAGATNTTACCTTNNNNNNNNNNTAGTTTCT\n'
        txt += '+HWI-EAS209_0006_FC706VJ:5:58:5894:21141#ATCACG/1\n'
        txt += 'efcfffffcfeefffcffffffddf`feed]`]_Ba_^__[YBBBBBBBBBBRTT\]][]\n'
        txt += 'efcfffffcfeefffcffffffddf`feed]`]_Ba_^__[YBBBBBBBBBBRTT\]][]\n'

        fhand = StringIO(txt + txt)
        assert get_format(fhand) == 'fastq-illumina'

        fhand = StringIO('@HWI-EAS209\n@')
        try:
            assert get_format(fhand) == 'fasta'
            self.fail('UndecidedFastqVersionError expected')
        except UndecidedFastqVersionError:
            pass

        # sanger
        txt = '@HWI-EAS209_0006_FC706VJ:5:58:5894:21141#ATCACG/1\n'
        txt += 'TTAATTGGTAAATAAATCTCCTAATAGCTTAGATNTTACCTTNNNNNNNNNNTAGTTTCT\n'
        txt += '+HWI-EAS209_0006_FC706VJ:5:58:5894:21141#ATCACG/1\n'
        txt += '000000000000000000000000000000000000000000000000000000000000\n'
        fhand = StringIO(txt)
        assert get_format(fhand) == 'fastq'
Example #3
0
 def test_empty_file(self):
     'It guesses the format of an empty file'
     fhand = StringIO()
     try:
         get_format(fhand)
         self.fail('FileIsEmptyError expected')
     except FileIsEmptyError:
         pass
Example #4
0
 def test_unkown(self):
     'It tests unkown formats'
     fhand = StringIO('xseq\nACTC\n')
     try:
         get_format(fhand)
         self.fail('UnknownFormatError expected')
     except UnknownFormatError:
         pass
 def test_empty_file(self):
     'It guesses the format of an empty file'
     fhand = StringIO()
     try:
         get_format(fhand)
         self.fail('FileIsEmptyError expected')
     except FileIsEmptyError:
         pass
 def test_unkown(self):
     'It tests unkown formats'
     fhand = StringIO('xseq\nACTC\n')
     try:
         get_format(fhand)
         self.fail('UnknownFormatError expected')
     except UnknownFormatError:
         pass
 def test_long_illumina(self):
     'The qualities seem illumina, but the reads are too lengthly'
     txt = '@read\n'
     txt += 'T' * 400 + '\n'
     txt += '+\n'
     txt += '@' * 400 + '\n'
     fhand = StringIO(txt)
     try:
         get_format(fhand)
         self.fail('UndecidedFastqVersionError expected')
     except UndecidedFastqVersionError:
         pass
Example #8
0
 def test_long_illumina(self):
     'The qualities seem illumina, but the reads are too lengthly'
     txt = '@read\n'
     txt += 'T' * 400 + '\n'
     txt += '+\n'
     txt += '@' * 400 + '\n'
     fhand = StringIO(txt)
     try:
         get_format(fhand)
         self.fail('UndecidedFastqVersionError expected')
     except UndecidedFastqVersionError:
         pass
Example #9
0
def parse_basic_args(parser):
    'It parses the command line and it returns a dict with the arguments.'
    parsed_args = parser.parse_args()
    # we have to wrap the file in a BufferedReader to allow peeking into stdin
    wrapped_fhands = []
    # if input is stdin it will be a fhand not a list of fhands.
    # we have to convert to a list
    in_fhands = parsed_args.input
    if not isinstance(in_fhands, list):
        in_fhands = [in_fhands]
    for fhand in in_fhands:
        fhand = wrap_in_buffered_reader(fhand)
        fhand = uncompress_if_required(fhand)
        wrapped_fhands.append(fhand)

    # We have to add the one_line to the fastq files in order to get the
    # speed improvements of the seqitems
    in_format = parsed_args.in_format
    if in_format == GUESS_FORMAT:
        for wrapped_fhand in wrapped_fhands:
            get_format(wrapped_fhand)
    else:
        if in_format != get_format(wrapped_fhands[0]):
            msg = 'The given input format does not correspond to the input'
            msg += ' file'
            raise WrongFormatError(msg)

        if 'fastq' in in_format:
            for wrapped_fhand in wrapped_fhands:
                get_format(wrapped_fhand)
        else:
            # we dont set the first one because already did in the previous
            # checking
            for wrapped_fhand in wrapped_fhands[1:]:
                set_format(wrapped_fhand, in_format)

    out_fhand = getattr(parsed_args, OUTFILE)

    comp_kind = get_requested_compression(parsed_args)
    if isinstance(out_fhand, list):
        new_out_fhands = []
        for out_f in out_fhand:
            try:
                out_f = compress_fhand(out_f, compression_kind=comp_kind)
            except RuntimeError, error:
                parser.error(error)

            new_out_fhands.append(out_f)
        out_fhand = new_out_fhands
Example #10
0
def _index_seq_file(fpath, file_format=None):
    '''It indexes a seq file using Biopython index.

    It uses the title line line as the key and not just the id.
    '''
    if file_format is None:
        file_format = get_format(open(fpath))

    file_format = remove_multiline(file_format)

    # pylint: disable W0212
    # we monkey patch to be able to index using the whole tile line and not
    # only the id. We need it because in a pair end file sequences with the
    # same id could be found
    accessor = _index._FormatToRandomAccess
    old_accessor = accessor.copy()
    accessor['fastq'] = FastqRandomAccess
    accessor['astq-sanger'] = FastqRandomAccess
    accessor['fastq-solexa'] = FastqRandomAccess
    accessor['fastq-illumina'] = FastqRandomAccess

    file_index = index(fpath, format=file_format)

    _index._FormatToRandomAccess = old_accessor

    return file_index
Example #11
0
def _read_seqitems(fhands):
    'it returns an iterator of seq items (tuples of name and chunk)'
    seq_iters = []
    for fhand in fhands:
        file_format = get_format(fhand)
        seq_iter = _itemize_fastx(fhand)
        seq_iter = assing_kind_to_seqs(SEQITEM, seq_iter, file_format)
        seq_iters.append(seq_iter)
    return chain.from_iterable(seq_iters)
Example #12
0
def _read_seqitems(fhands):
    'it returns an iterator of seq items (tuples of name and chunk)'
    seq_iters = []
    for fhand in fhands:
        file_format = get_format(fhand)
        seq_iter = _itemize_fastx(fhand)
        seq_iter = assing_kind_to_seqs(SEQITEM, seq_iter, file_format)
        seq_iters.append(seq_iter)
    return chain.from_iterable(seq_iters)
Example #13
0
    def test_get_format_stringio(self):
        "It checks the get/set format functions"
        # stiongIO
        stringIO_fhand = StringIO(">seq\natgctacgacta\n")

        striongIOhash = hashlib.sha224(stringIO_fhand.getvalue()[:100]).hexdigest()
        id_ = id(stringIO_fhand)

        file_format = get_format(stringIO_fhand)
        assert FILEFORMAT_INVENTORY[(id_, striongIOhash)] == file_format
    def test_fasta(self):
        'It guess fasta formats'
        fhand = StringIO('>seq\nACTC\n')
        assert get_format(fhand) == 'fasta'

        # multiline fasta
        fhand = StringIO('>seq\nACTC\nACTG\n>seq2\nACTG\n')
        assert get_format(fhand) == 'fasta'

        # qual
        fhand = StringIO('>seq\n10 20\n')
        assert get_format(fhand) == 'qual'

        # qual
        qual = ">seq1\n30 30 30 30 30 30 30 30\n>seq2\n30 30 30 30 30 30 30"
        qual += " 30\n>seq3\n30 30 30 30 30 30 30 30\n"

        fhand = StringIO(qual)
        assert get_format(fhand) == 'qual'
Example #15
0
    def test_fasta(self):
        'It guess fasta formats'
        fhand = StringIO('>seq\nACTC\n')
        assert get_format(fhand) == 'fasta'

        # multiline fasta
        fhand = StringIO('>seq\nACTC\nACTG\n>seq2\nACTG\n')
        assert get_format(fhand) == 'fasta'

        # qual
        fhand = StringIO('>seq\n10 20\n')
        assert get_format(fhand) == 'qual'

        # qual
        qual = ">seq1\n30 30 30 30 30 30 30 30\n>seq2\n30 30 30 30 30 30 30"
        qual += " 30\n>seq3\n30 30 30 30 30 30 30 30\n"

        fhand = StringIO(qual)
        assert get_format(fhand) == 'qual'
Example #16
0
    def test_get_format_stringio(self):
        "It checks the get/set format functions"
        #stiongIO
        stringIO_fhand = StringIO('>seq\natgctacgacta\n')

        striongIOhash = hashlib.sha224(
            stringIO_fhand.getvalue()[:100]).hexdigest()
        id_ = id(stringIO_fhand)

        file_format = get_format(stringIO_fhand)
        assert FILEFORMAT_INVENTORY[(id_, striongIOhash)] == file_format
Example #17
0
    def test_get_format_fhand(self):
        "It checks the get/set format functions"
        # file fhand
        fhand = NamedTemporaryFile()
        fhand.write(">seq\natgctacgacta\n")
        fhand.flush()
        name = fhand.name
        id_ = id(fhand)

        file_format = get_format(fhand)
        assert FILEFORMAT_INVENTORY[(id_, name)] == file_format
        num_keys = len(FILEFORMAT_INVENTORY)

        file_format = get_format(fhand)
        assert FILEFORMAT_INVENTORY[(id_, name)] == file_format
        assert len(FILEFORMAT_INVENTORY) == num_keys

        fhand = NamedTemporaryFile()
        set_format(fhand, "fasta")

        assert "fasta" == get_format(fhand)
Example #18
0
    def test_get_format_fhand(self):
        "It checks the get/set format functions"
        #file fhand
        fhand = NamedTemporaryFile()
        fhand.write('>seq\natgctacgacta\n')
        fhand.flush()
        name = fhand.name
        id_ = id(fhand)

        file_format = get_format(fhand)
        assert FILEFORMAT_INVENTORY[(id_, name)] == file_format
        num_keys = len(FILEFORMAT_INVENTORY)

        file_format = get_format(fhand)
        assert FILEFORMAT_INVENTORY[(id_, name)] == file_format
        assert len(FILEFORMAT_INVENTORY) == num_keys

        fhand = NamedTemporaryFile()
        set_format(fhand, 'fasta')

        assert 'fasta' == get_format(fhand)
Example #19
0
def sort_by_position_in_ref(in_fhands, ref_fpath, directory=None,
                            tempdir=None):
    in_fpaths = [fhand.name for fhand in in_fhands]
    file_format = get_format(in_fhands[0])
    extra_params = ['-f'] if 'fasta' in file_format else []
    index_fpath = get_or_create_bowtie2_index(ref_fpath, directory)
    bowtie2 = map_with_bowtie2(index_fpath, unpaired_fpaths=in_fpaths,
                               extra_params=extra_params)
    out_fhand = NamedTemporaryFile()
    sort_mapped_reads(bowtie2, out_fhand.name, tempdir=tempdir)
    samfile = pysam.Samfile(out_fhand.name)
    for aligned_read in samfile:
        yield alignedread_to_seqitem(aligned_read, file_format)
Example #20
0
def parse_basic_args(parser):
    'It parses the command line and it returns a dict with the arguments.'
    parsed_args = parser.parse_args()
    # we have to wrap the file in a BufferedReader to allow peeking into stdin
    wrapped_fhands = []
    # if input is stdin it will be a fhand not a list of fhands.
    # we have to convert to a list
    in_fhands = parsed_args.input
    if not isinstance(in_fhands, list):
        in_fhands = [in_fhands]
    for fhand in in_fhands:
        fhand = wrap_in_buffered_reader(fhand)
        fhand = uncompress_if_required(fhand)
        wrapped_fhands.append(fhand)

    # We have to add the one_line to the fastq files in order to get the
    # speed improvements of the seqitems
    in_format = parsed_args.in_format
    if in_format == GUESS_FORMAT:
        for wrapped_fhand in wrapped_fhands:
            get_format(wrapped_fhand)
    else:
        for wrapped_fhand in wrapped_fhands:
            set_format(wrapped_fhand, in_format)

    out_fhand = getattr(parsed_args, OUTFILE)

    comp_kind = get_requested_compression(parsed_args)
    if isinstance(out_fhand, list):
        new_out_fhands = []
        for out_f in out_fhand:
            try:
                out_f = compress_fhand(out_f, compression_kind=comp_kind)
            except RuntimeError, error:
                parser.error(error)

            new_out_fhands.append(out_f)
        out_fhand = new_out_fhands
Example #21
0
def parse_basic_args(parser):
    'It parses the command line and it returns a dict with the arguments.'
    parsed_args = parser.parse_args()
    # we have to wrap the file in a BufferedReader to allow peeking into stdin
    wrapped_fhands = []
    # if input is stdin it will be a fhand not a list of fhands.
    # we have to convert to a list
    in_fhands = parsed_args.input
    if not isinstance(in_fhands, list):
        in_fhands = [in_fhands]
    for fhand in in_fhands:
        fhand = wrap_in_buffered_reader(fhand)
        fhand = uncompress_if_required(fhand)
        wrapped_fhands.append(fhand)

    # We have to add the one_line to the fastq files in order to get the
    # speed improvements of the seqitems
    in_format = parsed_args.in_format
    if in_format == GUESS_FORMAT:
        for wrapped_fhand in wrapped_fhands:
            get_format(wrapped_fhand)
    else:
        for wrapped_fhand in wrapped_fhands:
            set_format(wrapped_fhand, in_format)

    out_fhand = getattr(parsed_args, OUTFILE)

    comp_kind = get_requested_compression(parsed_args)
    if isinstance(out_fhand, list):
        new_out_fhands = []
        for out_f in out_fhand:
            try:
                out_f = compress_fhand(out_f, compression_kind=comp_kind)
            except RuntimeError, error:
                parser.error(error)

            new_out_fhands.append(out_f)
        out_fhand = new_out_fhands
Example #22
0
def sort_by_position_in_ref(in_fhand, index_fpath, directory=None,
                            tempdir=None):
    #changed to bwa mem from bowtie, test doesn't work well, check it out
    in_fpath = in_fhand.name
    file_format = get_format(open(in_fpath))
    extra_params = ['--very-fast']
    if 'fasta' in file_format:
        extra_params.append('-f')
    bowtie2_process = map_with_bowtie2(index_fpath, paired_fpaths=None,
                                       unpaired_fpath=in_fpath,
                                       extra_params=extra_params)
    out_fhand = NamedTemporaryFile()
    map_process_to_sortedbam(bowtie2_process, out_fhand.name, tempdir=tempdir)
    samfile = pysam.Samfile(out_fhand.name)
    for aligned_read in samfile:
        yield alignedread_to_seqitem(aligned_read)
Example #23
0
def _sorted_mapped_reads(ref_fpath, paired_fpaths=None,
                     unpaired_fpaths=None, directory=None,
                     file_format=None, min_seed_len=None):
    fhand = open(paired_fpaths[0]) if paired_fpaths else open(unpaired_fpaths[0])
    if file_format is not None:
        set_format(fhand, file_format)
    else:
        file_format = get_format(fhand)
    index_fpath = get_or_create_bwa_index(ref_fpath, directory)
    extra_params = ['-a', '-M']
    if min_seed_len is not None:
        extra_params.extend(['-k', min_seed_len])
    bwa = map_with_bwamem(index_fpath, paired_fpaths=paired_fpaths,
                         unpaired_fpath=unpaired_fpaths,
                         extra_params=extra_params)
    bam_fhand = NamedTemporaryFile(dir='/home/carlos/tmp')
    sort_mapped_reads(bwa, bam_fhand.name, key='queryname')
    bamfile = pysam.Samfile(bam_fhand.name)
    return bamfile
Example #24
0
def _read_seqitems(fhands):
    'it returns an iterator of seq items (tuples of name and chunk)'
    seq_iters = []
    for fhand in fhands:
        file_format = get_format(fhand)

        if file_format == 'fasta':
            seq_iter = _itemize_fasta(fhand)
        elif 'multiline' not in file_format and 'fastq' in file_format:
            try:
                seq_iter = _itemize_fastq(fhand)
            except ValueError as error:
                if error_quality_disagree(error):
                    raise MalformedFile(str(error))
                raise
        else:
            msg = 'Format not supported by the itemizers: ' + file_format
            raise NotImplementedError(msg)
        seq_iter = assing_kind_to_seqs(SEQITEM, seq_iter, file_format)
        seq_iters.append(seq_iter)
    return chain.from_iterable(seq_iters)
Example #25
0
def _read_seqrecords(fhands):
    'It returns an iterator of seqrecords'
    seq_iters = []
    for fhand in fhands:
        fmt = get_format(fhand)

        if fmt in ('fasta', 'qual') or 'fastq' in fmt:
            title = title2ids
        if fmt == 'fasta':
            seq_iter = FastaIterator(fhand, title2ids=title)
        elif fmt == 'qual':
            seq_iter = QualPhredIterator(fhand, title2ids=title)
        elif fmt == 'fastq' or fmt == 'fastq-sanger':
            seq_iter = FastqPhredIterator(fhand, title2ids=title)
        elif fmt == 'fastq-solexa':
            seq_iter = FastqSolexaIterator(fhand, title2ids=title)
        elif fmt == 'fastq-illumina':
            seq_iter = FastqIlluminaIterator(fhand, title2ids=title)
        else:
            seq_iter = parse_into_seqrecs(fhand, fmt)
        seq_iters.append(seq_iter)
    return chain.from_iterable(seq_iters)
Example #26
0
def _read_seqrecords(fhands):
    'It returns an iterator of seqrecords'
    seq_iters = []
    for fhand in fhands:
        fmt = get_format(fhand)

        if fmt in ('fasta', 'qual') or 'fastq' in fmt:
            title = title2ids
        if fmt == 'fasta':
            seq_iter = FastaIterator(fhand, title2ids=title)
        elif fmt == 'qual':
            seq_iter = QualPhredIterator(fhand, title2ids=title)
        elif fmt == 'fastq' or fmt == 'fastq-sanger':
            seq_iter = FastqPhredIterator(fhand, title2ids=title)
        elif fmt == 'fastq-solexa':
            seq_iter = FastqSolexaIterator(fhand, title2ids=title)
        elif fmt == 'fastq-illumina':
            seq_iter = FastqIlluminaIterator(fhand, title2ids=title)
        else:
            seq_iter = parse_into_seqrecs(fhand, fmt)
        seq_iters.append(seq_iter)
    return chain.from_iterable(seq_iters)
Example #27
0
def filter_chimeras(ref_fpath, out_fhand, chimeras_fhand, in_fhands,
                    unknown_fhand, unpaired=False, paired_result=True,
                    settings=get_setting('CHIMERAS_SETTINGS'),
                    min_seed_len=None, directory=None):
    file_format = get_format(in_fhands[0])
    if unpaired:
        unpaired_fpaths = [fhand.name for fhand in in_fhands]
        paired_fpaths = None
    else:
        f_fhand = NamedTemporaryFile()
        r_fhand = NamedTemporaryFile()
        seqs = read_seqs(in_fhands)
        deinterleave_pairs(seqs, f_fhand, r_fhand, file_format)
        paired_fpaths = [f_fhand.name, r_fhand.name]
        unpaired_fpaths = None
    bamfile = _sorted_mapped_reads(ref_fpath, paired_fpaths, unpaired_fpaths,
                                   directory, file_format, min_seed_len)

    total = 0
    chimeric = 0
    unknown = 0
    for pair, kind in classify_mapped_reads(bamfile, settings=settings,
                                           paired_result=paired_result,
                                           file_format=file_format):
        if kind is NON_CHIMERIC:
            write_seqs(pair, out_fhand)
        elif kind is CHIMERA and chimeras_fhand is not None:
            write_seqs(pair, chimeras_fhand)
            chimeric += 1
        elif kind is UNKNOWN and unknown_fhand is not None:
            write_seqs(pair, unknown_fhand)
            unknown += 1
        total += 1
    mapped = total - chimeric - unknown
    print 'Total pairs analyzed: ', total
    print 'Chimeric pairs filtered: ', chimeric, '\t', chimeric / float(total)
    print 'Unknown pairs found: ', unknown, '\t', unknown / float(total)
    print 'Non-chimeric pairs: ', mapped, '\t', mapped / float(total)
Example #28
0
def seqio(in_fhands, out_fhand, out_format, copy_if_same_format=True):
    'It converts sequence files between formats'
    if out_format not in get_setting('SUPPORTED_OUTPUT_FORMATS'):
        raise IncompatibleFormatError("This output format is not supported")

    in_formats = [get_format(fhand) for fhand in in_fhands]

    if len(in_fhands) == 1 and in_formats[0] == out_format:
        if copy_if_same_format:
            copyfileobj(in_fhands[0], out_fhand)
        else:
            rel_symlink(in_fhands[0].name, out_fhand.name)
    else:
        seqs = _read_seqrecords(in_fhands)
        try:
            write_seqrecs(seqs, out_fhand, out_format)
        except ValueError, error:
            if error_quality_disagree(error):
                raise MalformedFile(str(error))
            if 'No suitable quality scores' in str(error):
                msg = 'No qualities available to write output file'
                raise IncompatibleFormatError(msg)
            raise
Example #29
0
def seqio(in_fhands, out_fhand, out_format, copy_if_same_format=True):
    'It converts sequence files between formats'
    if out_format not in get_setting('SUPPORTED_OUTPUT_FORMATS'):
        raise IncompatibleFormatError("This output format is not supported")

    in_formats = [get_format(fhand) for fhand in in_fhands]

    if len(in_fhands) == 1 and in_formats[0] == out_format:
        if copy_if_same_format:
            copyfileobj(in_fhands[0], out_fhand)
        else:
            rel_symlink(in_fhands[0].name, out_fhand.name)
    else:
        seqs = _read_seqrecords(in_fhands)
        try:
            write_seqrecs(seqs, out_fhand, out_format)
        except ValueError, error:
            if error_quality_disagree(error):
                raise MalformedFile(str(error))
            if 'No suitable quality scores' in str(error):
                msg = 'No qualities available to write output file'
                raise IncompatibleFormatError(msg)
            raise
Example #30
0
def read_seqs(fhands, out_format=None, prefered_seq_classes=None):
    'It returns a stream of seqs in different codings: seqrecords, seqitems...'

    if not prefered_seq_classes:
        prefered_seq_classes = [SEQITEM, SEQRECORD]
    try:
        in_format = get_format(fhands[0])
    except FileIsEmptyError:
        return []
    # seqitems is incompatible with different input and output formats
    # or when in_format != a fasta or fastq
    if ((out_format not in (None, GUESS_FORMAT) and in_format != out_format
         and SEQITEM in prefered_seq_classes) or
        (in_format not in ('fasta',) + SANGER_FASTQ_FORMATS +
         ILLUMINA_FASTQ_FORMATS)):
        prefered_seq_classes.pop(prefered_seq_classes.index(SEQITEM))

    if not prefered_seq_classes:
        msg = 'No valid seq class left or prefered'
        raise ValueError(msg)

    for seq_class in prefered_seq_classes:
        if seq_class == SEQITEM:
            try:
                return _read_seqitems(fhands)
            except NotImplementedError:
                continue
        elif seq_class == SEQRECORD:
            try:
                seqs = _read_seqrecords(fhands)
                return assing_kind_to_seqs(SEQRECORD, seqs, None)
            except NotImplementedError:
                continue
        else:
            raise ValueError('Unknown class for seq: ' + seq_class)
    raise RuntimeError('We should not be here, fixme')
Example #31
0
def read_seqs(fhands, out_format=None, prefered_seq_classes=None):
    'It returns a stream of seqs in different codings: seqrecords, seqitems...'

    if not prefered_seq_classes:
        prefered_seq_classes = [SEQITEM, SEQRECORD]
    try:
        in_format = get_format(fhands[0])
    except FileIsEmptyError:
        return []
    # seqitems is incompatible with different input and output formats
    # or when in_format != a fasta or fastq
    if ((out_format not in (None, GUESS_FORMAT) and in_format != out_format
         and SEQITEM in prefered_seq_classes) or
        (in_format not in ('fasta',) + SANGER_FASTQ_FORMATS +
         ILLUMINA_FASTQ_FORMATS)):
        prefered_seq_classes.pop(prefered_seq_classes.index(SEQITEM))

    if not prefered_seq_classes:
        msg = 'No valid seq class left or prefered'
        raise ValueError(msg)

    for seq_class in prefered_seq_classes:
        if seq_class == SEQITEM:
            try:
                return _read_seqitems(fhands)
            except NotImplementedError:
                continue
        elif seq_class == SEQRECORD:
            try:
                seqs = _read_seqrecords(fhands)
                return assing_kind_to_seqs(SEQRECORD, seqs, None)
            except NotImplementedError:
                continue
        else:
            raise ValueError('Unknown class for seq: ' + seq_class)
    raise RuntimeError('We should not be here, fixme')
Example #32
0
            try:
                out_f = compress_fhand(out_f, compression_kind=comp_kind)
            except RuntimeError, error:
                parser.error(error)

            new_out_fhands.append(out_f)
        out_fhand = new_out_fhands
    else:
        try:
            out_fhand = compress_fhand(out_fhand, compression_kind=comp_kind)
        except RuntimeError, error:
            parser.error(error)

    # The default output format is the same as the first file
    if 'fastq' in in_format or in_format == GUESS_FORMAT:
        out_format = get_format(wrapped_fhands[0])
    else:
        out_format = in_format

    # The original fhands should be stored, because otherwise they would be
    # closed
    args = {
        'out_fhand': out_fhand,
        'in_fhands': wrapped_fhands,
        'out_format': out_format,
        'original_in_fhands': in_fhands
    }
    return args, parsed_args


def parse_basic_parallel_args(parser):
    def test_with_long_desc(self):
        fhand = StringIO('''>comp27222_c1_seq1 len=4926 path=[89166356:0-46 89167522:47-85 89315292:86-121 89170132:122-176 89377211:177-217 89377235:218-244 89172846:245-247 89172856:248-251 89173028:252-276 89174386:277-292 89174684:293-506 89377352:507-582 89183669:583-587 89183821:588-613 89184868:614-644 89185624:645-719 89187914:720-723 89187935:724-870 89191280:871-887 89377494:888-907 89191517:908-927 89193046:928-1071 89198507:1072-1109 89199632:1110-1170 89201544:1171-1194 89202607:1195-1247 89377606:1248-1252 89377611:1253-1591 89215759:1592-1606 89215815:1607-1636 89216359:1637-1664 89377693:1665-1678 88727916:1679-2152 88743802:2153-2171 88744738:2172-2623 88759485:2624-2648 88759762:2649-2953 88769199:2954-2971 88769596:2972-3657 88791809:3658-3665 88792014:3666-3723 88793720:3724-3731 88794381:3732-3812 88799277:3813-3813 88799328:3814-3996 88807093:3997-3999 88807177:4000-4215 88813164:4216-4246 88814188:4247-4287 88815355:4288-4308 88816198:4309-4352 88817845:4353-4369 88818294:4370-4403 88818879:4404-4465 88821150:4466-4469 88821188:4470-4925]
GAAGGATCGATCGGCCTCGGCGGTGTTCCCAAAAATCTAAGAGCGTTTACTCCAAGCTTC''')
        get_format(fhand)
Example #34
0
    def test_with_long_desc(self):
        fhand = StringIO(
            '''>comp27222_c1_seq1 len=4926 path=[89166356:0-46 89167522:47-85 89315292:86-121 89170132:122-176 89377211:177-217 89377235:218-244 89172846:245-247 89172856:248-251 89173028:252-276 89174386:277-292 89174684:293-506 89377352:507-582 89183669:583-587 89183821:588-613 89184868:614-644 89185624:645-719 89187914:720-723 89187935:724-870 89191280:871-887 89377494:888-907 89191517:908-927 89193046:928-1071 89198507:1072-1109 89199632:1110-1170 89201544:1171-1194 89202607:1195-1247 89377606:1248-1252 89377611:1253-1591 89215759:1592-1606 89215815:1607-1636 89216359:1637-1664 89377693:1665-1678 88727916:1679-2152 88743802:2153-2171 88744738:2172-2623 88759485:2624-2648 88759762:2649-2953 88769199:2954-2971 88769596:2972-3657 88791809:3658-3665 88792014:3666-3723 88793720:3724-3731 88794381:3732-3812 88799277:3813-3813 88799328:3814-3996 88807093:3997-3999 88807177:4000-4215 88813164:4216-4246 88814188:4247-4287 88815355:4288-4308 88816198:4309-4352 88817845:4353-4369 88818294:4370-4403 88818879:4404-4465 88821150:4466-4469 88821188:4470-4925]
GAAGGATCGATCGGCCTCGGCGGTGTTCCCAAAAATCTAAGAGCGTTTACTCCAAGCTTC''')
        get_format(fhand)
Example #35
0
            try:
                out_f = compress_fhand(out_f, compression_kind=comp_kind)
            except RuntimeError, error:
                parser.error(error)

            new_out_fhands.append(out_f)
        out_fhand = new_out_fhands
    else:
        try:
            out_fhand = compress_fhand(out_fhand, compression_kind=comp_kind)
        except RuntimeError, error:
            parser.error(error)

    # The default output format is the same as the first file
    if 'fastq' in in_format or in_format == GUESS_FORMAT:
        out_format = get_format(wrapped_fhands[0])
    else:
        out_format = in_format

    # The original fhands should be stored, because otherwise they would be
    # closed
    args = {'out_fhand': out_fhand, 'in_fhands': wrapped_fhands,
            'out_format': out_format, 'original_in_fhands': in_fhands}
    return args, parsed_args


def parse_basic_parallel_args(parser):
    'It parses the command line and it returns a dict with the arguments.'
    args, parsed_args = parse_basic_args(parser)
    args['processes'] = parsed_args.processes
    return args, parsed_args