Example #1
0
def _fasta_sniffer(fh):
    # Strategy:
    #   Ignore up to 5 blank/whitespace-only lines at the beginning of the
    #   file. Read up to 10 records. If at least one record is read (i.e.
    #   the file isn't empty) and no errors are thrown during reading, assume
    #   the file is in FASTA format. If a record appears to be QUAL, do *not*
    #   identify the file as FASTA since we don't want to sniff QUAL files as
    #   FASTA (technically they can be read as FASTA since the sequences may
    #   not be validated but it probably isn't what the user wanted). Also, if
    #   we add QUAL as its own file format in the future, we wouldn't want the
    #   FASTA and QUAL sniffers to both positively identify a QUAL file.
    if _too_many_blanks(fh, 5):
        return False, {}

    num_records = 10
    empty = True
    try:
        parser = _parse_fasta_raw(fh, _sniffer_data_parser, FASTAFormatError)
        for _ in zip(range(num_records), parser):
            empty = False
    except FASTAFormatError:
        return False, {}

    if empty:
        return False, {}
    else:
        return True, {}
Example #2
0
def _fasta_sniffer(fh):
    # Strategy:
    #   Ignore up to 5 blank/whitespace-only lines at the beginning of the
    #   file. Read up to 10 records. If at least one record is read (i.e.
    #   the file isn't empty) and no errors are thrown during reading, assume
    #   the file is in FASTA format. If a record appears to be QUAL, do *not*
    #   identify the file as FASTA since we don't want to sniff QUAL files as
    #   FASTA (technically they can be read as FASTA since the sequences may
    #   not be validated but it probably isn't what the user wanted). Also, if
    #   we add QUAL as its own file format in the future, we wouldn't want the
    #   FASTA and QUAL sniffers to both positively identify a QUAL file.
    if _too_many_blanks(fh, 5):
        return False, {}

    num_records = 10
    empty = True
    try:
        parser = _parse_fasta_raw(fh, _sniffer_data_parser, FASTAFormatError)
        for _ in zip(range(num_records), parser):
            empty = False
    except FASTAFormatError:
        return False, {}

    if empty:
        return False, {}
    else:
        return True, {}
Example #3
0
def _embl_sniffer(fh):
    # check the 1st real line is a valid ID line
    if _too_many_blanks(fh, 5):
        return False, {}
    try:
        line = next(_line_generator(fh, skip_blanks=True, strip=False))
    except StopIteration:
        return False, {}

    if line.startswith('ID'):
        return True, {}
    else:
        return False, {}
Example #4
0
def _embl_sniffer(fh):
    # check the 1st real line is a valid ID line
    if _too_many_blanks(fh, 5):
        return False, {}
    try:
        line = next(_line_generator(fh, skip_blanks=True, strip=False))
    except StopIteration:
        return False, {}

    if line.startswith('ID'):
        return True, {}
    else:
        return False, {}
Example #5
0
def _genbank_sniffer(fh):
    # check the 1st real line is a valid LOCUS line
    if _too_many_blanks(fh, 5):
        return False, {}
    try:
        line = next(_line_generator(fh, skip_blanks=True, strip=False))
    except StopIteration:
        return False, {}

    try:
        _parse_locus([line])
    except GenBankFormatError:
        return False, {}
    return True, {}
Example #6
0
def _genbank_sniffer(fh):
    # check the 1st real line is a valid LOCUS line
    if _too_many_blanks(fh, 5):
        return False, {}
    try:
        line = next(_line_generator(fh, skip_blanks=True, strip=False))
    except StopIteration:
        return False, {}

    try:
        _parse_locus([line])
    except GenBankFormatError:
        return False, {}
    return True, {}
Example #7
0
def _gff3_sniffer(fh):
    # check the 1st real line is a valid ID line
    if _too_many_blanks(fh, 5):
        return False, {}

    try:
        line = next(_line_generator(fh, skip_blanks=True, strip=False))
    except StopIteration:
        return False, {}

    if re.match(r'##gff-version\s+3', line):
        return True, {}
    else:
        return False, {}
Example #8
0
def _gff3_sniffer(fh):
    # check the 1st real line is a valid ID line
    if _too_many_blanks(fh, 5):
        return False, {}

    try:
        line = next(_line_generator(fh, skip_blanks=True, strip=False))
    except StopIteration:
        return False, {}

    if re.match(r'##gff-version\s+3', line):
        return True, {}
    else:
        return False, {}
Example #9
0
def _fastq_sniffer(fh):
    # Strategy:
    #   Ignore up to 5 blank/whitespace-only lines at the beginning of the
    #   file. Read up to 10 records. If at least one record is read (i.e. the
    #   file isn't empty) and the quality scores are in printable ASCII range,
    #   assume the file is FASTQ.
    if _too_many_blanks(fh, 5):
        return False, {}

    try:
        not_empty = False
        for _ in zip(range(10), _fastq_to_generator(fh, phred_offset=33)):
            not_empty = True
        return not_empty, {}
    except (FASTQFormatError, ValueError):
        return False, {}
Example #10
0
def _fastq_sniffer(fh):
    # Strategy:
    #   Ignore up to 5 blank/whitespace-only lines at the beginning of the
    #   file. Read up to 10 records. If at least one record is read (i.e. the
    #   file isn't empty) and the quality scores are in printable ASCII range,
    #   assume the file is FASTQ.
    if _too_many_blanks(fh, 5):
        return False, {}

    try:
        not_empty = False
        for _ in zip(range(10), _fastq_to_generator(fh, phred_offset=33)):
            not_empty = True
        return not_empty, {}
    except (FASTQFormatError, ValueError):
        return False, {}
Example #11
0
def _fastq_sniffer(fh):
    # Strategy:
    #   Ignore up to 5 blank/whitespace-only lines at the beginning of the
    #   file. Read up to 10 records. If at least one record is read (i.e. the
    #   file isn't empty) and the quality scores are in printable ASCII range,
    #   assume the file is FASTQ.
    if _too_many_blanks(fh, 5):
        return False, {}

    try:
        not_empty = False
        for _, seq in zip(range(10), _fastq_to_generator(fh, phred_offset=33)):
            split_length = len((seq.metadata['id'] +
                                seq.metadata['description']).split(':'))
            description = seq.metadata['description'].split(':')
            if split_length == 10 and description[1] in 'YN':
                return True, {'variant': 'illumina1.8'}
            not_empty = True
        return not_empty, {}
    except (FASTQFormatError, ValueError):
        return False, {}