Ejemplo n.º 1
0
def test_fasta_reader_cleanup():
    i_path = _data_path("fasta_reader_1.fasta")
    fh = open(i_path)
    with _new_argv([fh]):
        reader = fastaReader(fh)
        for _ in reader:
            pass
    assert (fh.closed)
Ejemplo n.º 2
0
def main():
    # Read command line arguments
    fasta_filename = sys.argv[1]
    fasta_type = sys.argv[
        2] or 'fasta'  # should always be fasta or csfasta? what if txt?
    qual_filename = sys.argv[3]
    qual_type = sys.argv[4] or 'qualsanger'  # qual454 qualsolid
    output_filename = sys.argv[5]
    force_quality_encoding = sys.argv[6]
    if force_quality_encoding == 'None':
        force_quality_encoding = None

    format = 'sanger'
    if fasta_type == 'csfasta' or qual_type == 'qualsolid':
        format = 'cssanger'
    elif qual_type == 'qualsolexa':
        format = 'solexa'
    elif qual_type == 'qualillumina':
        format = 'illumina'

    out = fastqWriter(path=output_filename,
                      format=format,
                      force_quality_encoding=force_quality_encoding)
    if qual_filename == 'None':
        qual_input = fastqFakeFastaScoreReader(
            format, quality_encoding=force_quality_encoding)
    else:
        qual_input = fastaNamedReader(open(qual_filename, 'rt'))

    fastq_combiner = fastqCombiner(format)
    i = None
    skip_count = 0
    for i, sequence in enumerate(fastaReader(open(fasta_filename, 'rt'))):
        quality = qual_input.get(sequence)
        if quality:
            fastq_read = fastq_combiner.combine(sequence, quality)
            out.write(fastq_read)
        else:
            skip_count += 1
    out.close()
    if i is None:
        print("Your file contains no valid FASTA sequences.")
    else:
        print(qual_input.has_data())
        print('Combined %s of %s sequences with quality scores (%.2f%%).' %
              (i - skip_count + 1, i + 1,
               float(i - skip_count + 1) / float(i + 1) * 100.0))
Ejemplo n.º 3
0
def load_primers_as_re(primer_fasta, mm, rc=False):
    #Read primer file and record all specified sequences
    primers = set()
    in_handle = open(primer_fasta, "rU")
    reader = fastaReader(in_handle)
    count = 0
    for record in reader:
        if rc:
            seq = reverse_complement(record.sequence)
        else:
            seq = record.sequence
        #primers.add(re.compile(make_reg_ex(seq)))
        count += 1
        for pattern in make_reg_ex_mm(seq, mm):
            primers.add(pattern)
    in_handle.close()
    #Use set to avoid duplicates, sort to have longest first
    #(so more specific primers found before less specific ones)
    primers = sorted(set(primers), key=lambda p: -len(p))
    return count, re.compile("|".join(primers))  #make one monster re!
Ejemplo n.º 4
0
def load_primers_as_re(primer_fasta, mm, rc=False):
    #Read primer file and record all specified sequences
    primers = set()
    in_handle = open(primer_fasta, "rU")
    reader = fastaReader(in_handle)
    count = 0
    for record in reader:
        if rc:
            seq = reverse_complement(record.sequence)
        else:
            seq = record.sequence
        #primers.add(re.compile(make_reg_ex(seq)))
        count += 1
        for pattern in make_reg_ex_mm(seq, mm):
            primers.add(pattern)
    in_handle.close()
    #Use set to avoid duplicates, sort to have longest first
    #(so more specific primers found before less specific ones)
    primers = sorted(set(primers), key=lambda p: -len(p))
    return count, re.compile("|".join(primers)) #make one monster re!
Ejemplo n.º 5
0
def main():
    #Read command line arguments
    fasta_filename = sys.argv[1]
    fasta_type = sys.argv[2] or 'fasta' #should always be fasta or csfasta? what if txt?
    qual_filename = sys.argv[3]
    qual_type = sys.argv[4] or 'qualsanger' #qual454 qualsolid
    output_filename = sys.argv[5]
    force_quality_encoding = sys.argv[6]
    if force_quality_encoding == 'None':
        force_quality_encoding = None
    
    format = 'sanger'
    if fasta_type == 'csfasta' or qual_type == 'qualsolid':
        format = 'cssanger'
    elif qual_type == 'qualsolexa':
        format = 'solexa'
    elif qual_type == 'qualillumina':
        format = 'illumina'
    
    out = fastqWriter( open( output_filename, 'wb' ), format = format, force_quality_encoding = force_quality_encoding )
    if qual_filename == 'None':
        qual_input = fastqFakeFastaScoreReader( format, quality_encoding = force_quality_encoding )
    else:
        qual_input = fastaNamedReader( open( qual_filename, 'rb' )  )
    
    fastq_combiner = fastqCombiner( format )
    i = None
    skip_count = 0
    for i, sequence in enumerate( fastaReader( open( fasta_filename, 'rb' ) ) ):
        quality = qual_input.get( sequence )
        if quality:
            fastq_read = fastq_combiner.combine( sequence, quality )
            out.write( fastq_read )
        else:
            skip_count += 1
    out.close()
    if i is None:
        print "Your file contains no valid FASTA sequences."
    else:
        print qual_input.has_data()
        print 'Combined %s of %s sequences with quality scores (%.2f%%).' % ( i - skip_count + 1, i + 1, float( i - skip_count + 1 ) / float( i + 1 ) * 100.0 )
Ejemplo n.º 6
0
                if len(record.sequence) >= min_len:
                    record.quality = record.quality[:cut]
                    clipped += 1
                    writer.write(record)
                else:
                    short_clipped += 1
            elif keep_negatives:
                if len(record) >= min_len:
                    negs += 1
                    writer.write(record)
                else:
                    short_negs += 1
elif seq_format.lower() == "fasta":
    in_handle = open(in_file, "rU")
    out_handle = open(out_file, "w")
    reader = fastaReader(in_handle)
    writer = fastaWriter(out_handle)
    #Following code is identical to that for FASTQ but without editing qualities
    if forward:
        for record in reader:
            seq = record.sequence.upper()
            result = primer.search(seq)
            if result:
                #Forward primer, take everything after it
                cut = result.end()
                record.sequence = seq[cut:]
                if len(record.sequence) >= min_len:
                    clipped += 1
                    writer.write(record)
                else:
                    short_clipped += 1
Ejemplo n.º 7
0
    try:
        manifest = ReadRocheXmlManifest(in_handle)
    except ValueError:
        manifest = None
    out_handle = open(out_file, "wb")
    writer = SffWriter(out_handle, xml=manifest)
    in_handle.seek(0) #start again after getting manifest
    count = writer.write_file(rename_seqrecords(SffIterator(in_handle), rename))
    out_handle.close()
    in_handle.close()
else:
    #Use Galaxy for FASTA, QUAL or FASTQ
    if seq_format.lower() in ["fasta", "csfasta"] \
    or seq_format.lower().startswith("qual"):
        from galaxy_utils.sequence.fasta import fastaReader, fastaWriter
        reader = fastaReader(open(in_file, "rU"))
        writer = fastaWriter(open(out_file, "w"))
        marker = ">"
    elif seq_format.lower().startswith("fastq"):
        from galaxy_utils.sequence.fastq import fastqReader, fastqWriter
        reader = fastqReader(open(in_file, "rU"))
        writer = fastqWriter(open(out_file, "w"))
        marker = "@"
    else:
        sys.exit("Unsupported file type %r" % seq_format)
    #Now do the renaming
    count = 0
    renamed = 0
    for record in reader:
        #The [1:] is because the fastaReader leaves the > on the identifier,
        #likewise the fastqReader leaves the @ on the identifier
Ejemplo n.º 8
0
                if len(record.sequence) >= min_len:
                    record.quality = record.quality[:cut]
                    clipped += 1
                    writer.write(record)
                else:
                    short_clipped += 1
            elif keep_negatives:
                if len(record) >= min_len:
                    negs += 1
                    writer.write(record)
                else:
                    short_neg += 1
elif seq_format.lower()=="fasta":
    in_handle = open(in_file, "rU")
    out_handle = open(out_file, "w")
    reader = fastaReader(in_handle)
    writer = fastaWriter(out_handle)
    #Following code is identical to that for FASTQ but without editing qualities
    if forward:
        for record in reader:
            seq = record.sequence.upper()
            result = primer.search(seq)
            if result:
                #Forward primer, take everything after it
                cut = result.end()
                record.sequence = seq[cut:]
                if len(record.sequence) >= min_len:
                    clipped += 1
                    writer.write(record)
                else:
                    short_clipped += 1
Ejemplo n.º 9
0
    except ValueError:
        manifest = None
    out_handle = open(out_file, "wb")
    writer = SffWriter(out_handle, xml=manifest)
    in_handle.seek(0)  # start again after getting manifest
    count = writer.write_file(rename_seqrecords(SffIterator(in_handle), rename))
    out_handle.close()
    in_handle.close()
else:
    # Use Galaxy for FASTA, QUAL or FASTQ
    if seq_format.lower() in ["fasta", "csfasta"] or seq_format.lower().startswith(
        "qual"
    ):
        from galaxy_utils.sequence.fasta import fastaReader, fastaWriter

        reader = fastaReader(open(in_file, "rU"))
        writer = fastaWriter(open(out_file, "w"))
        marker = ">"
    elif seq_format.lower().startswith("fastq"):
        from galaxy_utils.sequence.fastq import fastqReader, fastqWriter

        reader = fastqReader(open(in_file, "rU"))
        writer = fastqWriter(open(out_file, "w"))
        marker = "@"
    else:
        sys.exit("Unsupported file type %r" % seq_format)
    # Now do the renaming
    count = 0
    renamed = 0
    for record in reader:
        # The [1:] is because the fastaReader leaves the > on the identifier,