Example #1
0
def main():
    if len(sys.argv) != 5:
        stop_err("Wrong number of arguments. Expect: fasta tabular desrc_split [type]")
    input_filename = sys.argv[1]
    output_filename = sys.argv[2]
    descr_split = int( sys.argv[3] ) - 1
    if descr_split < 0:
        stop_err("Bad description split value (should be 1 or more)")
    input_type = sys.argv[4] or 'sanger' #input type should ordinarily be unnecessary
    
    num_reads = None
    fastq_read = None
    out = open( output_filename, 'wb' )
    if descr_split == 0:
        #Don't divide the description into multiple columns
        for num_reads, fastq_read in enumerate( fastqReader( open( input_filename ), format = input_type ) ):
            out.write( "%s\t%s\t%s\n" % ( fastq_read.identifier[1:].replace( '\t', ' ' ), fastq_read.sequence.replace( '\t', ' ' ), fastq_read.quality.replace( '\t', ' ' ) ) )
    else:
        for num_reads, fastq_read in enumerate( fastqReader( open( input_filename ), format = input_type ) ):
            words = fastq_read.identifier[1:].replace( '\t', ' ' ).split(None, descr_split)
            #pad with empty columns if required
            words += [""]*(descr_split-len(words))
            out.write( "%s\t%s\t%s\n" % ("\t".join(words), fastq_read.sequence.replace( '\t', ' ' ), fastq_read.quality.replace( '\t', ' ' ) ) )
    out.close()
    if num_reads is None:
        print "No valid FASTQ reads could be processed."
    else:
        print "%i FASTQ reads were converted to Tabular." % ( num_reads + 1 )
Example #2
0
def main():
    mate1_filename = sys.argv[1]
    mate1_type = sys.argv[2] or 'sanger'
    mate2_filename = sys.argv[3]
    mate2_type = sys.argv[4] or 'sanger'
    outfile_pairs = sys.argv[5]
    outfile_singles = sys.argv[6]

    if mate1_type != mate2_type:
        print(
            "WARNING: You are trying to interlace files of two different types: %s and %s."
            % (mate1_type, mate2_type))
        return

    type = mate1_type
    joiner = fastqJoiner(type)

    nof_singles = 0
    nof_pairs = 0
    i = None
    j = None

    out_pairs = fastqWriter(path=outfile_pairs, format=type)
    out_singles = fastqWriter(path=outfile_singles, format=type)
    mate2_input = fastqNamedReader(path=mate2_filename, format=type)
    mate1_input = fastqNamedReader(path=mate1_filename, format=type)
    reader1 = fastqReader(path=mate1_filename, format=type)
    reader2 = fastqReader(path=mate2_filename, format=type)

    with out_pairs, out_singles, mate2_input, mate1_input, reader1, reader2:
        # Pairs + singles present in mate1
        for i, mate1 in enumerate(reader1):
            mate2 = mate2_input.get(joiner.get_paired_identifier(mate1))
            if mate2:
                out_pairs.write(mate1)
                out_pairs.write(mate2)
                nof_pairs += 1
            else:
                out_singles.write(mate1)
                nof_singles += 1

        # Singles present in mate2
        for j, mate2 in enumerate(reader2):
            mate1 = mate1_input.get(joiner.get_paired_identifier(mate2))
            if not mate1:
                out_singles.write(mate2)
                nof_singles += 1

    if (i is None) and (j is None):
        print("Your input files contained no valid FASTQ sequences.")
    else:
        print('There were %s single reads.' % (nof_singles))
        print('Interlaced %s pairs of sequences.' % (nof_pairs))
Example #3
0
def main():
    mate1_filename   = sys.argv[1]
    mate1_type       = sys.argv[2] or 'sanger'
    mate2_filename   = sys.argv[3]
    mate2_type       = sys.argv[4] or 'sanger'
    outfile_pairs    = sys.argv[5]
    outfile_singles = sys.argv[6]

    if mate1_type != mate2_type:
        print "WARNING: You are trying to interlace files of two different types: %s and %s." % ( mate1_type, mate2_type )
        return

    type = mate1_type
    joiner = fastqJoiner( type )
    out_pairs = fastqWriter( open( outfile_pairs, 'wb' ), format = type )
    out_singles = fastqWriter( open( outfile_singles, 'wb' ), format = type )

    # Pairs + singles present in mate1
    nof_singles = 0
    nof_pairs   = 0
    mate2_input = fastqNamedReader( open( mate2_filename, 'rb' ), format = type )
    i = None
    for i, mate1 in enumerate( fastqReader( open( mate1_filename, 'rb' ), format = type ) ):
        mate2 = mate2_input.get( joiner.get_paired_identifier( mate1 ) )
        if mate2:
            out_pairs.write( mate1 )
            out_pairs.write( mate2 )
            nof_pairs += 1
        else:
            out_singles.write( mate1 )
            nof_singles += 1

    # Singles present in mate2
    mate1_input = fastqNamedReader( open( mate1_filename, 'rb' ), format = type )
    j = None
    for j, mate2 in enumerate( fastqReader( open( mate2_filename, 'rb' ), format = type ) ):
        mate1 = mate1_input.get( joiner.get_paired_identifier( mate2 ) )
        if not mate1:
            out_singles.write( mate2 )
            nof_singles += 1

    if (i is None) and (j is None):
        print "Your input files contained no valid FASTQ sequences."
    else:
        print 'There were %s single reads.' % ( nof_singles )
        print 'Interlaced %s pairs of sequences.' % ( nof_pairs )

    mate1_input.close()
    mate2_input.close()
    out_pairs.close()
    out_singles.close()
Example #4
0
def main():
    #Read command line arguments
    input1_filename = sys.argv[1]
    input1_type = sys.argv[2] or 'sanger'
    input2_filename = sys.argv[3]
    input2_type = sys.argv[4] or 'sanger'
    output_filename = sys.argv[5]

    if input1_type != input2_type:
        print "WARNING: You are trying to join files of two different types: %s and %s." % (
            input1_type, input2_type)

    input2 = fastqNamedReader(open(input2_filename, 'rb'), input2_type)
    joiner = fastqJoiner(input1_type)
    out = fastqWriter(open(output_filename, 'wb'), format=input1_type)

    i = None
    skip_count = 0
    for i, fastq_read in enumerate(
            fastqReader(open(input1_filename, 'rb'), format=input1_type)):
        identifier = joiner.get_paired_identifier(fastq_read)
        fastq_paired = input2.get(identifier)
        if fastq_paired is None:
            skip_count += 1
        else:
            out.write(joiner.join(fastq_read, fastq_paired))
    out.close()

    if i is None:
        print "Your file contains no valid FASTQ reads."
    else:
        print input2.has_data()
        print 'Joined %s of %s read pairs (%.2f%%).' % (
            i - skip_count + 1, i + 1,
            float(i - skip_count + 1) / float(i + 1) * 100.0)
def main():
    input_filename = sys.argv[1]
    input_type = sys.argv[2] or 'sanger'
    mate1_filename = sys.argv[3]
    mate2_filename = sys.argv[4]
    single1_filename = sys.argv[5]
    single2_filename = sys.argv[6]

    type = input_type
    input = fastqNamedReader(open(input_filename, 'rb'), format=type)
    mate1_out = fastqWriter(open(mate1_filename, 'wb'), format=type)
    mate2_out = fastqWriter(open(mate2_filename, 'wb'), format=type)
    single1_out = fastqWriter(open(single1_filename, 'wb'), format=type)
    single2_out = fastqWriter(open(single2_filename, 'wb'), format=type)
    joiner = fastqJoiner(type)

    i = None
    skip_count = 0
    found = {}
    for i, read in enumerate(
            fastqReader(open(input_filename, 'rb'), format=type)):

        if read.identifier in found:
            del found[read.identifier]
            continue

        mate1 = input.get(read.identifier)

        mate2 = input.get(joiner.get_paired_identifier(mate1))

        if mate2:
            # This is a mate pair
            found[mate2.identifier] = None
            if joiner.is_first_mate(mate1):
                mate1_out.write(mate1)
                mate2_out.write(mate2)
            else:
                mate1_out.write(mate2)
                mate2_out.write(mate1)
        else:
            # This is a single
            skip_count += 1
            if joiner.is_first_mate(mate1):
                single1_out.write(mate1)
            else:
                single2_out.write(mate1)

    if i is None:
        print "Your input file contained no valid FASTQ sequences."
    else:
        if skip_count:
            print 'There were %i reads with no mate.' % skip_count
        print 'De-interlaced %s pairs of sequences.' % (
            (i - skip_count + 1) / 2)

    input.close()
    mate1_out.close()
    mate2_out.close()
    single1_out.close()
    single2_out.close()
Example #6
0
def main():
    # Read command line arguments
    input_filename = sys.argv[1]
    script_filename = sys.argv[2]
    output_filename = sys.argv[3]
    additional_files_path = sys.argv[4]
    input_type = sys.argv[5] or 'sanger'

    # Save script file for debuging/verification info later
    os.mkdir(additional_files_path)
    shutil.copy(script_filename, os.path.join(additional_files_path, 'debug.txt'))

    fastq_manipulator = imp.load_module('fastq_manipulator', open(script_filename), script_filename, ('', 'r', imp.PY_SOURCE))
    i = None
    reads_manipulated = 0

    writer = fastqWriter(path=output_filename, format=input_type)
    reader = fastqReader(path=input_filename, format=input_type)
    with writer, reader:
        for i, fastq_read in enumerate(reader):
            new_read = fastq_manipulator.match_and_manipulate_read(fastq_read)
            if new_read:
                writer.write(new_read)
            if new_read != fastq_read:
                reads_manipulated += 1

    if i is None:
        print("Your file contains no valid FASTQ reads.")
    else:
        print('Manipulated %s of %s reads (%.2f%%).' % (reads_manipulated, i + 1, float(reads_manipulated) / float(i + 1) * 100.0))
Example #7
0
def load_ids(filename, filetype):
    if filetype == "tabular":
        for line in open(filename):
            line = line.rstrip("\n")
            if line and not line.startswith("#"):
                yield line.split("\t", 1)[0]
    elif filetype == "fasta":
        for line in open(filename):
            if line.startswith(">"):
                yield line[1:].rstrip("\n").split(None, 1)[0]
    elif filetype.startswith("fastq"):
        # Use the Galaxy library not Biopython to cope with CS
        from galaxy_utils.sequence.fastq import fastqReader
        handle = open(filename, "rU")
        for record in fastqReader(handle):
            # The [1:] is because the fastaReader leaves the @ on the identifer.
            yield record.identifier.split()[0][1:]
        handle.close()
    elif filetype == "sff":
        try:
            from Bio.SeqIO import index
        except ImportError:
            sys.exit("Require Biopython 1.54 or later (to read SFF files)")
        # This will read the SFF index block if present (very fast)
        for name in index(filename, "sff"):
            yield name
    else:
        sys.exit("Unexpected file type %s" % filetype)
Example #8
0
def main():
    input_filename = sys.argv[1]
    output_filename = sys.argv[2]
    left_offset = sys.argv[3]
    right_offset = sys.argv[4]
    percent_offsets = sys.argv[5] == 'offsets_percent'
    input_type = sys.argv[6] or 'sanger'
    keep_zero_length = sys.argv[7] == 'keep_zero_length'
    
    out = fastqWriter( open( output_filename, 'wb' ), format = input_type )
    num_reads_excluded = 0
    num_reads = None
    for num_reads, fastq_read in enumerate( fastqReader( open( input_filename ), format = input_type ) ):
        if percent_offsets:
            left_column_offset = int( round( float( left_offset ) / 100.0 * float( len( fastq_read ) ) ) )
            right_column_offset = int( round( float( right_offset ) / 100.0 * float( len( fastq_read ) ) ) )
        else:
            left_column_offset = int( left_offset )
            right_column_offset = int( right_offset )
        if right_column_offset > 0:
            right_column_offset = -right_column_offset
        else:
            right_column_offset = None
        fastq_read = fastq_read.slice( left_column_offset, right_column_offset )
        if keep_zero_length or len( fastq_read ):
            out.write( fastq_read )
        else:
            num_reads_excluded += 1
    out.close()
    if num_reads is None:
        print "No valid fastq reads could be processed."
    else:
        print "%i fastq reads were processed." % ( num_reads + 1 )
    if num_reads_excluded:
        print "%i reads of zero length were excluded from the output." % num_reads_excluded
def main():
    #Read command line arguments
    input_filename = sys.argv[1]
    script_filename = sys.argv[2]
    output_filename = sys.argv[3]
    additional_files_path = sys.argv[4]
    input_type = sys.argv[5] or 'sanger'
    
    #Save script file for debuging/verification info later
    os.mkdir( additional_files_path )
    shutil.copy( script_filename, os.path.join( additional_files_path, 'debug.txt' ) )
    
    out = fastqWriter( open( output_filename, 'wb' ), format = input_type )
    
    i = None
    reads_kept = 0
    for i, fastq_read in enumerate( fastqReader( open( input_filename ), format = input_type ) ):
        local = {'fastq_read':fastq_read, 'ret_val':False}
        execfile( script_filename, {}, local )
        if local['ret_val']:
            out.write( fastq_read )
            reads_kept += 1
    out.close()
    if i is None:
        print "Your file contains no valid fastq reads."
    else:
        print 'Kept %s of %s reads (%.2f%%).' % ( reads_kept, i + 1, float( reads_kept ) / float( i + 1 ) * 100.0 )
Example #10
0
def load_ids(filename, filetype):
    if filetype == "tabular":
        for line in open(filename):
            line = line.rstrip("\n")
            if line and not line.startswith("#"):
                yield line.split("\t", 1)[0]
    elif filetype == "fasta":
        for line in open(filename):
            if line.startswith(">"):
                yield line[1:].rstrip("\n").split(None, 1)[0]
    elif filetype.startswith("fastq"):
        # Use the Galaxy library not Biopython to cope with CS
        from galaxy_utils.sequence.fastq import fastqReader
        handle = open(filename, "rU")
        for record in fastqReader(handle):
            # The [1:] is because the fastaReader leaves the @ on the identifer.
            yield record.identifier.split()[0][1:]
        handle.close()
    elif filetype == "sff":
        try:
            from Bio.SeqIO import index
        except ImportError:
            sys.exit("Require Biopython 1.54 or later (to read SFF files)")
        # This will read the SFF index block if present (very fast)
        for name in index(filename, "sff"):
            yield name
    else:
        sys.exit("Unexpected file type %s" % filetype)
Example #11
0
def main():
    #Read command line arguments
    input_filename = sys.argv[1]
    script_filename = sys.argv[2]
    output_filename = sys.argv[3]
    additional_files_path = sys.argv[4]
    input_type = sys.argv[5] or 'sanger'

    #Save script file for debuging/verification info later
    os.mkdir(additional_files_path)
    shutil.copy(script_filename,
                os.path.join(additional_files_path, 'debug.txt'))

    out = fastqWriter(open(output_filename, 'wb'), format=input_type)

    i = None
    reads_kept = 0
    for i, fastq_read in enumerate(
            fastqReader(open(input_filename), format=input_type)):
        local = {'fastq_read': fastq_read, 'ret_val': False}
        execfile(script_filename, {}, local)
        if local['ret_val']:
            out.write(fastq_read)
            reads_kept += 1
    out.close()
    if i is None:
        print "Your file contains no valid fastq reads."
    else:
        print 'Kept %s of %s reads (%.2f%%).' % (
            reads_kept, i + 1, float(reads_kept) / float(i + 1) * 100.0)
Example #12
0
def main():
    #Read command line arguments
    input_filename = sys.argv[1]
    script_filename = sys.argv[2]
    output_filename = sys.argv[3]
    additional_files_path = sys.argv[4]
    input_type = sys.argv[5] or 'sanger'
    
    #Save script file for debuging/verification info later
    os.mkdir( additional_files_path )
    shutil.copy( script_filename, os.path.join( additional_files_path, 'debug.txt' ) )
    
    ## Dan, Others: Can we simply drop the "format=input_type" here since it is specified in reader.
    ## This optimization would cut runtime roughly in half (for my test case anyway). -John
    out = fastqWriter( open( output_filename, 'wb' ), format = input_type )
    
    i = None
    reads_kept = 0
    execfile(script_filename, globals())
    for i, fastq_read in enumerate( fastqReader( open( input_filename ), format = input_type ) ):
        ret_val = fastq_read_pass_filter( fastq_read )  ## fastq_read_pass_filter defined in script_filename
        if ret_val:
            out.write( fastq_read )
            reads_kept += 1
    out.close()
    if i is None:
        print "Your file contains no valid fastq reads."
    else:
        print 'Kept %s of %s reads (%.2f%%).' % ( reads_kept, i + 1, float( reads_kept ) / float( i + 1 ) * 100.0 )
Example #13
0
def main():
    # Read command line arguments
    input_filename = sys.argv[1]
    script_filename = sys.argv[2]
    output_filename = sys.argv[3]
    additional_files_path = sys.argv[4]
    input_type = sys.argv[5] or 'sanger'

    # Save script file for debuging/verification info later
    os.mkdir(additional_files_path)
    shutil.copy(script_filename,
                os.path.join(additional_files_path, 'debug.txt'))

    # Dan, Others: Can we simply drop the "format=input_type" here since it is specified in reader.
    # This optimization would cut runtime roughly in half (for my test case anyway). -John
    out = fastqWriter(path=output_filename, format=input_type)

    i = None
    reads_kept = 0
    execfile(script_filename, globals())
    for i, fastq_read in enumerate(
            fastqReader(path=input_filename, format=input_type)):
        ret_val = fastq_read_pass_filter(
            fastq_read
        )  # fastq_read_pass_filter defined in script_filename  # NOQA
        if ret_val:
            out.write(fastq_read)
            reads_kept += 1
    out.close()
    if i is None:
        print("Your file contains no valid fastq reads.")
    else:
        print('Kept %s of %s reads (%.2f%%).' %
              (reads_kept, i + 1, float(reads_kept) / float(i + 1) * 100.0))
Example #14
0
    def test_invalid_header(self):
        i_path = _data_path('fastqreader_min_invalid-header')
        reader = fastqReader(path=i_path)

        with self.assertRaises(fastqFormatError):
            for _ in reader:
                pass
def main():
    #Read command line arguments
    input1_filename = sys.argv[1]
    input1_type = sys.argv[2] or 'sanger'
    input2_filename = sys.argv[3]
    input2_type = sys.argv[4] or 'sanger'
    output_filename = sys.argv[5]
    
    if input1_type != input2_type:
        print "WARNING: You are trying to join files of two different types: %s and %s." % ( input1_type, input2_type )
    
    input2 = fastqNamedReader( open( input2_filename, 'rb' ), input2_type )
    joiner = fastqJoiner( input1_type )
    out = fastqWriter( open( output_filename, 'wb' ), format = input1_type )
    
    i = None
    skip_count = 0
    for i, fastq_read in enumerate( fastqReader( open( input1_filename, 'rb' ), format = input1_type ) ):
        identifier = joiner.get_paired_identifier( fastq_read )
        fastq_paired = input2.get( identifier )
        if fastq_paired is None:
            skip_count += 1
        else:
            out.write( joiner.join( fastq_read, fastq_paired ) )
    out.close()
    
    if i is None:
        print "Your file contains no valid FASTQ reads."
    else:
        print input2.has_data()
        print 'Joined %s of %s read pairs (%.2f%%).' % ( i - skip_count + 1, i + 1, float( i - skip_count + 1 ) / float( i + 1 ) * 100.0 )
Example #16
0
def main():
    # Read command line arguments
    input_filename = sys.argv[1]
    input_type = sys.argv[2] or 'sanger'
    output1_filename = sys.argv[3]
    output2_filename = sys.argv[4]

    splitter = fastqSplitter()
    out1 = fastqWriter(path=output1_filename, format=input_type)
    out2 = fastqWriter(path=output2_filename, format=input_type)

    i = None
    skip_count = 0
    for i, fastq_read in enumerate(fastqReader(path=input_filename, format=input_type)):
        read1, read2 = splitter.split(fastq_read)
        if read1 and read2:
            out1.write(read1)
            out2.write(read2)
        else:
            skip_count += 1
    out1.close()
    out2.close()
    if i is None:
        print("Your file contains no valid FASTQ reads.")
    else:
        print('Split %s of %s reads (%.2f%%).' % (i - skip_count + 1, i + 1, float(i - skip_count + 1) / float(i + 1) * 100.0))
def main():
    usage = "usage: %prog [options] input_file output_file"
    parser = OptionParser(usage=usage)
    parser.add_option('-f', '--format', dest='format', type='choice', default='sanger', choices=('sanger', 'solexa', 'illumina', 'sanger.gz', 'solexa.gz', 'illumina.gz', 'sanger.bz2', 'solexa.bz2', 'illumina.bz2'), help='FASTQ variant type')
    parser.add_option('-m', '--mask_character', dest='mask_character', default='N', help='Mask Character to use')
    parser.add_option('-c', '--score_comparison', type="choice", dest='score_comparison', default='le', choices=('gt', 'ge', 'eq', 'lt', 'le', 'ne'), help='Mask base when score is')
    parser.add_option('-s', '--quality_score', type="float", dest='quality_score', default='0', help='Quality Score')
    parser.add_option("-l", "--lowercase", action="store_true", dest="lowercase", default=False, help="Use lowercase masking")
    (options, args) = parser.parse_args()

    if len(args) != 2:
        parser.error("Need to specify an input file and an output file")

    score_comparer = get_score_comparer(options.score_comparison)

    if options.lowercase:
        base_masker = str.lower
    else:
        base_masker = BaseReplacer(options.mask_character)

    out = fastqWriter(path=args[1], format=options.format)

    num_reads = None
    for num_reads, fastq_read in enumerate(fastqReader(path=args[0], format=options.format)):
        sequence_list = list(fastq_read.sequence)
        for i, quality_score in enumerate(fastq_read.get_decimal_quality_scores()):
            if score_comparer(quality_score, options.quality_score):
                sequence_list[i] = base_masker(sequence_list[i])
        fastq_read.sequence = "".join(sequence_list)
        out.write(fastq_read)

    if num_reads is not None:
        print("Processed %i %s reads." % (num_reads + 1, options.format))
    else:
        print("No valid FASTQ reads were provided.")
def main():
    # Read command line arguments
    input_filename = sys.argv[1]
    script_filename = sys.argv[2]
    output_filename = sys.argv[3]
    additional_files_path = sys.argv[4]
    input_type = sys.argv[5] or 'sanger'

    # Save script file for debuging/verification info later
    os.mkdir(additional_files_path)
    shutil.copy(script_filename, os.path.join(additional_files_path, 'debug.txt'))

    fastq_manipulator = imp.load_module('fastq_manipulator', open(script_filename), script_filename, ('', 'r', imp.PY_SOURCE))

    out = fastqWriter(path=output_filename, format=input_type)

    i = None
    reads_manipulated = 0
    for i, fastq_read in enumerate(fastqReader(path=input_filename, format=input_type)):
        new_read = fastq_manipulator.match_and_manipulate_read(fastq_read)
        if new_read:
            out.write(new_read)
        if new_read != fastq_read:
            reads_manipulated += 1
    out.close()
    if i is None:
        print("Your file contains no valid FASTQ reads.")
    else:
        print('Manipulated %s of %s reads (%.2f%%).' % (reads_manipulated, i + 1, float(reads_manipulated) / float(i + 1) * 100.0))
Example #19
0
def test_fastq_reader_cleanup():
    i_path = _data_path("sanger_full_range_original_sanger.fastqsanger")
    fh = open(i_path)
    with _new_argv([fh]):
        reader = fastqReader(fh)
        for _ in reader:
            pass
    assert (fh.closed)
def main():
    input_filename = sys.argv[1]
    input_type = sys.argv[2] or 'sanger'
    mate1_filename = sys.argv[3]
    mate2_filename = sys.argv[4]
    single1_filename = sys.argv[5]
    single2_filename = sys.argv[6]

    type = input_type
    input = fastqNamedReader(path=input_filename, format=type)
    mate1_out = fastqWriter(path=mate1_filename, format=type)
    mate2_out = fastqWriter(path=mate2_filename, format=type)
    single1_out = fastqWriter(path=single1_filename, format=type)
    single2_out = fastqWriter(path=single2_filename, format=type)
    joiner = fastqJoiner(type)

    i = None
    skip_count = 0
    found = {}
    for i, read in enumerate(fastqReader(path=input_filename, format=type)):

        if read.identifier in found:
            del found[read.identifier]
            continue

        mate1 = input.get(read.identifier)

        mate2 = input.get(joiner.get_paired_identifier(mate1))

        if mate2:
            # This is a mate pair
            found[mate2.identifier] = None
            if joiner.is_first_mate(mate1):
                mate1_out.write(mate1)
                mate2_out.write(mate2)
            else:
                mate1_out.write(mate2)
                mate2_out.write(mate1)
        else:
            # This is a single
            skip_count += 1
            if joiner.is_first_mate(mate1):
                single1_out.write(mate1)
            else:
                single2_out.write(mate1)

    if i is None:
        print("Your input file contained no valid FASTQ sequences.")
    else:
        if skip_count:
            print('There were %i reads with no mate.' % skip_count)
        print('De-interlaced %s pairs of sequences.' % ((i - skip_count + 1) / 2))

    input.close()
    mate1_out.close()
    mate2_out.close()
    single1_out.close()
    single2_out.close()
Example #21
0
    def test_file_closed_on_completion(self):
        i_path = _data_path('fastqreader_min')
        fh = open(i_path)
        reader = fastqReader(fh)
        for _ in reader:
            pass

        self.assertTrue(fh.closed,
                        'File should be closed after iteration compeletes')
Example #22
0
def main():
    input_filename = sys.argv[1]
    input_type = sys.argv[2] or 'sanger'
    mate1_filename = sys.argv[3]
    mate2_filename = sys.argv[4]
    single1_filename = sys.argv[5]
    single2_filename = sys.argv[6]

    type = input_type
    joiner = fastqJoiner(type)
    i = None
    skip_count = 0
    found = {}

    mate1_out = fastqWriter(path=mate1_filename, format=type)
    mate2_out = fastqWriter(path=mate2_filename, format=type)
    single1_out = fastqWriter(path=single1_filename, format=type)
    single2_out = fastqWriter(path=single2_filename, format=type)
    reader1 = fastqNamedReader(path=input_filename, format=type)
    reader2 = fastqReader(path=input_filename, format=type)

    with mate1_out, mate2_out, single1_out, single2_out, reader1, reader2:

        for i, read in enumerate(reader2):

            if read.identifier in found:
                del found[read.identifier]
                continue

            mate1 = reader1.get(read.identifier)

            mate2 = reader1.get(joiner.get_paired_identifier(mate1))

            if mate2:
                # This is a mate pair
                found[mate2.identifier] = None
                if joiner.is_first_mate(mate1):
                    mate1_out.write(mate1)
                    mate2_out.write(mate2)
                else:
                    mate1_out.write(mate2)
                    mate2_out.write(mate1)
            else:
                # This is a single
                skip_count += 1
                if joiner.is_first_mate(mate1):
                    single1_out.write(mate1)
                else:
                    single2_out.write(mate1)

    if i is None:
        print("Your input file contained no valid FASTQ sequences.")
    else:
        if skip_count:
            print('There were %i reads with no mate.' % skip_count)
        print('De-interlaced %s pairs of sequences.' % ((i - skip_count + 1) / 2))
Example #23
0
 def test_file_closed_on_line3_error(self):
     i_path = _data_path('fastqreader_min_invalid-line3')
     fh = open(i_path)
     with fastqReader(fh) as reader:
         with self.assertRaises(fastqFormatError):
             for _ in reader:
                 pass
     self.assertTrue(
         fh.closed,
         'File should be closed if exception occurs due to invalid line3')
Example #24
0
    def test_read_sequence_multiline(self):
        i_path = _data_path('fastqreader_min-multiline')
        reader = fastqReader(path=i_path)
        rvals = [rval for rval in reader]

        expected_reads = 2
        expected_seqs = 'ACGTACGTACGTACGTACGT', 'CATGCATGCATGCATGCATG'

        self.assertEqual(expected_reads, len(rvals))
        self.assertEqual(expected_seqs[0], rvals[0].get_sequence())
        self.assertEqual(expected_seqs[1], rvals[1].get_sequence())
Example #25
0
    def test_read_sequence(self):
        i_path = _data_path('fastqreader_min')
        with fastqReader(path=i_path) as reader:
            rvals = [rval for rval in reader]

        expected_reads = 2
        expected_seqs = 'ACGTACGTAC', 'CATGCATGCA'

        self.assertEqual(expected_reads, len(rvals))
        self.assertEqual(expected_seqs[0], rvals[0].get_sequence())
        self.assertEqual(expected_seqs[1], rvals[1].get_sequence())
Example #26
0
    def test_read_qualityscores(self):
        i_path = _data_path('fastqreader_min')
        reader = fastqReader(path=i_path)
        rvals = [rval for rval in reader]

        expected_reads = 2
        expected_scores = '!##$%&&()*', '~}|{zyxwvu'

        self.assertEqual(expected_reads, len(rvals))
        self.assertEqual(expected_scores[0], rvals[0].quality)
        self.assertEqual(expected_scores[1], rvals[1].quality)
Example #27
0
    def test_read_qualityscores_multiline(self):
        i_path = _data_path('fastqreader_min-multiline')
        reader = fastqReader(path=i_path)
        rvals = [rval for rval in reader]

        expected_reads = 2
        expected_scores = '!##$%&&()**,-./01234', '~}|{zyxwvutsrqponmlk'

        self.assertEqual(expected_reads, len(rvals))
        self.assertEqual(expected_scores[0], rvals[0].quality)
        self.assertEqual(expected_scores[1], rvals[1].quality)
Example #28
0
 def fastq_filter(in_file, out_file, iterator_filter):
     count = 0
     #from galaxy_utils.sequence.fastq import fastqReader, fastqWriter
     reader = fastqReader(open(in_file, "rU"))
     writer = fastqWriter(open(out_file, "w"))
     for record in iterator_filter(reader):
         count += 1
         writer.write(record)
     writer.close()
     reader.close()
     return count
Example #29
0
    def test_read_header(self):
        i_path = _data_path('fastqreader_min')
        reader = fastqReader(path=i_path)
        rvals = [rval for rval in reader]

        expected_reads = 2
        expected_headers = '@FAKE-1', '@FAKE-2'

        self.assertEqual(expected_reads, len(rvals))
        self.assertEqual(expected_headers[0], rvals[0].identifier)
        self.assertEqual(expected_headers[1], rvals[1].identifier)
Example #30
0
 def fastq_filter(in_file, out_file, iterator_filter):
     count = 0
     #from galaxy_utils.sequence.fastq import fastqReader, fastqWriter
     reader = fastqReader(open(in_file, "rU"))
     writer = fastqWriter(open(out_file, "w"))
     for record in iterator_filter(reader):
         count += 1
         writer.write(record)
     writer.close()
     reader.close()
     return count
Example #31
0
def main():
    input_filename = sys.argv[1]
    output_filename = sys.argv[2]
    input_type = sys.argv[3] or 'sanger'

    aggregator = fastqAggregator()
    num_reads = None
    fastq_read = None
    for num_reads, fastq_read in enumerate(
            fastqReader(open(input_filename), format=input_type)):
        aggregator.consume_read(fastq_read)
    out = open(output_filename, 'wb')
    valid_nucleotides = VALID_NUCLEOTIDES
    if fastq_read:
        if fastq_read.sequence_space == 'base':
            out.write(
                '#column\tcount\tmin\tmax\tsum\tmean\tQ1\tmed\tQ3\tIQR\tlW\trW\toutliers\tA_Count\tC_Count\tG_Count\tT_Count\tN_Count\tother_bases\tother_base_count\n'
            )
        else:
            out.write(
                '#column\tcount\tmin\tmax\tsum\tmean\tQ1\tmed\tQ3\tIQR\tlW\trW\toutliers\t0_Count\t1_Count\t2_Count\t3_Count\t4_Count\t5_Count\t6_Count\t._Count\tother_bases\tother_base_count\n'
            )
            valid_nucleotides = VALID_COLOR_SPACE
    for i in range(aggregator.get_max_read_length()):
        column_stats = aggregator.get_summary_statistics_for_column(i)
        out.write('%i\t' % (i + 1))
        out.write('%s\t' * len(SUMMARY_STAT_ORDER) %
                  tuple([column_stats[key] for key in SUMMARY_STAT_ORDER]))
        out.write('%s\t' % ','.join(map(str, column_stats['outliers'])))
        base_counts = aggregator.get_base_counts_for_column(i)
        for nuc in valid_nucleotides:
            out.write("%s\t" % base_counts.get(nuc, 0))
        extra_nucs = sorted([
            nuc for nuc in base_counts.keys() if nuc not in valid_nucleotides
        ])
        out.write("%s\t%s\n" % (','.join(extra_nucs), ','.join(
            str(base_counts[nuc]) for nuc in extra_nucs)))
    out.close()
    if num_reads is None:
        print "No valid fastq reads could be processed."
    else:
        print "%i fastq reads were processed." % (num_reads + 1)
        print "Based upon quality values and sequence characters, the input data is valid for: %s" % (
            ", ".join(aggregator.get_valid_formats()) or "None")
        ascii_range = aggregator.get_ascii_range()
        decimal_range = aggregator.get_decimal_range()
        print "Input ASCII range: %s(%i) - %s(%i)" % (
            repr(ascii_range[0]), ord(ascii_range[0]), repr(
                ascii_range[1]), ord(ascii_range[1])
        )  #print using repr, since \x00 (null) causes info truncation in galaxy when printed
        print "Input decimal range: %i - %i" % (decimal_range[0],
                                                decimal_range[1])
Example #32
0
    def test_read_qualityscores_edgecase_multiline(self):
        # Quality score input designed to confuse the parser
        i_path = _data_path('fastqreader_min-multiline-edgecase')
        reader = fastqReader(path=i_path)
        rvals = [rval for rval in reader]

        expected_reads = 2
        expected_scores = ('+##$%&&()*+##$%&&()*@,-./01234',
                           '@}|{zyxwvu@}|{zyxwvu+srqponmlk')

        self.assertEqual(expected_reads, len(rvals))
        self.assertEqual(expected_scores[0], rvals[0].quality)
        self.assertEqual(expected_scores[1], rvals[1].quality)
Example #33
0
    def test_read_line3(self):
        # Separate test case for line3 containing a copy of line1
        i_path = _data_path('fastqreader_min-line3')
        reader = fastqReader(path=i_path)
        rvals = [rval for rval in reader]

        expected_reads = 2
        expected_seqs = 'ACGTACGTAC', 'CATGCATGCA'
        expected_scores = '!##$%&&()*', '~}|{zyxwvu'

        self.assertEqual(expected_reads, len(rvals))
        for i in range(len(rvals)):
            self.assertEqual(expected_scores[i], rvals[i].quality)
            self.assertEqual(expected_seqs[i], rvals[i].get_sequence())
Example #34
0
def main():
    input_filename = sys.argv[1]
    output_filename = sys.argv[2]
    input_type = sys.argv[3] or 'sanger'  # input type should ordinarily be unnecessary

    num_reads = None
    fastq_read = None
    out = fastaWriter(path=output_filename, format="fasta")
    for num_reads, fastq_read in enumerate(fastqReader(path=input_filename, format=input_type)):
        out.write(fastq_read)
    out.close()
    if num_reads is None:
        print("No valid FASTQ reads could be processed.")
    else:
        print("%i FASTQ reads were converted to FASTA." % (num_reads + 1))
Example #35
0
    def test_invalid_line3_stripped(self):
        i_path = _data_path('fastqreader_min_invalid-line3')
        # fix_id=True: fix inconsistent identifiers (source: SRA data dumps)k
        reader = fastqReader(path=i_path, fix_id=True)
        rvals = [rval for rval in reader]

        expected_reads = 2
        expected_seqs = 'ACGTACGTAC', 'CATGCATGCA'
        expected_scores = '!##$%&&()*', '~}|{zyxwvu'
        expected_line3 = '+'

        self.assertEqual(expected_reads, len(rvals))
        for i in range(len(rvals)):
            self.assertEqual(expected_scores[i], rvals[i].quality)
            self.assertEqual(expected_seqs[i], rvals[i].get_sequence())
            self.assertEqual(expected_line3, rvals[i].description)
Example #36
0
def main():
    # Read command line arguments
    input1_filename = sys.argv[1]
    input1_type = sys.argv[2] or 'sanger'
    input2_filename = sys.argv[3]
    input2_type = sys.argv[4] or 'sanger'
    output_filename = sys.argv[5]

    fastq_style = sys.argv[6] or 'old'

    paste = sys.argv[7] or ''
    # --
    if input1_type != input2_type:
        print(
            "WARNING: You are trying to join files of two different types: %s and %s."
            % (input1_type, input2_type))

    if fastq_style == 'new':
        sep = sniff_sep(input1_filename)
        joiner = FastqJoiner(input1_type, sep=sep, paste=paste)
    else:
        joiner = fq.fastqJoiner(input1_type, paste=paste)
    # --
    i = None
    skip_count = 0

    writer = fq.fastqWriter(path=output_filename, format=input1_type)
    reader1 = fq.fastqReader(path=input1_filename, format=input1_type)
    reader2 = fq.fastqNamedReader(path=input2_filename, format=input2_type)

    with writer, reader1, reader2:
        for i, fastq_read in enumerate(reader1):
            identifier = joiner.get_paired_identifier(fastq_read)
            fastq_paired = reader2.get(identifier)
            if fastq_paired is None:
                skip_count += 1
            else:
                writer.write(joiner.join(fastq_read, fastq_paired))

        # this indent is correct: we still need access to reader2
        if i is None:
            print("Your file contains no valid FASTQ reads.")
        else:
            print(reader2.has_data())
            print('Joined %s of %s read pairs (%.2f%%).' %
                  (i - skip_count + 1, i + 1,
                   (i - skip_count + 1) / (i + 1) * 100.0))
def main():
    input_filename = sys.argv[1]
    output_filename = sys.argv[2]
    input_type = sys.argv[
        3] or 'sanger'  #input type should ordinarily be unnecessary

    num_reads = None
    fastq_read = None
    out = fastaWriter(open(output_filename, 'wb'))
    for num_reads, fastq_read in enumerate(
            fastqReader(open(input_filename), format=input_type)):
        out.write(fastq_read)
    out.close()
    if num_reads is None:
        print "No valid FASTQ reads could be processed."
    else:
        print "%i FASTQ reads were converted to FASTA." % (num_reads + 1)
Example #38
0
def main():
    input_filename = sys.argv[1]
    output_filename = sys.argv[2]
    input_type = sys.argv[3] or 'sanger'  # input type should ordinarily be unnecessary

    num_reads = None
    fastq_read = None
    out = fastaWriter(path=output_filename, format="fasta")

    reader = fastqReader(path=input_filename, format=input_type)
    with reader:
        for num_reads, fastq_read in enumerate(reader):
            out.write(fastq_read)
        out.close()

    if num_reads is None:
        print("No valid FASTQ reads could be processed.")
    else:
        print("%i FASTQ reads were converted to FASTA." % (num_reads + 1))
Example #39
0
def main():
    input_filename = sys.argv[1]
    output_filename = sys.argv[2]
    input_type = sys.argv[3] or 'sanger' #input type should ordinarily be unnecessary
    renum = bool(int(sys.argv[4])) 
    num_reads = None
    fastq_read = None
    #out = fastaWriter( open( output_filename, 'wb' ) )
    out = open(output_filename, "w")
    for num_reads, fastq_read in enumerate( fastqReader( open( input_filename ), format = input_type ) ):
        if not renum:
            out.write( "%s\t%s\t%s\n" % (fastq_read.identifier[1:], fastq_read.sequence, fastq_read.quality))
        else:
            out.write( "%x\t%s\t%s\n" % (num_reads, fastq_read.sequence, fastq_read.quality))

    out.close()
    if num_reads is None:
        print "No valid FASTQ reads could be processed."
    else:
        print "%i FASTQ reads were converted to SFQ." % ( num_reads + 1 )
Example #40
0
def main():
    input_filename = sys.argv[1]
    output_filename = sys.argv[2]
    left_offset = sys.argv[3]
    right_offset = sys.argv[4]
    percent_offsets = sys.argv[5] == 'offsets_percent'
    input_type = sys.argv[6] or 'sanger'
    keep_zero_length = sys.argv[7] == 'keep_zero_length'

    out = fastqWriter(path=output_filename, format=input_type)
    num_reads_excluded = 0
    num_reads = None
    for num_reads, fastq_read in enumerate(
            fastqReader(path=input_filename, format=input_type)):
        if percent_offsets:
            left_column_offset = int(
                round(float(left_offset) / 100.0 * float(len(fastq_read))))
            right_column_offset = int(
                round(float(right_offset) / 100.0 * float(len(fastq_read))))
        else:
            left_column_offset = int(left_offset)
            right_column_offset = int(right_offset)
        if right_column_offset != 0:
            right_column_offset = -right_column_offset
        else:
            right_column_offset = None
        fastq_read = fastq_read.slice(left_column_offset, right_column_offset)
        if keep_zero_length or len(fastq_read):
            out.write(fastq_read)
        else:
            num_reads_excluded += 1
    out.close()
    if num_reads is None:
        print("No valid fastq reads could be processed.")
    else:
        print("%i fastq reads were processed." % (num_reads + 1))
    if num_reads_excluded:
        print("%i reads of zero length were excluded from the output." %
              num_reads_excluded)
def main():
    # Read command line arguments
    input1_filename = sys.argv[1]
    input1_type = sys.argv[2] or 'sanger'
    input2_filename = sys.argv[3]
    input2_type = sys.argv[4] or 'sanger'
    output_filename = sys.argv[5]

    fastq_style = sys.argv[6] or 'old'

    paste = sys.argv[7] or ''
    # --
    if input1_type != input2_type:
        print("WARNING: You are trying to join files of two different types: %s and %s." % (input1_type, input2_type))

    if fastq_style == 'new':
        sep = sniff_sep(input1_filename)
        joiner = FastqJoiner(input1_type, sep=sep, paste=paste)
    else:
        joiner = fq.fastqJoiner(input1_type, paste=paste)
    # --
    input2 = fq.fastqNamedReader(path=input2_filename, format=input2_type)
    out = fq.fastqWriter(path=output_filename, format=input1_type)
    i = None
    skip_count = 0
    for i, fastq_read in enumerate(fq.fastqReader(path=input1_filename, format=input1_type)):
        identifier = joiner.get_paired_identifier(fastq_read)
        fastq_paired = input2.get(identifier)
        if fastq_paired is None:
            skip_count += 1
        else:
            out.write(joiner.join(fastq_read, fastq_paired))
    out.close()

    if i is None:
        print("Your file contains no valid FASTQ reads.")
    else:
        print(input2.has_data())
        print('Joined %s of %s read pairs (%.2f%%).' % (i - skip_count + 1, i + 1, (i - skip_count + 1) / (i + 1) * 100.0))
Example #42
0
def main():
    input_filename = sys.argv[1]
    output_filename = sys.argv[2]
    input_type = sys.argv[3] or 'sanger'

    aggregator = fastqAggregator()
    num_reads = None
    fastq_read = None
    for num_reads, fastq_read in enumerate(fastqReader(path=input_filename, format=input_type)):
        aggregator.consume_read(fastq_read)
    out = open(output_filename, 'w')
    valid_nucleotides = VALID_NUCLEOTIDES
    if fastq_read:
        if fastq_read.sequence_space == 'base':
            out.write('#column\tcount\tmin\tmax\tsum\tmean\tQ1\tmed\tQ3\tIQR\tlW\trW\toutliers\tA_Count\tC_Count\tG_Count\tT_Count\tN_Count\tother_bases\tother_base_count\n')
        else:
            out.write('#column\tcount\tmin\tmax\tsum\tmean\tQ1\tmed\tQ3\tIQR\tlW\trW\toutliers\t0_Count\t1_Count\t2_Count\t3_Count\t4_Count\t5_Count\t6_Count\t._Count\tother_bases\tother_base_count\n')
            valid_nucleotides = VALID_COLOR_SPACE
    for i in range(aggregator.get_max_read_length()):
        column_stats = aggregator.get_summary_statistics_for_column(i)
        out.write('%d\t' % (i + 1))
        out.write("%d\t%d\t%d\t%d\t%f\t%f\t%f\t%f\t%f\t%d\t%d\t" % tuple(column_stats[key] for key in SUMMARY_STAT_ORDER))
        out.write('%s\t' % ','.join(map(str, column_stats['outliers'])))
        base_counts = aggregator.get_base_counts_for_column(i)
        for nuc in valid_nucleotides:
            out.write("%s\t" % base_counts.get(nuc, 0))
        extra_nucs = sorted(nuc for nuc in base_counts.keys() if nuc not in valid_nucleotides)
        out.write("%s\t%s\n" % (','.join(extra_nucs), ','.join(str(base_counts[nuc]) for nuc in extra_nucs)))
    out.close()
    if num_reads is None:
        print("No valid fastq reads could be processed.")
    else:
        print("%i fastq reads were processed." % (num_reads + 1))
        print("Based upon quality values and sequence characters, the input data is valid for: %s" % (", ".join(aggregator.get_valid_formats()) or "None"))
        ascii_range = aggregator.get_ascii_range()
        decimal_range = aggregator.get_decimal_range()
        print("Input ASCII range: %s(%i) - %s(%i)" % (repr(ascii_range[0]), ord(ascii_range[0]), repr(ascii_range[1]), ord(ascii_range[1])))  # print(using repr, since \x00 (null) causes info truncation in galaxy when printed)
        print("Input decimal range: %i - %i" % (decimal_range[0], decimal_range[1]))
Example #43
0
                        short_neg += 1

    in_handle = open(in_file, "rb")
    try:
        manifest = ReadRocheXmlManifest(in_handle)
    except ValueError:
        manifest = None
    in_handle.seek(0)
    out_handle = open(out_file, "wb")
    writer = SffWriter(out_handle, xml=manifest)
    writer.write_file(process(SffIterator(in_handle)))
    #End of SFF code
elif seq_format.lower().startswith("fastq"):
    in_handle = open(in_file, "rU")
    out_handle = open(out_file, "w")
    reader = fastqReader(in_handle)
    writer = fastqWriter(out_handle)
    if forward:
        for record in reader:
            seq = record.sequence.upper()
            result = primer.search(seq)
            if result:
                #Forward primer, take everything after it
                cut = result.end()
                record.sequence = seq[cut:]
                if len(record.sequence) >= min_len:
                    record.quality = record.quality[cut:]
                    clipped += 1
                    writer.write(record)
                else:
                    short_clipped += 1
def main():
    usage = "usage: %prog [options] input_file output_file"
    parser = OptionParser( usage=usage )
    parser.add_option( '-f', '--format', dest='format', type='choice', default='sanger', choices=( 'sanger', 'cssanger', 'solexa', 'illumina' ), help='FASTQ variant type' )
    parser.add_option( '-s', '--window_size', type="int", dest='window_size', default='1', help='Window size' )
    parser.add_option( '-t', '--window_step', type="int", dest='window_step', default='1', help='Window step' )
    parser.add_option( '-e', '--trim_ends', type="choice", dest='trim_ends', default='53', choices=('5','3','53','35' ), help='Ends to Trim' )
    parser.add_option( '-a', '--aggregation_action', type="choice", dest='aggregation_action', default='min', choices=('min','max','sum','mean' ), help='Aggregate action for window' )
    parser.add_option( '-x', '--exclude_count', type="int", dest='exclude_count', default='0', help='Maximum number of bases to exclude from the window during aggregation' )
    parser.add_option( '-c', '--score_comparison', type="choice", dest='score_comparison', default='>=', choices=('>','>=','==','<', '<=', '!=' ), help='Keep read when aggregate score is' )
    parser.add_option( '-q', '--quality_score', type="float", dest='quality_score', default='0', help='Quality Score' )
    parser.add_option( "-k", "--keep_zero_length", action="store_true", dest="keep_zero_length", default=False, help="Keep reads with zero length")
    ( options, args ) = parser.parse_args()
    
    if len ( args ) != 2:
        parser.error( "Need to specify an input file and an output file" )
    
    if options.window_size < 1:
        parser.error( 'You must specify a strictly positive window size' )
    
    if options.window_step < 1:
        parser.error( 'You must specify a strictly positive step size' )
    
    #determine an exhaustive list of window indexes that can be excluded from aggregation
    exclude_window_indexes = []
    last_exclude_indexes = []
    for exclude_count in range( min( options.exclude_count, options.window_size ) ):
        if last_exclude_indexes:
            new_exclude_indexes = []
            for exclude_list in last_exclude_indexes:
                for window_index in range( options.window_size ):
                    if window_index not in exclude_list:
                        new_exclude = sorted( exclude_list + [ window_index ] )
                        if new_exclude not in exclude_window_indexes + new_exclude_indexes:
                            new_exclude_indexes.append( new_exclude )
            exclude_window_indexes += new_exclude_indexes
            last_exclude_indexes = new_exclude_indexes
        else:
            for window_index in range( options.window_size ):
                last_exclude_indexes.append( [ window_index ] )
            exclude_window_indexes = list( last_exclude_indexes )
    
    out = fastqWriter( open( args[1], 'wb' ), format = options.format )
    action = ACTION_METHODS[ options.aggregation_action ]
    
    num_reads = None
    num_reads_excluded = 0
    for num_reads, fastq_read in enumerate( fastqReader( open( args[0] ), format = options.format ) ):
        for trim_end in options.trim_ends:
            quality_list = fastq_read.get_decimal_quality_scores()
            if trim_end == '5':
                lwindow_position = 0 #left position of window
                while True:
                    if lwindow_position >= len( quality_list ):
                        fastq_read.sequence = ''
                        fastq_read.quality = ''
                        break
                    if exclude_and_compare( action, quality_list[ lwindow_position:lwindow_position + options.window_size ], options.score_comparison, options.quality_score, exclude_window_indexes ):
                        fastq_read = fastq_read.slice( lwindow_position, None )
                        break
                    lwindow_position += options.window_step
            else:
                rwindow_position = len( quality_list ) #right position of window
                while True:
                    lwindow_position = rwindow_position - options.window_size #left position of window
                    if rwindow_position <= 0 or lwindow_position < 0:
                        fastq_read.sequence = ''
                        fastq_read.quality = ''
                        break
                    if exclude_and_compare( action, quality_list[ lwindow_position:rwindow_position ], options.score_comparison, options.quality_score, exclude_window_indexes ):
                        fastq_read = fastq_read.slice( None, rwindow_position )
                        break
                    rwindow_position -= options.window_step
        if options.keep_zero_length or len( fastq_read ):
            out.write( fastq_read )
        else:
            num_reads_excluded += 1
    out.close()
    if num_reads is None:
        print "No valid FASTQ reads could be processed."
    else:
        print "%i FASTQ reads were processed." % ( num_reads + 1 )
    if num_reads_excluded:
        print "%i reads of zero length were excluded from the output." % num_reads_excluded
Example #45
0
    writer = SffWriter(out_handle, xml=manifest)
    in_handle.seek(0) #start again after getting manifest
    count = writer.write_file(rename_seqrecords(SffIterator(in_handle), rename))
    out_handle.close()
    in_handle.close()
else:
    #Use Galaxy for FASTA, QUAL or FASTQ
    if seq_format.lower() in ["fasta", "csfasta"] \
    or seq_format.lower().startswith("qual"):
        from galaxy_utils.sequence.fasta import fastaReader, fastaWriter
        reader = fastaReader(open(in_file, "rU"))
        writer = fastaWriter(open(out_file, "w"))
        marker = ">"
    elif seq_format.lower().startswith("fastq"):
        from galaxy_utils.sequence.fastq import fastqReader, fastqWriter
        reader = fastqReader(open(in_file, "rU"))
        writer = fastqWriter(open(out_file, "w"))
        marker = "@"
    else:
        sys.exit("Unsupported file type %r" % seq_format)
    #Now do the renaming
    count = 0
    renamed = 0
    for record in reader:
        #The [1:] is because the fastaReader leaves the > on the identifier,
        #likewise the fastqReader leaves the @ on the identifier
        try:
            idn, descr = record.identifier[1:].split(None, 1)
        except ValueError:
            idn = record.identifier[1:]
            descr = None
Example #46
0
def main():
    # Parse Command Line
    try:
        tabular_file, cols_arg, in_file, seq_format, out_positive_file, out_negative_file = sys.argv[1:]
    except ValueError:
        stop_err("Expected six arguments, got %i:\n%s" % (len(sys.argv) - 1, " ".join(sys.argv)))
    try:
        columns = [int(arg) - 1 for arg in cols_arg.split(",")]
    except ValueError:
        stop_err("Expected list of columns (comma separated integers), got %s" % cols_arg)

    if out_positive_file == "-" and out_negative_file == "-":
        stop_err("Neither output file requested")

    # Read tabular file and record all specified identifiers
    ids = set()
    handle = open(tabular_file, "rU")
    if len(columns) > 1:
        # General case of many columns
        for line in handle:
            if line.startswith("#"):
                # Ignore comments
                continue
            parts = line.rstrip("\n").split("\t")
            for col in columns:
                ids.add(parts[col])
        print "Using %i IDs from %i columns of tabular file" % (len(ids), len(columns))
    else:
        # Single column, special case speed up
        col = columns[0]
        for line in handle:
            if not line.startswith("#"):
                ids.add(line.rstrip("\n").split("\t")[col])
        print "Using %i IDs from tabular file" % (len(ids))
    handle.close()

    if seq_format.lower() == "sff":
        # Now write filtered SFF file based on IDs from BLAST file
        try:
            from Bio.SeqIO.SffIO import SffIterator, SffWriter
        except ImportError:
            stop_err("Requires Biopython 1.54 or later")

        try:
            from Bio.SeqIO.SffIO import ReadRocheXmlManifest
        except ImportError:
            # Prior to Biopython 1.56 this was a private function
            from Bio.SeqIO.SffIO import _sff_read_roche_index_xml as ReadRocheXmlManifest
        in_handle = open(in_file, "rb")  # must be binary mode!
        try:
            manifest = ReadRocheXmlManifest(in_handle)
        except ValueError:
            manifest = None
        # This makes two passes though the SFF file with isn't so efficient,
        # but this makes the code simple.
        if out_positive_file != "-":
            out_handle = open(out_positive_file, "wb")
            writer = SffWriter(out_handle, xml=manifest)
            in_handle.seek(0)  # start again after getting manifest
            pos_count = writer.write_file(rec for rec in SffIterator(in_handle) if rec.id in ids)
            out_handle.close()
        if out_negative_file != "-":
            out_handle = open(out_negative_file, "wb")
            writer = SffWriter(out_handle, xml=manifest)
            in_handle.seek(0)  # start again
            neg_count = writer.write_file(rec for rec in SffIterator(in_handle) if rec.id not in ids)
            out_handle.close()
        # And we're done
        in_handle.close()
        # At the time of writing, Galaxy doesn't show SFF file read counts,
        # so it is useful to put them in stdout and thus shown in job info.
        if out_positive_file != "-" and out_negative_file != "-":
            print "%i with and %i without specified IDs" % (pos_count, neg_count)
        elif out_positive_file != "-":
            print "%i with specified IDs" % pos_count
        elif out_negative_file != "-":
            print "%i without specified IDs" % neg_count
    elif seq_format.lower() == "fasta":
        # Write filtered FASTA file based on IDs from tabular file
        reader = fastaReader(open(in_file, "rU"))
        if out_positive_file != "-" and out_negative_file != "-":
            print "Generating two FASTA files"
            positive_writer = fastaWriter(open(out_positive_file, "w"))
            negative_writer = fastaWriter(open(out_negative_file, "w"))
            for record in reader:
                # The [1:] is because the fastaReader leaves the > on the identifer.
                if record.identifier and record.identifier.split()[0][1:] in ids:
                    positive_writer.write(record)
                else:
                    negative_writer.write(record)
            positive_writer.close()
            negative_writer.close()
        elif out_positive_file != "-":
            print "Generating matching FASTA file"
            positive_writer = fastaWriter(open(out_positive_file, "w"))
            for record in reader:
                # The [1:] is because the fastaReader leaves the > on the identifer.
                if record.identifier and record.identifier.split()[0][1:] in ids:
                    positive_writer.write(record)
            positive_writer.close()
        elif out_negative_file != "-":
            print "Generating non-matching FASTA file"
            negative_writer = fastaWriter(open(out_negative_file, "w"))
            for record in reader:
                # The [1:] is because the fastaReader leaves the > on the identifer.
                if not record.identifier or record.identifier.split()[0][1:] not in ids:
                    negative_writer.write(record)
            negative_writer.close()
    elif seq_format.lower().startswith("fastq"):
        # Write filtered FASTQ file based on IDs from tabular file
        from galaxy_utils.sequence.fastq import fastqReader, fastqWriter

        reader = fastqReader(open(in_file, "rU"))
        if out_positive_file != "-" and out_negative_file != "-":
            print "Generating two FASTQ files"
            positive_writer = fastqWriter(open(out_positive_file, "w"))
            negative_writer = fastqWriter(open(out_negative_file, "w"))
            for record in reader:
                # The [1:] is because the fastaReader leaves the @ on the identifer.
                if record.identifier and record.identifier.split()[0][1:] in ids:
                    positive_writer.write(record)
                else:
                    negative_writer.write(record)
            positive_writer.close()
            negative_writer.close()
        elif out_positive_file != "-":
            print "Generating matching FASTQ file"
            positive_writer = fastqWriter(open(out_positive_file, "w"))
            for record in reader:
                # The [1:] is because the fastaReader leaves the @ on the identifer.
                if record.identifier and record.identifier.split()[0][1:] in ids:
                    positive_writer.write(record)
            positive_writer.close()
        elif out_negative_file != "-":
            print "Generating non-matching FASTQ file"
            negative_writer = fastqWriter(open(out_negative_file, "w"))
            for record in reader:
                # The [1:] is because the fastaReader leaves the @ on the identifer.
                if not record.identifier or record.identifier.split()[0][1:] not in ids:
                    negative_writer.write(record)
            negative_writer.close()
    else:
        stop_err("Unsupported file type %r" % seq_format)
    "@HWI-ST916:79:D04M5ACXX:1:1101:10000:100326 2:N:0:TGNCCA")
assert not re_illumina_r.match(
    "@HWI-ST916:79:D04M5ACXX:1:1101:10000:100326 1:N:0:TGNCCA")

count, forward, reverse, neither, pairs, singles = 0, 0, 0, 0, 0, 0
in_handle = open(input_fastq)
if pairs_fastq:
    pairs_f_writer = fastqWriter(open(pairs_fastq, "w"), format)
    pairs_r_writer = pairs_f_writer
else:
    pairs_f_writer = fastqWriter(open(pairs_f_fastq, "w"), format)
    pairs_r_writer = fastqWriter(open(pairs_r_fastq, "w"), format)
singles_writer = fastqWriter(open(singles_fastq, "w"), format)
last_template, buffered_reads = None, []

for record in fastqReader(in_handle, format):
    count += 1
    name = record.identifier.split(None, 1)[0]
    assert name[0] == "@", record.identifier  #Quirk of the Galaxy parser
    is_forward = False
    suffix = re_f.search(name)
    if suffix:
        #============
        #Forward read
        #============
        template = name[:suffix.start()]
        is_forward = True
    elif re_illumina_f.match(record.identifier):
        template = name  #No suffix
        is_forward = True
    if is_forward:
Example #48
0
                        short_neg += 1
    
    in_handle = open(in_file, "rb")
    try:
        manifest = ReadRocheXmlManifest(in_handle)
    except ValueError:
        manifest = None
    in_handle.seek(0)
    out_handle = open(out_file, "wb")
    writer = SffWriter(out_handle, xml=manifest)
    writer.write_file(process(SffIterator(in_handle)))
    #End of SFF code
elif seq_format.lower().startswith("fastq"):
    in_handle = open(in_file, "rU")
    out_handle = open(out_file, "w")
    reader = fastqReader(in_handle)
    writer = fastqWriter(out_handle)
    if forward:
        for record in reader:
            seq = record.sequence.upper()
            result = primer.search(seq)
            if result:
                #Forward primer, take everything after it
                cut = result.end()
                record.sequence = seq[cut:]
                if len(record.sequence) >= min_len:
                    record.quality = record.quality[cut:]
                    clipped += 1
                    writer.write(record)
                else:
                    short_clipped += 1
Example #49
0
count = 0
pairs = set()  # Will this scale OK?
forward = 0
reverse = 0
neither = 0

out_pairs = open(output_pairs, "w")
out_nonpairs = open(output_nonpairs, "w")

for input_fastq in input_fastq_filenames:
    if not os.path.isfile(input_fastq):
        sys.exit("Missing input FASTQ file %r" % input_fastq)
    in_handle = open(input_fastq)

    # Don't care about the FASTQ type really...
    for record in fastqReader(in_handle, "sanger"):
        count += 1
        name = record.identifier.split(None, 1)[0]
        assert name[0] == "@", record.identifier  # Quirk of the Galaxy parser
        name = name[1:]
        is_forward = False
        suffix = re_f.search(name)
        if suffix:
            # ============
            # Forward read
            # ============
            template = name[: suffix.start()]
            is_forward = True
        elif re_illumina_f.match(record.identifier):
            template = name  # No suffix
            is_forward = True