def main():
    input_filename = sys.argv[1]
    input_type = sys.argv[2] or 'sanger'
    mate1_filename = sys.argv[3]
    mate2_filename = sys.argv[4]
    single1_filename = sys.argv[5]
    single2_filename = sys.argv[6]

    type = input_type
    input = fastqNamedReader(open(input_filename, 'rb'), format=type)
    mate1_out = fastqWriter(open(mate1_filename, 'wb'), format=type)
    mate2_out = fastqWriter(open(mate2_filename, 'wb'), format=type)
    single1_out = fastqWriter(open(single1_filename, 'wb'), format=type)
    single2_out = fastqWriter(open(single2_filename, 'wb'), format=type)
    joiner = fastqJoiner(type)

    i = None
    skip_count = 0
    found = {}
    for i, read in enumerate(
            fastqReader(open(input_filename, 'rb'), format=type)):

        if read.identifier in found:
            del found[read.identifier]
            continue

        mate1 = input.get(read.identifier)

        mate2 = input.get(joiner.get_paired_identifier(mate1))

        if mate2:
            # This is a mate pair
            found[mate2.identifier] = None
            if joiner.is_first_mate(mate1):
                mate1_out.write(mate1)
                mate2_out.write(mate2)
            else:
                mate1_out.write(mate2)
                mate2_out.write(mate1)
        else:
            # This is a single
            skip_count += 1
            if joiner.is_first_mate(mate1):
                single1_out.write(mate1)
            else:
                single2_out.write(mate1)

    if i is None:
        print "Your input file contained no valid FASTQ sequences."
    else:
        if skip_count:
            print 'There were %i reads with no mate.' % skip_count
        print 'De-interlaced %s pairs of sequences.' % (
            (i - skip_count + 1) / 2)

    input.close()
    mate1_out.close()
    mate2_out.close()
    single1_out.close()
    single2_out.close()
Example #2
0
def main():
    # Read command line arguments
    input_filename = sys.argv[1]
    input_type = sys.argv[2] or 'sanger'
    output1_filename = sys.argv[3]
    output2_filename = sys.argv[4]

    splitter = fastqSplitter()
    out1 = fastqWriter(path=output1_filename, format=input_type)
    out2 = fastqWriter(path=output2_filename, format=input_type)

    i = None
    skip_count = 0
    for i, fastq_read in enumerate(fastqReader(path=input_filename, format=input_type)):
        read1, read2 = splitter.split(fastq_read)
        if read1 and read2:
            out1.write(read1)
            out2.write(read2)
        else:
            skip_count += 1
    out1.close()
    out2.close()
    if i is None:
        print("Your file contains no valid FASTQ reads.")
    else:
        print('Split %s of %s reads (%.2f%%).' % (i - skip_count + 1, i + 1, float(i - skip_count + 1) / float(i + 1) * 100.0))
def main():
    input_filename = sys.argv[1]
    input_type = sys.argv[2] or 'sanger'
    mate1_filename = sys.argv[3]
    mate2_filename = sys.argv[4]
    single1_filename = sys.argv[5]
    single2_filename = sys.argv[6]

    type = input_type
    input = fastqNamedReader(path=input_filename, format=type)
    mate1_out = fastqWriter(path=mate1_filename, format=type)
    mate2_out = fastqWriter(path=mate2_filename, format=type)
    single1_out = fastqWriter(path=single1_filename, format=type)
    single2_out = fastqWriter(path=single2_filename, format=type)
    joiner = fastqJoiner(type)

    i = None
    skip_count = 0
    found = {}
    for i, read in enumerate(fastqReader(path=input_filename, format=type)):

        if read.identifier in found:
            del found[read.identifier]
            continue

        mate1 = input.get(read.identifier)

        mate2 = input.get(joiner.get_paired_identifier(mate1))

        if mate2:
            # This is a mate pair
            found[mate2.identifier] = None
            if joiner.is_first_mate(mate1):
                mate1_out.write(mate1)
                mate2_out.write(mate2)
            else:
                mate1_out.write(mate2)
                mate2_out.write(mate1)
        else:
            # This is a single
            skip_count += 1
            if joiner.is_first_mate(mate1):
                single1_out.write(mate1)
            else:
                single2_out.write(mate1)

    if i is None:
        print("Your input file contained no valid FASTQ sequences.")
    else:
        if skip_count:
            print('There were %i reads with no mate.' % skip_count)
        print('De-interlaced %s pairs of sequences.' % ((i - skip_count + 1) / 2))

    input.close()
    mate1_out.close()
    mate2_out.close()
    single1_out.close()
    single2_out.close()
Example #4
0
def main():
    input_filename = sys.argv[1]
    input_type = sys.argv[2] or 'sanger'
    mate1_filename = sys.argv[3]
    mate2_filename = sys.argv[4]
    single1_filename = sys.argv[5]
    single2_filename = sys.argv[6]

    type = input_type
    joiner = fastqJoiner(type)
    i = None
    skip_count = 0
    found = {}

    mate1_out = fastqWriter(path=mate1_filename, format=type)
    mate2_out = fastqWriter(path=mate2_filename, format=type)
    single1_out = fastqWriter(path=single1_filename, format=type)
    single2_out = fastqWriter(path=single2_filename, format=type)
    reader1 = fastqNamedReader(path=input_filename, format=type)
    reader2 = fastqReader(path=input_filename, format=type)

    with mate1_out, mate2_out, single1_out, single2_out, reader1, reader2:

        for i, read in enumerate(reader2):

            if read.identifier in found:
                del found[read.identifier]
                continue

            mate1 = reader1.get(read.identifier)

            mate2 = reader1.get(joiner.get_paired_identifier(mate1))

            if mate2:
                # This is a mate pair
                found[mate2.identifier] = None
                if joiner.is_first_mate(mate1):
                    mate1_out.write(mate1)
                    mate2_out.write(mate2)
                else:
                    mate1_out.write(mate2)
                    mate2_out.write(mate1)
            else:
                # This is a single
                skip_count += 1
                if joiner.is_first_mate(mate1):
                    single1_out.write(mate1)
                else:
                    single2_out.write(mate1)

    if i is None:
        print("Your input file contained no valid FASTQ sequences.")
    else:
        if skip_count:
            print('There were %i reads with no mate.' % skip_count)
        print('De-interlaced %s pairs of sequences.' % ((i - skip_count + 1) / 2))
Example #5
0
def main():
    mate1_filename = sys.argv[1]
    mate1_type = sys.argv[2] or 'sanger'
    mate2_filename = sys.argv[3]
    mate2_type = sys.argv[4] or 'sanger'
    outfile_pairs = sys.argv[5]
    outfile_singles = sys.argv[6]

    if mate1_type != mate2_type:
        print(
            "WARNING: You are trying to interlace files of two different types: %s and %s."
            % (mate1_type, mate2_type))
        return

    type = mate1_type
    joiner = fastqJoiner(type)

    nof_singles = 0
    nof_pairs = 0
    i = None
    j = None

    out_pairs = fastqWriter(path=outfile_pairs, format=type)
    out_singles = fastqWriter(path=outfile_singles, format=type)
    mate2_input = fastqNamedReader(path=mate2_filename, format=type)
    mate1_input = fastqNamedReader(path=mate1_filename, format=type)
    reader1 = fastqReader(path=mate1_filename, format=type)
    reader2 = fastqReader(path=mate2_filename, format=type)

    with out_pairs, out_singles, mate2_input, mate1_input, reader1, reader2:
        # Pairs + singles present in mate1
        for i, mate1 in enumerate(reader1):
            mate2 = mate2_input.get(joiner.get_paired_identifier(mate1))
            if mate2:
                out_pairs.write(mate1)
                out_pairs.write(mate2)
                nof_pairs += 1
            else:
                out_singles.write(mate1)
                nof_singles += 1

        # Singles present in mate2
        for j, mate2 in enumerate(reader2):
            mate1 = mate1_input.get(joiner.get_paired_identifier(mate2))
            if not mate1:
                out_singles.write(mate2)
                nof_singles += 1

    if (i is None) and (j is None):
        print("Your input files contained no valid FASTQ sequences.")
    else:
        print('There were %s single reads.' % (nof_singles))
        print('Interlaced %s pairs of sequences.' % (nof_pairs))
Example #6
0
def main():
    mate1_filename   = sys.argv[1]
    mate1_type       = sys.argv[2] or 'sanger'
    mate2_filename   = sys.argv[3]
    mate2_type       = sys.argv[4] or 'sanger'
    outfile_pairs    = sys.argv[5]
    outfile_singles = sys.argv[6]

    if mate1_type != mate2_type:
        print "WARNING: You are trying to interlace files of two different types: %s and %s." % ( mate1_type, mate2_type )
        return

    type = mate1_type
    joiner = fastqJoiner( type )
    out_pairs = fastqWriter( open( outfile_pairs, 'wb' ), format = type )
    out_singles = fastqWriter( open( outfile_singles, 'wb' ), format = type )

    # Pairs + singles present in mate1
    nof_singles = 0
    nof_pairs   = 0
    mate2_input = fastqNamedReader( open( mate2_filename, 'rb' ), format = type )
    i = None
    for i, mate1 in enumerate( fastqReader( open( mate1_filename, 'rb' ), format = type ) ):
        mate2 = mate2_input.get( joiner.get_paired_identifier( mate1 ) )
        if mate2:
            out_pairs.write( mate1 )
            out_pairs.write( mate2 )
            nof_pairs += 1
        else:
            out_singles.write( mate1 )
            nof_singles += 1

    # Singles present in mate2
    mate1_input = fastqNamedReader( open( mate1_filename, 'rb' ), format = type )
    j = None
    for j, mate2 in enumerate( fastqReader( open( mate2_filename, 'rb' ), format = type ) ):
        mate1 = mate1_input.get( joiner.get_paired_identifier( mate2 ) )
        if not mate1:
            out_singles.write( mate2 )
            nof_singles += 1

    if (i is None) and (j is None):
        print "Your input files contained no valid FASTQ sequences."
    else:
        print 'There were %s single reads.' % ( nof_singles )
        print 'Interlaced %s pairs of sequences.' % ( nof_pairs )

    mate1_input.close()
    mate2_input.close()
    out_pairs.close()
    out_singles.close()
Example #7
0
def main():
    #Read command line arguments
    input_filename = sys.argv[1]
    script_filename = sys.argv[2]
    output_filename = sys.argv[3]
    additional_files_path = sys.argv[4]
    input_type = sys.argv[5] or 'sanger'

    #Save script file for debuging/verification info later
    os.mkdir(additional_files_path)
    shutil.copy(script_filename,
                os.path.join(additional_files_path, 'debug.txt'))

    out = fastqWriter(open(output_filename, 'wb'), format=input_type)

    i = None
    reads_kept = 0
    for i, fastq_read in enumerate(
            fastqReader(open(input_filename), format=input_type)):
        local = {'fastq_read': fastq_read, 'ret_val': False}
        execfile(script_filename, {}, local)
        if local['ret_val']:
            out.write(fastq_read)
            reads_kept += 1
    out.close()
    if i is None:
        print "Your file contains no valid fastq reads."
    else:
        print 'Kept %s of %s reads (%.2f%%).' % (
            reads_kept, i + 1, float(reads_kept) / float(i + 1) * 100.0)
def main():
    input_filename = sys.argv[1]
    input_type = sys.argv[2]
    output_filename = sys.argv[3]
    output_type = sys.argv[4]
    force_quality_encoding = sys.argv[5]
    summarize_input = sys.argv[6] == 'summarize_input'
    if force_quality_encoding == 'None':
        force_quality_encoding = None
    
    aggregator = fastqAggregator()
    out = fastqWriter( open( output_filename, 'wb' ), format = output_type, force_quality_encoding = force_quality_encoding )
    read_count = None
    if summarize_input:
        reader = fastqVerboseErrorReader
    else:
        reader = fastqReader
    for read_count, fastq_read in enumerate( reader( open( input_filename ), format = input_type, apply_galaxy_conventions = True ) ):
        if summarize_input:
            aggregator.consume_read( fastq_read )
        out.write( fastq_read )
    out.close()
    
    if read_count is not None:
        print "Groomed %i %s reads into %s reads." % ( read_count + 1, input_type, output_type )
        if input_type != output_type and 'solexa' in [ input_type, output_type ]:
            print "Converted between Solexa and PHRED scores."
        if summarize_input:
            print "Based upon quality and sequence, the input data is valid for: %s" % ( ", ".join( aggregator.get_valid_formats() )  or "None" )
            ascii_range = aggregator.get_ascii_range()
            decimal_range =  aggregator.get_decimal_range()
            print "Input ASCII range: %s(%i) - %s(%i)" % ( repr( ascii_range[0] ), ord( ascii_range[0] ), repr( ascii_range[1] ), ord( ascii_range[1] ) ) #print using repr, since \x00 (null) causes info truncation in galaxy when printed
            print "Input decimal range: %i - %i" % ( decimal_range[0], decimal_range[1] )        
    else:
        print "No valid FASTQ reads were provided."
def main():
    #Read command line arguments
    input1_filename = sys.argv[1]
    input1_type = sys.argv[2] or 'sanger'
    input2_filename = sys.argv[3]
    input2_type = sys.argv[4] or 'sanger'
    output_filename = sys.argv[5]
    
    if input1_type != input2_type:
        print "WARNING: You are trying to join files of two different types: %s and %s." % ( input1_type, input2_type )
    
    input2 = fastqNamedReader( open( input2_filename, 'rb' ), input2_type )
    joiner = fastqJoiner( input1_type )
    out = fastqWriter( open( output_filename, 'wb' ), format = input1_type )
    
    i = None
    skip_count = 0
    for i, fastq_read in enumerate( fastqReader( open( input1_filename, 'rb' ), format = input1_type ) ):
        identifier = joiner.get_paired_identifier( fastq_read )
        fastq_paired = input2.get( identifier )
        if fastq_paired is None:
            skip_count += 1
        else:
            out.write( joiner.join( fastq_read, fastq_paired ) )
    out.close()
    
    if i is None:
        print "Your file contains no valid FASTQ reads."
    else:
        print input2.has_data()
        print 'Joined %s of %s read pairs (%.2f%%).' % ( i - skip_count + 1, i + 1, float( i - skip_count + 1 ) / float( i + 1 ) * 100.0 )
Example #10
0
def main():
    #Read command line arguments
    input_filename = sys.argv[1]
    script_filename = sys.argv[2]
    output_filename = sys.argv[3]
    additional_files_path = sys.argv[4]
    input_type = sys.argv[5] or 'sanger'
    
    #Save script file for debuging/verification info later
    os.mkdir( additional_files_path )
    shutil.copy( script_filename, os.path.join( additional_files_path, 'debug.txt' ) )
    
    out = fastqWriter( open( output_filename, 'wb' ), format = input_type )
    
    i = None
    reads_kept = 0
    for i, fastq_read in enumerate( fastqReader( open( input_filename ), format = input_type ) ):
        local = {'fastq_read':fastq_read, 'ret_val':False}
        execfile( script_filename, {}, local )
        if local['ret_val']:
            out.write( fastq_read )
            reads_kept += 1
    out.close()
    if i is None:
        print "Your file contains no valid fastq reads."
    else:
        print 'Kept %s of %s reads (%.2f%%).' % ( reads_kept, i + 1, float( reads_kept ) / float( i + 1 ) * 100.0 )
Example #11
0
def main():
    # Read command line arguments
    input_filename = sys.argv[1]
    script_filename = sys.argv[2]
    output_filename = sys.argv[3]
    additional_files_path = sys.argv[4]
    input_type = sys.argv[5] or 'sanger'

    # Save script file for debuging/verification info later
    os.mkdir(additional_files_path)
    shutil.copy(script_filename, os.path.join(additional_files_path, 'debug.txt'))

    fastq_manipulator = imp.load_module('fastq_manipulator', open(script_filename), script_filename, ('', 'r', imp.PY_SOURCE))
    i = None
    reads_manipulated = 0

    writer = fastqWriter(path=output_filename, format=input_type)
    reader = fastqReader(path=input_filename, format=input_type)
    with writer, reader:
        for i, fastq_read in enumerate(reader):
            new_read = fastq_manipulator.match_and_manipulate_read(fastq_read)
            if new_read:
                writer.write(new_read)
            if new_read != fastq_read:
                reads_manipulated += 1

    if i is None:
        print("Your file contains no valid FASTQ reads.")
    else:
        print('Manipulated %s of %s reads (%.2f%%).' % (reads_manipulated, i + 1, float(reads_manipulated) / float(i + 1) * 100.0))
Example #12
0
def main():
    #Read command line arguments
    input1_filename = sys.argv[1]
    input1_type = sys.argv[2] or 'sanger'
    input2_filename = sys.argv[3]
    input2_type = sys.argv[4] or 'sanger'
    output_filename = sys.argv[5]

    if input1_type != input2_type:
        print "WARNING: You are trying to join files of two different types: %s and %s." % (
            input1_type, input2_type)

    input2 = fastqNamedReader(open(input2_filename, 'rb'), input2_type)
    joiner = fastqJoiner(input1_type)
    out = fastqWriter(open(output_filename, 'wb'), format=input1_type)

    i = None
    skip_count = 0
    for i, fastq_read in enumerate(
            fastqReader(open(input1_filename, 'rb'), format=input1_type)):
        identifier = joiner.get_paired_identifier(fastq_read)
        fastq_paired = input2.get(identifier)
        if fastq_paired is None:
            skip_count += 1
        else:
            out.write(joiner.join(fastq_read, fastq_paired))
    out.close()

    if i is None:
        print "Your file contains no valid FASTQ reads."
    else:
        print input2.has_data()
        print 'Joined %s of %s read pairs (%.2f%%).' % (
            i - skip_count + 1, i + 1,
            float(i - skip_count + 1) / float(i + 1) * 100.0)
Example #13
0
def main():
    # Read command line arguments
    input_filename = sys.argv[1]
    script_filename = sys.argv[2]
    output_filename = sys.argv[3]
    additional_files_path = sys.argv[4]
    input_type = sys.argv[5] or 'sanger'

    # Save script file for debuging/verification info later
    os.mkdir(additional_files_path)
    shutil.copy(script_filename,
                os.path.join(additional_files_path, 'debug.txt'))

    # Dan, Others: Can we simply drop the "format=input_type" here since it is specified in reader.
    # This optimization would cut runtime roughly in half (for my test case anyway). -John
    out = fastqWriter(path=output_filename, format=input_type)

    i = None
    reads_kept = 0
    execfile(script_filename, globals())
    for i, fastq_read in enumerate(
            fastqReader(path=input_filename, format=input_type)):
        ret_val = fastq_read_pass_filter(
            fastq_read
        )  # fastq_read_pass_filter defined in script_filename  # NOQA
        if ret_val:
            out.write(fastq_read)
            reads_kept += 1
    out.close()
    if i is None:
        print("Your file contains no valid fastq reads.")
    else:
        print('Kept %s of %s reads (%.2f%%).' %
              (reads_kept, i + 1, float(reads_kept) / float(i + 1) * 100.0))
Example #14
0
def main():
    input_filename = sys.argv[1]
    output_filename = sys.argv[2]
    left_offset = sys.argv[3]
    right_offset = sys.argv[4]
    percent_offsets = sys.argv[5] == 'offsets_percent'
    input_type = sys.argv[6] or 'sanger'
    keep_zero_length = sys.argv[7] == 'keep_zero_length'
    
    out = fastqWriter( open( output_filename, 'wb' ), format = input_type )
    num_reads_excluded = 0
    num_reads = None
    for num_reads, fastq_read in enumerate( fastqReader( open( input_filename ), format = input_type ) ):
        if percent_offsets:
            left_column_offset = int( round( float( left_offset ) / 100.0 * float( len( fastq_read ) ) ) )
            right_column_offset = int( round( float( right_offset ) / 100.0 * float( len( fastq_read ) ) ) )
        else:
            left_column_offset = int( left_offset )
            right_column_offset = int( right_offset )
        if right_column_offset > 0:
            right_column_offset = -right_column_offset
        else:
            right_column_offset = None
        fastq_read = fastq_read.slice( left_column_offset, right_column_offset )
        if keep_zero_length or len( fastq_read ):
            out.write( fastq_read )
        else:
            num_reads_excluded += 1
    out.close()
    if num_reads is None:
        print "No valid fastq reads could be processed."
    else:
        print "%i fastq reads were processed." % ( num_reads + 1 )
    if num_reads_excluded:
        print "%i reads of zero length were excluded from the output." % num_reads_excluded
def main():
    # Read command line arguments
    input_filename = sys.argv[1]
    script_filename = sys.argv[2]
    output_filename = sys.argv[3]
    additional_files_path = sys.argv[4]
    input_type = sys.argv[5] or 'sanger'

    # Save script file for debuging/verification info later
    os.mkdir(additional_files_path)
    shutil.copy(script_filename, os.path.join(additional_files_path, 'debug.txt'))

    fastq_manipulator = imp.load_module('fastq_manipulator', open(script_filename), script_filename, ('', 'r', imp.PY_SOURCE))

    out = fastqWriter(path=output_filename, format=input_type)

    i = None
    reads_manipulated = 0
    for i, fastq_read in enumerate(fastqReader(path=input_filename, format=input_type)):
        new_read = fastq_manipulator.match_and_manipulate_read(fastq_read)
        if new_read:
            out.write(new_read)
        if new_read != fastq_read:
            reads_manipulated += 1
    out.close()
    if i is None:
        print("Your file contains no valid FASTQ reads.")
    else:
        print('Manipulated %s of %s reads (%.2f%%).' % (reads_manipulated, i + 1, float(reads_manipulated) / float(i + 1) * 100.0))
def main():
    usage = "usage: %prog [options] input_file output_file"
    parser = OptionParser(usage=usage)
    parser.add_option('-f', '--format', dest='format', type='choice', default='sanger', choices=('sanger', 'solexa', 'illumina', 'sanger.gz', 'solexa.gz', 'illumina.gz', 'sanger.bz2', 'solexa.bz2', 'illumina.bz2'), help='FASTQ variant type')
    parser.add_option('-m', '--mask_character', dest='mask_character', default='N', help='Mask Character to use')
    parser.add_option('-c', '--score_comparison', type="choice", dest='score_comparison', default='le', choices=('gt', 'ge', 'eq', 'lt', 'le', 'ne'), help='Mask base when score is')
    parser.add_option('-s', '--quality_score', type="float", dest='quality_score', default='0', help='Quality Score')
    parser.add_option("-l", "--lowercase", action="store_true", dest="lowercase", default=False, help="Use lowercase masking")
    (options, args) = parser.parse_args()

    if len(args) != 2:
        parser.error("Need to specify an input file and an output file")

    score_comparer = get_score_comparer(options.score_comparison)

    if options.lowercase:
        base_masker = str.lower
    else:
        base_masker = BaseReplacer(options.mask_character)

    out = fastqWriter(path=args[1], format=options.format)

    num_reads = None
    for num_reads, fastq_read in enumerate(fastqReader(path=args[0], format=options.format)):
        sequence_list = list(fastq_read.sequence)
        for i, quality_score in enumerate(fastq_read.get_decimal_quality_scores()):
            if score_comparer(quality_score, options.quality_score):
                sequence_list[i] = base_masker(sequence_list[i])
        fastq_read.sequence = "".join(sequence_list)
        out.write(fastq_read)

    if num_reads is not None:
        print("Processed %i %s reads." % (num_reads + 1, options.format))
    else:
        print("No valid FASTQ reads were provided.")
Example #17
0
def main():
    #Read command line arguments
    input_filename = sys.argv[1]
    script_filename = sys.argv[2]
    output_filename = sys.argv[3]
    additional_files_path = sys.argv[4]
    input_type = sys.argv[5] or 'sanger'
    
    #Save script file for debuging/verification info later
    os.mkdir( additional_files_path )
    shutil.copy( script_filename, os.path.join( additional_files_path, 'debug.txt' ) )
    
    ## Dan, Others: Can we simply drop the "format=input_type" here since it is specified in reader.
    ## This optimization would cut runtime roughly in half (for my test case anyway). -John
    out = fastqWriter( open( output_filename, 'wb' ), format = input_type )
    
    i = None
    reads_kept = 0
    execfile(script_filename, globals())
    for i, fastq_read in enumerate( fastqReader( open( input_filename ), format = input_type ) ):
        ret_val = fastq_read_pass_filter( fastq_read )  ## fastq_read_pass_filter defined in script_filename
        if ret_val:
            out.write( fastq_read )
            reads_kept += 1
    out.close()
    if i is None:
        print "Your file contains no valid fastq reads."
    else:
        print 'Kept %s of %s reads (%.2f%%).' % ( reads_kept, i + 1, float( reads_kept ) / float( i + 1 ) * 100.0 )
Example #18
0
 def fastq_filter(in_file, out_file, iterator_filter):
     count = 0
     #from galaxy_utils.sequence.fastq import fastqReader, fastqWriter
     reader = fastqReader(open(in_file, "rU"))
     writer = fastqWriter(open(out_file, "w"))
     for record in iterator_filter(reader):
         count += 1
         writer.write(record)
     writer.close()
     reader.close()
     return count
Example #19
0
 def fastq_filter(in_file, out_file, iterator_filter):
     count = 0
     #from galaxy_utils.sequence.fastq import fastqReader, fastqWriter
     reader = fastqReader(open(in_file, "rU"))
     writer = fastqWriter(open(out_file, "w"))
     for record in iterator_filter(reader):
         count += 1
         writer.write(record)
     writer.close()
     reader.close()
     return count
Example #20
0
def main():
    input_filename = sys.argv[1]
    input_type = sys.argv[2]
    output_filename = sys.argv[3]
    output_type = sys.argv[4]
    force_quality_encoding = sys.argv[5]
    summarize_input = sys.argv[6] == 'summarize_input'
    if force_quality_encoding == 'None':
        force_quality_encoding = None

    aggregator = fastqAggregator()
    out = fastqWriter(path=output_filename,
                      format=output_type,
                      force_quality_encoding=force_quality_encoding)
    read_count = None
    if summarize_input:
        reader = fastqVerboseErrorReader
    else:
        reader = fastqReader
    for read_count, fastq_read in enumerate(
            reader(path=input_filename,
                   format=input_type,
                   apply_galaxy_conventions=True)):
        if summarize_input:
            aggregator.consume_read(fastq_read)
        out.write(fastq_read)
    out.close()

    if read_count is not None:
        print("Groomed %i %s reads into %s reads." %
              (read_count + 1, input_type, output_type))
        if input_type != output_type and 'solexa' in [input_type, output_type]:
            print("Converted between Solexa and PHRED scores.")
        if summarize_input:
            print(
                "Based upon quality and sequence, the input data is valid for: %s"
                % (", ".join(aggregator.get_valid_formats()) or "None"))
            ascii_range = aggregator.get_ascii_range()
            decimal_range = aggregator.get_decimal_range()
            print(
                "Input ASCII range: %s(%i) - %s(%i)" %
                (repr(ascii_range[0]), ord(ascii_range[0]), repr(
                    ascii_range[1]), ord(ascii_range[1]))
            )  # print using repr, since \x00 (null) causes info truncation in galaxy when printed
            print("Input decimal range: %i - %i" %
                  (decimal_range[0], decimal_range[1]))
    else:
        print("No valid FASTQ reads were provided.")
Example #21
0
def main():
    # Read command line arguments
    input1_filename = sys.argv[1]
    input1_type = sys.argv[2] or 'sanger'
    input2_filename = sys.argv[3]
    input2_type = sys.argv[4] or 'sanger'
    output_filename = sys.argv[5]

    fastq_style = sys.argv[6] or 'old'

    paste = sys.argv[7] or ''
    # --
    if input1_type != input2_type:
        print(
            "WARNING: You are trying to join files of two different types: %s and %s."
            % (input1_type, input2_type))

    if fastq_style == 'new':
        sep = sniff_sep(input1_filename)
        joiner = FastqJoiner(input1_type, sep=sep, paste=paste)
    else:
        joiner = fq.fastqJoiner(input1_type, paste=paste)
    # --
    i = None
    skip_count = 0

    writer = fq.fastqWriter(path=output_filename, format=input1_type)
    reader1 = fq.fastqReader(path=input1_filename, format=input1_type)
    reader2 = fq.fastqNamedReader(path=input2_filename, format=input2_type)

    with writer, reader1, reader2:
        for i, fastq_read in enumerate(reader1):
            identifier = joiner.get_paired_identifier(fastq_read)
            fastq_paired = reader2.get(identifier)
            if fastq_paired is None:
                skip_count += 1
            else:
                writer.write(joiner.join(fastq_read, fastq_paired))

        # this indent is correct: we still need access to reader2
        if i is None:
            print("Your file contains no valid FASTQ reads.")
        else:
            print(reader2.has_data())
            print('Joined %s of %s read pairs (%.2f%%).' %
                  (i - skip_count + 1, i + 1,
                   (i - skip_count + 1) / (i + 1) * 100.0))
Example #22
0
def main():
    # Read command line arguments
    fasta_filename = sys.argv[1]
    fasta_type = sys.argv[
        2] or 'fasta'  # should always be fasta or csfasta? what if txt?
    qual_filename = sys.argv[3]
    qual_type = sys.argv[4] or 'qualsanger'  # qual454 qualsolid
    output_filename = sys.argv[5]
    force_quality_encoding = sys.argv[6]
    if force_quality_encoding == 'None':
        force_quality_encoding = None

    format = 'sanger'
    if fasta_type == 'csfasta' or qual_type == 'qualsolid':
        format = 'cssanger'
    elif qual_type == 'qualsolexa':
        format = 'solexa'
    elif qual_type == 'qualillumina':
        format = 'illumina'

    out = fastqWriter(path=output_filename,
                      format=format,
                      force_quality_encoding=force_quality_encoding)
    if qual_filename == 'None':
        qual_input = fastqFakeFastaScoreReader(
            format, quality_encoding=force_quality_encoding)
    else:
        qual_input = fastaNamedReader(open(qual_filename, 'rt'))

    fastq_combiner = fastqCombiner(format)
    i = None
    skip_count = 0
    for i, sequence in enumerate(fastaReader(open(fasta_filename, 'rt'))):
        quality = qual_input.get(sequence)
        if quality:
            fastq_read = fastq_combiner.combine(sequence, quality)
            out.write(fastq_read)
        else:
            skip_count += 1
    out.close()
    if i is None:
        print("Your file contains no valid FASTA sequences.")
    else:
        print(qual_input.has_data())
        print('Combined %s of %s sequences with quality scores (%.2f%%).' %
              (i - skip_count + 1, i + 1,
               float(i - skip_count + 1) / float(i + 1) * 100.0))
def main():
    #Read command line arguments
    fasta_filename = sys.argv[1]
    fasta_type = sys.argv[2] or 'fasta' #should always be fasta or csfasta? what if txt?
    qual_filename = sys.argv[3]
    qual_type = sys.argv[4] or 'qualsanger' #qual454 qualsolid
    output_filename = sys.argv[5]
    force_quality_encoding = sys.argv[6]
    if force_quality_encoding == 'None':
        force_quality_encoding = None
    
    format = 'sanger'
    if fasta_type == 'csfasta' or qual_type == 'qualsolid':
        format = 'cssanger'
    elif qual_type == 'qualsolexa':
        format = 'solexa'
    elif qual_type == 'qualillumina':
        format = 'illumina'
    
    out = fastqWriter( open( output_filename, 'wb' ), format = format, force_quality_encoding = force_quality_encoding )
    if qual_filename == 'None':
        qual_input = fastqFakeFastaScoreReader( format, quality_encoding = force_quality_encoding )
    else:
        qual_input = fastaNamedReader( open( qual_filename, 'rb' )  )
    
    fastq_combiner = fastqCombiner( format )
    i = None
    skip_count = 0
    for i, sequence in enumerate( fastaReader( open( fasta_filename, 'rb' ) ) ):
        quality = qual_input.get( sequence )
        if quality:
            fastq_read = fastq_combiner.combine( sequence, quality )
            out.write( fastq_read )
        else:
            skip_count += 1
    out.close()
    if i is None:
        print "Your file contains no valid FASTA sequences."
    else:
        print qual_input.has_data()
        print 'Combined %s of %s sequences with quality scores (%.2f%%).' % ( i - skip_count + 1, i + 1, float( i - skip_count + 1 ) / float( i + 1 ) * 100.0 )
Example #24
0
def partition(input_filename, temp_output_filename, fileCount,
              quality_encoding, verbose):
    #    print 'Starting Thread: ' + str(fileCount)
    input_type = ARGV[1]
    output_type = ARGV[3]
    force_quality_encoding = quality_encoding
    summarize_input = verbose
    if force_quality_encoding == 'None':
        force_quality_encoding = None
    aggregator = fastqAggregator()
    temp_process_file = fastqWriter(
        open(temp_output_filename, 'wb'),
        format=output_type,
        force_quality_encoding=force_quality_encoding)
    read_count = None
    if summarize_input:
        reader = fastqVerboseErrorReader
    else:
        reader = fastqReader
    for read_count, fastq_read in enumerate(
            reader(open(input_filename, 'rb'),
                   format=input_type,
                   apply_galaxy_conventions=True)):
        if summarize_input:
            aggregator.consume_read(fastq_read)
        temp_process_file.write(fastq_read)


#        print "Just wrote (%d): " % read_count + str(fastq_read)
    temp_process_file.close()
    if read_count is not None:
        if input_type != output_type and 'solexa' in [input_type, output_type]:
            print "Converted between Solexa and PHRED scores."
        if summarize_input:
            with open(temp_output_filename + "_summary",
                      'w') as summaryLogFile:
                pickle.dump(aggregator, summaryLogFile)
    else:
        print "No valid FASTQ reads were provided."
Example #25
0
def main():
    input_filename = sys.argv[1]
    output_filename = sys.argv[2]
    left_offset = sys.argv[3]
    right_offset = sys.argv[4]
    percent_offsets = sys.argv[5] == 'offsets_percent'
    input_type = sys.argv[6] or 'sanger'
    keep_zero_length = sys.argv[7] == 'keep_zero_length'

    out = fastqWriter(path=output_filename, format=input_type)
    num_reads_excluded = 0
    num_reads = None
    for num_reads, fastq_read in enumerate(
            fastqReader(path=input_filename, format=input_type)):
        if percent_offsets:
            left_column_offset = int(
                round(float(left_offset) / 100.0 * float(len(fastq_read))))
            right_column_offset = int(
                round(float(right_offset) / 100.0 * float(len(fastq_read))))
        else:
            left_column_offset = int(left_offset)
            right_column_offset = int(right_offset)
        if right_column_offset != 0:
            right_column_offset = -right_column_offset
        else:
            right_column_offset = None
        fastq_read = fastq_read.slice(left_column_offset, right_column_offset)
        if keep_zero_length or len(fastq_read):
            out.write(fastq_read)
        else:
            num_reads_excluded += 1
    out.close()
    if num_reads is None:
        print("No valid fastq reads could be processed.")
    else:
        print("%i fastq reads were processed." % (num_reads + 1))
    if num_reads_excluded:
        print("%i reads of zero length were excluded from the output." %
              num_reads_excluded)
def main():
    # Read command line arguments
    input1_filename = sys.argv[1]
    input1_type = sys.argv[2] or 'sanger'
    input2_filename = sys.argv[3]
    input2_type = sys.argv[4] or 'sanger'
    output_filename = sys.argv[5]

    fastq_style = sys.argv[6] or 'old'

    paste = sys.argv[7] or ''
    # --
    if input1_type != input2_type:
        print("WARNING: You are trying to join files of two different types: %s and %s." % (input1_type, input2_type))

    if fastq_style == 'new':
        sep = sniff_sep(input1_filename)
        joiner = FastqJoiner(input1_type, sep=sep, paste=paste)
    else:
        joiner = fq.fastqJoiner(input1_type, paste=paste)
    # --
    input2 = fq.fastqNamedReader(path=input2_filename, format=input2_type)
    out = fq.fastqWriter(path=output_filename, format=input1_type)
    i = None
    skip_count = 0
    for i, fastq_read in enumerate(fq.fastqReader(path=input1_filename, format=input1_type)):
        identifier = joiner.get_paired_identifier(fastq_read)
        fastq_paired = input2.get(identifier)
        if fastq_paired is None:
            skip_count += 1
        else:
            out.write(joiner.join(fastq_read, fastq_paired))
    out.close()

    if i is None:
        print("Your file contains no valid FASTQ reads.")
    else:
        print(input2.has_data())
        print('Joined %s of %s read pairs (%.2f%%).' % (i - skip_count + 1, i + 1, (i - skip_count + 1) / (i + 1) * 100.0))
Example #27
0
    def run(self):
        aggregator = fastqAggregator()
        reader_class = fastqReader
        if self.summarize_input:
            reader_class = fastqVerboseErrorReader
        read_count = None

        writer = fastqWriter(
            path=self.output_filename,
            format=self.output_type,
            force_quality_encoding=self.force_quality_encoding)
        reader = reader_class(fh=self.file_handle,
                              path=self.input_filename,
                              format=self.input_type,
                              apply_galaxy_conventions=True,
                              fix_id=self.fix_id)
        with writer, reader:
            for read_count, fastq_read in enumerate(reader):
                if self.summarize_input:
                    aggregator.consume_read(fastq_read)
                writer.write(fastq_read)

        self._print_output(read_count, aggregator)
Example #28
0
def main():
    input_filename = sys.argv[1]
    input_type = sys.argv[2]
    output_filename = sys.argv[3]
    output_type = sys.argv[4]
    force_quality_encoding = sys.argv[5]
    summarize_input = sys.argv[6] == 'summarize_input'
    if force_quality_encoding == 'None':
        force_quality_encoding = None

    fix_id = False  # fix inconsistent identifiers (SRA data dumps)
    if len(sys.argv) > 7:
        fix_id = sys.argv[7] == 'fix_id'

    aggregator = fastqAggregator()
    out = fastqWriter(path=output_filename,
                      format=output_type,
                      force_quality_encoding=force_quality_encoding)
    read_count = None
    if summarize_input:
        reader_type = fastqVerboseErrorReader
    else:
        reader_type = fastqReader

    reader = reader_type(path=input_filename,
                         format=input_type,
                         apply_galaxy_conventions=True,
                         fix_id=fix_id)
    for read_count, fastq_read in enumerate(reader):
        if summarize_input:
            aggregator.consume_read(fastq_read)
        out.write(fastq_read)
    out.close()

    _print_output(read_count, input_type, output_type, summarize_input,
                  aggregator)
Example #29
0
    in_handle.close()
else:
    # Use Galaxy for FASTA, QUAL or FASTQ
    if seq_format.lower() in ["fasta", "csfasta"] or seq_format.lower().startswith(
        "qual"
    ):
        from galaxy_utils.sequence.fasta import fastaReader, fastaWriter

        reader = fastaReader(open(in_file, "rU"))
        writer = fastaWriter(open(out_file, "w"))
        marker = ">"
    elif seq_format.lower().startswith("fastq"):
        from galaxy_utils.sequence.fastq import fastqReader, fastqWriter

        reader = fastqReader(open(in_file, "rU"))
        writer = fastqWriter(open(out_file, "w"))
        marker = "@"
    else:
        sys.exit("Unsupported file type %r" % seq_format)
    # Now do the renaming
    count = 0
    renamed = 0
    for record in reader:
        # The [1:] is because the fastaReader leaves the > on the identifier,
        # likewise the fastqReader leaves the @ on the identifier
        try:
            idn, descr = record.identifier[1:].split(None, 1)
        except ValueError:
            idn = record.identifier[1:]
            descr = None
        if idn in rename:
re_illumina_f = re.compile(r"^@[a-zA-Z0-9_:-]+ 1:.*$")
re_illumina_r = re.compile(r"^@[a-zA-Z0-9_:-]+ 2:.*$")
assert re_illumina_f.match(
    "@HWI-ST916:79:D04M5ACXX:1:1101:10000:100326 1:N:0:TGNCCA")
assert re_illumina_r.match(
    "@HWI-ST916:79:D04M5ACXX:1:1101:10000:100326 2:N:0:TGNCCA")
assert not re_illumina_f.match(
    "@HWI-ST916:79:D04M5ACXX:1:1101:10000:100326 2:N:0:TGNCCA")
assert not re_illumina_r.match(
    "@HWI-ST916:79:D04M5ACXX:1:1101:10000:100326 1:N:0:TGNCCA")

count, forward, reverse, neither, pairs, singles = 0, 0, 0, 0, 0, 0
in_handle = open(input_fastq)
if pairs_fastq:
    pairs_f_writer = fastqWriter(open(pairs_fastq, "w"), format)
    pairs_r_writer = pairs_f_writer
else:
    pairs_f_writer = fastqWriter(open(pairs_f_fastq, "w"), format)
    pairs_r_writer = fastqWriter(open(pairs_r_fastq, "w"), format)
singles_writer = fastqWriter(open(singles_fastq, "w"), format)
last_template, buffered_reads = None, []

for record in fastqReader(in_handle, format):
    count += 1
    name = record.identifier.split(None, 1)[0]
    assert name[0] == "@", record.identifier  #Quirk of the Galaxy parser
    is_forward = False
    suffix = re_f.search(name)
    if suffix:
        #============
def main():
    usage = "usage: %prog [options] input_file output_file"
    parser = OptionParser( usage=usage )
    parser.add_option( '-f', '--format', dest='format', type='choice', default='sanger', choices=( 'sanger', 'cssanger', 'solexa', 'illumina' ), help='FASTQ variant type' )
    parser.add_option( '-s', '--window_size', type="int", dest='window_size', default='1', help='Window size' )
    parser.add_option( '-t', '--window_step', type="int", dest='window_step', default='1', help='Window step' )
    parser.add_option( '-e', '--trim_ends', type="choice", dest='trim_ends', default='53', choices=('5','3','53','35' ), help='Ends to Trim' )
    parser.add_option( '-a', '--aggregation_action', type="choice", dest='aggregation_action', default='min', choices=('min','max','sum','mean' ), help='Aggregate action for window' )
    parser.add_option( '-x', '--exclude_count', type="int", dest='exclude_count', default='0', help='Maximum number of bases to exclude from the window during aggregation' )
    parser.add_option( '-c', '--score_comparison', type="choice", dest='score_comparison', default='>=', choices=('>','>=','==','<', '<=', '!=' ), help='Keep read when aggregate score is' )
    parser.add_option( '-q', '--quality_score', type="float", dest='quality_score', default='0', help='Quality Score' )
    parser.add_option( "-k", "--keep_zero_length", action="store_true", dest="keep_zero_length", default=False, help="Keep reads with zero length")
    ( options, args ) = parser.parse_args()
    
    if len ( args ) != 2:
        parser.error( "Need to specify an input file and an output file" )
    
    if options.window_size < 1:
        parser.error( 'You must specify a strictly positive window size' )
    
    if options.window_step < 1:
        parser.error( 'You must specify a strictly positive step size' )
    
    #determine an exhaustive list of window indexes that can be excluded from aggregation
    exclude_window_indexes = []
    last_exclude_indexes = []
    for exclude_count in range( min( options.exclude_count, options.window_size ) ):
        if last_exclude_indexes:
            new_exclude_indexes = []
            for exclude_list in last_exclude_indexes:
                for window_index in range( options.window_size ):
                    if window_index not in exclude_list:
                        new_exclude = sorted( exclude_list + [ window_index ] )
                        if new_exclude not in exclude_window_indexes + new_exclude_indexes:
                            new_exclude_indexes.append( new_exclude )
            exclude_window_indexes += new_exclude_indexes
            last_exclude_indexes = new_exclude_indexes
        else:
            for window_index in range( options.window_size ):
                last_exclude_indexes.append( [ window_index ] )
            exclude_window_indexes = list( last_exclude_indexes )
    
    out = fastqWriter( open( args[1], 'wb' ), format = options.format )
    action = ACTION_METHODS[ options.aggregation_action ]
    
    num_reads = None
    num_reads_excluded = 0
    for num_reads, fastq_read in enumerate( fastqReader( open( args[0] ), format = options.format ) ):
        for trim_end in options.trim_ends:
            quality_list = fastq_read.get_decimal_quality_scores()
            if trim_end == '5':
                lwindow_position = 0 #left position of window
                while True:
                    if lwindow_position >= len( quality_list ):
                        fastq_read.sequence = ''
                        fastq_read.quality = ''
                        break
                    if exclude_and_compare( action, quality_list[ lwindow_position:lwindow_position + options.window_size ], options.score_comparison, options.quality_score, exclude_window_indexes ):
                        fastq_read = fastq_read.slice( lwindow_position, None )
                        break
                    lwindow_position += options.window_step
            else:
                rwindow_position = len( quality_list ) #right position of window
                while True:
                    lwindow_position = rwindow_position - options.window_size #left position of window
                    if rwindow_position <= 0 or lwindow_position < 0:
                        fastq_read.sequence = ''
                        fastq_read.quality = ''
                        break
                    if exclude_and_compare( action, quality_list[ lwindow_position:rwindow_position ], options.score_comparison, options.quality_score, exclude_window_indexes ):
                        fastq_read = fastq_read.slice( None, rwindow_position )
                        break
                    rwindow_position -= options.window_step
        if options.keep_zero_length or len( fastq_read ):
            out.write( fastq_read )
        else:
            num_reads_excluded += 1
    out.close()
    if num_reads is None:
        print "No valid FASTQ reads could be processed."
    else:
        print "%i FASTQ reads were processed." % ( num_reads + 1 )
    if num_reads_excluded:
        print "%i reads of zero length were excluded from the output." % num_reads_excluded
Example #32
0
    in_handle.seek(0) #start again after getting manifest
    count = writer.write_file(rename_seqrecords(SffIterator(in_handle), rename))
    out_handle.close()
    in_handle.close()
else:
    #Use Galaxy for FASTA, QUAL or FASTQ
    if seq_format.lower() in ["fasta", "csfasta"] \
    or seq_format.lower().startswith("qual"):
        from galaxy_utils.sequence.fasta import fastaReader, fastaWriter
        reader = fastaReader(open(in_file, "rU"))
        writer = fastaWriter(open(out_file, "w"))
        marker = ">"
    elif seq_format.lower().startswith("fastq"):
        from galaxy_utils.sequence.fastq import fastqReader, fastqWriter
        reader = fastqReader(open(in_file, "rU"))
        writer = fastqWriter(open(out_file, "w"))
        marker = "@"
    else:
        sys.exit("Unsupported file type %r" % seq_format)
    #Now do the renaming
    count = 0
    renamed = 0
    for record in reader:
        #The [1:] is because the fastaReader leaves the > on the identifier,
        #likewise the fastqReader leaves the @ on the identifier
        try:
            idn, descr = record.identifier[1:].split(None, 1)
        except ValueError:
            idn = record.identifier[1:]
            descr = None
        if idn in rename:
Example #33
0
    
    in_handle = open(in_file, "rb")
    try:
        manifest = ReadRocheXmlManifest(in_handle)
    except ValueError:
        manifest = None
    in_handle.seek(0)
    out_handle = open(out_file, "wb")
    writer = SffWriter(out_handle, xml=manifest)
    writer.write_file(process(SffIterator(in_handle)))
    #End of SFF code
elif seq_format.lower().startswith("fastq"):
    in_handle = open(in_file, "rU")
    out_handle = open(out_file, "w")
    reader = fastqReader(in_handle)
    writer = fastqWriter(out_handle)
    if forward:
        for record in reader:
            seq = record.sequence.upper()
            result = primer.search(seq)
            if result:
                #Forward primer, take everything after it
                cut = result.end()
                record.sequence = seq[cut:]
                if len(record.sequence) >= min_len:
                    record.quality = record.quality[cut:]
                    clipped += 1
                    writer.write(record)
                else:
                    short_clipped += 1
            elif keep_negatives:
Example #34
0
def main():
    # Parse Command Line
    try:
        tabular_file, cols_arg, in_file, seq_format, out_positive_file, out_negative_file = sys.argv[1:]
    except ValueError:
        stop_err("Expected six arguments, got %i:\n%s" % (len(sys.argv) - 1, " ".join(sys.argv)))
    try:
        columns = [int(arg) - 1 for arg in cols_arg.split(",")]
    except ValueError:
        stop_err("Expected list of columns (comma separated integers), got %s" % cols_arg)

    if out_positive_file == "-" and out_negative_file == "-":
        stop_err("Neither output file requested")

    # Read tabular file and record all specified identifiers
    ids = set()
    handle = open(tabular_file, "rU")
    if len(columns) > 1:
        # General case of many columns
        for line in handle:
            if line.startswith("#"):
                # Ignore comments
                continue
            parts = line.rstrip("\n").split("\t")
            for col in columns:
                ids.add(parts[col])
        print "Using %i IDs from %i columns of tabular file" % (len(ids), len(columns))
    else:
        # Single column, special case speed up
        col = columns[0]
        for line in handle:
            if not line.startswith("#"):
                ids.add(line.rstrip("\n").split("\t")[col])
        print "Using %i IDs from tabular file" % (len(ids))
    handle.close()

    if seq_format.lower() == "sff":
        # Now write filtered SFF file based on IDs from BLAST file
        try:
            from Bio.SeqIO.SffIO import SffIterator, SffWriter
        except ImportError:
            stop_err("Requires Biopython 1.54 or later")

        try:
            from Bio.SeqIO.SffIO import ReadRocheXmlManifest
        except ImportError:
            # Prior to Biopython 1.56 this was a private function
            from Bio.SeqIO.SffIO import _sff_read_roche_index_xml as ReadRocheXmlManifest
        in_handle = open(in_file, "rb")  # must be binary mode!
        try:
            manifest = ReadRocheXmlManifest(in_handle)
        except ValueError:
            manifest = None
        # This makes two passes though the SFF file with isn't so efficient,
        # but this makes the code simple.
        if out_positive_file != "-":
            out_handle = open(out_positive_file, "wb")
            writer = SffWriter(out_handle, xml=manifest)
            in_handle.seek(0)  # start again after getting manifest
            pos_count = writer.write_file(rec for rec in SffIterator(in_handle) if rec.id in ids)
            out_handle.close()
        if out_negative_file != "-":
            out_handle = open(out_negative_file, "wb")
            writer = SffWriter(out_handle, xml=manifest)
            in_handle.seek(0)  # start again
            neg_count = writer.write_file(rec for rec in SffIterator(in_handle) if rec.id not in ids)
            out_handle.close()
        # And we're done
        in_handle.close()
        # At the time of writing, Galaxy doesn't show SFF file read counts,
        # so it is useful to put them in stdout and thus shown in job info.
        if out_positive_file != "-" and out_negative_file != "-":
            print "%i with and %i without specified IDs" % (pos_count, neg_count)
        elif out_positive_file != "-":
            print "%i with specified IDs" % pos_count
        elif out_negative_file != "-":
            print "%i without specified IDs" % neg_count
    elif seq_format.lower() == "fasta":
        # Write filtered FASTA file based on IDs from tabular file
        reader = fastaReader(open(in_file, "rU"))
        if out_positive_file != "-" and out_negative_file != "-":
            print "Generating two FASTA files"
            positive_writer = fastaWriter(open(out_positive_file, "w"))
            negative_writer = fastaWriter(open(out_negative_file, "w"))
            for record in reader:
                # The [1:] is because the fastaReader leaves the > on the identifer.
                if record.identifier and record.identifier.split()[0][1:] in ids:
                    positive_writer.write(record)
                else:
                    negative_writer.write(record)
            positive_writer.close()
            negative_writer.close()
        elif out_positive_file != "-":
            print "Generating matching FASTA file"
            positive_writer = fastaWriter(open(out_positive_file, "w"))
            for record in reader:
                # The [1:] is because the fastaReader leaves the > on the identifer.
                if record.identifier and record.identifier.split()[0][1:] in ids:
                    positive_writer.write(record)
            positive_writer.close()
        elif out_negative_file != "-":
            print "Generating non-matching FASTA file"
            negative_writer = fastaWriter(open(out_negative_file, "w"))
            for record in reader:
                # The [1:] is because the fastaReader leaves the > on the identifer.
                if not record.identifier or record.identifier.split()[0][1:] not in ids:
                    negative_writer.write(record)
            negative_writer.close()
    elif seq_format.lower().startswith("fastq"):
        # Write filtered FASTQ file based on IDs from tabular file
        from galaxy_utils.sequence.fastq import fastqReader, fastqWriter

        reader = fastqReader(open(in_file, "rU"))
        if out_positive_file != "-" and out_negative_file != "-":
            print "Generating two FASTQ files"
            positive_writer = fastqWriter(open(out_positive_file, "w"))
            negative_writer = fastqWriter(open(out_negative_file, "w"))
            for record in reader:
                # The [1:] is because the fastaReader leaves the @ on the identifer.
                if record.identifier and record.identifier.split()[0][1:] in ids:
                    positive_writer.write(record)
                else:
                    negative_writer.write(record)
            positive_writer.close()
            negative_writer.close()
        elif out_positive_file != "-":
            print "Generating matching FASTQ file"
            positive_writer = fastqWriter(open(out_positive_file, "w"))
            for record in reader:
                # The [1:] is because the fastaReader leaves the @ on the identifer.
                if record.identifier and record.identifier.split()[0][1:] in ids:
                    positive_writer.write(record)
            positive_writer.close()
        elif out_negative_file != "-":
            print "Generating non-matching FASTQ file"
            negative_writer = fastqWriter(open(out_negative_file, "w"))
            for record in reader:
                # The [1:] is because the fastaReader leaves the @ on the identifer.
                if not record.identifier or record.identifier.split()[0][1:] not in ids:
                    negative_writer.write(record)
            negative_writer.close()
    else:
        stop_err("Unsupported file type %r" % seq_format)
def main():
    usage = "usage: %prog [options] input_file output_file"
    parser = OptionParser(usage=usage)
    parser.add_option('-f',
                      '--format',
                      dest='format',
                      type='choice',
                      default='sanger',
                      choices=('sanger', 'solexa', 'illumina'),
                      help='FASTQ variant type')
    parser.add_option('-m',
                      '--mask_character',
                      dest='mask_character',
                      default='N',
                      help='Mask Character to use')
    parser.add_option('-c',
                      '--score_comparison',
                      type="choice",
                      dest='score_comparison',
                      default='le',
                      choices=('gt', 'ge', 'eq', 'lt', 'le', 'ne'),
                      help='Mask base when score is')
    parser.add_option('-s',
                      '--quality_score',
                      type="float",
                      dest='quality_score',
                      default='0',
                      help='Quality Score')
    parser.add_option("-l",
                      "--lowercase",
                      action="store_true",
                      dest="lowercase",
                      default=False,
                      help="Use lowercase masking")
    (options, args) = parser.parse_args()

    if len(args) != 2:
        parser.error("Need to specify an input file and an output file")

    score_comparer = get_score_comparer(options.score_comparison)

    if options.lowercase:
        base_masker = string.lower
    else:
        base_masker = BaseReplacer(options.mask_character)

    out = fastqWriter(open(args[1], 'wb'), format=options.format)

    num_reads = None
    num_reads_excluded = 0
    for num_reads, fastq_read in enumerate(
            fastqReader(open(args[0]), format=options.format)):
        sequence_list = list(fastq_read.sequence)
        for i, quality_score in enumerate(
                fastq_read.get_decimal_quality_scores()):
            if score_comparer(quality_score, options.quality_score):
                sequence_list[i] = base_masker(sequence_list[i])
        fastq_read.sequence = "".join(sequence_list)
        out.write(fastq_read)

    if num_reads is not None:
        print "Processed %i %s reads." % (num_reads + 1, options.format)
    else:
        print "No valid FASTQ reads were provided."
re_illumina_f = re.compile(r"^@[a-zA-Z0-9_:-]+ 1:.*$")
re_illumina_r = re.compile(r"^@[a-zA-Z0-9_:-]+ 2:.*$")
assert re_illumina_f.match(
    "@HWI-ST916:79:D04M5ACXX:1:1101:10000:100326 1:N:0:TGNCCA")
assert re_illumina_r.match(
    "@HWI-ST916:79:D04M5ACXX:1:1101:10000:100326 2:N:0:TGNCCA")
assert not re_illumina_f.match(
    "@HWI-ST916:79:D04M5ACXX:1:1101:10000:100326 2:N:0:TGNCCA")
assert not re_illumina_r.match(
    "@HWI-ST916:79:D04M5ACXX:1:1101:10000:100326 1:N:0:TGNCCA")

count, forward, reverse, neither, pairs, singles = 0, 0, 0, 0, 0, 0
in_handle = open(input_fastq)
if pairs_fastq:
    pairs_f_writer = fastqWriter(open(pairs_fastq, "w"), format)
    pairs_r_writer = pairs_f_writer
else:
    pairs_f_writer = fastqWriter(open(pairs_f_fastq, "w"), format)
    pairs_r_writer = fastqWriter(open(pairs_r_fastq, "w"), format)
singles_writer = fastqWriter(open(singles_fastq, "w"), format)
last_template, buffered_reads = None, []

for record in fastqReader(in_handle, format):
    count += 1
    name = record.identifier.split(None, 1)[0]
    assert name[0] == "@", record.identifier  #Quirk of the Galaxy parser
    is_forward = False
    suffix = re_f.search(name)
    if suffix:
        #============
            ids.add(parts[col])
    print "Using %i IDs from %i columns of tabular file" % (len(ids), len(columns))
else:
    # Single column, special case speed up
    col = columns[0]
    for line in handle:
        if not line.startswith("#"):
            ids.add(line.rstrip("\n").split("\t")[col])
    print "Using %i IDs from tabular file" % (len(ids))
handle.close()

# Write filtered FASTQ file based on IDs from tabular file
reader = fastqReader(open(in_file, "rU"))
if out_positive_file != "-" and out_negative_file != "-":
    print "Generating two FASTQ files"
    positive_writer = fastqWriter(open(out_positive_file, "w"))
    negative_writer = fastqWriter(open(out_negative_file, "w"))
    for record in reader:
        # The [1:] is because the fastaReader leaves the @ on the identifer.
        if record.identifier and record.identifier.split()[0][1:] in ids:
            positive_writer.write(record)
        else:
            negative_writer.write(record)
    positive_writer.close()
    negative_writer.close()
elif out_positive_file != "-":
    print "Generating matching FASTQ file"
    positive_writer = fastqWriter(open(out_positive_file, "w"))
    for record in reader:
        # The [1:] is because the fastaReader leaves the @ on the identifer.
        if record.identifier and record.identifier.split()[0][1:] in ids:
def main():
    usage = "usage: %prog [options] input_file output_file"
    parser = OptionParser(usage=usage)
    parser.add_option('-f',
                      '--format',
                      dest='format',
                      type='choice',
                      default='sanger',
                      choices=('sanger', 'cssanger', 'solexa', 'illumina',
                               'sanger.gz', 'cssanger.gz', 'solexa.gz',
                               'illumina.gz', 'sanger.bz2', 'cssanger.bz2',
                               'solexa.bz2', 'illumina.bz2'),
                      help='FASTQ variant type')
    parser.add_option('-s',
                      '--window_size',
                      type="int",
                      dest='window_size',
                      default='1',
                      help='Window size')
    parser.add_option('-t',
                      '--window_step',
                      type="int",
                      dest='window_step',
                      default='1',
                      help='Window step')
    parser.add_option('-e',
                      '--trim_ends',
                      type="choice",
                      dest='trim_ends',
                      default='53',
                      choices=('5', '3', '53', '35'),
                      help='Ends to Trim')
    parser.add_option('-a',
                      '--aggregation_action',
                      type="choice",
                      dest='aggregation_action',
                      default='min',
                      choices=('min', 'max', 'sum', 'mean'),
                      help='Aggregate action for window')
    parser.add_option(
        '-x',
        '--exclude_count',
        type="int",
        dest='exclude_count',
        default='0',
        help=
        'Maximum number of bases to exclude from the window during aggregation'
    )
    parser.add_option('-c',
                      '--score_comparison',
                      type="choice",
                      dest='score_comparison',
                      default='>=',
                      choices=('>', '>=', '==', '<', '<=', '!='),
                      help='Keep read when aggregate score is')
    parser.add_option('-q',
                      '--quality_score',
                      type="float",
                      dest='quality_score',
                      default='0',
                      help='Quality Score')
    parser.add_option("-k",
                      "--keep_zero_length",
                      action="store_true",
                      dest="keep_zero_length",
                      default=False,
                      help="Keep reads with zero length")
    (options, args) = parser.parse_args()

    if len(args) != 2:
        parser.error("Need to specify an input file and an output file")

    if options.window_size < 1:
        parser.error('You must specify a strictly positive window size')

    if options.window_step < 1:
        parser.error('You must specify a strictly positive step size')

    # determine an exhaustive list of window indexes that can be excluded from aggregation
    exclude_window_indexes = []
    last_exclude_indexes = []
    for exclude_count in range(min(options.exclude_count,
                                   options.window_size)):
        if last_exclude_indexes:
            new_exclude_indexes = []
            for exclude_list in last_exclude_indexes:
                for window_index in range(options.window_size):
                    if window_index not in exclude_list:
                        new_exclude = sorted(exclude_list + [window_index])
                        if new_exclude not in exclude_window_indexes + new_exclude_indexes:
                            new_exclude_indexes.append(new_exclude)
            exclude_window_indexes += new_exclude_indexes
            last_exclude_indexes = new_exclude_indexes
        else:
            for window_index in range(options.window_size):
                last_exclude_indexes.append([window_index])
            exclude_window_indexes = list(last_exclude_indexes)

    out = fastqWriter(path=args[1], format=options.format)
    action = ACTION_METHODS[options.aggregation_action]

    num_reads = None
    num_reads_excluded = 0
    for num_reads, fastq_read in enumerate(
            fastqReader(path=args[0], format=options.format)):
        for trim_end in options.trim_ends:
            quality_list = fastq_read.get_decimal_quality_scores()
            if trim_end == '5':
                lwindow_position = 0  # left position of window
                while True:
                    if lwindow_position >= len(quality_list):
                        fastq_read.sequence = ''
                        fastq_read.quality = ''
                        break
                    if exclude_and_compare(
                            action,
                            quality_list[lwindow_position:lwindow_position +
                                         options.window_size],
                            options.score_comparison, options.quality_score,
                            exclude_window_indexes):
                        fastq_read = fastq_read.slice(lwindow_position, None)
                        break
                    lwindow_position += options.window_step
            else:
                rwindow_position = len(
                    quality_list)  # right position of window
                while True:
                    lwindow_position = rwindow_position - options.window_size  # left position of window
                    if rwindow_position <= 0 or lwindow_position < 0:
                        fastq_read.sequence = ''
                        fastq_read.quality = ''
                        break
                    if exclude_and_compare(
                            action,
                            quality_list[lwindow_position:rwindow_position],
                            options.score_comparison, options.quality_score,
                            exclude_window_indexes):
                        fastq_read = fastq_read.slice(None, rwindow_position)
                        break
                    rwindow_position -= options.window_step
        if options.keep_zero_length or len(fastq_read):
            out.write(fastq_read)
        else:
            num_reads_excluded += 1
    out.close()
    if num_reads is None:
        print("No valid FASTQ reads could be processed.")
    else:
        print("%i FASTQ reads were processed." % (num_reads + 1))
    if num_reads_excluded:
        print("%i reads of zero length were excluded from the output." %
              num_reads_excluded)
Example #39
0
    in_handle = open(in_file, "rb")
    try:
        manifest = ReadRocheXmlManifest(in_handle)
    except ValueError:
        manifest = None
    in_handle.seek(0)
    out_handle = open(out_file, "wb")
    writer = SffWriter(out_handle, xml=manifest)
    writer.write_file(process(SffIterator(in_handle)))
    #End of SFF code
elif seq_format.lower().startswith("fastq"):
    in_handle = open(in_file, "rU")
    out_handle = open(out_file, "w")
    reader = fastqReader(in_handle)
    writer = fastqWriter(out_handle)
    if forward:
        for record in reader:
            seq = record.sequence.upper()
            result = primer.search(seq)
            if result:
                #Forward primer, take everything after it
                cut = result.end()
                record.sequence = seq[cut:]
                if len(record.sequence) >= min_len:
                    record.quality = record.quality[cut:]
                    clipped += 1
                    writer.write(record)
                else:
                    short_clipped += 1
            elif keep_negatives:
Example #40
0
    #And we're done
    in_handle.close()
    #At the time of writing, Galaxy doesn't show SFF file read counts,
    #so it is useful to put them in stdout and thus shown in job info.
    print "%i with and %i without specified IDs" % (pos_count, neg_count)
elif seq_format.lower()=="fasta":
    #Write filtered FASTA file based on IDs from tabular file
    pos_count, neg_count = fasta_filter(in_file, out_positive_file, out_negative_file, ids)
    print "%i with and %i without specified IDs" % (pos_count, neg_count)
elif seq_format.lower().startswith("fastq"):
    #Write filtered FASTQ file based on IDs from tabular file
    from galaxy_utils.sequence.fastq import fastqReader, fastqWriter
    reader = fastqReader(open(in_file, "rU"))
    if out_positive_file is not None and out_negative_file is not None:
        print "Generating two FASTQ files"
        positive_writer = fastqWriter(open(out_positive_file, "w"))
        negative_writer = fastqWriter(open(out_negative_file, "w"))
        for record in reader:
            #The [1:] is because the fastaReader leaves the > on the identifier.
            if record.identifier and clean_name(record.identifier.split()[0][1:]) in ids:                
                positive_writer.write(record)
            else:
                negative_writer.write(record)
        positive_writer.close()
        negative_writer.close()
    elif out_positive_file is not None:
        print "Generating matching FASTQ file"
        positive_writer = fastqWriter(open(out_positive_file, "w"))
        for record in reader:
            #The [1:] is because the fastaReader leaves the > on the identifier.
            if record.identifier and clean_name(record.identifier.split()[0][1:]) in ids: