Exemple #1
0
def mergeLabelPair(fastqIn1, fastqIn2, fastqOut, label1=':1', label2=':2'):
    ''' Function merges two FASTQ files into a single FASTQ file.
    Specified names are added to the end of the merged reads.
    
    1)  fastqIn1 - Read one FASTQ file(s). Either a string or a list of
        strings.
    2)  fastqIn2 - Read two FASTQ file(s). Either a string or a list of
        strings.
    3)  fastqOut - Output FASTQ file
    4)  label1 - Label to add to read1 file.
    5)  label2 - Label to add to read2 file.
    
    '''
    # Open input and output process
    output = writeFile.writeFileProcess(fastqOut)
    input1 = fastqExtract.readFastqProcess(fastqIn1)
    input2 = fastqExtract.readFastqProcess(fastqIn2)
    # Extract labelled reads and save to output
    for read1, read2 in itertools.izip(input1, input2):
        # Extract read ID and check for equality
        read1Header, read1Remainder = read1.split('\n' ,1)
        read1Header = read1Header.split(' ', 1)
        read2Header, read2Remainder = read2.split('\n' ,1)
        read2Header = read2Header.split(' ', 1)
        if read1Header[0] != read2Header[0]:
            raise IOError('Input FASTQ files contain unmatched reads')
        else:
            read1Header[0] += label1
            read2Header[0] += label2
            output.add('%s\n%s\n%s\n%s\n' %(
                ' '.join(read1Header),
                read1Remainder,
                ' '.join(read2Header),
                read2Remainder
            ))
    # Close pipes and processes
    input1.close()
    input2.close()
    output.close()
Exemple #2
0
def mergeLabelTrimPair(fastqIn1, fastqIn2, trimSeq, fastqOut, minLength = 20,
    label1=':1', label2=':2'):
    ''' Function merges two paired FASTQ files into a single FASTQ file.
    FASTQ entries are trimmed to not extend beyong a supplied trim
    sequence. Any pair of reads for which one of the trimmed reads is
    shorter than the supplied minimum length is discarded. Specified
    labels are added to the end of the merged reads. Function takes seven
    arguments:
    
    1)  fastqIn1 - Read one FASTQ file(s). Either a string or a list of
        strings.
    2)  fastqIn2 - Read two FASTQ file(s). Either a string or a list of
        strings.
    3)  trimSeq - Sequence at which to terminate reads
    4)  fastqOut - Output FASTQ file.
    5)  minLength - Minimum length of reads to be output.
    6)  label1 - Label to add to read1 file.
    7)  label2 - Label to add to read2 file.
    
    Function returns a dictionary containing the following elements

    1)  total - Total number of read pairs.
    2)  short - Number of pairs with at least one read too short.
    3)  trim1 - Number of acceptable pairs with read1 trimmed.
    4)  trim2 - Number of acceptable pairs with read2 trimmed.
    '''
    # Create output dictionary and key variables
    metrics = {'total' : 0, 'short': 0, 'trim1': 0, 'trim2' : 0}
    seqLength = len(trimSeq)
    # Open input and output process
    input1 = fastqExtract.readFastqProcess(fastqIn1)
    input2 = fastqExtract.readFastqProcess(fastqIn2)
    output = writeFile.writeFileProcess(fastqOut)
    # Extract labelled reads and save to output
    for read1, read2 in itertools.izip(input1, input2):
        # Count total reads
        metrics['total'] += 1
        # Extract elements of read and identify trim sequence
        read1 = read1.split('\n')
        read2 = read2.split('\n')
        read1Loc = read1[1].find(trimSeq)
        read2Loc = read2[1].find(trimSeq)
        # Set trim length for read1 and count and skip if too short
        if read1Loc == -1:
            read1Trim = None
        else:
            read1Trim = read1Loc + seqLength
        if read1Trim and read1Trim < minLength:
            metrics['short'] += 1
            continue
        # Set trim length for read2 and count and skip if too short
        if read2Loc == -1:
            read2Trim = None
        else:
            read2Trim = read2Loc + seqLength
        if read2Trim and read2Trim < minLength:
            metrics['short'] +=1
            continue
        # Trim reads if required
        if read1Trim and read1Trim < len(read1[1]):
            read1[1] = read1[1][:read1Trim]
            read1[3] = read1[3][:read1Trim]
            metrics['trim1'] += 1
        if read2Trim and read2Trim < len(read2[1]):
            read2[1] = read2[1][:read2Trim]
            read2[3] = read2[3][:read2Trim]
            metrics['trim2'] += 1
        # Label reads
        if label1:
            read1Head = read1[0].split(' ',1)
            read1Head[0] += label1
            read1[0] = ' '.join(read1Head)
        if label2:
            read2Head = read2[0].split(' ',1)
            read2Head[0] += label2
            read2[0] = ' '.join(read2Head)
        # Save to file
        output.add('%s\n%s\n' %(
            '\n'.join(read1),
            '\n'.join(read2)
        ))
    # Close input and output objects
    input1.close()
    input2.close()
    output.close()
    # Return metrics
    return(metrics)