Beispiel #1
0
def clean_fastqc_output(param,readfile):
    """
    Function that delete unwanted FastQC output files.

    Takes two arguments :
        - param [dict] : dictionnary containing all parameters
        - readfile [dict] : fastq file used by Fastqc

    Returns one argument :
        - file [dict] : containing quality control information about the readfile
    """

    # Save the working directory
    work_dir = os.getcwd()


    # Delete unwanted files ----------------------------------------------------
    
    # get prefix of readfile to delete unwanted files generated by FastQC
    file_1 = ce.get_file_prefix(readfile)
    
    # delete
    os.remove('{0}/Fastqc/{1}_fastqc.zip'.format(param['output'], file_1))
    os.chdir('{0}/Fastqc/{1}_fastqc'.format(param['output'], file_1))
    os.system('mv ../{0}_fastqc.html .'.format(file_1))
    os.system('rm -r fastqc_report.html fastqc.fo summary.txt Icons')


    # Get Quality control information about readfile ---------------------------

    # regex
    filename_re = re.compile("^Filename\t(.*)")
    encoding_re = re.compile("^Encoding\t(.*)")
    total_seq_re = re.compile("^Total Sequences\t(.*)")
    seq_length_re = re.compile("^Sequence length\t(.*)")
    GC_perc_re = re.compile("^%GC\t(.*)")

    # create a dict
    file1 = {}

    # get information
    with open("fastqc_data.txt", "rt") as f:
        for line in f:
            # filename
            match = filename_re.search(line)
            if match:
                file1['filename'] = match.group(1)
            # encoding
            match = encoding_re.search(line)
            if match:
                file1['encoding'] = match.group(1)
            # total sequences
            match = total_seq_re.search(line)
            if match:
                file1['total_sequence'] = match.group(1)    
            # sequence length
            match = seq_length_re.search(line)
            if match:
                file1['sequence_length'] = match.group(1)
            # percentage GC
            match = GC_perc_re.search(line)
            if match:
                file1['GC_perc'] = match.group(1)
    
    # go back to the working directory
    os.chdir(work_dir)

    return file1
Beispiel #2
0
def commandline_input_output(param, cmd, nb, inout):
    """
    Function that add to 'cmd' the input and output files commandline depending
    on if it's the first or second step of trimming (information given by nb)
    
    Takes 4 arguments :
        - param [dict] : dictionnary containing all parameters
        - cmd [string] : base command line of the programme (trimmomatic)
        - nb [integer] : number of executed trimming command
        - inout [dict] : dictionnary containing all generated files on the 
                        user's computer
    
    Returns two arguments:
        - cmd [string] : the command line with the input and output files
        - inout [dict] : with new files names
    """
    
    # SINGLE-END DATA ----------------------------------------------------------

    if(param['layout'] == 'SE'):
        cmd += ' SE'
        cmd += ' -threads {0}'.format(param['threads'])
        
        # Creating output filename(s) ------------------------------------------
    
        # get input file prefix to create new filename(s)
        prefix = ce.get_file_prefix(param['input'][0])
        trimmed = "{0}/trimmed_{1}.fastq".format(param['output'],prefix)
        
        # add the compression format if choosen
        if 'compress' in param :
            trimmed += "{0}".format(param['compress'])
        

        # Generation of commandline --------------------------------------------
        
        # if it's a the first trimming
        if (nb == 0):
            cmd += ' {0} {1}'.format(param['input'][0], trimmed)
            
            # adding the input and output files to inout
            inout['input'] = param['input']
            inout['trimmed'] = trimmed    
        
        # else step 2 input files are the output files of step 1
        elif (nb == 1):
            cmd += ' {0} {1}'.format(inout['tmp'], trimmed)
        
    
    # PAIRED-END DATA ----------------------------------------------------------

    elif(param['layout'] == 'PE') :
        cmd += ' PE'
        cmd += ' -threads {0} '.format(param['threads'])
        
        # Creating output filename(s) ------------------------------------------

        # get input file prefix to create new filename(s)
        prefix_1 = ce.get_file_prefix(param['input'][0])
        prefix_2 = ce.get_file_prefix(param['input'][1])
        
        trimmed_1 = "{0}/trimmed_{1}.fastq".format(param['output'],prefix_1)
        trimmed_2 = "{0}/trimmed_{1}.fastq".format(param['output'],prefix_2)
        
        single_1 = "{0}/single_{1}.fastq".format(param['output'],prefix_1)
        single_2 = "{0}/single_{1}.fastq".format(param['output'],prefix_2)
        
        
        # add the compression format if choosen
        if 'compress' in param :
            
            trimmed_1 += "{0}".format(param['compress'])
            trimmed_2 += "{0}".format(param['compress'])
            single_1 += "{0}".format(param['compress'])
            single_2 += "{0}".format(param['compress'])
            
        
        # Generation of commandline --------------------------------------------

        # if it's a the first trimming
        if(nb==0):
            cmd += ' {0} {1} {2} {3} {4} {5}'.format(param['input'][0], 
                                                     param['input'][1], 
                                                     trimmed_1, single_1, 
                                                     trimmed_2, single_2)
            
            # adding the input and output files to inout
            inout['input'] = param['input']
            inout['trimmed'] = trimmed_1, trimmed_2
            inout['single'] = single_1, single_2
        
        
        # else step 2 input files are the output files of step 1    
        elif(nb==1):
            cmd += '{0} {1} {2} {3} {4} {5}'.format(inout['tmp'][0],
                                                    inout['tmp'][1],
                                                    trimmed_1, single_1,
                                                    trimmed_2, single_2)
            
    return cmd,inout
Beispiel #3
0
def change_output_as_input(inout, param):
    """
    Function that change step1 output files into step2 input files.
    
    Takes 2 arguments :
        - inout [dict] : dictionnary containing all generated files on the 
                        user's working directory
        - param [dict] : dictionnary containing all parameters    
    
    Returns one argument:
        inout [dict] : containing the new files names
    """
        
    # SINGLE-END ---------------------------------------------------------------

    if(param['layout'] == 'SE'):
        
        # Creating temporary filename(s) ---------------------------------------
        
        # get the prefix of the file
        filename = ce.get_file_prefix(param['input'][0])
        
        # new temporary filename
        tmp = '{0}/tmp{1}.fastq'.format(param['output'], filename)
    
        if 'compress' in param:
            tmp += '{0}'.format(param['compress'])
        

        # Rename step 1 trimming file in temporary -----------------------------

        os.rename(inout['trimmed'], tmp)
        

        # Add the temporary file in io -----------------------------------------
        
        inout['tmp'] = tmp
        
        
    # PAIRED-END ---------------------------------------------------------------
    else :
        
        # Creating temporary filename(s) ---------------------------------------

        # get the prefix of files
        filename_1 = ce.get_file_prefix(param['input'][0])
        filename_2 = ce.get_file_prefix(param['input'][1])
        
        # new temporary filenames
        tmp_1 = '{0}/tmp{1}.fastq'.format(param['output'],filename_1)
        tmp_2 = '{0}/tmp{1}.fastq'.format(param['output'],filename_2)
        
        if 'compress' in param:
            tmp_1 += '.{0}'.format(param['compress'])
            tmp_2 += '.{0}'.format(param['compress'])
        

        # Rename step 1 trimming file in temporary -----------------------------
        
        os.rename(inout['trimmed'][0], tmp_1)
        os.rename(inout['trimmed'][1], tmp_2)
        

        # Add the temporary file in io -----------------------------------------
        
        inout['tmp'] = tmp_1, tmp_2
        

        # Delete step1 singleton read files ------------------------------------

        os.remove(inout['single'][0])
        os.remove(inout['single'][1])
    

    return inout