Exemple #1
0
def intersect_fastq(fastq_files, out_file, remove_temp_output=None):
    """ Intersects multiple fastq files with one another. Includes only the reads (4
    lines long each) that are common to all the files. Writes these reads to the
    output file specified in out_file. 
    """

    # optimize for the common case, where we are intersecting 1 file
    if len(fastq_files) == 1:
        if remove_temp_output:
            shutil.move(fastq_files[0], out_file)
        else:
            shutil.copyfile(fastq_files[0], out_file)
    else:
        # store the number of files that contain each sequence
        sequence_count = {}
        for fname in fastq_files:
            for lines in utilities.read_file_n_lines(fname, 4):
                sequence_count[lines[0]] = sequence_count.get(lines[0], 0) + 1

        num_files = len(fastq_files)
        with open(out_file, "w") as file_handle:
            # read through one of the files, writing out each sequence that
            # is found in all of the files
            for lines in utilities.read_file_n_lines(fastq_files[0], 4):
                if sequence_count.get(lines[0], 0) >= num_files:
                    file_handle.write("".join(lines))
Exemple #2
0
def write_tagged_sequences_from_fastq(input_fastq, bmtagger_output,
                                      output_fastq, verbose):
    """ Find the sequences bmtagger has tagged as contaminates from the extract output file """

    # store all of the sequences bmtagger has not tagged as contaminates
    untagged_sequences = set()
    for lines in utilities.read_file_n_lines(bmtagger_output, 4):
        untagged_sequences.add(lines[0])

    try:
        file_handle_write = open(output_fastq, "w")
    except EnvironmentError:
        sys.exit("ERROR: Unable to open file: " + output_fastq)

    tagged_sequences = 0
    for lines in utilities.read_file_n_lines(input_fastq, 4):
        # check if the sequence was identified by bmtagger
        if not lines[0] in untagged_sequences:
            tagged_sequences += 1
            file_handle_write.write("".join(lines))

    # log the number of sequences
    message = "Total contaminate sequences in file ( " + output_fastq + " ): " + str(
        tagged_sequences)
    logger.info(message)
    if verbose:
        print(message)
Exemple #3
0
def remove_repeats_from_fastq(input_fastq, trf_output, output_fastq):
    """ Remove the sequences from TRF that contain repeats from the output files """

    sequences_with_repeats = set()
    try:
        with open(trf_output) as file_handle:
            for line in file_handle:
                # sequences start with "@"
                if line[0] == "@":
                    sequences_with_repeats.add(line)
    except EnvironmentError:
        pass

    try:
        file_handle_write = open(output_fastq, "w")
    except EnvironmentError:
        sys.exit("ERROR: Unable to open file: " + output_fastq)

    removed_sequences = 0
    for lines in utilities.read_file_n_lines(input_fastq, 4):
        # check if the sequence was identified by TRF
        if lines[0] in sequences_with_repeats:
            removed_sequences += 1
        else:
            file_handle_write.write("".join(lines))

    # log the number of sequences removed for repeats
    logger.info("Total number of sequences with repeats removed from file ( " +
                input_fastq + " ): " + str(removed_sequences))
Exemple #4
0
    def test_read_in_file_n_lines(self):
        """
        Test the function that reads in a file n lines at a time
        """

        # Get the sequences from the file, removing end-lines and the fastq "@" character
        sequences = [
            lines[0].rstrip()[1:]
            for lines in utilities.read_file_n_lines(cfg.merge_files[0], 4)
        ]

        self.assertEqual(sorted(sequences),
                         sorted(cfg.merge_files_1_sequences))
Exemple #5
0
    def test_intersect_fastq(self):
        """
        Test the intersect_fastq function
        """

        file_handle, temp_output_file = tempfile.mkstemp(
            prefix="kneaddata_test")

        run.intersect_fastq(cfg.merge_files, temp_output_file)

        # read in the sequences from the fastq output file
        # Get the sequences from the file, removing end-lines and the fastq "@" character
        sequences = [
            lines[0].rstrip()[1:]
            for lines in utilities.read_file_n_lines(temp_output_file, 4)
        ]

        # remove the temp output file
        utils.remove_temp_file(temp_output_file)

        self.assertEqual(sorted(sequences),
                         sorted(cfg.merge_files_sequences_intersect))
Exemple #6
0
def run_trf(input,trf_path,trf_options,nproc,output,verbose=True):
    """ Run trf with the options provided """

    tempfile_list=[]
    datfile_list=[]
    commands=[]

    # check for one process and if so just run trf directly
    if nproc == 1:
        commands.append([[trf_path,input]+trf_options.split(" "),"trf",[input],[output],output])
        
        utilities.start_processes(commands,nproc,verbose)
    else:
        # get the total number of reads
        total_lines=0
        with open(input) as file_handle:
            for line in file_handle:
                total_lines+=1

        # split the input into multiple files and run in parallel
        for i in range(int(nproc)):
            file_out, new_file = tempfile.mkstemp(prefix=os.path.basename(output)+'_'+str(i)+'_temp_trf_output',dir=os.path.dirname(output))
            os.close(file_out)
            tempfile_list.append(new_file)
            datfile_list.append(new_file+".".join(trf_options.split("-")[0].split(" "))+"dat")

        # write the input file into all temp output files
        output_file_number=0
        lines_per_file = int(total_lines/int(nproc))
        lines_written=0
        file_handle_write=None
        for read_line in utilities.read_file_n_lines(input,2):
            if not file_handle_write:
                file_handle_write = open(tempfile_list[output_file_number],"wt")
            file_handle_write.write("".join(read_line))

            lines_written+=2
            if lines_written > lines_per_file:
                file_handle_write.close()
                lines_written=0
                output_file_number+=1
                file_handle_write = open(tempfile_list[output_file_number],"wt")
               
        file_handle_write.close() 

        # run commands
        for i, temp_in, temp_out in zip(range(len(tempfile_list)), tempfile_list, datfile_list):
            trf_command=[trf_path,temp_in]+trf_options.split(" ")
            commands.append([trf_command,"trf{}".format(i),[temp_in],[temp_out],temp_out])

        utilities.start_processes(commands,nproc,verbose)
    
        # merge all of the outputs to the final output file
        with open(output,"w") as file_write:
            for datfile in datfile_list:
                with open(datfile) as file_read:
                    for line in file_read:
                        file_write.write(line)
    
        # remove temp files
        for filename in tempfile_list+datfile_list:
            try:
                os.remove(filename)   
            except EnvironmentError:
                print("Unable to remove temp file: " + filename)