def concatenate( aligned_loci_dir, output_dir, order_of_loci, number_of_gaps_between_loci, log_dir ): ''' Concatenate the alignments of individual loci. number_of_gaps_between_loci - the number of gaps to insert between individual alignments. ''' print('\nConcatenate the alignments of individual loci.') import os from Bio import AlignIO import krio import krbioio import kralign import copy ps = os.path.sep print('\n\tPreparing output directory "', output_dir, '"', sep='') krio.prepare_directory(output_dir) file_list = krio.parse_directory(aligned_loci_dir, ' ') order_list = [x.strip() for x in order_of_loci] alignments = [x.strip() for x in order_of_loci] for f in file_list: if not f['ext'].startswith('phy'): continue file_name = f['name'].split('_trimal')[0] aln = AlignIO.read(f['path'], "phylip-relaxed") if aln: i = alignments.index(file_name) alignments[i] = (aln, file_name) for aln in alignments: if isinstance(aln, basestring): alignments.remove(aln) print('\n\tProducing concatenated alignment.') if alignments: # Produce presence/absence matrix presence_list = list() length_list = list() for p in range(0, len(order_list)): presence_list.append('0') matrix = dict() for a in alignments: length_list.append(str(a[0].get_alignment_length())) for s in a[0]: if not s.id in matrix: matrix[s.id] = copy.copy(presence_list) for a in alignments: for s in a[0]: idx = order_list.index(a[1]) matrix[s.id][idx] = '1' matrix_output_file = log_dir + ps + '06-locus-presence' + '.csv' f = open(matrix_output_file, 'wb') f.write('taxon' + ',' + 'count' + ',' + ','.join(order_list) + '\n') f.write('' + ',' + '' + ',' + ','.join(length_list) + '\n') for key in matrix.keys(): f.write(key + ',' + str(matrix[key].count('1')) + ',' + ','.join(matrix[key]) + '\n') f.close() # Concatenate partitions_output_file = log_dir + ps + '06-locus-partitions' + '.csv' raxml_partitions_output_file = log_dir + ps + '06-locus-partitions-raxml' f_part = open(partitions_output_file, 'wb') f_part_raxml = open(raxml_partitions_output_file, 'wb') raw_alignments = list() for a in alignments: raw_alignments.append(a[0]) concatenated = kralign.concatenate(raw_alignments, int(number_of_gaps_between_loci)) cat_aln = concatenated[0] cat_partitions = concatenated[1] f_part.write('locus,start,end\n') for i, part in enumerate(cat_partitions): raxml_part_line = 'DNA, ' + order_list[i] + ' = ' + str(part[0]) + '-' + str(part[1]) + '\n' f_part_raxml.write(raxml_part_line) part_line = order_list[i] + ',' + str(part[0]) + ',' + str(part[1]) + '\n' f_part.write(part_line) concatenated_output_file = output_dir + ps + 'concatenated' + '.phy' krbioio.write_alignment_file(cat_aln, concatenated_output_file, 'phylip-relaxed') f_part.close() f_part_raxml.close()
def split_fastq_file(pieces, output_dir, forward_reads_file_path, reverse_reads_file_path=None, log_func=None, log_file_path=None): import os import krio msg = 'Splitting FASTQ file into ' + str(pieces) + ' pieces.' print(msg) if log_func and log_file_path: log_func(msg, log_file_path) krio.prepare_directory(output_dir) print('Counting reads, this may take some time...') num_lines = krio.num_lines_in_file(forward_reads_file_path, print_every=400000) msg = 'There are ' + str(num_lines / 4) + ' records.' print(msg) if log_func and log_file_path: log_func(msg, log_file_path) records_per_file = num_lines / 4 / pieces forward_file_handles = list() reverse_file_handles = list() for piece in range(0, pieces): handle = open(output_dir + os.path.sep + 'f_' + str(piece + 1) + '.fastq', 'wa') forward_file_handles.append(handle) if reverse_reads_file_path: handle = open(output_dir + os.path.sep + 'r_' + str(piece + 1) + '.fastq', 'wa') reverse_file_handles.append(handle) forward_file_handles.reverse() reverse_file_handles.reverse() msg = '\nSplitting forward reads.\n' print(msg) if log_func and log_file_path: log_func(msg, log_file_path) with open(forward_reads_file_path) as f: write_handle = None lines_written = 0 for i, l in enumerate(f): if (len(forward_file_handles) and ((float(i) / 4) % records_per_file == 0)): if lines_written != 0: msg = '\tWritten ' + str(lines_written / 4) + ' records.' print(msg) if log_func and log_file_path: log_func(msg, log_file_path) lines_written = 0 msg = ('\t' + str(len(forward_file_handles)) + ' files remaining.') print(msg) if log_func and log_file_path: log_func(msg, log_file_path) write_handle = forward_file_handles.pop() write_handle.write(l) lines_written = lines_written + 1 if num_lines == i + 1: msg = '\tWritten ' + str(lines_written / 4) + ' records.' print(msg) if log_func and log_file_path: log_func(msg, log_file_path) if reverse_reads_file_path: msg = '\nSplitting reverse reads.\n' print(msg) if log_func and log_file_path: log_func(msg, log_file_path) with open(reverse_reads_file_path) as f: write_handle = None lines_written = 0 for i, l in enumerate(f): if (len(reverse_file_handles) and ((float(i) / 4) % records_per_file == 0)): if lines_written != 0: msg = '\tWritten ' + str(lines_written / 4) + ' records.' print(msg) if log_func and log_file_path: log_func(msg, log_file_path) lines_written = 0 msg = ('\t' + str(len(reverse_file_handles)) + ' files remaining.') print(msg) if log_func and log_file_path: log_func(msg, log_file_path) write_handle = reverse_file_handles.pop() write_handle.write(l) lines_written = lines_written + 1 if num_lines == i + 1: msg = '\tWritten ' + str(lines_written / 4) + ' records.' print(msg) if log_func and log_file_path: log_func(msg, log_file_path)