コード例 #1
0
def concatenate(
    aligned_loci_dir,
    output_dir,
    order_of_loci,
    number_of_gaps_between_loci,
    log_dir
):

    '''
    Concatenate the alignments of individual loci.

    number_of_gaps_between_loci - the number of gaps to insert between individual alignments.
    '''

    print('\nConcatenate the alignments of individual loci.')

    import os

    from Bio import AlignIO

    import krio
    import krbioio
    import kralign
    import copy

    ps = os.path.sep

    print('\n\tPreparing output directory "', output_dir, '"', sep='')
    krio.prepare_directory(output_dir)

    file_list = krio.parse_directory(aligned_loci_dir, ' ')

    order_list = [x.strip() for x in order_of_loci]
    alignments = [x.strip() for x in order_of_loci]

    for f in file_list:
        if not f['ext'].startswith('phy'):
            continue

        file_name = f['name'].split('_trimal')[0]

        aln = AlignIO.read(f['path'], "phylip-relaxed")
        if aln:
            i = alignments.index(file_name)
            alignments[i] = (aln, file_name)

    for aln in alignments:
        if isinstance(aln, basestring):
            alignments.remove(aln)

    print('\n\tProducing concatenated alignment.')
    if alignments:

        # Produce presence/absence matrix
        presence_list = list()
        length_list = list()
        for p in range(0, len(order_list)):
            presence_list.append('0')
        matrix = dict()
        for a in alignments:
            length_list.append(str(a[0].get_alignment_length()))
            for s in a[0]:
                if not s.id in matrix:
                    matrix[s.id] = copy.copy(presence_list)
        for a in alignments:
            for s in a[0]:
                idx = order_list.index(a[1])
                matrix[s.id][idx] = '1'
        matrix_output_file = log_dir + ps + '06-locus-presence' + '.csv'
        f = open(matrix_output_file, 'wb')
        f.write('taxon' + ',' + 'count' + ',' + ','.join(order_list) + '\n')
        f.write('' + ',' + '' + ',' + ','.join(length_list) + '\n')
        for key in matrix.keys():
            f.write(key + ',' + str(matrix[key].count('1')) + ',' +
                    ','.join(matrix[key]) + '\n')
        f.close()

        # Concatenate
        partitions_output_file = log_dir + ps + '06-locus-partitions' + '.csv'
        raxml_partitions_output_file = log_dir + ps + '06-locus-partitions-raxml'
        f_part = open(partitions_output_file, 'wb')
        f_part_raxml = open(raxml_partitions_output_file, 'wb')
        raw_alignments = list()
        for a in alignments:
            raw_alignments.append(a[0])
        concatenated = kralign.concatenate(raw_alignments, int(number_of_gaps_between_loci))
        cat_aln = concatenated[0]
        cat_partitions = concatenated[1]
        f_part.write('locus,start,end\n')
        for i, part in enumerate(cat_partitions):
            raxml_part_line = 'DNA, ' + order_list[i] + ' = ' + str(part[0]) + '-' + str(part[1]) + '\n'
            f_part_raxml.write(raxml_part_line)
            part_line = order_list[i] + ',' + str(part[0]) + ',' + str(part[1]) + '\n'
            f_part.write(part_line)
        concatenated_output_file = output_dir + ps + 'concatenated' + '.phy'
        krbioio.write_alignment_file(cat_aln, concatenated_output_file,
                                     'phylip-relaxed')
        f_part.close()
        f_part_raxml.close()
コード例 #2
0
ファイル: krbioio.py プロジェクト: karolisr/PhyloMill
def split_fastq_file(pieces, output_dir, forward_reads_file_path,
                     reverse_reads_file_path=None, log_func=None,
                     log_file_path=None):

    import os
    import krio

    msg = 'Splitting FASTQ file into ' + str(pieces) + ' pieces.'
    print(msg)
    if log_func and log_file_path:
        log_func(msg, log_file_path)

    krio.prepare_directory(output_dir)
    print('Counting reads, this may take some time...')
    num_lines = krio.num_lines_in_file(forward_reads_file_path, print_every=400000)
    msg = 'There are ' + str(num_lines / 4) + ' records.'
    print(msg)
    if log_func and log_file_path:
        log_func(msg, log_file_path)
    records_per_file = num_lines / 4 / pieces

    forward_file_handles = list()
    reverse_file_handles = list()

    for piece in range(0, pieces):
        handle = open(output_dir + os.path.sep + 'f_' + str(piece + 1) +
                      '.fastq', 'wa')
        forward_file_handles.append(handle)
        if reverse_reads_file_path:
            handle = open(output_dir + os.path.sep + 'r_' + str(piece + 1) +
                          '.fastq', 'wa')
            reverse_file_handles.append(handle)

    forward_file_handles.reverse()
    reverse_file_handles.reverse()

    msg = '\nSplitting forward reads.\n'
    print(msg)
    if log_func and log_file_path:
        log_func(msg, log_file_path)
    with open(forward_reads_file_path) as f:
        write_handle = None
        lines_written = 0
        for i, l in enumerate(f):
            if (len(forward_file_handles) and
                    ((float(i) / 4) % records_per_file == 0)):
                if lines_written != 0:
                    msg = '\tWritten ' + str(lines_written / 4) + ' records.'
                    print(msg)
                    if log_func and log_file_path:
                        log_func(msg, log_file_path)
                    lines_written = 0
                msg = ('\t' + str(len(forward_file_handles)) +
                       ' files remaining.')
                print(msg)
                if log_func and log_file_path:
                    log_func(msg, log_file_path)
                write_handle = forward_file_handles.pop()
            write_handle.write(l)
            lines_written = lines_written + 1
            if num_lines == i + 1:
                msg = '\tWritten ' + str(lines_written / 4) + ' records.'
                print(msg)
                if log_func and log_file_path:
                    log_func(msg, log_file_path)

    if reverse_reads_file_path:
        msg = '\nSplitting reverse reads.\n'
        print(msg)
        if log_func and log_file_path:
            log_func(msg, log_file_path)
        with open(reverse_reads_file_path) as f:
            write_handle = None
            lines_written = 0
            for i, l in enumerate(f):
                if (len(reverse_file_handles) and
                        ((float(i) / 4) % records_per_file == 0)):
                    if lines_written != 0:
                        msg = '\tWritten ' + str(lines_written / 4) + ' records.'
                        print(msg)
                        if log_func and log_file_path:
                            log_func(msg, log_file_path)
                        lines_written = 0
                    msg = ('\t' + str(len(reverse_file_handles)) +
                           ' files remaining.')
                    print(msg)
                    if log_func and log_file_path:
                        log_func(msg, log_file_path)
                    write_handle = reverse_file_handles.pop()
                write_handle.write(l)
                lines_written = lines_written + 1
                if num_lines == i + 1:
                    msg = '\tWritten ' + str(lines_written / 4) + ' records.'
                    print(msg)
                    if log_func and log_file_path:
                        log_func(msg, log_file_path)