def bootstrap(args, alignment, tree, reps=100):
    bs_args = range_args(args, reps)
    bs_alignments = sample(alignment, reps)

    partition_jobs = [(partition, (args, bsa))
                      for args, bsa in zip(bs_args, bs_alignments)]
    part_results = mapPool(reps, partition_jobs)

    phylo_jobs = [(phylogeny, (args, pr[0], pr[1]))
                  for args, pr in zip(bs_args, part_results)]
    bs_trees = mapPool(reps, phylo_jobs)

    bs_tree = map_support(tree, bs_trees)

    return bs_tree
def partition_sites(seqs, args):
    sites = [[s[i] for s in seqs] for i in range(len(seqs[0]))]
    jobs = [(partition, (site, args.alphabet)) for site in sites]
    partitions = mapPool(args.threads, jobs, daemonic=True, chunksize=10000)

    patterns = list(set(partitions))
    partitions = [patterns.index(part) for part in partitions]
    return partitions, patterns
def enumerate_minimal_covers(clique_matrix, threads=1):
    m, n = clique_matrix.shape
    elements = get_duals(clique_matrix, n)
    elements = reduce_elements(elements, m, n)
    elements = order_minimals(elements)
    islands = split_disconnected(elements)
    parts = [(minimal_covers, island) for island in islands]
    partial_covers, partial_chains = list(zip(*mapPool(threads, parts)))
    covers = merge_disconnected(partial_covers)
    return covers
def calculate_rates(patterns, pattern_counts, nMinusOne, num_invariants,
                    invariant_index, partitions, args):
    # parallelize, since this step can be very long
    jobs = [(score_conflict, (pat, patterns, pattern_counts, nMinusOne,
                              num_invariants)) for pat in patterns]
    pattern_conflicts = mapPool(args.threads,
                                jobs,
                                daemonic=True,
                                chunksize=100)

    pattern_conflicts[
        invariant_index] = 0  # definitionally, and above calculation doesn't account for invariant sites
    pattern_rates = [1. - c for c in pattern_conflicts]
    # Expand pattern_rates into places where they occur in partitons.
    rates = [pattern_rates[i] for i in partitions]
    return pattern_rates, rates
Esempio n. 5
0
def multiple_alignment(args, fastas):
    basedir = os.getcwd()
    alignment = basedir + '/2_alignment/' + args.output + '.fasta'
    os.chdir('2_alignment')

    if not os.path.isfile(alignment) or args.force:
        if args.force:
            unaligned_fastas = fastas
        else:
            unaligned_fastas = [
                fasta for fasta in fastas
                if not os.path.isfile(trim_name(fasta))
            ]

        if unaligned_fastas:
            chunk_size = int(len(unaligned_fastas) / 4) + 1
            chunks = [
                unaligned_fastas[i:i + chunk_size]
                for i in [n * chunk_size for n in range(4)]
            ]
            # Run this script with list of fastas as args
            jobs = [(submit_alignment_batch, [
                '{} {} {}'.format(sys.executable, __file__, ' '.join(chunk))
            ]) for chunk in chunks]
            IDs = mapPool(4, jobs)
            outfiles = ['mafft_' + str(ID) + '.out' for ID in IDs]
            errfiles = ['mafft_' + str(ID) + '.err' for ID in IDs]
        else:
            outfiles = []
            errfiles = []

        aligned = [align_name(fasta) for fasta in fastas
                   ]  # Intermediate files from the alignment process.
        aligned_trimmed = [trim_name(fasta) for fasta in fastas
                           ]  # The output files from the aligment process.

        concatenate_fasta(aligned_trimmed, alignment)

        cleanup(logs=outfiles + errfiles,
                trash=fastas + aligned + aligned_trimmed)

    os.chdir(basedir)
    return alignment
Esempio n. 6
0
def estimate_rates(count_blocks, rates, threads=1):
    estimations = [(estimate_rate, count_part + (rates,)) for count_part in count_blocks]
    estimates = mapPool(threads, estimations)
    header = ['start', 'end', 'length', 'E'] + [str(rate) for rate in rates]
    return pd.DataFrame(estimates, columns=header)    
Esempio n. 7
0
    ustates = set()
    for seq in seqlist:
        if len(seq) < 1:
            return False
        seq = seq.upper()
        ustates = ustates.union(set(seq))
        if len(ustates) >= 4:
            return True
    return False


def cleanup(logs=[], trash=[]):
    try:
        os.mkdir('logs')
    except OSError:
        pass
    for log in logs:
        os.rename(log, 'logs/' + log)
    for f in trash:
        try:
            os.remove(f)
        except OSError:
            pass


if __name__ == '__main__':
    fastas = sys.argv[1:]

    calls = [(align_trim, [fasta]) for fasta in fastas]
    nones = mapPool(20, calls, daemonic=True, chunksize=50)
Esempio n. 8
0
def main(tree,
         reference,
         outgroup,
         output,
         segment_files,
         seqs_files,
         step=7000,
         window_size=35000,
         nthreads=1,
         centromeres=None,
         tracks=None):
    if output:
        outfile = output + '.BRAG.stats'
        print('Writing messages to {}'.format(outfile))
        log = open(outfile, 'w')
    else:
        output = 'rearrangement_analysis'
        print('Writing messages to stdout\nWriting results to {}*'.format(
            output))
        log = sys.stdout

    clock = timer()
    log.write('Reading input. . .\n')

    tree = Tree(tree)
    root(tree, outgroup)

    reference_genome_file = infer_reference(seqs_files)
    table_jobs = [(segment_tables, (reference, segment_file, seqs_file,
                                    reference_genome_file))
                  for segment_file, seqs_file in zip(segment_files, seqs_files)
                  ]
    tables = mapPool(nthreads, table_jobs)

    order = tree_order(reference, tree)
    tables.sort(key=lambda t: order.index(t[0]))  # sort by queries into order
    queries, rscaffolds, qscaffolds, os_tabs = list(zip(*tables))
    rscaffolds = rscaffolds[0]  # all have same reference
    N = rscaffolds.iloc[
        -1].abs_pos  # position of the end == reference genome size

    coverages = [
        np.sum(os_tab.rend - (os_tab.rstart - 1)) / float(N)
        for os_tab in os_tabs
    ]
    coverage_stats = describe(coverages)
    log.write('{} genomes aligned to {}.\n'.format(coverage_stats.nobs,
                                                   reference))
    log.write('Minimum coverage:\t{}\n'.format(coverage_stats.minmax[0]))
    log.write('Mean coverage:\t{}\n'.format(coverage_stats.mean))
    log.write('Maximum coverage:\t{}\n'.format(coverage_stats.minmax[1]))
    log.write('SD coverage:\t{}\n'.format(coverage_stats.variance**0.5))
    log.write('Cumulative coverage:\t{}\n'.format(
        cumulative_coverage(os_tabs, N)))
    log.write(clock.report() + '\n\n')

    log.write(
        'Plotting coverage of alignments and histograms of OS and qbreak lengths. . .\n'
    )
    degrading_coverage(coverages, os_tabs, N,
                       output + '_coverage_survival_curve')
    hist_jobs = [(OS_length_hist, (reference, query, os_tab))
                 for query, rscaffolds, qscaffolds, os_tab in tables]
    mapPool(nthreads, hist_jobs)
    log.write(clock.report() + '\n')

    certain_out = output + '_certain'
    uncertain_out = output + '_uncertain'

    log.write('\nEstimating break rates. . .\n\n')
    if not (os.path.isfile(uncertain_out + '_rates.tab')
            and os.path.isfile(certain_out + '_rates.tab')
            and os.path.isfile(uncertain_out + '.log')
            and os.path.isfile(certain_out + '.log')):
        adj_jobs = [(map_breakpoints, [os_tab]) for os_tab in os_tabs]
        uncertain_adj_coords = mapPool(nthreads, adj_jobs)
        certain_adj_coords = [[coord for coord in coords if coord[2]]
                              for coords in uncertain_adj_coords]
        uncertain_adj_coords = list(zip(queries, uncertain_adj_coords))
        certain_adj_coords = list(zip(queries, certain_adj_coords))

        br.set_reference(tree & reference, N)

    log.write('Calculating Uncertain (True or False qbreaks) Break Rates:\n')
    if not (os.path.isfile(uncertain_out + '_rates.tab')
            and os.path.isfile(uncertain_out + '.log')):
        uncertain_estimates = br.break_rate(uncertain_adj_coords,
                                            output=uncertain_out,
                                            threads=nthreads)
    else:
        uncertain_estimates = pd.read_csv(uncertain_out + '_rates.tab',
                                          sep='\t')
    log.write(open(uncertain_out + '.log', 'r').read())

    log.write('\nCalculating Certain (True qbreaks only) Break Rates:\n')
    if not (os.path.isfile(certain_out + '_rates.tab')
            and os.path.isfile(certain_out + '.log')):
        certain_estimates = br.break_rate(certain_adj_coords,
                                          output=certain_out,
                                          threads=nthreads)
    else:
        certain_estimates = pd.read_csv(certain_out + '_rates.tab', sep='\t')
    log.write(open(certain_out + '.log', 'r').read())

    if not os.path.isfile(uncertain_out + '_rate_windows.txt'):
        uncertain_rate_windows = rate_windows(uncertain_estimates,
                                              N,
                                              step=step,
                                              window_size=window_size)
        uncertain_rate_windows.to_csv(uncertain_out + '_rate_windows.txt',
                                      sep='\t',
                                      index=False)
    else:
        uncertain_rate_windows = pd.read_csv(uncertain_out +
                                             '_rate_windows.txt',
                                             sep='\t',
                                             header=0)

    if not os.path.isfile(certain_out + '_rate_windows.txt'):
        certain_rate_windows = rate_windows(certain_estimates,
                                            N,
                                            step=step,
                                            window_size=window_size)
        certain_rate_windows.to_csv(certain_out + '_rate_windows.txt',
                                    sep='\t',
                                    index=False)
    else:
        certain_rate_windows = pd.read_csv(certain_out + '_rate_windows.txt',
                                           sep='\t',
                                           header=0)
    log.write('\n' + clock.report() + '\n\n')

    log.write(
        'Processing centromeres & extra data tracks, if applicable. . .\n')
    # Mask Centromeres
    if centromeres:
        centromeres = [
            list(map(int,
                     line.split('#')[0].strip().split()))
            for line in open(centromeres, 'r') if line.split('#')[0].strip()
        ]
        abs_centromeres = [(rscaffolds.iloc[scaf_idx].abs_pos + start,
                            rscaffolds.iloc[scaf_idx].abs_pos + stop)
                           for scaf_idx, start, stop in centromeres]
        certain_rate_windows = mask(certain_rate_windows, abs_centromeres)
        uncertain_rate_windows = mask(uncertain_rate_windows, abs_centromeres)
    else:
        abs_centromeres = []

    # Mask scaffold edges
    scaffold_boundaries = [(x, x) for x in rscaffolds.abs_pos]
    certain_rate_windows = mask(certain_rate_windows,
                                scaffold_boundaries,
                                inclusive=False)
    uncertain_rate_windows = mask(uncertain_rate_windows,
                                  scaffold_boundaries,
                                  inclusive=False)

    # Add in extra data tracks and mask
    if tracks:
        tracks = pd.read_csv(tracks, sep='\t')
        tracks.sort_values('start', inplace=True)
        tracks = mask(tracks, scaffold_boundaries, inclusive=False)
    else:
        tracks = []
    track_labels = [
        label for label in list(tracks) if label not in ['start', 'end']
    ]
    log.write(clock.report() + '\n')

    # Plot Figures
    log.write('\n')
    log.write(
        'Plotting break rates calculated with "True" qbreaks ("certain", lower bound estimate)\n'
    )
    log.write(
        'against break rates calculated with "True" and "False" break rates ("uncertain:, upper bound estimate).\n'
    )
    log.write('Output: ' + output + '_uncertainty\n')
    indexer = (certain_rate_windows['E'] != uncertain_rate_windows['E']) & (
        uncertain_rate_windows['E'] != 0)
    model = correlation_scatter(certain_rate_windows['E'].loc[indexer],
                                uncertain_rate_windows['E'].loc[indexer],
                                output + '_uncertainty')
    same = (certain_rate_windows['E'] == uncertain_rate_windows['E']).sum()
    num_windows = len(certain_rate_windows)
    log.write('{}/{} ({:.2f}%) windows have identical break rates\n'.format(
        same, num_windows, same / num_windows * 100))
    uncertain_over = (certain_rate_windows['E'] <
                      uncertain_rate_windows['E']).sum()
    report = '{}/{} ({:.2f}%) of non-identical windows have (True) < (True | False)\n'
    report = report.format(uncertain_over, num_windows - same,
                           uncertain_over / (num_windows - same) * 100)
    log.write(report)
    mean_ratio = (certain_rate_windows['E'].loc[indexer] /
                  uncertain_rate_windows['E'].loc[indexer]).mean()
    log.write(
        'Mean ratio of (True)/(True | False) when True != False: {}\n'.format(
            mean_ratio))
    log.write(
        '(True)/(True | False) = {:.4f}*(True | False) + {:.4f}; p={:.3E} R2={:.3E}\n'
        .format(model.slope, model.intercept, model.p_val, model.r2))
    log.write(clock.report() + '\n')

    if track_labels:
        log.write('\n')
        log.write(
            'Performing linear regression between extra data tracks and break rate.\n'
        )
        log.write('Output: ' + output + '_tracks-x-breakrate\n')
        track_results = track_correlation(certain_rate_windows, tracks,
                                          track_labels,
                                          output + '_tracks-x-breakrate')
        log.write(track_results.summary().as_text() + '\n')
        log.write(clock.report() + '\n')

    log.write('\n')
    log.write(
        'Plotting break rates and extra tracks along the reference genome.\n')
    log.write('Output: ' + output + '_brMap\n')
    plot_break_rate(N, queries, os_tabs, certain_estimates,
                    uncertain_estimates, certain_rate_windows,
                    uncertain_rate_windows, tracks, track_labels, rscaffolds,
                    abs_centromeres, step, output + '_brMap')
    log.write('BRAG Finished!\t{}\n'.format(clock.report()))