Example #1
0
def calc_group_unique_allele_window(args):
    """Count the number of and relative rate of uniquely held alleles
       spatially along chromosomes (i.e. Lineage-specific rates)"""
    data = {}
    mvf = MultiVariantFile(args.mvf, 'read')
    if mvf.flavor != 'codon':
        raise RuntimeError(
            "\n=====================\nERROR: MVF is not codon flavor!")
    annotations = {}
    coordinates = {}
    labels = mvf.get_sample_labels()[:]
    ncol = len(labels)
    current_contig = None
    current_position = 0
    counts = Counter()
    totals = Counter()
    args.start_contig = (args.start_contig
                         if args.start_contig is not None else 0)
    args.end_contig = (args.end_contig
                       if args.end_contig is not None else 100000000000)
    if args.output_align is True:
        outputalign = []
    if args.gff is not None:
        annotations, coordinates = (parse_gff_analysis(args.gff))
    if args.allele_groups is not None:
        args.allele_groups = procarg_allelegroups(args.allele_groups, mvf)
    if args.species_groups is None:
        args.species_groups = args.allele_groups
    else:
        args.species_groups = procarg_speciesgroups(args.species_groups, mvf)
    fieldtags = [
        'likelihood', 'bgdnds0', 'bgdnds1', 'bgdnds2a', 'bgdnds2b', 'fgdnds0',
        'fgdnds1', 'fgdnds2a', 'fgdnds2b', 'dndstree', 'errorstate'
    ]
    if args.branch_lrt is not None:
        with open(args.branch_lrt, 'w') as branchlrt:
            genealign = []
            branchlrt.write(
                "\t".join(['contig', 'ntaxa', 'alignlength', 'lrtscore'] +
                          ["null.{}".format(x) for x in fieldtags] +
                          ["test.{}".format(x)
                           for x in fieldtags] + ['tree']) + "\n")
    groups = args.allele_groups.values()
    if args.species_groups is not None:
        speciesgroups = args.species_groups.values()
    allsets = set([])
    for group in groups:
        allsets.update(group)
    allsets = list(sorted(allsets))
    speciesnames = args.species_groups.keys()
    speciesrev = {}
    if args.species_groups is not None:
        for species in args.species_groups:
            speciesrev.update([(x, species)
                               for x in args.species_groups[species]])
    if args.mincoverage is not None:
        if args.mincoverage < len(groups) * 2:
            raise RuntimeError("""
                Error: GroupUniqueAlleleWindow:
                --mincoverage cannot be lower than the twice the number
                of specified groups in --allele-groups
                """)
    genealign = []
    for contig, pos, allelesets in mvf:
        if not current_contig:
            current_contig = contig[:]
        if contig != current_contig or (args.windowsize > 0 and pos >
                                        current_position + args.windowsize):
            xkey = (
                current_contig,
                current_position,
            )
            data[xkey] = counts.copy()
            data[xkey].update([
                ('contig', (mvf.get_contig_labels(ids=current_contig)
                            if args.use_labels is True else current_contig)),
                ('position', current_position),
                ('nonsynyonymous_changes',
                 counts.get('nonsynonymous_changes', 0) or 0),
                ('synyonymous_changes', counts.get('synonymous_changes', 0)
                 or 0)
            ])
            data[xkey].update([
                ('ns_ratio',
                 (float(data[xkey].get('nonsynonymous_changes', 0)) /
                  (data[xkey].get('synonymous_changes', 1.0)))),
                ('annotation', annotations.get(data[xkey]['contig'], '.')),
                ('coordinates', coordinates.get(data[xkey]['contig'], '.'))
            ])
            if genealign:
                if (args.end_contig >= int(current_contig)) and (
                        args.start_contig <= int(current_contig)):
                    (pamlnull, pamltest, tree) = paml_branchsite(
                        genealign,
                        labels[:],
                        species=speciesnames,
                        speciesrev=speciesrev,
                        codemlpath=args.codeml_path,
                        raxmlpath=args.raxml_path,
                        pamltmp=args.paml_tmp,
                        target=args.target,
                        targetspec=args.num_target_species,
                        allsampletrees=args.all_sample_trees,
                        outgroup=args.outgroup)
                    lrtscore = -1
                    if (pamlnull.get('likelihood', -1) != -1
                            and pamltest.get('likelihood', -1) != -1):
                        lrtscore = 2 * (pamltest['likelihood'] -
                                        pamlnull['likelihood'])
                    with open(args.branch_lrt, 'a') as branchlrt:
                        branchlrt.write("\t".join([
                            str(x) for x in [
                                data[xkey]['contig'],
                                len(genealign),
                                len(genealign[0]) * 3, lrtscore
                            ] + [pamlnull.get(y, -1) for y in fieldtags] +
                            [pamltest.get(y, -1)
                             for y in fieldtags] + [str(tree).rstrip()]
                        ]) + "\n")
            genealign = None
            totals.add('genes_total')
            if counts.get('total_codons', 0) > 0:
                totals.add('genes_tested')
            if counts.get('total_nsyn_codons', 0) > 0:
                totals.add('genes_with_nsyn')
            if contig != current_contig:
                current_contig = contig[:]
                current_position = 0
            elif args.windowsize > 0:
                current_position += args.windowsize
            counts = Counter()
        proteins = allelesets[0]
        codons = allelesets[1:4]
        if len(proteins) == 1 and all(len(x) == 1 for x in codons):
            if proteins == '*' or ''.join(codons) in MLIB.stop_codons:
                continue
            counts.add('total_codons')
            totals.add('total_codons')
            if args.output_align is True:
                if not outputalign:
                    outputalign = [[''.join(codons)]
                                   for x in range(mvf.metadata['ncol'])]
                else:
                    for ialign, xalign in enumerate(outputalign):
                        xalign.append(''.join(codons))
            if args.branch_lrt is not None:
                if not genealign:
                    genealign = [[''.join(codons)] for x in range(ncol)]
                else:
                    for ialign in range(len(genealign)):
                        genealign[ialign].append(''.join(codons))
            continue
        if len(proteins) > 1:
            if allelesets[0][1] == '+':
                continue
        proteins = mvf.decode(proteins)
        if args.mincoverage is not None:
            if sum([int(x not in 'X-')
                    for x in proteins]) < (args.mincoverage):
                continue
        species_groups = [[proteins[i] for i in x if proteins[i] not in '-X']
                          for x in speciesgroups]
        if any(len(x) == 0 for x in species_groups):
            continue
        xcodons = [mvf.decode(x) for x in codons]
        codons = [''.join(x) for x in zip(*xcodons)]
        if any(codons[x] in MLIB.stop_codons for x in allsets):
            continue
        if any(
                any(x != species_groups[0][0] for x in y)
                for y in species_groups):
            totals.add('total_nsyn_codons')
            counts.add('total_nsyn_codons')
        totals.add('total_codons')
        totals.add('tested_codons')
        counts.add('total_codons')
        totals.add('variable_codons',
                   val=int(
                       sum([int(len(set(x) - set('X-')) > 1)
                            for x in xcodons]) > 0))
        if args.output_align is not None:
            if not outputalign:
                outputalign = [[x] for x in codons]
            else:
                for ialign in range(len(outputalign)):
                    outputalign[ialign].append(codons[ialign])
        if args.branch_lrt is not None:
            if not genealign:
                genealign = [[x] for x in codons]
            else:
                for ialign in range(len(codons)):
                    genealign[ialign].append(codons[ialign])
        nonsyn_change = False
        synon_change = False
        codon_groups = [
            set([
                codons[i] for i in x
                if '-' not in codons[i] and 'X' not in codons[i]
            ]) for x in groups
        ]
        protein_groups = None
        for i in range(len(codon_groups)):
            if any(base in codon for base in 'RYWKMS'
                   for codon in codon_groups[i]):
                codon_groups[i] = hapgroup(codon_groups[i])
        if all(
                grp1.isdisjoint(grp0)
                for grp0, grp1 in combinations(codon_groups, 2)):
            protein_groups = [
                set([
                    MLIB.codon_tables['full'][''.join(x)]
                    for x in codon_groups[i]
                ]) for i in range(len(codon_groups))
            ]
            if all(
                    grp1.isdisjoint(grp0)
                    for grp0, grp1 in combinations(protein_groups, 2)):
                nonsyn_change = True
            elif all(grp1 == grp0
                     for grp0, grp1 in combinations(protein_groups, 2)):
                synon_change = True
        if nonsyn_change:
            if args.verbose is True:
                print('NON', contig, pos, allelesets,
                      codon_groups, protein_groups, groups,
                      mvf.get_contig_labels(ids=contig))
            counts.add('nonsynonymous_changes')
            totals.add('nonsynonymous_changes')
        elif synon_change:
            if args.verbose is True:
                print('SYN', contig, pos, allelesets,
                      codon_groups, protein_groups, groups,
                      mvf.get_contig_labels(ids=contig))
            counts.add('synonymous_changes')
            totals.add('synonymous_changes')
    args.totals = totals
    # WRITE OUTPUT
    headers = [
        "contig", "position", "nonsynonymous_changes", "synonymous_changes",
        "ns_ratio", "nonsynonymous_total", "synonymous_total", "pvalue",
        "total_codons", "annotation", "coordinates"
    ]
    if args.windowsize == -1:
        headers.remove('position')
    if args.chi_test is None:
        headers.remove('pvalue')
    outfile = OutputFile(path=args.out, headers=headers)
    sorted_entries = sorted(
        [(data[k]['ns_ratio'], k)
         for k in data if data[k].get('nonsynonymous_changes', 0) > 0],
        reverse=True)
    for _, k in sorted_entries:
        outfile.write_entry(data[k])
    with open(args.out + '.total', 'w') as totalfile:
        for entry in args.totals.iter_sorted():
            totalfile.write(entry)
    if args.output_align is not None:
        with open(args.output_align, 'w') as alignfile:
            alignfile.write("\n".join([
                ">{}\n{}".format(mvf.metadata['labels'][i],
                                 ''.join(outputalign[i]))
                for i in range(len(outputalign))
            ]))
    return ''
Example #2
0
def calc_all_character_count_per_sample(args):
    """Count the number of and relative rate of certain bases
       spatially along chromosomes
    """
    args.qprint("Running CalcAllCharacterCountPerSample")
    mvf = MultiVariantFile(args.mvf, 'read')
    current_contig = None
    current_position = 0
    data_in_buffer = False
    # Set up sample indices
    sample_labels = mvf.get_sample_ids()
    if args.sample_indices is not None:
        sample_indices = [int(x) for x in args.sample_indices[0].split(",")]
    elif args.sample_labels is not None:
        sample_indices = mvf.get_sample_indices(
            ids=args.sample_labels[0].split(","))
    else:
        sample_indices = mvf.get_sample_indices()
    # Set up contig ids
    if args.contig_ids is not None:
        contig_ids = args.contig_ids[0].split(",")
    elif args.contig_labels is not None:
        contig_ids = mvf.get_contig_ids(
            labels=args.contig_labels[0].split(","))
    else:
        contig_ids = None
    data = dict((i, {}) for i in sample_indices)
    data_characters = [{} for i in sample_indices]
    for contig, pos, allelesets in mvf.iterentries(decode=False,
                                                   contig_ids=contig_ids):
        # Check Minimum Site Coverage
        if check_mincoverage(args.mincoverage, allelesets[0]) is False:
            continue
        if current_contig is None:
            current_contig = contig[:]
            if args.windowsize > 0:
                while pos > current_position + args.windowsize - 1:
                    current_position += args.windowsize
        # Check if windows are specified.
        if not same_window((current_contig, current_position),
                           (contig, pos), args.windowsize):
            args.qprint("Processing contig {}".format(current_contig))
            for i in sample_indices:
                data[i][(current_contig, current_position)] = {
                    'contig': current_contig,
                    'position': current_position
                }
                data[i][(current_contig,
                         current_position)].update(data_characters[i])
            if contig != current_contig:
                current_contig = contig[:]
                current_position = 0
            else:
                current_position += (0 if args.windowsize == -1 else
                                     args.windowsize)
            data_characters = [{} for i in sample_indices]
            data_in_buffer = False
        alleles = allelesets[0]
        if len(alleles) == 1:
            for i in sample_indices:
                data_characters[i][alleles[0]] = (
                    data_characters[i].get(alleles[0], 0) + 1)
        else:
            alleles = mvf.decode(alleles)
            for i in sample_indices:
                data_characters[i][alleles[i]] = (
                    data_characters[i].get(alleles[i], 0) + 1)
        data_in_buffer = True
    if data_in_buffer:
        for i in sample_indices:
            data[i][(current_contig, current_position)] = {
                'contig': current_contig,
                'position': current_position
            }
            data[i][(current_contig,
                     current_position)].update(data_characters[i])
    # WRITE OUTPUT
    all_chars = set([])
    for sampleid in data:
        for window in data[sampleid]:
            all_chars.update([
                x for x in data[sampleid][window]
                if x not in ('contig', 'position')
            ])
    headers = ['contig', 'position']
    headers.extend(list(sorted(all_chars)))
    outfile = OutputFile(path=args.out, headers=headers)

    for sampleid in sample_indices:
        outfile.write("#{}\n".format(sample_labels[sampleid]))
        sorted_entries = [(data[sampleid][k]['contig'],
                           data[sampleid][k]['position'], k)
                          for k in data[sampleid]]
        for _, _, k in sorted_entries:
            outfile.write_entry(data[sampleid][k], defaultvalue='0')
    return ''
Example #3
0
def calc_pairwise_distances(args):
    """Count the pairwise nucleotide distance between
       combinations of samples in a window
    """
    args.qprint("Running CalcPairwiseDistances")
    mvf = MultiVariantFile(args.mvf, 'read')
    args.qprint("Input MVF: Read")
    data = {}
    data_order = []
    if args.sample_indices is not None:
        sample_indices = [int(x) for x in args.sample_indices[0].split(",")]
    elif args.sample_labels is not None:
        sample_indices = mvf.get_sample_indices(
            ids=args.sample_labels[0].split(","))
    else:
        sample_indices = mvf.get_sample_indices()
    sample_labels = mvf.get_sample_ids(indices=sample_indices)
    args.qprint("Calculating for sample columns: {}".format(
        list(sample_indices)))
    current_contig = None
    current_position = 0
    data_in_buffer = False
    sample_pairs = [tuple(x) for x in combinations(sample_indices, 2)]
    base_matches = dict((x, {}) for x in sample_pairs)
    all_match = {}
    if mvf.flavor == 'dna':
        allele_frames = (0, )
        args.data_type = 'dna'
    elif mvf.flavor == 'prot':
        allele_frames = (0, )
        args.data_type = 'dna'
    elif mvf.flavor == 'codon':
        if args.data_type == 'prot':
            allele_frames = (0, )
        else:
            allele_frames = (1, 2, 3)
            args.data_type = 'dna'
    args.qprint("MVF flavor is: {}".format(mvf.flavor))
    args.qprint("Data type is: {}".format(args.data_type))
    args.qprint("Ambiguous mode: {}".format(args.ambig))
    args.qprint("Processing MVF Records")
    pwdistance_function = get_pairwise_function(args.data_type, args.ambig)
    if args.emit_counts:
        outfile_emitcounts = open(args.out + ".pairwisecounts", 'w')
    for contig, pos, allelesets in mvf.iterentries(decode=None):
        # Check Minimum Site Coverage
        if check_mincoverage(args.mincoverage, allelesets[0]) is False:
            continue
        # Establish first contig
        if current_contig is None:
            current_contig = contig[:]
            if args.windowsize > 0:
                while pos > current_position + args.windowsize - 1:
                    current_position += args.windowsize
        # Check if windows are specified.
        if not same_window((current_contig, current_position),
                           (contig, pos), args.windowsize):
            data[(current_contig, current_position)] = {
                'contig': current_contig,
                'position': current_position
            }
            data_order.append((current_contig, current_position))
            all_diff, all_total = pwdistance_function(all_match)
            for samplepair in base_matches:
                ndiff, ntotal = pwdistance_function(base_matches[samplepair])
                taxa = "{};{}".format(sample_labels[samplepair[0]],
                                      sample_labels[samplepair[1]])
                data[(current_contig, current_position)].update({
                    '{};ndiff'.format(taxa):
                    ndiff + all_diff,
                    '{};ntotal'.format(taxa):
                    ntotal + all_total,
                    '{};dist'.format(taxa):
                    zerodiv(ndiff + all_diff, ntotal + all_total)
                })
            if contig != current_contig:
                current_contig = contig[:]
                current_position = 0
                if args.windowsize > 0:
                    while pos > current_position + args.windowsize - 1:
                        current_position += args.windowsize
            else:
                current_position += args.windowsize
            if args.emit_counts:
                args.qprint("Writing Full Count Table")
                for p0, p1 in base_matches:
                    outfile_emitcounts.write("#{}\t{}\t{}\t{}\n{}\n".format(
                        p0, p1, current_position, current_contig, "\n".join([
                            "{} {}".format(x,
                                           (base_matches[(p0, p1)].get(x, 0) +
                                            all_match.get(x, 0)))
                            for x in set(base_matches[(p0,
                                                       p1)]).union(all_match)
                        ])))
            base_matches = dict((x, {}) for x in sample_pairs)
            all_match = {}
            data_in_buffer = False
        for iframe in allele_frames:
            alleles = allelesets[iframe]
            if len(alleles) == 1:
                all_match["{0}{0}".format(alleles)] = (
                    all_match.get("{0}{0}".format(alleles), 0) + 1)
                data_in_buffer = True
                continue
            if alleles[1] == '+':
                if alleles[2] in 'X-':
                    continue
                samplepair = (0, int(alleles[3:]))
                if any(x not in sample_indices for x in samplepair):
                    continue
                basepair = "{0}{1}".format(alleles[0], alleles[2])
                base_matches[samplepair][basepair] = (
                    base_matches[samplepair].get(basepair, 0) + 1)
                data_in_buffer = True
                continue
            alleles = mvf.decode(alleles)
            valid_positions = [
                i for i, x in enumerate(alleles)
                if x not in 'X-' and i in sample_indices
            ]
            assert len(alleles) == 4
            assert alleles[0] not in 'X-', alleles
            assert alleles[1] not in 'X-', alleles
            for i, j in combinations(valid_positions, 2):
                samplepair = (i, j)
                basepair = "{0}{1}".format(alleles[i], alleles[j])
                base_matches[samplepair][basepair] = (
                    base_matches[samplepair].get(basepair, 0) + 1)
            data_in_buffer = True
        # print(base_matches)
    if data_in_buffer is True:
        print(sum(base_matches[samplepair].values()), base_matches[samplepair],
              samplepair)
        print(sum(all_match.values()), all_match)
        print(sum(base_matches[samplepair].values()) + sum(all_match.values()))
        # Check whether, windows, contigs, or total
        if args.windowsize == 0:
            current_contig = 'TOTAL'
            current_position = 0
        elif args.windowsize == -1:
            current_position = 0
        data[(current_contig, current_position)] = {
            'contig': current_contig,
            'position': current_position
        }
        data_order.append((current_contig, current_position))
        # print("All match")
        all_diff, all_total = pwdistance_function(all_match)
        print(all_diff, all_total)
        for samplepair in base_matches:
            ndiff, ntotal = pwdistance_function(base_matches[samplepair])
            taxa = "{};{}".format(sample_labels[samplepair[0]],
                                  sample_labels[samplepair[1]])
            data[(current_contig, current_position)].update({
                '{};ndiff'.format(taxa):
                ndiff + all_diff,
                '{};ntotal'.format(taxa):
                ntotal + all_total,
                '{};dist'.format(taxa):
                zerodiv(ndiff + all_diff, ntotal + all_total)
            })
        if args.emit_counts:
            args.qprint("Writing Full Count Table")
            for p0, p1 in base_matches:
                outfile_emitcounts.write("#{}\t{}\t{}\t{}\n{}\n".format(
                    p0, p1, current_position, current_contig, "\n".join([
                        "{} {}".format(x, (base_matches[(p0, p1)].get(x, 0) +
                                           all_match.get(x, 0)))
                        for x in set(base_matches[(p0, p1)]).union(all_match)
                    ])))
    args.qprint("Writing Output")
    headers = ['contig', 'position']
    for samplepair in sample_pairs:
        headers.extend([
            '{};{};{}'.format(sample_labels[samplepair[0]],
                              sample_labels[samplepair[1]], x)
            for x in ('ndiff', 'ntotal', 'dist')
        ])
    outfile = OutputFile(path=args.out, headers=headers)
    for okey in data_order:
        outfile.write_entry(data[okey])
    if args.emit_counts:
        outfile_emitcounts.close()
    return ''
Example #4
0
def calc_dstat_combinations(args):
    """Calculate genome-wide D-statstics for
       all possible trio combinations of samples
       and outgroups specified.
    """
    mvf = MultiVariantFile(args.mvf, 'read')
    data = {}
    sample_labels = mvf.get_sample_ids()
    if args.outgroup_indices is not None:
        outgroup_indices = [
            int(x) for x in args.outgroup_indices[0].split(",")
        ]
    elif args.outgroup_labels is not None:
        outgroup_indices = mvf.get_sample_indices(
            ids=args.outgroup_labels[0].split(","))
    if args.sample_indices is not None:
        sample_indices = [int(x) for x in args.sample_indices[0].split(",")]
    elif args.sample_labels is not None:
        sample_indices = mvf.get_sample_indices(
            ids=args.sample_labels[0].split(","))
    else:
        sample_indices = mvf.get_sample_indices()
    if args.contig_ids is not None:
        contig_ids = args.contig_ids[0].split(",")
    elif args.contig_labels is not None:
        contig_ids = mvf.get_contig_ids(
            labels=args.contig_labels[0].split(","))
    else:
        contig_ids = None
    if any(x in outgroup_indices for x in sample_indices):
        raise RuntimeError("Sample and Outgroup column lists cannot overlap.")
    for contig, _, allelesets in mvf:
        if contig not in contig_ids:
            continue
        alleles = mvf.decode(allelesets[0])
        for i, j, k in combinations(sample_indices, 3):
            for outgroup in outgroup_indices:
                subset = [alleles[x] for x in [i, j, k, outgroup]]
                if any(x not in 'ATGC' for x in subset):
                    continue
                if subset[-1] not in subset[:3]:
                    continue
                if len(set(subset)) != 2:
                    continue
                # [ABBA, BABA, BBAA]
                val = (0 + 1 * (subset[0] == subset[3]) + 2 *
                       (subset[1] == subset[3]) + 4 * (subset[2] == subset[3]))
                if val in (1, 2):
                    val -= 1
                elif val == 4:
                    val = 2
                else:
                    continue
                tetrad = (i, j, k, outgroup)
                if tetrad not in data:
                    data[tetrad] = {}
                if contig not in data[tetrad]:
                    data[tetrad][contig] = [0, 0, 0]
                data[tetrad][contig][val] += 1
    # WRITE OUTPUT
    headers = ['sample0', 'sample1', 'sample2', "outgroup"]
    for xcontig in contig_ids:
        headers.extend([
            '{}:abba'.format(xcontig), '{}:baba'.format(xcontig),
            '{}:bbaa'.format(xcontig), '{}:D'.format(xcontig)
        ])
    outfile = OutputFile(path=args.out, headers=headers)
    for i, j, k in combinations(sample_indices, 3):
        for outgroup in outgroup_indices:
            tetrad = tuple([i, j, k, outgroup])
            if tetrad not in data:
                continue
            entry = dict(('sample{}'.format(i), sample_labels[x])
                         for i, x in enumerate(tetrad[:3]))
            entry['outgroup'] = sample_labels[outgroup]
            for contig in contig_ids:
                if contig not in data[tetrad]:
                    entry.update(dict().fromkeys([
                        '{}:abba'.format(contig), '{}:baba'.format(contig),
                        '{}:bbaa'.format(contig), '{}:D'.format(contig)
                    ], '0'))
                else:
                    [abba, baba, bbaa] = data[tetrad][contig]
                    if abba > baba and abba > bbaa:

                        dstat = zerodiv(baba - bbaa, baba + bbaa)
                    elif baba > bbaa and baba > abba:
                        dstat = zerodiv(abba - bbaa, abba + bbaa)
                    else:
                        dstat = zerodiv(abba - baba, abba + baba)
                    entry.update([('{}:abba'.format(contig), abba),
                                  ('{}:baba'.format(contig), baba),
                                  ('{}:bbaa'.format(contig), bbaa),
                                  ('{}:D'.format(contig), dstat)])
            outfile.write_entry(entry)
    return ''
Example #5
0
def calc_character_count(args):
    """Count the number of and relative rate of certain bases
       spatially along chromosomes
    """
    mvf = MultiVariantFile(args.mvf, 'read')
    data = {}
    current_contig = None
    current_position = 0
    all_match = 0
    all_total = 0
    data_in_buffer = False
    # Set up base matching from special words
    data_order = []

    def proc_special_word(argx):
        if argx == 'dna':
            argx = MLIB.validchars['dna']
        elif argx == 'dnaambig2':
            argx = MLIB.validchars['dna+ambig2']
        elif argx == 'dnaambig3':
            argx = MLIB.validchars['dna+ambig3']
        elif argx == 'dnaambigall':
            argx = MLIB.validchars['dna+ambigall']
        elif argx == 'prot':
            argx = MLIB.validchars['amino']
        return argx

    args.base_match = proc_special_word(args.base_match)
    args.base_total = proc_special_word(args.base_total)
    # Set up sample indices
    if args.sample_indices is not None:
        sample_indices = [int(x) for x in args.sample_indices[0].split(",")]
    elif args.sample_labels is not None:
        sample_indices = mvf.get_sample_indices(
            ids=args.sample_labels[0].split(","))
    else:
        sample_indices = mvf.get_sample_indices()
    sample_labels = mvf.get_sample_ids(indices=sample_indices)
    # Set up contig ids
    if args.contig_ids is not None:
        contig_indices = mvf.get_contig_indices(
            ids=args.contig_ids[0].split(","))
    elif args.contig_labels is not None:
        contig_indices = mvf.get_contig_indices(
            labels=args.contig_labels[0].split(","))
    else:
        contig_indices = None
    match_counts = dict().fromkeys([sample_labels[i] for i in sample_indices],
                                   0)
    total_counts = dict().fromkeys([sample_labels[i] for i in sample_indices],
                                   0)
    for contig, pos, allelesets in mvf.iterentries(
            decode=False, contig_indices=contig_indices):
        # Check Minimum Site Coverage
        if check_mincoverage(args.mincoverage, allelesets[0]) is False:
            continue
        # if contig not in contig_ids:
        #   continue
        # Establish first contig
        if current_contig is None:
            current_contig = contig[:]
            if args.windowsize > 0:
                while pos > current_position + args.windowsize - 1:
                    current_position += args.windowsize
        # Check if windows are specified.
        if not same_window((current_contig, current_position),
                           (contig, pos), args.windowsize):
            data[(current_contig, current_position)] = {
                'contig': current_contig,
                'position': current_position
            }
            data_order.append((current_contig, current_position))
            for k in match_counts:

                data[(current_contig, current_position)].update([
                    (k + '.match', match_counts[k] + all_match),
                    (k + '.total', total_counts[k] + all_total),
                    (k + '.prop', ((float(match_counts[k] + all_match) /
                                    float(total_counts[k] + all_total))
                                   if total_counts[k] + all_total > 0 else 0))
                ])
            if contig != current_contig:
                current_contig = contig[:]
                current_position = 0
            else:
                current_position += (0 if args.windowsize == -1 else
                                     args.windowsize)
            match_counts = dict().fromkeys(
                [sample_labels[i] for i in sample_indices], 0)
            total_counts = dict().fromkeys(
                [sample_labels[i] for i in sample_indices], 0)
            all_total = 0
            all_match = 0
            data_in_buffer = False
        else:
            alleles = allelesets[0]
            if len(alleles) == 1:
                if args.base_match is None:
                    all_match += 1
                elif alleles in args.base_match:
                    all_match += 1
                if args.base_total is None:
                    all_total += 1
                elif alleles in args.base_total:
                    all_total += 1
            else:
                alleles = mvf.decode(alleles)
                for i in sample_indices:
                    if args.base_match is None:
                        match_counts[sample_labels[i]] += 1
                    elif alleles[i] in args.base_match:
                        match_counts[sample_labels[i]] += 1
                    if args.base_total is None:
                        total_counts[sample_labels[i]] += 1
                    elif alleles[i] in args.base_total:
                        total_counts[sample_labels[i]] += 1
            data_in_buffer = True
    if data_in_buffer:
        data[(current_contig, current_position)] = {
            'contig': current_contig,
            'position': current_position
        }
        data_order.append((current_contig, current_position))
        for k in match_counts:
            data[(current_contig, current_position)].update([
                (k + '.match', match_counts[k] + all_match),
                (k + '.total', total_counts[k] + all_total),
                (k + '.prop', ((float(match_counts[k] + all_match) /
                                float(total_counts[k] + all_total))
                               if total_counts[k] + all_total > 0 else 0))
            ])
    # WRITE OUTPUT
    headers = ['contig', 'position']
    for label in sample_labels:
        headers.extend([label + x for x in ('.match', '.total', '.prop')])
    outfile = OutputFile(path=args.out, headers=headers)
    for okey in data_order:
        outfile.write_entry(data[okey])
    return ''
Example #6
0
def calc_pattern_count(args):
    """Count biallelic patterns spatially along
       chromosomes (e.g,, for use in DFOIL or Dstats
       http://www.github.com/jbpease/dfoil).
       The last sample specified will determine the 'A'
       versus 'B' allele.
    """
    mvf = MultiVariantFile(args.mvf, 'read')
    data = {}
    current_contig = None
    current_position = 0
    sitepatterns = {}

    # sample_labels = mvf.get_sample_labels()
    if args.sample_indices is not None:
        sample_indices = [int(x) for x in args.sample_indices[0].split(",")]
    elif args.sample_labels is not None:
        sample_indices = mvf.get_sample_indices(
            labels=args.sample_labels[0].split(","))
    else:
        sample_indices = mvf.get_sample_indices()
    nsamples = len(sample_indices)
    for contig, pos, allelesets in mvf:
        # Check Minimum Site Coverage
        if check_mincoverage(args.mincoverage, allelesets[0]) is False:
            continue
        # Establish first contig
        if current_contig is None:
            current_contig = contig[:]
            if args.windowsize > 0:
                while pos > current_position + args.windowsize - 1:
                    current_position += args.windowsize
        # Check if windows are specified.
        if not same_window((current_contig, current_position),
                           (contig, pos), args.windowsize):
            data[(current_contig,
                  current_position)] = dict([('contig', current_contig),
                                             ('position', current_position)])
            data[(current_contig, current_position)].update(sitepatterns)
            sitepatterns = {}
            if contig != current_contig:
                current_position = 0
                current_contig = contig[:]
            else:
                current_position += (0 if args.windowsize == -1 else
                                     args.windowsize)
        if len(allelesets[0]) == 1:
            if allelesets[0] in 'ATGC':
                pattern = 'A' * nsamples
            else:
                continue
        elif allelesets[0][1] == '+':
            continue
        else:
            alleles = mvf.decode(allelesets[0])
            alleles = [alleles[x] for x in sample_indices]
            if any(x in alleles for x in 'X-RYKMWS'):
                continue
            if len(set(alleles)) > 2:
                continue
            pattern = ''.join(
                ['A' if x == alleles[-1] else 'B' for x in alleles[:-1]]) + 'A'
        sitepatterns[pattern] = sitepatterns.get(pattern, 0) + 1
    if sitepatterns:
        data[(current_contig,
              current_position)] = dict([('contig', current_contig),
                                         ('position', current_position)])
        data[(current_contig, current_position)].update(sitepatterns)
    # WRITE OUTPUT
    headers = ['contig', 'position']
    headers.extend(
        [MLIB.abpattern(x, nsamples) for x in range(0, 2**nsamples, 2)])
    outfile = OutputFile(path=args.out, headers=headers)
    outfile.write("#{}\n".format(",".join(
        mvf.get_sample_labels(sample_indices))))
    sorted_entries = sorted([(data[k]['contig'], data[k]['position'], k)
                             for k in data])
    for _, _, k in sorted_entries:
        outfile.write_entry(data[k])
    # WRITE LIST OUTPUT
    if args.output_lists is True:
        sorted_entries = sorted([(data[k]['contig'], data[k]['position'], k)
                                 for k in data])
        total_counts = {}
        for contig, pos, k in sorted_entries:
            outfilepath = "{}-{}-{}.counts.list".format(args.out, contig, pos)
            with open(outfilepath, 'w') as outfile:
                outfile.write("pattern,count\n")
                for pattern, pcount in sorted(data[k].items()):
                    if pattern in ['contig', 'position']:
                        continue
                    outfile.write("{},{}\n".format(pattern, pcount))
                    total_counts[pattern] = (total_counts.get(pattern, 0) +
                                             pcount)
        outfilepath = "{}-TOTAL.counts.list".format(args.out)
        with open(outfilepath, 'w') as outfile:
            outfile.write("pattern,count\n")
            for pattern, pcount in sorted(total_counts.items()):
                if pattern in ['contig', 'position']:
                    continue
                outfile.write("{},{}\n".format(pattern, pcount))
    return ''
Example #7
0
def mvf_join(args):
    """Main method"""
    concatmvf = MultiVariantFile(args.out, 'write', overwrite=args.overwrite)
    # Copy the first file's metadata
    if args.main_header_file:
        if args.main_header_file not in args.mvf:
            raise RuntimeError("{} not found in files".format(
                args.main_header_file))
        else:
            args.main_header_file = args.mvf.index(args.main_header_file)
    else:
        args.main_header_file = 0
    first_mvf = MultiVariantFile(args.mvf[args.main_header_file], 'read')
    concatmvf.metadata = first_mvf.metadata.copy()
    # Open each MVF file, read headers to make unified header
    transformers = []
    for mvfname in args.mvf:
        # This will create a dictionary of samples{old:new}, contigs{old:new}
        transformer = MvfTransformer()
        mvf = MultiVariantFile(mvfname, 'read')
        for i, label in enumerate(mvf.get_sample_labels()):
            if label not in concatmvf.get_sample_labels():
                concatmvf.metadata['labels'].append(label)
                concatmvf.metadata['samples'][
                    concatmvf.metadata['labels'].index(label)] = {
                        'label': label
                    }
            if concatmvf.metadata['labels'].index(label) != i:
                transformer.set_label(
                    i, concatmvf.metadata['labels'].index(label))
        for contigid, contigdata in iter(mvf.metadata['contigs'].items()):
            if contigdata['label'] not in [
                    concatmvf.metadata['contigs'][x]['label']
                    for x in concatmvf.metadata['contigs']
            ]:
                newid = (contigid not in concatmvf.metadata['contigs']
                         and contigid or concatmvf.get_next_contig_id())
                concatmvf.metadata['contigs'][newid] = contigdata
            else:
                for concatid, concatdata in (
                        concatmvf.metadata['contigs'].items()):
                    if contigdata['label'] == concatdata['label']:
                        newid = concatid
                        break
            if newid != contigid:
                transformer.set_contig(contigid, newid)
        transformers.append(transformer)
    # Write output header
    concatmvf.write_data(concatmvf.get_header())
    # Now loop through each file
    entries = []
    nentries = 0
    for ifile, mvfname in enumerate(args.mvf):
        if not args.quiet:
            sys.stderr.write("Processing {} ...\n".format(mvfname))
        transformer = transformers[ifile]
        mvf = MultiVariantFile(mvfname, 'read')
        for contigid, pos, allelesets in mvf.iterentries(decode=False,
                                                         quiet=args.quiet):
            if transformer.labels:
                allelesets = [mvf.decode(x) for x in allelesets]
                for j, alleles in enumerate(allelesets):
                    allelesets[j] = concatmvf.encode(''.join([
                        x in transformer.labels
                        and alleles[transformer.labels[x]] or alleles[x]
                        for x in range(len(alleles))
                    ]))
            if transformer.contigs:
                contigid = (contigid in transformer['contigs']
                            and transformer['contigs'][contigid] or contigid)
            entries.append((contigid, pos, allelesets))
            nentries += 1
            if nentries == args.line_buffer:
                concatmvf.write_entries(entries)
                entries = []
                nentries = 0
        if entries:
            concatmvf.write_entries(entries)
            entries = []
            nentries = 0
        if not args.quiet:
            sys.stderr.write("done\n")
    return ''
Example #8
0
def translate_mvf(args):
    """Main method"""
    args.qprint("Running TranslateMVF")
    if args.gff:
        args.qprint("Reading and Indexing MVF.")
    else:
        args.qprint("Reading MVF.")
    mvf = MultiVariantFile(args.mvf, 'read', contigindex=bool(args.gff))
    if mvf.flavor != 'dna':
        raise RuntimeError("MVF must be flavor=dna to translate")
    if args.gff:
        args.qprint("Processing MVF Index File.")
        mvf.read_index_file()
        args.qprint("GFF processing start.")
        gff_genes, gene_order = parse_gff_exome(args)
        args.qprint("GFF processed.")
    outmvf = MultiVariantFile(args.out, 'write', overwrite=args.overwrite)
    outmvf.copy_headers_from(mvf)
    outmvf.contig_data = dict(
         (
                i, dict((y, z)
                                       for (y, z) in gff_genes[x].items()
                                       if y not in ('cds', )))
                              for (i, x) in enumerate(gene_order))
    outmvf.contig_indices = list(range(len(gene_order)))
    outmvf.contig_ids = [gff_genes[x]['id']
                         for x in gene_order]
    outmvf.contig_labels = [gff_genes[x]['label']
                            for x in gene_order]
    outmvf.flavor = args.output_data
    outmvf.metadata.notes.append(args.command_string)
    outmvf.write_data(outmvf.get_header())
    args.qprint("Output MVF Established.")
    entrybuffer = []
    nentry = 0
    pos = None
    if not args.gff:
        args.qprint("No GFF used, translating sequences as pre-aligned in "
                    "coding frame.")
        inputbuffer = []
        current_contig = ''
        for contigid, pos, allelesets in mvf.iterentries(decode=False):
            if current_contig == '':
                current_contig = contigid[:]
            if contigid == current_contig:
                inputbuffer.append((pos, allelesets))
            else:
                for _, amino_acids, alleles in iter_codons(
                        inputbuffer, mvf):
                    if all([x in '-X' for x in amino_acids]):
                        continue
                    if args.output_data == 'protein':
                        entrybuffer.append(
                            (current_contig, pos, (amino_acids,)))
                    else:
                        entrybuffer.append((
                            current_contig, pos, (
                                amino_acids, alleles[0],
                                alleles[1], alleles[2])))
                    nentry += 1
                    if nentry == args.line_buffer:
                        outmvf.write_entries(entrybuffer)
                        entrybuffer = []
                        nentry = 0
                inputbuffer = [(pos, allelesets)]
                current_contig = contigid[:]
        if inputbuffer:
            for _, amino_acids, alleles in iter_codons(
                    inputbuffer, outmvf):
                if all([x in '-X' for x in amino_acids]):
                    continue
                if args.output_data == 'protein':
                    entrybuffer.append(
                        (current_contig, pos, (amino_acids,)))
                else:
                    entrybuffer.append((
                        current_contig, pos, (
                            amino_acids, alleles[0],
                            alleles[1], alleles[2])))
                nentry += 1
                if nentry == args.line_buffer:
                    outmvf.write_entries(entrybuffer)
                    entrybuffer = []
                    nentry = 0
    else:
        running_gene_index = -1
        for igene, gene in enumerate(gene_order):
            xcontiglabel = gff_genes[gene]['contig']
            xcontig = mvf.get_contig_indices(
                labels=gff_genes[gene]['contig'])
            if xcontig is None:
                print("Warning: contig {} not found".format(
                    gff_genes[gene]['contig']))
            xcontigid = mvf.get_contig_ids(indices=xcontig)[0]
            min_gene_coord = gff_genes[gene]['cds'][0][0]
            max_gene_coord = gff_genes[gene]['cds'][-1][1]
            mvf_entries = {}
            if not igene % 100:
                args.qprint("Processing gene {} on {}".format(
                    gene, xcontiglabel))
            for contigid, pos, allelesets in mvf.itercontigentries(
                    xcontig, decode=False):
                if pos < min_gene_coord:
                    continue
                if pos > max_gene_coord:
                    break
                mvf_entries[pos] = allelesets[0]
            reverse_strand = gff_genes[gene]['strand'] == '-'
            coords = []
            running_gene_index += 1
            for elem in gff_genes[gene]['cds']:
                coords.extend(list(range(elem[0], elem[1] + 1)))
            if reverse_strand:
                coords = coords[::-1]
            for codoncoord in range(0, len(coords), 3):
                alleles = tuple(mvf_entries.get(x, '-')
                                for x in coords[codoncoord:codoncoord + 3])
                if len(alleles) < 3:
                    alleles = tuple(list(alleles) + ['-'] * (3 - len(alleles)))
                if all(len(x) == 1 for x in alleles):
                    if reverse_strand:
                        alleles = tuple(
                            MLIB.complement_bases[x] for x in alleles)
                    decoded_alleles = alleles
                    amino_acids = translate_single_codon(''.join(alleles))
                else:
                    if reverse_strand is True:
                        decoded_alleles = tuple(tuple(MLIB.complement_bases[y]
                                                      for y in mvf.decode(x))
                                                for x in alleles)
                        alleles = tuple(outmvf.encode(''.join(x))
                                        for x in decoded_alleles)
                    else:
                        decoded_alleles = tuple(mvf.decode(x) for x in alleles)
                    amino_acids = tuple(translate_single_codon(''.join(x))
                                        for x in zip(*decoded_alleles))
                    amino_acids = outmvf.encode(''.join(amino_acids))
                if args.output_data == 'protein':
                    entrybuffer.append((
                        (
                            xcontigid
                            if args.retain_contigs
                            else running_gene_index
                        ),
                        (
                            coords[codoncoord]
                            if args.retain_coords
                            else codoncoord
                        ),
                        (
                            amino_acids,
                        )
                    ))
                elif args.output_data == 'codon':
                    entrybuffer.append((
                        (
                            xcontigid
                            if args.retain_contigs
                            else running_gene_index
                        ),
                        (
                            coords[codoncoord]
                            if args.retain_coords
                            else codoncoord
                        ),
                        (
                            amino_acids,
                            alleles[0],
                            alleles[1],
                            alleles[2]
                        )
                    ))
                elif args.output_data == 'dna':
                    for j, elem in enumerate(
                            range(codoncoord,
                                  min(codoncoord + 3, len(coords)))):
                        entrybuffer.append((
                            (
                                xcontigid
                                if args.retain_contigs
                                else running_gene_index
                            ),
                            (
                                coords[elem]
                                if args.retain_coords
                                else elem + 1
                            ),
                            (
                                alleles[j],
                            )
                        ))
                nentry += 1
                if nentry >= args.line_buffer:
                    args.qprint("Writing a block of {} entries.".format(
                        args.line_buffer))
                    outmvf.write_entries(entrybuffer)
                    entrybuffer = []
                    nentry = 0
        if entrybuffer:
            outmvf.write_entries(entrybuffer)
            entrybuffer = []
            nentry = 0
    return ''
Example #9
0
def legacy_translate_mvf(args):
    """Main method"""
    args.qprint("Running LegacyTranslateMVF")
    if args.gff:
        args.qprint("Reading and Indexing MVF.")
    else:
        args.qprint("Reading MVF.")
    mvf = MultiVariantFile(args.mvf, 'read', contigindex=bool(args.gff))
    if mvf.flavor != 'dna':
        raise RuntimeError("MVF must be flavor=dna to translate")
    if args.gff:
        args.qprint("Processing MVF Index File.")
        mvf.read_index_file()
        args.qprint("GFF processing start.")
        gff = parse_gff_legacy_translate(
            args.gff, args,
            parent_gene_pattern=args.parent_gene_pattern)
        args.qprint("GFF processed.")
    outmvf = MultiVariantFile(args.out, 'write', overwrite=args.overwrite)
    outmvf.copy_headers_from(mvf)
    outmvf.flavor = args.output_data
    outmvf.write_data(outmvf.get_header())
    args.qprint("Output MVF Established.")
    entrybuffer = []
    nentry = 0
    pos = None
    if not args.gff:
        args.qprint("No GFF used, translating sequences as pre-aligned in "
                    "coding frame.")
        inputbuffer = []
        current_contig = ''
        for contigid, pos, allelesets in mvf.iterentries(decode=False):
            if current_contig == '':
                current_contig = contigid[:]
            if contigid == current_contig:
                inputbuffer.append((pos, allelesets))
            else:
                for _, amino_acids, alleles in iter_codons(
                        inputbuffer, mvf):
                    if all([x in '-X' for x in amino_acids]):
                        continue
                    if args.output_data == 'protein':
                        entrybuffer.append(
                            (current_contig, pos, (amino_acids,)))
                    else:
                        entrybuffer.append((
                            current_contig, pos, (
                                amino_acids, alleles[0],
                                alleles[1], alleles[2])))
                    nentry += 1
                    if nentry == args.line_buffer:
                        outmvf.write_entries(entrybuffer)
                        entrybuffer = []
                        nentry = 0
                inputbuffer = [(pos, allelesets)]
                current_contig = contigid[:]
        if inputbuffer:
            for _, amino_acids, alleles in iter_codons(
                    inputbuffer, outmvf):
                if all([x in '-X' for x in amino_acids]):
                    continue
                if args.output_data == 'protein':
                    entrybuffer.append(
                        (current_contig, pos, (amino_acids,)))
                else:
                    entrybuffer.append((
                        current_contig, pos, (
                            amino_acids, alleles[0],
                            alleles[1], alleles[2])))
                nentry += 1
                if nentry == args.line_buffer:
                    outmvf.write_entries(entrybuffer)
                    entrybuffer = []
                    nentry = 0
    else:
        args.qprint("Indexing GFF gene names.")
        # mvfid_to_gffname = outmvf.get_contig_reverse_dict()
        for xcontig in outmvf.get_contig_indices():
            mvf_entries = {}
            xcontiglabel = outmvf.get_contig_labels(indices=xcontig)[0]
            xcontigid = outmvf.get_contig_ids(indices=xcontig)[0]
            if xcontiglabel not in gff:
                if args.verbose:
                    print(
                        ("No entries in GFF, "
                         "skipping contig: index:{} id:{} label:{}").format(
                             xcontig, xcontigid, xcontiglabel))
                continue
            if not xcontig % 100:
                args.qprint("Processing contig: {} {}".format(
                    xcontigid, xcontiglabel))
            for contigid, pos, allelesets in mvf.itercontigentries(
                    xcontig, decode=False):
                mvf_entries[pos] = allelesets[0]
            for coords in sorted(gff[xcontiglabel]):
                reverse_strand = coords[3] == '-'
                alleles = (tuple(mvf_entries.get(x, '-')
                                 for x in coords[2::-1])
                           if reverse_strand is True
                           else tuple(mvf_entries.get(x, '-')
                                      for x in coords[0:3]))
                if all(len(x) == 1 for x in alleles):
                    if reverse_strand:
                        alleles = tuple(
                            MLIB.complement_bases[x] for x in alleles)
                    decoded_alleles = alleles
                    amino_acids = translate_single_codon(''.join(alleles))
                else:
                    if reverse_strand is True:
                        decoded_alleles = tuple(tuple(MLIB.complement_bases[y]
                                                      for y in mvf.decode(x))
                                                for x in alleles)
                        alleles = tuple(outmvf.encode(''.join(x))
                                        for x in decoded_alleles)
                    else:
                        decoded_alleles = tuple(mvf.decode(x) for x in alleles)
                    amino_acids = tuple(translate_single_codon(''.join(x))
                                        for x in zip(*decoded_alleles))
                    # print("aminx", amino_acids)
                    amino_acids = outmvf.encode(''.join(amino_acids))
                # if all(x in '-X' for x in amino_acids):
                #    continue
                # print("amino", amino_acids)
                # print("translated", amino_acids, alleles)
                if args.output_data == 'protein':
                    entrybuffer.append((xcontig, coords[0], (amino_acids,)))
                else:
                    entrybuffer.append((
                        xcontigid, coords[0], (
                            amino_acids, alleles[0], alleles[1], alleles[2])))
                nentry += 1
                if nentry >= args.line_buffer:
                    args.qprint("Writing a block of {} entries.".format(
                        args.line_buffer))
                    outmvf.write_entries(entrybuffer)
                    entrybuffer = []
                    nentry = 0
    if entrybuffer:
        outmvf.write_entries(entrybuffer)
        entrybuffer = []
        nentry = 0
    return ''
Example #10
0
def translate_mvf(args):
    """Main method"""
    mvf = MultiVariantFile(args.mvf, 'read')
    if mvf.flavor != 'dna':
        raise RuntimeError("MVF must be flavor=dna to translate")
    if args.gff:
        gff = parse_gff_translate(args.gff, args)
        if not args.quiet:
            print("gff_processed")
    outmvf = MultiVariantFile(args.out, 'write', overwrite=args.overwrite)
    outmvf.metadata = deepcopy(mvf.metadata)
    outmvf.flavor = args.output_data
    outmvf.write_data(outmvf.get_header())
    entrybuffer = []
    nentry = 0
    if not args.gff:
        inputbuffer = []
        current_contig = ''
        for contigid, pos, allelesets in mvf.iterentries(decode=False):
            if current_contig == '':
                current_contig = contigid[:]
            if contigid == current_contig:
                inputbuffer.append((pos, allelesets))
            else:
                for _, amino_acids, alleles in iter_codons(inputbuffer, mvf):
                    if all([x in '-X' for x in amino_acids]):
                        continue
                    if args.output_data == 'protein':
                        entrybuffer.append(
                            (current_contig, pos, (amino_acids, )))
                    else:
                        entrybuffer.append(
                            (current_contig, pos, (amino_acids, alleles[0],
                                                   alleles[1], alleles[2])))
                    nentry += 1
                    if nentry == args.line_buffer:
                        outmvf.write_entries(entrybuffer)
                        entrybuffer = []
                        nentry = 0
                inputbuffer = [(pos, allelesets)]
                current_contig = contigid[:]
        if inputbuffer:
            for _, amino_acids, alleles in iter_codons(inputbuffer, mvf):
                if all([x in '-X' for x in amino_acids]):
                    continue
                if args.output_data == 'protein':
                    entrybuffer.append((current_contig, pos, (amino_acids, )))
                else:
                    entrybuffer.append(
                        (current_contig, pos, (amino_acids, alleles[0],
                                               alleles[1], alleles[2])))
                nentry += 1
                if nentry == args.line_buffer:
                    outmvf.write_entries(entrybuffer)
                    entrybuffer = []
                    nentry = 0
    else:
        mvf_entries = {}
        for contigid, pos, allelesets in mvf.iterentries(decode=False):
            if contigid not in mvf_entries:
                mvf_entries[contigid] = {}
            mvf_entries[contigid][pos] = allelesets[0]
        for contigname in sorted(gff):
            contigid = mvf.get_contig_ids(labels=contigname)[0]
            for coords in sorted(gff[contigname]):
                reverse_strand = False
                if coords[3] == '-':
                    reverse_strand = True
                    alleles = [
                        mvf_entries[contigid].get(x, '-')
                        for x in coords[2::-1]
                    ]
                else:
                    alleles = [
                        mvf_entries[contigid].get(x, '-') for x in coords[0:3]
                    ]
                if all(len(x) == 1 for x in alleles):
                    if reverse_strand:
                        alleles = [MLIB.complement_bases[x] for x in alleles]
                    decoded_alleles = alleles
                    amino_acids = translate(''.join(alleles))[0]
                else:
                    if reverse_strand:
                        decoded_alleles = [[
                            MLIB.complement_bases[y] for y in mvf.decode(x)
                        ] for x in alleles]
                        alleles = [
                            mvf.encode(''.join(x)) for x in decoded_alleles
                        ]
                    else:
                        decoded_alleles = [mvf.decode(x) for x in alleles]
                    amino_acids = [
                        translate(''.join(x)) for x in zip(*decoded_alleles)
                    ]
                    amino_acids = mvf.encode(''.join(
                        [x[0] for x in amino_acids]))
                if all([x in '-X' for x in amino_acids]):
                    continue
                if args.output_data == 'protein':
                    entrybuffer.append((contigid, coords[0], (amino_acids, )))
                else:
                    entrybuffer.append(
                        (contigid, coords[0], (amino_acids, alleles[0],
                                               alleles[1], alleles[2])))
                nentry += 1
                if nentry == args.line_buffer:
                    outmvf.write_entries(entrybuffer)
                    entrybuffer = []
                    nentry = 0
    if entrybuffer:
        outmvf.write_entries(entrybuffer)
        entrybuffer = []
        nentry = 0
    return ''
Example #11
0
def calc_pairwise_distances(args):
    """Count the pairwise nucleotide distance between
       combinations of samples in a window
    """
    mvf = MultiVariantFile(args.mvf, 'read')
    data = {}
    sample_labels = mvf.get_sample_labels()
    if args.sample_indices is not None:
        sample_indices = [int(x) for x in
                          args.sample_indices[0].split(",")]
    elif args.sample_labels is not None:
        sample_indices = mvf.get_sample_indices(
            labels=args.sample_labels[0].split(","))
    else:
        sample_indices = mvf.get_sample_indices()
    current_contig = None
    current_position = 0
    data_in_buffer = False
    sample_pairs = [tuple(x) for x in combinations(sample_indices, 2)]
    base_matches = dict([(x, {}) for x in sample_pairs])
    all_match = {}
    for contig, pos, allelesets in mvf:
        # Check Minimum Site Coverage
        if check_mincoverage(args.mincoverage, allelesets[0]) is False:
            continue
        # Establish first contig
        if current_contig is None:
            current_contig = contig[:]
            while pos > current_position + args.windowsize - 1:
                current_position += args.windowsize
        # Check if windows are specified.
        if not same_window((current_contig, current_position),
                           (contig, pos), args.windowsize):
            data[(current_contig, current_position)] = {
                'contig': current_contig, 'position': current_position}
            if mvf.flavor == 'dna':
                all_diff, all_total = pairwise_distance_nuc(all_match)
            elif mvf.flavor == 'prot':
                all_diff, all_total = pairwise_distance_prot(all_match)
            for samplepair in base_matches:
                if mvf.flavor == 'dna':
                    ndiff, ntotal = pairwise_distance_nuc(
                        base_matches[samplepair])
                elif mvf.flavor == 'prot':
                    ndiff, ntotal = pairwise_distance_prot(
                        base_matches[samplepair])
                taxa = "{};{}".format(sample_labels[samplepair[0]],
                                      sample_labels[samplepair[1]])
                data[(current_contig, current_position)].update({
                    '{};ndiff'.format(taxa): ndiff + all_diff,
                    '{};ntotal'.format(taxa): ntotal + all_total,
                    '{};dist'.format(taxa): zerodiv(ndiff + all_diff,
                                                    ntotal + all_total)})
            if contig != current_contig:
                current_contig = contig[:]
                current_position = 0
                while pos > current_position + args.windowsize - 1:
                    current_position += args.windowsize
            else:
                current_position += args.windowsize
            base_matches = dict([(x, {}) for x in sample_pairs])
            all_match = {}
            data_in_buffer = False
        alleles = allelesets[0]
        if len(alleles) == 1:
            all_match["{}{}".format(alleles, alleles)] = (
                all_match.get("{}{}".format(alleles, alleles),
                              0) + 1)
            data_in_buffer = True
            continue
        if alleles[1] == '+':
            if 'X' in alleles or '-' in alleles:
                continue
            samplepair = (0, int(alleles[3:]))
            if any(x not in sample_indices for x in samplepair):
                continue
            basepair = "{}{}".format(alleles[0], alleles[2])
            base_matches[samplepair][basepair] = (
                base_matches[samplepair].get(basepair, 0) + 1)
            data_in_buffer = True
            continue
        alleles = mvf.decode(alleles)
        valid_positions = [i for i, x in enumerate(alleles)
                           if x not in 'X-']
        for i, j in combinations(valid_positions, 2):
            samplepair = (i, j)
            if any(x not in sample_indices for x in samplepair):
                continue
            basepair = "{}{}".format(alleles[i], alleles[j])
            base_matches[samplepair][basepair] = (
                base_matches[samplepair].get(basepair, 0) + 1)
        data_in_buffer = True
    if data_in_buffer is True:
        # Check whether, windows, contigs, or total
        if args.windowsize == 0:
            current_contig = 'TOTAL'
            current_position = 0
        elif args.windowsize == -1:
            current_position = 0
        data[(current_contig, current_position)] = {
            'contig': current_contig, 'position': current_position}
        if mvf.flavor == 'dna':
            all_diff, all_total = pairwise_distance_nuc(all_match)
        elif mvf.flavor == 'prot':
            all_diff, all_total = pairwise_distance_prot(all_match)
        for samplepair in base_matches:
            if mvf.flavor == 'dna':
                ndiff, ntotal = pairwise_distance_nuc(base_matches[samplepair])
            elif mvf.flavor == 'prot':
                ndiff, ntotal = pairwise_distance_prot(
                    base_matches[samplepair])
            taxa = "{};{}".format(sample_labels[samplepair[0]],
                                  sample_labels[samplepair[1]])
            data[(current_contig, current_position)].update({
                '{};ndiff'.format(taxa): ndiff + all_diff,
                '{};ntotal'.format(taxa): ntotal + all_total,
                '{};dist'.format(taxa): zerodiv(ndiff + all_diff,
                                                ntotal + all_total)})
    headers = ['contig', 'position']
    for samplepair in sample_pairs:
        headers.extend(['{};{};{}'.format(
            sample_labels[samplepair[0]],
            sample_labels[samplepair[1]],
            x) for x in ('ndiff', 'ntotal', 'dist')])
    outfile = OutputFile(path=args.out, headers=headers)
    sorted_entries = sorted([(
        data[k]['contig'], data[k]['position'], k)
                             for k in data])
    for _, _, k in sorted_entries:
        outfile.write_entry(data[k])
    return ''
Example #12
0
def calc_character_count(args):
    """Count the number of and relative rate of certain bases
       spatially along chromosomes
    """
    mvf = MultiVariantFile(args.mvf, 'read')
    data = {}
    current_contig = None
    current_position = 0
    all_match = 0
    all_total = 0
    data_in_buffer = 0
    # Set up sample indices
    sample_labels = mvf.get_sample_labels()
    if args.sample_indices is not None:
        sample_indices = [int(x) for x in
                          args.sample_indices[0].split(",")]
    elif args.sample_labels is not None:
        sample_indices = mvf.get_sample_indices(
            labels=args.sample_labels[0].split(","))
    else:
        sample_indices = mvf.get_sample_indices()
    # Set up contig ids
    if args.contig_ids is not None:
        contig_ids = args.contig_ids[0].split(",")
    elif args.contig_labels is not None:
        contig_ids = mvf.get_contig_ids(
            labels=args.contig_labels[0].split(","))
    else:
        contig_ids = mvf.get_contig_ids()
    match_counts = dict().fromkeys(
        [sample_labels[i] for i in sample_indices], 0)
    total_counts = dict().fromkeys(
        [sample_labels[i] for i in sample_indices], 0)
    for contig, pos, allelesets in mvf:
        # Check Minimum Site Coverage
        if check_mincoverage(args.mincoverage,
                             allelesets[0]) is False:
            continue
        if contig not in contig_ids:
            continue
        # Establish first contig
        if current_contig is None:
            current_contig = contig[:]
            while pos > current_position + args.windowsize - 1:
                current_position += args.windowsize
        # Check if windows are specified.
        if not same_window((current_contig, current_position),
                           (contig, pos), args.windowsize):
            data[(current_contig, current_position)] = {
                'contig': current_contig, 'position': current_position}
            for k in match_counts:
                data[(current_contig, current_position)].update([
                    (k + '.match', match_counts[k] + all_match),
                    (k + '.total', total_counts[k] + all_total),
                    (k + '.prop', (
                        (float(match_counts[k] + all_match) /
                         float(total_counts[k] + all_total)) if
                        total_counts[k] + all_total > 0 else 0))])
            if contig != current_contig:
                current_contig = contig[:]
                current_position = 0
            else:
                current_position += (0 if args.windowsize == -1
                                     else args.windowsize)
            match_counts = dict().fromkeys(
                [sample_labels[i] for i in sample_indices], 0)
            total_counts = dict().fromkeys(
                [sample_labels[i] for i in sample_indices], 0)
            all_total = 0
            all_match = 0
            data_in_buffer = 0
        else:
            alleles = allelesets[0]
            if len(alleles) == 1:
                if args.base_match is None:
                    all_match += 1
                elif alleles in args.base_match:
                    all_match += 1
                if args.base_total is None:
                    all_total += 1
                elif alleles in args.base_total:
                    all_total += 1
            else:
                alleles = mvf.decode(alleles)
                for i in sample_indices:
                    if args.base_match is None:
                        match_counts[sample_labels[i]] += 1
                    elif alleles[i] in args.base_match:
                        match_counts[sample_labels[i]] += 1
                    if args.base_total is None:
                        total_counts[sample_labels[i]] += 1
                    elif alleles[i] in args.base_total:
                        total_counts[sample_labels[i]] += 1
            data_in_buffer = 1
    if data_in_buffer:
        data[(current_contig, current_position)] = {
            'contig': current_contig, 'position': current_position}
        for k in match_counts:
            data[(current_contig, current_position)].update([
                (k + '.match', match_counts[k] + all_match),
                (k + '.total', total_counts[k] + all_total),
                (k + '.prop', ((float(match_counts[k] + all_match) /
                                float(total_counts[k] + all_total)) if
                               total_counts[k] + all_total > 0 else 0))])
    # WRITE OUTPUT
    headers = ['contig', 'position']
    for label in sample_labels:
        headers.extend([label + x for x in ('.match', '.total', '.prop')])
    outfile = OutputFile(path=args.out,
                         headers=headers)
    sorted_entries = sorted([(data[k]['contig'],
                              data[k]['position'], k)
                             for k in data])
    for _, _, k in sorted_entries:
        outfile.write_entry(data[k])
    return ''