Ejemplo n.º 1
0
def mvf2phy(args):
    """Main method"""
    mvf = MultiVariantFile(args.mvf, 'read')
    if (mvf.flavor in ("dna", "rna") and args.output_data == "prot") or (
            mvf.flavor == "prot" and args.output_data in ("dna", "rna")):
        raise RuntimeError(
            "--outdput-data {} incompatiable with '{}' flavor mvf".format(
                args.output_data, mvf.flavor))
    max_region_coord = dict((x, None) for x in mvf.get_contig_ids())
    if args.regions is not None:
        _, max_region_coord, _ = parse_regions_arg(args.regions,
                                                   mvf.get_contig_ids())
    if args.sample_indices is not None:
        sample_indices = [int(x) for x in args.sample_indices[0].split(",")]
    elif args.sample_labels is not None:
        sample_indices = mvf.get_sample_indices(
            ids=args.sample_labels[0].split(","))
    else:
        sample_indices = mvf.get_sample_indices()
    sample_labels = mvf.get_sample_ids(indices=sample_indices)
    skipcontig = ''
    tmp_files = dict((fn,
                      open("{}-{}.tmp".format(fn, randint(1000000, 9999999)),
                           'w+', args.buffer)) for fn in sample_labels)
    labelwritten = dict.fromkeys(sample_labels, False)
    current_contig_id = None
    current_contig_start = 1
    current_contig_end = 1
    if args.partition is True:
        partprefix = "PROT" if args.output_data == "prot" else "DNA"
        partitionfile = open("{}.part".format(args.out), 'w')
    for contig, _, allelesets in mvf.iterentries(
            contig_ids=(mvf.get_contig_ids()
                        if args.regions is None else max_region_coord[:]),
            decode=True):
        if contig == skipcontig:
            continue
        if contig not in max_region_coord:
            skipcontig = contig[:]
            continue
        if current_contig_id is None:
            current_contig_id = contig[:]
        elif contig != current_contig_id:
            if args.partition is True:
                if current_contig_end > current_contig_start:
                    partitionfile.write("{}, {} = {}-{}\n".format(
                        partprefix,
                        mvf.get_contig_labels(ids=current_contig_id),
                        current_contig_start, current_contig_end - 1))
            current_contig_id = contig[:]
            # reset start as one position after end of last
            current_contig_start = current_contig_end
            current_contig_end = current_contig_end + 1
        for col, label in zip(sample_indices, sample_labels):
            if not labelwritten[label]:
                if args.label_type == 'long':
                    tmp_files[label].write("{}{}".format(
                        label[:100], " " * (100 - len(label[:100]))))
                elif args.label_type == 'short':
                    tmp_files[label].write("{}{}".format(
                        label[:20], " " * (20 - len(label[:20]))))
                labelwritten[label] = True
            if mvf.flavor == 'dna':
                tmp_files[label].write(allelesets[0][col] == 'X' and 'N'
                                       or allelesets[0][col])
                if label == sample_labels[0]:
                    current_contig_end += 1
            elif ((mvf.flavor == 'codon' and args.output_data == 'prot')
                  or (mvf.flavor == 'prot')):
                tmp_files[label].write(allelesets[0][col])
                if label == sample_labels[0]:
                    current_contig_end += 1
            elif mvf.flavor == 'codon':
                codon = [
                    "N" if allelesets[x][col] == 'X' else allelesets[x][col]
                    for x in (1, 2, 3)
                ]
                tmp_files[label].write(''.join(codon))
                if label == sample_labels[0]:
                    current_contig_end += 3
    first_file = True
    totalseqlen = 0
    with open(args.out, 'w') as outfile:
        for filehandler in tmp_files.values():
            # read first file to establish sequence length for phylip header
            if first_file is True:
                filehandler.seek(0, 0)
                buff = filehandler.read(args.buffer)
                while buff != '':
                    if " " in buff:
                        totalseqlen += len(buff.strip().split(" ")[-1])
                    else:
                        totalseqlen += len(buff.strip())
                    buff = filehandler.read(args.buffer)
                outfile.write("{} {}\n".format(len(sample_labels),
                                               totalseqlen))
                first_file = False
            filehandler.seek(0, 0)
            buff = filehandler.read(args.buffer)
            while buff != '':
                if first_file is True:
                    outfile.write("{} {}\n".format(len(sample_labels),
                                                   len(buff.split()[1])))
                    first_file = False
                outfile.write(buff)
                buff = filehandler.read(args.buffer)
            outfile.write("\n")
            filehandler.close()
            os.remove(os.path.join(args.temp_dir, filehandler.name))
    if args.partition is True:
        if current_contig_end > current_contig_start:
            partitionfile.write("{}, {} = {}-{}\n".format(
                partprefix, mvf.get_contig_labels(ids=current_contig_id),
                current_contig_start, current_contig_end - 1))
        partitionfile.close()
    return ''
Ejemplo n.º 2
0
def vcf2mvf(args=None):
    """Main method for vcf2mvf"""
    sepchars = dict([("TAB", "\t"), ("SPACE", " "), ("DBLSPACE", "  "),
                     ("COMMA", ","), ("MIXED", None)])
    args.fieldsep = sepchars[args.field_sep]
    # ESTABLISH VCF
    args.qprint("Opening input VCF: {}".format(args.vcf))
    vcf = VariantCallFile(args.vcf, indexcontigs=(not args.no_autoindex))
    # ESTABLISH MVF
    args.qprint("Establishing output MVF: {}".format(args.out))
    mvf = MultiVariantFile(args.out, 'write', overwrite=args.overwrite)
    mvf.notes.append(args.command_string)
    mvf.metadata['mvfversion'] = args.versionx
    # PROCESS CONTIG INFO
    args.qprint("Processing VCF headers.")
    vcfcontigs = vcf.metadata['contigs'].copy()
    args.qprint("{} contigs found.".format(len(vcfcontigs)))
    contig_translate = {}
    if args.contig_ids:
        for cid, cvcf, cmvf in (x.split(';') for x in args.contig_ids):
            try:
                cid = int(cid)
            except ValueError:
                pass
            assert cvcf in [vcfcontigs[x]['label'] for x in vcfcontigs]
            for vid in vcfcontigs:
                if vcfcontigs[vid]['label'] == cvcf:
                    contig_translate[cvcf] = [cid, cmvf]
                    if cid in mvf.metadata['contigs']:
                        raise RuntimeError(
                            'Contig id {} is not unique'.format(cid))
                    mvf.metadata['contigs'][cid] = vcfcontigs[vid].copy()
                    if cmvf in mvf.get_contig_labels():
                        raise RuntimeError(
                            'Contig label {} is not unique'.format(cmvf))
                    mvf.metadata['contigs'][cid]['label'] = cmvf[:]
    mvf.reset_max_contig()
    mvf.max_contig_index -= 1
    args.qprint("Processing contigs.")
    static_contig_ids = list(mvf.get_contig_ids())
    for vcid in vcfcontigs:
        vlabel = vcfcontigs[vcid]['label']
        if vlabel not in static_contig_ids:
            newindex = mvf.get_next_contig_index()
            if ((is_int(vlabel) or len(vlabel) < 3)
                    and vlabel not in static_contig_ids):
                newid = vlabel[:]
            else:
                newid = str(newindex)
            mvf.contig_indices.append(newindex)
            mvf.contig_ids.append(newid)
            mvf.contig_data[newindex] = vcfcontigs[vcid].copy()
            static_contig_ids.append(newid)
            contig_translate[vlabel] = [newindex, vlabel]
    mvf.reset_max_contig()
    new_contigs = [(x, mvf.contig_data[x]['label'])
                   for x in mvf.contig_indices]
    if args.skip_contig_label_check is False:
        args.qprint("Checking contigs for label/id overlap errors.")
        xids = [x[0] for x in new_contigs]
        xlabels = [x[1] for x in new_contigs]
        xintersect = set(xids).intersection(xlabels)
        if xintersect:
            for i, (newid, newlabel) in enumerate(new_contigs):
                if i % 100 == 0:
                    args.qprint("{} contigs processed".format(i))
                if newid in xlabels[:i] or newid in xlabels[i + 1:]:
                    # if newid in xlabels:
                    # if xlabels.index(newid) != i:
                    raise RuntimeError("Error contig id {} is the same as"
                                       " the label for another contig"
                                       " ({})".format(newid,
                                                      xlabels.index(newid)))
                if newlabel in xids[:i] or newlabel in xids[i + 1:]:
                    # if newlabel in xids:
                    # if xids.index(newlabel) != i:
                    raise RuntimeError("Error contig label {} is the same"
                                       "as the id for another contig"
                                       "({})".format(newlabel,
                                                     xids.index(newlabel)))
    # PROCESS SAMPLE INFO
    args.qprint("Processing samples.")
    samplelabels = [args.ref_label] + vcf.metadata['samples'][:]
    if args.alleles_from:
        args.alleles_from = args.alleles_from.split(':')
        samplelabels += args.alleles_from
    if args.sample_replace:
        newsample = [
            x.split(':') if ':' in tuple(x) else tuple([x, x])
            for x in args.sample_replace
        ]
        unmatched = list(enumerate(samplelabels))
        for old, new in newsample:
            labelmatched = False
            for j, (i, name) in enumerate(unmatched):
                if old in name:
                    samplelabels[i] = new
                    labelmatched = j
                    break
            if labelmatched is not False:
                del unmatched[labelmatched]
    mvf.sample_indices = list(range(len(samplelabels)))
    mvf.sample_ids = samplelabels[:]
    for i, label in enumerate(samplelabels):
        mvf.sample_data[i] = {'id': label}
    mvf.metadata['ncol'] = len(mvf.sample_ids)
    mvf.max_sample_index = len(mvf.sample_ids)
    mvf.metadata['sourceformat'] = vcf.metadata['sourceformat']
    # WRITE MVF HEADER
    mvf.write_data(mvf.get_header())
    mvfentries = []
    nentry = 0
    args.qprint("Processing VCF entries.")
    for vcfrecord in vcf.iterentries(args):
        mvfstring = ''.join(vcfrecord['genotypes'])
        if args.filter_nonref_empty is True:
            if all(x in 'Xx-?' for x in mvfstring[1:]):
                continue
        mvf_alleles = encode_mvfstring(mvfstring)
        if args.out_flavor in ('dnaqual', ):
            qual_alleles = encode_mvfstring(''.join(vcfrecord['qscores']))
        if mvf_alleles:
            mvfentries.append(
                (contig_translate.get(vcfrecord['contig'])[0],
                 vcfrecord['coord'],
                 ((mvf_alleles,
                   qual_alleles) if args.out_flavor in ('dnaqual', ) else
                  (mvf_alleles, ))))
            nentry += 1
            if nentry == args.line_buffer:
                mvf.write_entries(mvfentries, encoded=True)
                mvfentries = []
                nentry = 0
    if mvfentries:
        mvf.write_entries(mvfentries)
        mvfentries = []
    return ''
Ejemplo n.º 3
0
def mvf2fastagene(args):
    """Main method"""
    args.qprint("Indexing MVF")
    mvf = MultiVariantFile(args.mvf, 'read', contigindex=True)
    if (mvf.flavor in ("dna", "rna") and args.output_data == "prot") or (
            mvf.flavor == "prot" and args.output_data in ("dna", "rna")):
        raise RuntimeError(
            "--output-data {} incompatiable with '{}' flavor mvf".format(
                args.output_data, mvf.flavor))
    if args.output_data is None:
        raise RuntimeError("--output-data required")
    sample_labels = mvf.get_sample_ids()
    if args.sample_indices is not None:
        sample_indices = [int(x) for x in args.sample_indices[0].split(",")]
    elif args.sample_labels is not None:
        sample_indices = mvf.get_sample_indices(
            ids=args.sample_labels[0].split(","))
    else:
        sample_indices = mvf.get_sample_indices()
    args.qprint("Beginning Entries.")
    if not os.path.exists(args.output_dir):
        args.qprint("Output Directory Created: {}".format(args.output_dir))
        os.mkdir(args.output_dir)
    else:
        args.qprint("Output Directory Exists Already: {}".format(
            args.output_dir))
    write_buffer = {}
    for targetcontig in mvf.get_contig_indices():
        contiglabel = mvf.get_contig_labels(indices=targetcontig)[0]
        args.qprint("Reading Contig {}: {}".format(targetcontig, contiglabel))
        write_buffer = dict((x, []) for x in sample_labels)
        data_in_buffer = False
        for _, _, allelesets in mvf.itercontigentries(targetcontig,
                                                      decode=True):
            for col, label in zip(sample_indices, sample_labels):
                if mvf.flavor == 'dna':
                    write_buffer[label].append('N' if allelesets[0][col] ==
                                               'X' else allelesets[0][col])
                    data_in_buffer = True
                elif mvf.flavor in ('codon', 'prot') and (args.output_data
                                                          == 'prot'):
                    write_buffer[label].append(allelesets[0][col])
                    data_in_buffer = True
                elif mvf.flavor == 'codon' and args.output_data == 'dna':
                    if args.choose_allele == 'random1':
                        codon = [
                            'N' if allelesets[x][col] == 'X' else
                            (MLIB.randomnuc(allelesets[x][col]) if
                             (allelesets[x][col]
                              in MLIB.validchars['dnaambig23']) else
                             allelesets[x][col]) for x in (1, 2, 3)
                        ]
                    else:
                        codon = [
                            'N' if allelesets[x][col] == 'X' else
                            allelesets[x][col] for x in (1, 2, 3)
                        ]
                    write_buffer[label].append(''.join(codon))
                    data_in_buffer = True
        if data_in_buffer:
            args.qprint("Writing Align")
            with open(os.path.join(args.output_dir, contiglabel + ".fa"),
                      'w') as outfile:
                for label in write_buffer:
                    if (mvf.flavor == 'codon'
                            and args.output_data in ('dna', 'prot')):
                        if ((mvf.contig_data[targetcontig].get('strand', '+')
                             == '-') and (args.ignore_strand is False)):
                            entryseq = ''.join(write_buffer[label][::-1])
                        else:
                            entryseq = ''.join(write_buffer[label])
                    else:
                        entryseq = ''.join(write_buffer[label])
                    outfile.write(">{}\n{}\n".format(label, entryseq))
                outfile.write("\b")

    return ''
Ejemplo n.º 4
0
def calc_group_unique_allele_window(args):
    """Count the number of and relative rate of uniquely held alleles
       spatially along chromosomes (i.e. Lineage-specific rates)"""
    data = {}
    mvf = MultiVariantFile(args.mvf, 'read')
    if mvf.flavor != 'codon':
        raise RuntimeError(
            "\n=====================\nERROR: MVF is not codon flavor!")
    annotations = {}
    coordinates = {}
    labels = mvf.get_sample_labels()[:]
    ncol = len(labels)
    current_contig = None
    current_position = 0
    counts = Counter()
    totals = Counter()
    args.start_contig = (args.start_contig
                         if args.start_contig is not None else 0)
    args.end_contig = (args.end_contig
                       if args.end_contig is not None else 100000000000)
    if args.output_align is True:
        outputalign = []
    if args.gff is not None:
        annotations, coordinates = (parse_gff_analysis(args.gff))
    if args.allele_groups is not None:
        args.allele_groups = procarg_allelegroups(args.allele_groups, mvf)
    if args.species_groups is None:
        args.species_groups = args.allele_groups
    else:
        args.species_groups = procarg_speciesgroups(args.species_groups, mvf)
    fieldtags = [
        'likelihood', 'bgdnds0', 'bgdnds1', 'bgdnds2a', 'bgdnds2b', 'fgdnds0',
        'fgdnds1', 'fgdnds2a', 'fgdnds2b', 'dndstree', 'errorstate'
    ]
    if args.branch_lrt is not None:
        with open(args.branch_lrt, 'w') as branchlrt:
            genealign = []
            branchlrt.write(
                "\t".join(['contig', 'ntaxa', 'alignlength', 'lrtscore'] +
                          ["null.{}".format(x) for x in fieldtags] +
                          ["test.{}".format(x)
                           for x in fieldtags] + ['tree']) + "\n")
    groups = args.allele_groups.values()
    if args.species_groups is not None:
        speciesgroups = args.species_groups.values()
    allsets = set([])
    for group in groups:
        allsets.update(group)
    allsets = list(sorted(allsets))
    speciesnames = args.species_groups.keys()
    speciesrev = {}
    if args.species_groups is not None:
        for species in args.species_groups:
            speciesrev.update([(x, species)
                               for x in args.species_groups[species]])
    if args.mincoverage is not None:
        if args.mincoverage < len(groups) * 2:
            raise RuntimeError("""
                Error: GroupUniqueAlleleWindow:
                --mincoverage cannot be lower than the twice the number
                of specified groups in --allele-groups
                """)
    genealign = []
    for contig, pos, allelesets in mvf:
        if not current_contig:
            current_contig = contig[:]
        if contig != current_contig or (args.windowsize > 0 and pos >
                                        current_position + args.windowsize):
            xkey = (
                current_contig,
                current_position,
            )
            data[xkey] = counts.copy()
            data[xkey].update([
                ('contig', (mvf.get_contig_labels(ids=current_contig)
                            if args.use_labels is True else current_contig)),
                ('position', current_position),
                ('nonsynyonymous_changes',
                 counts.get('nonsynonymous_changes', 0) or 0),
                ('synyonymous_changes', counts.get('synonymous_changes', 0)
                 or 0)
            ])
            data[xkey].update([
                ('ns_ratio',
                 (float(data[xkey].get('nonsynonymous_changes', 0)) /
                  (data[xkey].get('synonymous_changes', 1.0)))),
                ('annotation', annotations.get(data[xkey]['contig'], '.')),
                ('coordinates', coordinates.get(data[xkey]['contig'], '.'))
            ])
            if genealign:
                if (args.end_contig >= int(current_contig)) and (
                        args.start_contig <= int(current_contig)):
                    (pamlnull, pamltest, tree) = paml_branchsite(
                        genealign,
                        labels[:],
                        species=speciesnames,
                        speciesrev=speciesrev,
                        codemlpath=args.codeml_path,
                        raxmlpath=args.raxml_path,
                        pamltmp=args.paml_tmp,
                        target=args.target,
                        targetspec=args.num_target_species,
                        allsampletrees=args.all_sample_trees,
                        outgroup=args.outgroup)
                    lrtscore = -1
                    if (pamlnull.get('likelihood', -1) != -1
                            and pamltest.get('likelihood', -1) != -1):
                        lrtscore = 2 * (pamltest['likelihood'] -
                                        pamlnull['likelihood'])
                    with open(args.branch_lrt, 'a') as branchlrt:
                        branchlrt.write("\t".join([
                            str(x) for x in [
                                data[xkey]['contig'],
                                len(genealign),
                                len(genealign[0]) * 3, lrtscore
                            ] + [pamlnull.get(y, -1) for y in fieldtags] +
                            [pamltest.get(y, -1)
                             for y in fieldtags] + [str(tree).rstrip()]
                        ]) + "\n")
            genealign = None
            totals.add('genes_total')
            if counts.get('total_codons', 0) > 0:
                totals.add('genes_tested')
            if counts.get('total_nsyn_codons', 0) > 0:
                totals.add('genes_with_nsyn')
            if contig != current_contig:
                current_contig = contig[:]
                current_position = 0
            elif args.windowsize > 0:
                current_position += args.windowsize
            counts = Counter()
        proteins = allelesets[0]
        codons = allelesets[1:4]
        if len(proteins) == 1 and all(len(x) == 1 for x in codons):
            if proteins == '*' or ''.join(codons) in MLIB.stop_codons:
                continue
            counts.add('total_codons')
            totals.add('total_codons')
            if args.output_align is True:
                if not outputalign:
                    outputalign = [[''.join(codons)]
                                   for x in range(mvf.metadata['ncol'])]
                else:
                    for ialign, xalign in enumerate(outputalign):
                        xalign.append(''.join(codons))
            if args.branch_lrt is not None:
                if not genealign:
                    genealign = [[''.join(codons)] for x in range(ncol)]
                else:
                    for ialign in range(len(genealign)):
                        genealign[ialign].append(''.join(codons))
            continue
        if len(proteins) > 1:
            if allelesets[0][1] == '+':
                continue
        proteins = mvf.decode(proteins)
        if args.mincoverage is not None:
            if sum([int(x not in 'X-')
                    for x in proteins]) < (args.mincoverage):
                continue
        species_groups = [[proteins[i] for i in x if proteins[i] not in '-X']
                          for x in speciesgroups]
        if any(len(x) == 0 for x in species_groups):
            continue
        xcodons = [mvf.decode(x) for x in codons]
        codons = [''.join(x) for x in zip(*xcodons)]
        if any(codons[x] in MLIB.stop_codons for x in allsets):
            continue
        if any(
                any(x != species_groups[0][0] for x in y)
                for y in species_groups):
            totals.add('total_nsyn_codons')
            counts.add('total_nsyn_codons')
        totals.add('total_codons')
        totals.add('tested_codons')
        counts.add('total_codons')
        totals.add('variable_codons',
                   val=int(
                       sum([int(len(set(x) - set('X-')) > 1)
                            for x in xcodons]) > 0))
        if args.output_align is not None:
            if not outputalign:
                outputalign = [[x] for x in codons]
            else:
                for ialign in range(len(outputalign)):
                    outputalign[ialign].append(codons[ialign])
        if args.branch_lrt is not None:
            if not genealign:
                genealign = [[x] for x in codons]
            else:
                for ialign in range(len(codons)):
                    genealign[ialign].append(codons[ialign])
        nonsyn_change = False
        synon_change = False
        codon_groups = [
            set([
                codons[i] for i in x
                if '-' not in codons[i] and 'X' not in codons[i]
            ]) for x in groups
        ]
        protein_groups = None
        for i in range(len(codon_groups)):
            if any(base in codon for base in 'RYWKMS'
                   for codon in codon_groups[i]):
                codon_groups[i] = hapgroup(codon_groups[i])
        if all(
                grp1.isdisjoint(grp0)
                for grp0, grp1 in combinations(codon_groups, 2)):
            protein_groups = [
                set([
                    MLIB.codon_tables['full'][''.join(x)]
                    for x in codon_groups[i]
                ]) for i in range(len(codon_groups))
            ]
            if all(
                    grp1.isdisjoint(grp0)
                    for grp0, grp1 in combinations(protein_groups, 2)):
                nonsyn_change = True
            elif all(grp1 == grp0
                     for grp0, grp1 in combinations(protein_groups, 2)):
                synon_change = True
        if nonsyn_change:
            if args.verbose is True:
                print('NON', contig, pos, allelesets,
                      codon_groups, protein_groups, groups,
                      mvf.get_contig_labels(ids=contig))
            counts.add('nonsynonymous_changes')
            totals.add('nonsynonymous_changes')
        elif synon_change:
            if args.verbose is True:
                print('SYN', contig, pos, allelesets,
                      codon_groups, protein_groups, groups,
                      mvf.get_contig_labels(ids=contig))
            counts.add('synonymous_changes')
            totals.add('synonymous_changes')
    args.totals = totals
    # WRITE OUTPUT
    headers = [
        "contig", "position", "nonsynonymous_changes", "synonymous_changes",
        "ns_ratio", "nonsynonymous_total", "synonymous_total", "pvalue",
        "total_codons", "annotation", "coordinates"
    ]
    if args.windowsize == -1:
        headers.remove('position')
    if args.chi_test is None:
        headers.remove('pvalue')
    outfile = OutputFile(path=args.out, headers=headers)
    sorted_entries = sorted(
        [(data[k]['ns_ratio'], k)
         for k in data if data[k].get('nonsynonymous_changes', 0) > 0],
        reverse=True)
    for _, k in sorted_entries:
        outfile.write_entry(data[k])
    with open(args.out + '.total', 'w') as totalfile:
        for entry in args.totals.iter_sorted():
            totalfile.write(entry)
    if args.output_align is not None:
        with open(args.output_align, 'w') as alignfile:
            alignfile.write("\n".join([
                ">{}\n{}".format(mvf.metadata['labels'][i],
                                 ''.join(outputalign[i]))
                for i in range(len(outputalign))
            ]))
    return ''
Ejemplo n.º 5
0
def legacy_translate_mvf(args):
    """Main method"""
    args.qprint("Running LegacyTranslateMVF")
    if args.gff:
        args.qprint("Reading and Indexing MVF.")
    else:
        args.qprint("Reading MVF.")
    mvf = MultiVariantFile(args.mvf, 'read', contigindex=bool(args.gff))
    if mvf.flavor != 'dna':
        raise RuntimeError("MVF must be flavor=dna to translate")
    if args.gff:
        args.qprint("Processing MVF Index File.")
        mvf.read_index_file()
        args.qprint("GFF processing start.")
        gff = parse_gff_legacy_translate(
            args.gff, args,
            parent_gene_pattern=args.parent_gene_pattern)
        args.qprint("GFF processed.")
    outmvf = MultiVariantFile(args.out, 'write', overwrite=args.overwrite)
    outmvf.copy_headers_from(mvf)
    outmvf.flavor = args.output_data
    outmvf.write_data(outmvf.get_header())
    args.qprint("Output MVF Established.")
    entrybuffer = []
    nentry = 0
    pos = None
    if not args.gff:
        args.qprint("No GFF used, translating sequences as pre-aligned in "
                    "coding frame.")
        inputbuffer = []
        current_contig = ''
        for contigid, pos, allelesets in mvf.iterentries(decode=False):
            if current_contig == '':
                current_contig = contigid[:]
            if contigid == current_contig:
                inputbuffer.append((pos, allelesets))
            else:
                for _, amino_acids, alleles in iter_codons(
                        inputbuffer, mvf):
                    if all([x in '-X' for x in amino_acids]):
                        continue
                    if args.output_data == 'protein':
                        entrybuffer.append(
                            (current_contig, pos, (amino_acids,)))
                    else:
                        entrybuffer.append((
                            current_contig, pos, (
                                amino_acids, alleles[0],
                                alleles[1], alleles[2])))
                    nentry += 1
                    if nentry == args.line_buffer:
                        outmvf.write_entries(entrybuffer)
                        entrybuffer = []
                        nentry = 0
                inputbuffer = [(pos, allelesets)]
                current_contig = contigid[:]
        if inputbuffer:
            for _, amino_acids, alleles in iter_codons(
                    inputbuffer, outmvf):
                if all([x in '-X' for x in amino_acids]):
                    continue
                if args.output_data == 'protein':
                    entrybuffer.append(
                        (current_contig, pos, (amino_acids,)))
                else:
                    entrybuffer.append((
                        current_contig, pos, (
                            amino_acids, alleles[0],
                            alleles[1], alleles[2])))
                nentry += 1
                if nentry == args.line_buffer:
                    outmvf.write_entries(entrybuffer)
                    entrybuffer = []
                    nentry = 0
    else:
        args.qprint("Indexing GFF gene names.")
        # mvfid_to_gffname = outmvf.get_contig_reverse_dict()
        for xcontig in outmvf.get_contig_indices():
            mvf_entries = {}
            xcontiglabel = outmvf.get_contig_labels(indices=xcontig)[0]
            xcontigid = outmvf.get_contig_ids(indices=xcontig)[0]
            if xcontiglabel not in gff:
                if args.verbose:
                    print(
                        ("No entries in GFF, "
                         "skipping contig: index:{} id:{} label:{}").format(
                             xcontig, xcontigid, xcontiglabel))
                continue
            if not xcontig % 100:
                args.qprint("Processing contig: {} {}".format(
                    xcontigid, xcontiglabel))
            for contigid, pos, allelesets in mvf.itercontigentries(
                    xcontig, decode=False):
                mvf_entries[pos] = allelesets[0]
            for coords in sorted(gff[xcontiglabel]):
                reverse_strand = coords[3] == '-'
                alleles = (tuple(mvf_entries.get(x, '-')
                                 for x in coords[2::-1])
                           if reverse_strand is True
                           else tuple(mvf_entries.get(x, '-')
                                      for x in coords[0:3]))
                if all(len(x) == 1 for x in alleles):
                    if reverse_strand:
                        alleles = tuple(
                            MLIB.complement_bases[x] for x in alleles)
                    decoded_alleles = alleles
                    amino_acids = translate_single_codon(''.join(alleles))
                else:
                    if reverse_strand is True:
                        decoded_alleles = tuple(tuple(MLIB.complement_bases[y]
                                                      for y in mvf.decode(x))
                                                for x in alleles)
                        alleles = tuple(outmvf.encode(''.join(x))
                                        for x in decoded_alleles)
                    else:
                        decoded_alleles = tuple(mvf.decode(x) for x in alleles)
                    amino_acids = tuple(translate_single_codon(''.join(x))
                                        for x in zip(*decoded_alleles))
                    # print("aminx", amino_acids)
                    amino_acids = outmvf.encode(''.join(amino_acids))
                # if all(x in '-X' for x in amino_acids):
                #    continue
                # print("amino", amino_acids)
                # print("translated", amino_acids, alleles)
                if args.output_data == 'protein':
                    entrybuffer.append((xcontig, coords[0], (amino_acids,)))
                else:
                    entrybuffer.append((
                        xcontigid, coords[0], (
                            amino_acids, alleles[0], alleles[1], alleles[2])))
                nentry += 1
                if nentry >= args.line_buffer:
                    args.qprint("Writing a block of {} entries.".format(
                        args.line_buffer))
                    outmvf.write_entries(entrybuffer)
                    entrybuffer = []
                    nentry = 0
    if entrybuffer:
        outmvf.write_entries(entrybuffer)
        entrybuffer = []
        nentry = 0
    return ''
Ejemplo n.º 6
0
def infer_window_tree(args):
    """Main method"""
    # ESTABLISH FILE OBJECTS
    mvf = MultiVariantFile(args.mvf, 'read')
    # Set up contig ids
    if args.contig_ids is not None:
        contig_ids = args.contig_ids[0].split(",")
    elif args.contig_labels is not None:
        contig_ids = mvf.get_contig_ids(
            labels=args.contig_labels[0].split(","))
    else:
        contig_ids = mvf.get_contig_ids()
    treefile = OutputFile(
        args.out,
        headers=[
            'contig',
            'windowstart',
            'windowsize',
            'tree',
            'topology',
            'topoid',
            # 'templabels', ### USED FOR DEBUGGING ###
            'alignlength',
            'aligndepth',
            'status'
        ])
    topofile = OutputFile(args.out + '.counts',
                          headers=['rank', 'topology', 'count'])
    if args.sample_indices is not None:
        sample_indices = [int(x) for x in args.sample_indices[0].split(",")]
    elif args.sample_labels is not None:
        sample_indices = mvf.get_sample_indices(
            labels=args.sample_labels[0].split(","))
    else:
        sample_indices = mvf.get_sample_indices()
    if not os.path.exists(args.temp_dir):
        os.mkdir(args.temp_dir)
    os.chdir(args.temp_dir)
    # SETUP PARAMS
    main_labels = mvf.get_sample_labels(sample_indices)
    if args.choose_allele in ['randomboth', 'majorminor']:
        main_labels = [label + x for x in ['a', 'b'] for label in main_labels]
    params = {
        'outgroups':
        args.raxml_outgroups or [],
        'rootwith':
        (args.root_with.split(',') if args.root_with is not None else None),
        'minsites':
        args.min_sites,
        'minseqcoverage':
        args.min_seq_coverage,
        'mindepth':
        args.min_depth,
        'raxmlpath':
        args.raxml_path,
        'raxmlopts':
        args.raxml_opts,
        'duplicateseq':
        args.duplicate_seq,
        'model':
        args.raxml_model,
        'bootstrap':
        args.bootstrap,
        'windowsize':
        args.windowsize,
        'chooseallele':
        args.choose_allele,
        'tempdir':
        args.temp_dir,
        'tempprefix':
        args.temp_prefix
    }
    # WINDOW START INTERATION
    verify_raxml(params)
    current_contig = ''
    current_position = 0
    window_data = None
    skip_contig = False
    topo_ids = {}
    topo_counts = {}
    for contig, pos, allelesets in mvf.iterentries(contigs=contig_ids,
                                                   subset=sample_indices,
                                                   quiet=args.quiet,
                                                   no_invariant=False,
                                                   no_ambig=False,
                                                   no_gap=False,
                                                   decode=True):
        if current_contig == contig:
            if skip_contig is True:
                continue
        if not same_window((current_contig, current_position),
                           (contig, pos), args.windowsize):
            skip_contig = False
            if window_data is not None:
                entry = window_data.maketree_raxml(params)
                if entry['status'] != 'ok':
                    if args.output_empty:
                        treefile.write_entry(entry)
                    if args.windowsize != -1:
                        skip_contig = True
                else:
                    topo = entry["topology"]
                    topo_counts[topo] = topo_counts.get(topo, 0) + 1
                    if topo not in topo_ids:
                        topo_ids[topo] = (topo_ids
                                          and max(topo_ids.values()) + 1 or 0)
                    entry["topoid"] = topo_ids[topo]
                    treefile.write_entry(entry)
                current_position = (current_position + args.windowsize if
                                    (contig == current_contig
                                     and args.windowsize > 0) else 0)
            current_contig = contig[:]
            window_data = None
            window_data = WindowData(
                window_params={
                    'contigname': (mvf.get_contig_labels(
                        ids=current_contig) if args.output_contig_labels
                                   is not None else current_contig[:]),
                    "windowstart": (
                        '-1' if args.windowsize == -1 else current_position +
                        0),
                    "windowsize":
                    args.windowsize,
                    "labels":
                    main_labels[:]
                })
        # ADD ALLELES
        if mvf.flavor == 'dna':
            if args.choose_allele != 'none':
                allelesets[0] = hapsplit(allelesets[0], args.choose_allele)
            window_data.append_alleles(allelesets[0], mindepth=args.min_depth)
    # LAST LOOP
    if window_data:
        entry = window_data.maketree_raxml(params)
        if entry['status'] != 'ok':
            if args.output_empty:
                treefile.write_entry(entry)
        else:
            topo = entry["topology"]
            topo_counts[topo] = topo_counts.get(topo, 0) + 1
            if topo not in topo_ids:
                topo_ids[topo] = (max(topo_ids.values()) +
                                  1 if topo_ids else 0)
            entry["topoid"] = topo_ids[topo]
            treefile.write_entry(entry)
        window_data = None
    # END WINDOW ITERATION
    topo_list = sorted([(v, k) for k, v in topo_counts.items()], reverse=True)
    for rank, [value, topo] in enumerate(topo_list):
        topofile.write_entry({'rank': rank, 'count': value, 'topology': topo})
    return ''
Ejemplo n.º 7
0
def infer_window_tree(args):
    """Main method"""
    args.qprint("Running InferTree")
    # ESTABLISH FILE OBJECTS
    mvf = MultiVariantFile(args.mvf, 'read')
    args.qprint("Read MVF File: {}".format(args.mvf))
    # Set up contig ids
    if args.contig_ids is not None:
        contig_ids = args.contig_ids[0].split(",")
    elif args.contig_labels is not None:
        contig_ids = mvf.get_contig_ids(
            labels=args.contig_labels[0].split(","))
    else:
        contig_ids = mvf.get_contig_ids()
    treefile = OutputFile(
        args.out,
        headers=['contig', 'windowstart', 'windowsize', 'tree',
                 'topology', 'topoid',
                 # 'templabels', ### USED FOR DEBUGGING ###
                 'alignlength', 'aligndepth', 'status'])
    topofile = OutputFile(args.out + '.counts',
                          headers=['rank', 'topology', 'count'])
    if args.sample_indices is not None:
        sample_indices = [int(x) for x in
                          args.sample_indices[0].split(",")]
    elif args.sample_labels is not None:
        sample_indices = mvf.get_sample_indices(
            ids=args.sample_labels[0].split(","))
    else:
        sample_indices = mvf.get_sample_indices()
    if not os.path.exists(args.temp_dir):
        os.mkdir(args.temp_dir)
    os.chdir(args.temp_dir)
    # SETUP PARAMS
    main_labels = mvf.get_sample_ids(sample_indices)
    if args.choose_allele in ['randomboth', 'majorminor']:
        main_labels = [label + x for x in ['a', 'b'] for label in main_labels]
    params = {
        'bootstrap': args.bootstrap,
        'chooseallele': args.choose_allele,
        'collapse_polytomies': args.collapse_polytomies,
        'duplicateseq': args.duplicate_seq,
        'engine': args.engine,
        'engine_path': args.engine_path,
        'engine_opts': args.engine_opts,
        'mindepth': args.min_depth,
        'minseqcoverage': args.min_seq_coverage,
        'minsites': args.min_sites,
        'model': args.model,
        'outgroups': (args.raxml_outgroups 
                      if args.raxml_outgroups is not None
                      else None),
        'rootwith': (args.root_with.split(',')
                     if args.root_with is not None
                    else []),
        'tempdir': args.temp_dir,
        'tempprefix': args.temp_prefix,
        'windowsize': args.windowsize,
        }
    # DEFAULT MODEL
    if params['model'] is None:
        if params['engine'] == 'raxml':
            params['model'] = 'GTRGAMMA'
        elif params['engine'] == 'raxml-ng':
            params['model'] = "GTR+G"
    # WINDOW START INTERATION
    verify_raxml(params)
    args.qprint("RAxML Found.")
    current_contig = None
    current_position = 0
    window_data = None
    # skip_contig = False
    topo_ids = {}
    topo_counts = {}
    args.qprint("Prcocessing Records")
    windowsizename = "window size={}".format(args.windowsize)
    if windowsizename == "window size=-1":
        windowsizename = "whole contig"
    elif windowsizename == "window size=0":
        windowsizename = "whole genome"
        window_data = WindowData(window_params={
            'contigname': 'all',
            "windowstart": 0,
            "windowsize": 0,
            "labels": main_labels[:]})
    for contig, pos, allelesets in mvf.iterentries(
            contig_ids=contig_ids, subset=sample_indices,
            no_invariant=False, no_ambig=False, no_gap=False, decode=True):
        # if current_contig == contig:
        #     if skip_contig is True:
        #         args.qprint("Skipping contig: {}".format(current_contig))
        #         continue
        if not same_window((current_contig, current_position),
                           (contig, pos), args.windowsize):
            # skip_contig = False
            if window_data is not None:
                args.qprint(("Making tree for {} "
                             "at contig {} position {}").format(
                                 windowsizename,
                                 current_contig,
                                 current_position))
                entry = window_data.maketree_raxml(params)
                if entry['status'] != 'ok':
                    if args.output_empty:
                        treefile.write_entry(entry)
                    # if args.windowsize != -1:
                    #     skip_contig = True
                    args.qprint(
                        "TREE REJECTED with error code: {} ({})".format(
                            entry['status'], entry.get('comment', "None")))
                else:
                    args.qprint("Tree completed.")
                    topo = entry["topology"]
                    topo_counts[topo] = topo_counts.get(topo, 0) + 1
                    if topo not in topo_ids:
                        topo_ids[topo] = (max(topo_ids.values()) + 1
                                          if topo_ids else 0)
                    entry["topoid"] = topo_ids[topo]
                    treefile.write_entry(entry)
                current_position = current_position + args.windowsize if (
                    contig == current_contig and args.windowsize > 0) else 0
            current_contig = contig[:]
            window_data = None
            window_data = WindowData(window_params={
                'contigname': (mvf.get_contig_labels(ids=current_contig) if
                               args.output_contig_labels is not None else
                               current_contig[:]),
                "windowstart": ('-1' if args.windowsize == -1
                                else current_position + 0),
                "windowsize": args.windowsize,
                "labels": main_labels[:]})
        # ADD ALLELES
        if mvf.flavor == 'dna':
            if args.choose_allele != 'none':
                allelesets[0] = hapsplit(allelesets[0], args.choose_allele)
            window_data.append_alleles(allelesets[0], mindepth=args.min_depth)
        elif mvf.flavor == 'codon':
            for i in (1, 2, 3):
                if args.choose_allele != 'none':
                    allelesets[i] = hapsplit(allelesets[i], args.choose_allele)
                window_data.append_alleles(allelesets[i], mindepth=args.min_depth)
    # LAST LOOP
    if window_data:
        entry = window_data.maketree_raxml(params)
        if entry['status'] != 'ok':
            if args.output_empty:
                treefile.write_entry(entry)
        else:
            topo = entry["topology"]
            topo_counts[topo] = topo_counts.get(topo, 0) + 1
            if topo not in topo_ids:
                topo_ids[topo] = (
                    max(topo_ids.values()) + 1 if topo_ids else 0)
            entry["topoid"] = topo_ids[topo]
            treefile.write_entry(entry)
        window_data = None
    # END WINDOW ITERATION
    topo_list = sorted([(v, k) for k, v in topo_counts.items()],
                       reverse=True)
    for rank, [value, topo] in enumerate(topo_list):
        topofile.write_entry({'rank': rank, 'count': value, 'topology': topo})
    return ''
Ejemplo n.º 8
0
def vcf2mvf(args=None):
    """Main method for vcf2mvf"""
    sepchars = dict([("TAB", "\t"), ("SPACE", " "), ("DBLSPACE", "  "),
                     ("COMMA", ","), ("MIXED", None)])
    args.fieldsep = sepchars[args.field_sep]
    # ESTABLISH VCF
    vcf = VariantCallFile(args.vcf, indexcontigs=(not args.no_autoindex))
    # ESTABLISH MVF
    mvf = MultiVariantFile(args.out, 'write', overwrite=args.overwrite)
    # PROCESS CONTIG INFO
    vcfcontigs = vcf.metadata['contigs'].copy()
    contig_translate = {}
    if args.contig_ids:
        for cid, cvcf, cmvf in (x.split(';') for x in args.contig_ids):
            try:
                cid = int(cid)
            except ValueError:
                pass
            assert cvcf in [vcfcontigs[x]['label'] for x in vcfcontigs]
            for vid in vcfcontigs:
                if vcfcontigs[vid]['label'] == cvcf:
                    contig_translate[cvcf] = [cid, cmvf]
                    if cid in mvf.metadata['contigs']:
                        raise RuntimeError(
                            'Contig id {} is not unique'.format(cid))
                    mvf.metadata['contigs'][cid] = vcfcontigs[vid].copy()
                    if cmvf in mvf.get_contig_labels():
                        raise RuntimeError(
                            'Contig label {} is not unique'.format(cmvf))
                    mvf.metadata['contigs'][cid]['label'] = cmvf[:]
    mvf.reset_max_contig_id()
    for vcid in vcfcontigs:
        vlabel = vcfcontigs[vcid]['label']
        if vlabel not in mvf.get_contig_labels():
            if ((is_int(vlabel) or len(vlabel) < 3)
                    and vlabel not in mvf.get_contig_ids()):
                newid = vlabel[:]
            else:
                newid = mvf.get_next_contig_id()
            mvf.metadata['contigs'][newid] = vcfcontigs[vcid].copy()
            contig_translate[vlabel] = [newid, vlabel]
    mvf.reset_max_contig_id()
    new_contigs = [(x, mvf.metadata['contigs'][x]['label'])
                   for x in mvf.metadata['contigs']]
    for i, (newid, newlabel) in enumerate(new_contigs):
        for j, (xid, xlabel) in enumerate(new_contigs):
            if i == j:
                continue
            if newid == xlabel:
                raise RuntimeError("Error contig id {} is the same as"
                                   " the label for another contig"
                                   " ({} {})".format(newid, xid, xlabel))
            if newlabel == xid:
                raise RuntimeError("Error contig label {} is the same"
                                   "as the id for another contig"
                                   "({} {})".format(newlabel, xid, xlabel))
    # PROCESS SAMPLE INFO
    samplelabels = [args.ref_label] + vcf.metadata['samples'][:]
    if args.alleles_from:
        args.alleles_from = args.alleles_from.split(':')
        samplelabels += args.alleles_from
    if args.sample_replace:
        newsample = [
            x.split(':') if ':' in tuple(x) else tuple([x, x])
            for x in args.sample_replace
        ]
        unmatched = [x for x in enumerate(samplelabels)]
        for old, new in newsample:
            labelmatched = False
            for j, (i, name) in enumerate(unmatched):
                if old in name:
                    samplelabels[i] = new
                    labelmatched = j
                    break
            if labelmatched is not False:
                del unmatched[labelmatched]
    mvf.metadata['labels'] = samplelabels[:]
    for i, label in enumerate(samplelabels):
        mvf.metadata['samples'][i] = {'label': label}
    mvf.metadata['ncol'] = len(mvf.metadata['labels'])
    mvf.metadata['sourceformat'] = vcf.metadata['sourceformat']
    # WRITE MVF HEADER
    mvf.write_data(mvf.get_header())
    mvfentries = []
    nentry = 0
    for vcfrecord in vcf.iterentries(args):
        # try:
        mvf_alleles = encode_mvfstring(''.join(vcfrecord['genotypes']))
        if args.out_flavor in ('dnaqual', ):
            qual_alleles = encode_mvfstring(''.join(vcfrecord['qscores']))
        if mvf_alleles:
            mvfentries.append(
                (contig_translate.get(vcfrecord['contig'])[0],
                 vcfrecord['coord'],
                 ((mvf_alleles,
                   qual_alleles) if args.out_flavor in ('dnaqual', ) else
                  (mvf_alleles, ))))
            nentry += 1
            if nentry == args.line_buffer:
                mvf.write_entries(mvfentries, encoded=True)
                mvfentries = []
                nentry = 0
        # except Exception as exception:
    if mvfentries:
        mvf.write_entries(mvfentries)
        mvfentries = []
    return ''