Python MultiVariantFile Examples

Programming Language: Python

Namespace/Package Name: pylib.mvfbase

Class/Type: MultiVariantFile

Examples at hotexamples.com: 30

Python MultiVariantFile - 30 examples found. These are the top rated real world Python examples of pylib.mvfbase.MultiVariantFile extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

MultiVariantFile(30)

iterentries(21)

get_header(16)

write_data(16)

get_sample_indices(16)

write_entries(16)

get_sample_ids(15)

get_sample_labels(15)

get_contig_ids(13)

decode(12)

get_contig_labels(8)

metadata(7)

encode(6)

get_contig_indices(6)

read_index_file(4)

copy_headers_from(4)

flavor(4)

get_next_contig_index(3)

get_next_contig_id(3)

reset_max_contig(3)

reset_max_contig_id(3)

sample_ids(3)

sample_indices(3)

itercontigentries(3)

contig_data(2)

contig_labels(2)

reset_max_sample(2)

contig_ids(2)

max_sample_index(1)

copy_header(1)

reset_ncol(1)

sample_data(1)

Example #1

Show file

File: mvfanalysis.py Project: peaselab/mvftools

def calc_sample_coverage(args):
    """Counts the total number of non-gap/ambiguous characters for
      each sample per contig.
      """
    mvf = MultiVariantFile(args.mvf, 'read')
    data = {}
    # data_order = []
    # Set up sample indices
    if args.sample_indices is not None:
        sample_indices = [int(x) for x in args.sample_indices[0].split(",")]
    elif args.sample_labels is not None:
        sample_indices = mvf.get_sample_indices(
            ids=args.sample_labels[0].split(","))
    else:
        sample_indices = mvf.get_sample_indices()
    sample_labels = mvf.get_sample_ids(indices=sample_indices)
    # Set up contig ids
    if args.contig_ids is not None:
        contig_indices = mvf.get_contig_indices(args.contig_ids[0].split(","))
    elif args.contig_labels is not None:
        contig_indices = mvf.get_contig_indices(
            labels=args.contig_labels[0].split(","))
    else:
        contig_indices = None
    for contig, _, allelesets in mvf.iterentries(contig_indices=contig_indices,
                                                 subset=sample_indices,
                                                 decode=True):
        if contig not in data:
            data[contig] = dict((x, 0) for x in sample_labels)
            data[contig]['contig'] = contig
        for j, elem in enumerate(sample_indices):
            data[contig][sample_labels[elem]] += int(
                allelesets[0][j] not in 'Xx-')
    outfile = OutputFile(path=args.out,
                         headers=(["contig"] +
                                  [sample_labels[x] for x in sample_indices]))
    for contig in data:
        outfile.write_entry(data[contig])
    return ''

Example #2

Show file

def annotate_mvf(args):
    """Main method"""
    mvf = MultiVariantFile(args.mvf, 'read')
    gff, geneids = parse_gff_annotate(args.gff, mvf.metadata['contigs'])
    if args.quiet is False:
        print("gff_processed")
    outmvf = MultiVariantFile(args.out, 'write', overwrite=args.overwrite)
    outmvf.metadata = deepcopy(mvf.metadata)
    if args.nongenic_mode is False:
        outmvf.metadata['contigs'] = geneids
    outmvf.write_data(outmvf.get_header())
    entrybuffer = []
    nentry = 0
    for contigid, pos, allelesets in mvf.iterentries(decode=False):
        annotated_pos = False
        if contigid in gff:
            if pos in gff[contigid]:
                annotated_pos = True
            elif args.nongenic_mode is True and args.unmargin > 0:
                for xpos in range(pos - args.unmargin,
                                  pos + args.unmargin + 1):
                    if xpos in gff[contigid]:
                        annotated_pos = True
                        break
        if args.nongenic_mode is False and annotated_pos is True:
            entrybuffer.append((gff[contigid][pos], pos, allelesets))
            nentry += 1
            if nentry == args.line_buffer:
                outmvf.write_entries(entrybuffer)
                entrybuffer = []
                nentry = 0
        elif args.nongenic_mode is True and annotated_pos is False:
            entrybuffer.append((contigid, pos, allelesets))
            nentry += 1
            if nentry == args.line_buffer:
                outmvf.write_entries(entrybuffer)
                entrybuffer = []
                nentry = 0
    if entrybuffer:
        outmvf.write_entries(entrybuffer)
        entrybuffer = []
        nentry = 0
    return ''

Example #3

Show file

def fasta2mvf(args):
    """Main method"""
    sepchars = dict([("PIPE", "\\|"), ("TAB", "\\t"), ("SPACE", "\\s"),
                     ("DBLSPACE", "\\s\\s"), ("COMMA", "\\,"), ("NONE", None),
                     ("AT", "\\@"), ('UNDER', "\\_"), ("DBLUNDER", "\\_\\_")])
    if args.field_sep is None:
        args.field_sep = ''
    else:
        args.field_sep = re.compile("[{}]".format(''.join(
            [sepchars[x] for x in args.field_sep])))
    if args.manual_coord:
        assert len(args.manual_coord) == len(args.fasta)
        args.manual_coord = [(x.split(':')[0],
                              int(x.split(":")[1].split('..')[0]),
                              int(x.split(':')[1].split('..')[1]))
                             for x in args.manual_coord]
    mvf = MultiVariantFile(args.out, 'write', overwrite=args.overwrite)
    fasta = {}
    current_contig = 0
    fsamples = []
    fcontigs = []
    for ifasta, fastapath in enumerate(args.fasta):
        print("Processing {}".format(fastapath))
        for header, seq in fasta_iter(fastapath):
            if args.field_sep is None:
                header = header[:]
            if args.field_sep != '' and args.field_sep is not None:
                header = [str(x) for x in re.split(args.field_sep, header)]
            if args.contig_by_file is True:
                contig = os.path.basename(fastapath[:])
                if args.sample_field is None:
                    sample = header[:]
                else:
                    sample = header[args.sample_field]
            elif (len(header) < max(
                    args.contig_field if args.contig_field is not None else 0,
                    args.sample_field if args.sample_field is not None else 0)
                  or args.contig_field is None or args.sample_field is None):
                contig = "UNK{}".format(current_contig)
                sample = header[:]
            elif args.manual_coord:
                contig = args.manual_coord[ifasta][0]
            else:
                contig = header[args.contig_field]
                sample = header[args.sample_field]
            if contig not in fcontigs:
                fcontigs.append(contig)
                fasta[contig] = {}
            if sample not in fsamples:
                fsamples.append(sample)
            fasta[contig][sample] = (len(seq), seq)
    reflabel = None
    if args.ref_label:
        for i, samplename in enumerate(fsamples):
            if args.ref_label in samplename:
                reflabel = i
                break
    if reflabel:
        newref = fsamples.pop(i)
        fsamples = [newref] + fsamples
    for i, contig in enumerate(fcontigs):
        new_index = mvf.get_next_contig_index()
        mvf.contig_indices.append(new_index)
        mvf.contig_ids.append(str(new_index))
        mvf.contig_labels.append(contig)
        mvf.contig_label_to_index[contig] = new_index
        mvf.contig_id_to_index[str(new_index)] = new_index
        mvf.contig_data[new_index] = {
            'label': contig,
            'id': str(new_index),
            'length': max([fasta[contig][x][0] for x in fasta[contig]])
        }
    mvf.metadata['labels'] = fsamples[:]
    for i, label in enumerate(fsamples[:]):
        mvf.sample_indices.append(i)
        mvf.sample_id_to_index[label] = i
        mvf.sample_ids.append(label)
        mvf.sample_data[i] = {'id': label}
    mvf.metadata['ncol'] = len(mvf.metadata['labels'])
    mvf.metadata['sourceformat'] = 'fasta'
    mvf.metadata.append(args.command_string)
    mvf.flavor = args.flavor
    # WRITE MVF HEADER
    mvf.write_data(mvf.get_header())
    mvfentries = []
    nentry = 0
    mvf_alleles = {}
    for cind, contig in enumerate(fcontigs):
        for pos in range(mvf.contig_data[cind + 1]['length']):
            mvf_alleles = encode_mvfstring(
                ''.join(samp not in fasta[contig] and '-'
                        or pos >= fasta[contig][samp][0] and '-'
                        or fasta[contig][samp][1][pos] for samp in fsamples))
            if mvf_alleles:
                if args.flavor == 'dna':
                    mvf_alleles = ''.join(
                        ["X" if x in 'NX' else x for x in mvf_alleles])
                mvfentries.append((cind, pos + 1, (mvf_alleles, )))
                nentry += 1
                if nentry == args.write_buffer:
                    mvf.write_entries(mvfentries, encoded=True)
                    mvfentries = []
                    nentry = 0
    if mvfentries:
        mvf.write_entries(mvfentries)
        mvfentries = []
    return ''

Example #4

Show file

def mvf2fasta(args):
    """Main method"""
    mvf = MultiVariantFile(args.mvf, 'read')
    if (mvf.flavor in ("dna", "rna") and args.output_data == "prot") or (
            mvf.flavor == "prot" and args.output_data in ("dna", "rna")):
        raise RuntimeError(
            "--output-data {} incompatiable with '{}' flavor mvf".format(
                args.output_data, mvf.flavor))
    regions, max_region_coord, regionlabel = parse_regions_arg(
        args.regions, mvf.contig_data)
    sample_labels = mvf.get_sample_ids()
    if args.sample_indices is not None:
        sample_indices = [int(x) for x in args.sample_indices[0].split(",")]
    elif args.sample_labels is not None:
        sample_indices = mvf.get_sample_indices(
            ids=args.sample_labels[0].split(","))
    else:
        sample_indices = mvf.get_sample_indices()
    skipcontig = None
    tmp_files = dict(
        (fname, tempfile.NamedTemporaryFile(mode='w+', prefix=fname))
        for fname in sample_labels)
    labelwritten = dict.fromkeys(sample_labels, False)
    write_buffer = {}
    current_contig = None
    data_written = False
    args.qprint("Regions determined. Reading entries.")
    for contig, pos, allelesets in mvf.iterentries(
            contig_indices=mvf.get_contig_indices(
                ids=list(max_region_coord.keys())),
            decode=True):
        if current_contig is None:
            current_contig = mvf.get_contig_indices(ids=contig)
        if contig == skipcontig:
            continue
        if (contig not in max_region_coord) or (
                max_region_coord[contig] is not None
                and pos > max_region_coord[contig]):
            skipcontig = contig[:]
            continue
        inregion = False
        for rcontig, rstart, rstop, _ in regions[contig]:
            if contig == rcontig:
                if rstart is None or pos >= rstart:
                    if rstop is None or pos <= rstop:
                        inregion = True
                        break
        if inregion is False:
            continue
        for col, label in zip(sample_indices, sample_labels):
            if not labelwritten[label]:
                if args.label_type == 'long':
                    xlabel = "{} region={}".format(label, regionlabel)
                elif args.label_type == 'short':
                    xlabel = "{}".format(label)
                tmp_files[label].write(">{}\n".format(xlabel))
                labelwritten[label] = True
            if mvf.flavor == 'dna':
                tmp_files[label].write("N" if allelesets[0][col] ==
                                       'X' else allelesets[0][col])
                data_written = True
            elif mvf.flavor in ('codon', 'prot') and (args.output_data
                                                      == 'prot'):
                tmp_files[label].write(allelesets[0][col])
                data_written = True
            elif mvf.flavor == 'codon' and args.output_data == 'dna':
                codon = [
                    "N" if allelesets[x][col] == 'X' else allelesets[x][col]
                    for x in (1, 2, 3)
                ]
                if not args.gene_mode:
                    tmp_files[label].write(''.join(codon))
                    data_written = True
                else:
                    if contig != current_contig:
                        if mvf.metadata['contigs'][current_contig].get(
                                'strand', "+") == '-':
                            write_buffer[label] = write_buffer[label][::-1]
                        tmp_files[label].write(''.join(write_buffer[label]))
                        data_written = True
                    if label not in write_buffer:
                        write_buffer[label] = []
                    write_buffer[label].append(''.join(codon))
        if args.gene_mode and current_contig != contig:
            write_buffer = {}
            current_contig = contig[:]
    if write_buffer:
        for label in write_buffer:
            if mvf.metadata['contigs'][current_contig].get('strand',
                                                           "+") == '-':
                write_buffer[label] = write_buffer[label][::-1]
            tmp_files[label].write(''.join(write_buffer[label]))
            data_written = True
        write_buffer = {}
    if data_written is False:
        print("ERROR NO DATA WRITTEN")
    with open(args.out, 'w') as outfile:
        for filehandler in tmp_files.values():
            filehandler.seek(0, 0)
            buff = filehandler.read(args.buffer)
            while buff:
                outfile.write(buff)
                buff = filehandler.read(args.buffer)
            outfile.write("\n")
            filehandler.close()
    return ''

Example #5

Show file

def filter_mvf(args):
    """Main method"""
    if args.more_help is True:
        modulehelp()
        sys.exit()
    if args.mvf is None and args.test is None:
        raise RuntimeError("No input file specified with --mvf")
    if args.out is None and args.test is None:
        raise RuntimeError("No output file specified with --out")
    # Establish Input MVF
    if args.test is not None:
        ncol = args.test_nchar or len(args.test.split()[1])
    else:
        mvf = MultiVariantFile(args.mvf, 'read')
        ncol = mvf.metadata['ncol']
    # Create Actionset
    if args.labels:
        labels = mvf.get_sample_labels()[:]
        for i in range(len(args.actions)):
            action = args.actions[i]
            arr = action.split(':')
            if arr[0] in ('columns', 'collapsepriority', 'collapsemerge',
                          'allelegroup', 'notmultigroup'):
                for j in range(1, len(arr)):
                    arr[j] = ','.join(
                        [str(labels.index(x)) for x in arr[j].split(',')])
            args.actions[i] = ':'.join(arr)
    actionset = build_actionset(args.actions, ncol)
    # TESTING MODE
    if args.test:
        loc, alleles = args.test.split()
        linefail = False
        transformed = False
        # invar = invariant (single character)
        # refvar (all different than reference, two chars)
        # onecov (single coverage, + is second character)
        # onevar (one variable base, + is third character)
        # full = full alleles (all chars)
        if args.verbose:
            print(alleles)
        linetype = get_linetype(alleles)
        sys.stdout.write("MVF Encoding type '{}' detected\n".format(linetype))
        for actionname, actiontype, actionfunc, actionarg in actionset:
            sys.stdout.write("Applying action {} ({}): ".format(
                actionname, actiontype))
            if actiontype == 'filter':
                if not actionfunc(alleles, linetype):
                    linefail = True
                    sys.stdout.write("Filter Fail\n")
                    break
                else:
                    sys.stdout.write("Filter Pass\n")
            elif actiontype == 'transform':
                transformed = True
                alleles = actionfunc(alleles, linetype)
                linetype = get_linetype(alleles)
                if linetype == 'empty':
                    linefail = True
                    sys.stdout.write("Transform removed all alleles\n")
                    break
                else:
                    sys.stdout.write("Transform result {}\n".format(alleles))
            elif actiontype == 'location':
                loc = loc.split(':')
                loc[1] = int(loc[1])
                if actionfunc(loc) is False:
                    linefail = True
                    sys.stdout.write("Location Fail\n")
                    break
                else:
                    sys.stdout.write("Location Pass\n")
        if linefail is False:
            if transformed:
                if linetype == 'full':
                    alleles = encode_mvfstring(alleles)
                if alleles:
                    test_output = "{}\t{}\n".format(loc, alleles)
                    sys.stdout.write("Final output = {}\n".format(test_output))
                else:
                    sys.stdout.write("Transform removed all alleles\n")
            else:
                sys.stdout.write("No changes applied\n")
                sys.stdout.write("Final output = {}\n".format(args.test))
        sys.exit()
    # MAIN MODE
    # Set up file handler
    outmvf = MultiVariantFile(args.out, 'write', overwrite=args.overwrite)
    outmvf.metadata = deepcopy(mvf.metadata)
    # reprocess header if actions are used that filter columns
    if any(x == y[0] for x in ('columns', 'collapsepriority', 'collapsemerge')
           for y in actionset):
        if args.labels:
            labels = outmvf.metadata['labels'][:]
        else:
            labels = [x for x in outmvf.metadata['samples']]
        for actionname, actiontype, actionfunc, actionarg in actionset:
            if actionname == 'columns':
                labels = [labels[x] for x in actionarg[0]]
            elif actionname in ('collapsepriority', 'collapsemerge'):
                labels = [
                    labels[x] for x in range(len(labels))
                    if x not in actionarg[0][1:]
                ]
        if args.labels:
            oldindices = mvf.get_sample_indices(labels)
        else:
            oldindices = labels[:]
        newsamples = {}
        for i, _ in enumerate(labels):
            newsamples[i] = mvf.metadata['samples'][oldindices[i]]
        outmvf.metadata['samples'] = newsamples.copy()
        outmvf.metadata['labels'] = labels[:]
    outmvf.write_data(outmvf.get_header())
    # End header editing
    linebuffer = []
    nbuffer = 0
    for chrom, pos, allelesets in mvf.iterentries(decode=False):
        linefail = False
        transformed = False
        # invar = invariant (single character)
        # refvar (all different than reference, two chars)
        # onecov (single coverage, + is second character)
        # onevar (one variable base, + is third character)
        # full = full alleles (all chars)
        alleles = allelesets[0]
        linetype = get_linetype(alleles)
        if linetype == 'empty':
            continue
        if args.verbose is True:
            sys.stdout.write(" {} {}".format(alleles, linetype))
        for actionname, actiontype, actionfunc, actionargs in actionset:
            if actiontype == 'filter':
                if not actionfunc(alleles, linetype):
                    linefail = True
            elif actiontype == 'transform':
                transformed = True
                alleles = actionfunc(alleles, linetype)
                linetype = get_linetype(alleles)
                if linetype == 'empty':
                    linefail = True
            elif actiontype == 'location':
                if actionfunc([chrom, pos]) is False:
                    linefail = True
            if linefail:
                break
        if linefail is False:
            if transformed:
                if linetype == 'full':
                    alleles = mvf.encode(alleles)
                if not alleles:
                    linefail = True
            nbuffer += 1
            linebuffer.append((chrom, pos, (alleles, )))
            if args.verbose:
                sys.stdout.write("{}\n".format(alleles))
            if nbuffer == args.line_buffer:
                outmvf.write_entries(linebuffer)
                linebuffer = []
                nbuffer = 0
        elif args.verbose:
            sys.stdout.write("FAIL\n")
    if linebuffer:
        outmvf.write_entries(linebuffer)
        linebuffer = []
    return ''

Example #6

Show file

File: mvfmaf.py Project: peaselab/mvftools

def maf2mvf(args):
    """Main method"""
    # ESTABLISH MAF
    args.qprint("Starting ConvertMAF2MVF")
    maf = MultiAlignFile(args)
    args.qprint("MAF Established")
    # ESTABLISH MVF
    mvf = MultiVariantFile(args.out, 'write', overwrite=args.overwrite)
    args.qprint("MVF output initialized")
    # PROCESS SAMPLE INFO
    contig_translate = {1: 1}
    samplelabels = [s.split(':')[0] for s in args.sample_tags.split(',')]
    args.qprint("Sample tags processed: {}".format(samplelabels))
    if args.ref_tag not in samplelabels:
        raise IndexError("--ref-tag not in the tags listed in --sample-tags")
    samplelabels.remove(args.ref_tag)
    samplelabels.insert(0, args.ref_tag)
    mvf.sample_ids = samplelabels[:]
    mvf.sample_indices = list(range(len(mvf.sample_ids)))
    for i, label in enumerate(samplelabels):
        mvf.sample_data[i] = {'id': label, 'index': i}
    mvf.reset_max_sample()
    mvf.metadata['sourceformat'] = maf.metadata['sourceformat']
    mvf.metadata.notes.append(args.command_string)
    # WRITE MVF HEADER
    mvf.write_data(mvf.get_header())
    args.qprint("MAF Headers Written")
    mvfentries = []
    nentry = 0
    total_entries = 0
    args.qprint("Begin data conversion")
    for pos, length, msa in maf:
        for sname in samplelabels:
            if sname not in msa:
                msa[sname] = '-'*length
        msa['contig'] = 1
        for i in range(length):
            mvf_alleles = encode_mvfstring(
                ''.join(msa[s][i].strip() for s in samplelabels))
            if mvf_alleles:
                mvfentries.append(
                    (contig_translate.get(msa['contig']),
                     pos+i, (mvf_alleles,)))
                nentry += 1
                if nentry == args.line_buffer:
                    total_entries += nentry
                    mvf.write_entries(mvfentries, encoded=True)
                    args.qprint("{} entries written".format(total_entries))
                    mvfentries = []
                    nentry = 0
    if mvfentries:
        total_entries += nentry
        mvf.write_entries(mvfentries)
        args.qprint("{} entries written".format(total_entries))
    args.qprint("Complete.")
    return ''

Example #7

Show file

File: mvfanalysis.py Project: peaselab/mvftools

def calc_pairwise_distances(args):
    """Count the pairwise nucleotide distance between
       combinations of samples in a window
    """
    args.qprint("Running CalcPairwiseDistances")
    mvf = MultiVariantFile(args.mvf, 'read')
    args.qprint("Input MVF: Read")
    data = {}
    data_order = []
    if args.sample_indices is not None:
        sample_indices = [int(x) for x in args.sample_indices[0].split(",")]
    elif args.sample_labels is not None:
        sample_indices = mvf.get_sample_indices(
            ids=args.sample_labels[0].split(","))
    else:
        sample_indices = mvf.get_sample_indices()
    sample_labels = mvf.get_sample_ids(indices=sample_indices)
    args.qprint("Calculating for sample columns: {}".format(
        list(sample_indices)))
    current_contig = None
    current_position = 0
    data_in_buffer = False
    sample_pairs = [tuple(x) for x in combinations(sample_indices, 2)]
    base_matches = dict((x, {}) for x in sample_pairs)
    all_match = {}
    if mvf.flavor == 'dna':
        allele_frames = (0, )
        args.data_type = 'dna'
    elif mvf.flavor == 'prot':
        allele_frames = (0, )
        args.data_type = 'dna'
    elif mvf.flavor == 'codon':
        if args.data_type == 'prot':
            allele_frames = (0, )
        else:
            allele_frames = (1, 2, 3)
            args.data_type = 'dna'
    args.qprint("MVF flavor is: {}".format(mvf.flavor))
    args.qprint("Data type is: {}".format(args.data_type))
    args.qprint("Ambiguous mode: {}".format(args.ambig))
    args.qprint("Processing MVF Records")
    pwdistance_function = get_pairwise_function(args.data_type, args.ambig)
    if args.emit_counts:
        outfile_emitcounts = open(args.out + ".pairwisecounts", 'w')
    for contig, pos, allelesets in mvf.iterentries(decode=None):
        # Check Minimum Site Coverage
        if check_mincoverage(args.mincoverage, allelesets[0]) is False:
            continue
        # Establish first contig
        if current_contig is None:
            current_contig = contig[:]
            if args.windowsize > 0:
                while pos > current_position + args.windowsize - 1:
                    current_position += args.windowsize
        # Check if windows are specified.
        if not same_window((current_contig, current_position),
                           (contig, pos), args.windowsize):
            data[(current_contig, current_position)] = {
                'contig': current_contig,
                'position': current_position
            }
            data_order.append((current_contig, current_position))
            all_diff, all_total = pwdistance_function(all_match)
            for samplepair in base_matches:
                ndiff, ntotal = pwdistance_function(base_matches[samplepair])
                taxa = "{};{}".format(sample_labels[samplepair[0]],
                                      sample_labels[samplepair[1]])
                data[(current_contig, current_position)].update({
                    '{};ndiff'.format(taxa):
                    ndiff + all_diff,
                    '{};ntotal'.format(taxa):
                    ntotal + all_total,
                    '{};dist'.format(taxa):
                    zerodiv(ndiff + all_diff, ntotal + all_total)
                })
            if contig != current_contig:
                current_contig = contig[:]
                current_position = 0
                if args.windowsize > 0:
                    while pos > current_position + args.windowsize - 1:
                        current_position += args.windowsize
            else:
                current_position += args.windowsize
            if args.emit_counts:
                args.qprint("Writing Full Count Table")
                for p0, p1 in base_matches:
                    outfile_emitcounts.write("#{}\t{}\t{}\t{}\n{}\n".format(
                        p0, p1, current_position, current_contig, "\n".join([
                            "{} {}".format(x,
                                           (base_matches[(p0, p1)].get(x, 0) +
                                            all_match.get(x, 0)))
                            for x in set(base_matches[(p0,
                                                       p1)]).union(all_match)
                        ])))
            base_matches = dict((x, {}) for x in sample_pairs)
            all_match = {}
            data_in_buffer = False
        for iframe in allele_frames:
            alleles = allelesets[iframe]
            if len(alleles) == 1:
                all_match["{0}{0}".format(alleles)] = (
                    all_match.get("{0}{0}".format(alleles), 0) + 1)
                data_in_buffer = True
                continue
            if alleles[1] == '+':
                if alleles[2] in 'X-':
                    continue
                samplepair = (0, int(alleles[3:]))
                if any(x not in sample_indices for x in samplepair):
                    continue
                basepair = "{0}{1}".format(alleles[0], alleles[2])
                base_matches[samplepair][basepair] = (
                    base_matches[samplepair].get(basepair, 0) + 1)
                data_in_buffer = True
                continue
            alleles = mvf.decode(alleles)
            valid_positions = [
                i for i, x in enumerate(alleles)
                if x not in 'X-' and i in sample_indices
            ]
            assert len(alleles) == 4
            assert alleles[0] not in 'X-', alleles
            assert alleles[1] not in 'X-', alleles
            for i, j in combinations(valid_positions, 2):
                samplepair = (i, j)
                basepair = "{0}{1}".format(alleles[i], alleles[j])
                base_matches[samplepair][basepair] = (
                    base_matches[samplepair].get(basepair, 0) + 1)
            data_in_buffer = True
        # print(base_matches)
    if data_in_buffer is True:
        print(sum(base_matches[samplepair].values()), base_matches[samplepair],
              samplepair)
        print(sum(all_match.values()), all_match)
        print(sum(base_matches[samplepair].values()) + sum(all_match.values()))
        # Check whether, windows, contigs, or total
        if args.windowsize == 0:
            current_contig = 'TOTAL'
            current_position = 0
        elif args.windowsize == -1:
            current_position = 0
        data[(current_contig, current_position)] = {
            'contig': current_contig,
            'position': current_position
        }
        data_order.append((current_contig, current_position))
        # print("All match")
        all_diff, all_total = pwdistance_function(all_match)
        print(all_diff, all_total)
        for samplepair in base_matches:
            ndiff, ntotal = pwdistance_function(base_matches[samplepair])
            taxa = "{};{}".format(sample_labels[samplepair[0]],
                                  sample_labels[samplepair[1]])
            data[(current_contig, current_position)].update({
                '{};ndiff'.format(taxa):
                ndiff + all_diff,
                '{};ntotal'.format(taxa):
                ntotal + all_total,
                '{};dist'.format(taxa):
                zerodiv(ndiff + all_diff, ntotal + all_total)
            })
        if args.emit_counts:
            args.qprint("Writing Full Count Table")
            for p0, p1 in base_matches:
                outfile_emitcounts.write("#{}\t{}\t{}\t{}\n{}\n".format(
                    p0, p1, current_position, current_contig, "\n".join([
                        "{} {}".format(x, (base_matches[(p0, p1)].get(x, 0) +
                                           all_match.get(x, 0)))
                        for x in set(base_matches[(p0, p1)]).union(all_match)
                    ])))
    args.qprint("Writing Output")
    headers = ['contig', 'position']
    for samplepair in sample_pairs:
        headers.extend([
            '{};{};{}'.format(sample_labels[samplepair[0]],
                              sample_labels[samplepair[1]], x)
            for x in ('ndiff', 'ntotal', 'dist')
        ])
    outfile = OutputFile(path=args.out, headers=headers)
    for okey in data_order:
        outfile.write_entry(data[okey])
    if args.emit_counts:
        outfile_emitcounts.close()
    return ''

Example #8

Show file

File: mvfanalysis.py Project: peaselab/mvftools

def calc_character_count(args):
    """Count the number of and relative rate of certain bases
       spatially along chromosomes
    """
    mvf = MultiVariantFile(args.mvf, 'read')
    data = {}
    current_contig = None
    current_position = 0
    all_match = 0
    all_total = 0
    data_in_buffer = False
    # Set up base matching from special words
    data_order = []

    def proc_special_word(argx):
        if argx == 'dna':
            argx = MLIB.validchars['dna']
        elif argx == 'dnaambig2':
            argx = MLIB.validchars['dna+ambig2']
        elif argx == 'dnaambig3':
            argx = MLIB.validchars['dna+ambig3']
        elif argx == 'dnaambigall':
            argx = MLIB.validchars['dna+ambigall']
        elif argx == 'prot':
            argx = MLIB.validchars['amino']
        return argx

    args.base_match = proc_special_word(args.base_match)
    args.base_total = proc_special_word(args.base_total)
    # Set up sample indices
    if args.sample_indices is not None:
        sample_indices = [int(x) for x in args.sample_indices[0].split(",")]
    elif args.sample_labels is not None:
        sample_indices = mvf.get_sample_indices(
            ids=args.sample_labels[0].split(","))
    else:
        sample_indices = mvf.get_sample_indices()
    sample_labels = mvf.get_sample_ids(indices=sample_indices)
    # Set up contig ids
    if args.contig_ids is not None:
        contig_indices = mvf.get_contig_indices(
            ids=args.contig_ids[0].split(","))
    elif args.contig_labels is not None:
        contig_indices = mvf.get_contig_indices(
            labels=args.contig_labels[0].split(","))
    else:
        contig_indices = None
    match_counts = dict().fromkeys([sample_labels[i] for i in sample_indices],
                                   0)
    total_counts = dict().fromkeys([sample_labels[i] for i in sample_indices],
                                   0)
    for contig, pos, allelesets in mvf.iterentries(
            decode=False, contig_indices=contig_indices):
        # Check Minimum Site Coverage
        if check_mincoverage(args.mincoverage, allelesets[0]) is False:
            continue
        # if contig not in contig_ids:
        #   continue
        # Establish first contig
        if current_contig is None:
            current_contig = contig[:]
            if args.windowsize > 0:
                while pos > current_position + args.windowsize - 1:
                    current_position += args.windowsize
        # Check if windows are specified.
        if not same_window((current_contig, current_position),
                           (contig, pos), args.windowsize):
            data[(current_contig, current_position)] = {
                'contig': current_contig,
                'position': current_position
            }
            data_order.append((current_contig, current_position))
            for k in match_counts:

                data[(current_contig, current_position)].update([
                    (k + '.match', match_counts[k] + all_match),
                    (k + '.total', total_counts[k] + all_total),
                    (k + '.prop', ((float(match_counts[k] + all_match) /
                                    float(total_counts[k] + all_total))
                                   if total_counts[k] + all_total > 0 else 0))
                ])
            if contig != current_contig:
                current_contig = contig[:]
                current_position = 0
            else:
                current_position += (0 if args.windowsize == -1 else
                                     args.windowsize)
            match_counts = dict().fromkeys(
                [sample_labels[i] for i in sample_indices], 0)
            total_counts = dict().fromkeys(
                [sample_labels[i] for i in sample_indices], 0)
            all_total = 0
            all_match = 0
            data_in_buffer = False
        else:
            alleles = allelesets[0]
            if len(alleles) == 1:
                if args.base_match is None:
                    all_match += 1
                elif alleles in args.base_match:
                    all_match += 1
                if args.base_total is None:
                    all_total += 1
                elif alleles in args.base_total:
                    all_total += 1
            else:
                alleles = mvf.decode(alleles)
                for i in sample_indices:
                    if args.base_match is None:
                        match_counts[sample_labels[i]] += 1
                    elif alleles[i] in args.base_match:
                        match_counts[sample_labels[i]] += 1
                    if args.base_total is None:
                        total_counts[sample_labels[i]] += 1
                    elif alleles[i] in args.base_total:
                        total_counts[sample_labels[i]] += 1
            data_in_buffer = True
    if data_in_buffer:
        data[(current_contig, current_position)] = {
            'contig': current_contig,
            'position': current_position
        }
        data_order.append((current_contig, current_position))
        for k in match_counts:
            data[(current_contig, current_position)].update([
                (k + '.match', match_counts[k] + all_match),
                (k + '.total', total_counts[k] + all_total),
                (k + '.prop', ((float(match_counts[k] + all_match) /
                                float(total_counts[k] + all_total))
                               if total_counts[k] + all_total > 0 else 0))
            ])
    # WRITE OUTPUT
    headers = ['contig', 'position']
    for label in sample_labels:
        headers.extend([label + x for x in ('.match', '.total', '.prop')])
    outfile = OutputFile(path=args.out, headers=headers)
    for okey in data_order:
        outfile.write_entry(data[okey])
    return ''

Example #9

Show file

def merge_mvf(args):
    """Main method"""
    args.qprint("Running MergeMVF")
    if any(fpath.endswith('.gz') for fpath in args.mvf):
        print("WARNING! Running MergeMVF with gzipped input files is "
              "extremely slow and strongly discouraged.")
    concatmvf = MultiVariantFile(args.out, 'write', overwrite=args.overwrite)
    # Copy the first file's metadata
    args.qprint("Reading First File and Establishing Output")
    if args.main_header_file:
        if args.main_header_file not in args.mvf:
            raise RuntimeError("{} not found in files".format(
                args.main_header_file))
        args.main_header_file = args.mvf.index(args.main_header_file)
    else:
        args.main_header_file = 0
    first_mvf = MultiVariantFile(args.mvf[args.main_header_file], 'read')
    concatmvf.copy_header(first_mvf)
    # Open each MVF file, read headers to make unified header
    transformers = []
    mvfmetadata = []
    inputfiles = []
    for mvfname in args.mvf:
        args.qprint("Reading headers from {}".format(mvfname))
        # This will create a dictionary of samples{old:new}, contigs{old:new}
        args.qprint("Processing Headers and Indexing: {}".format(mvfname))
        transformer = MvfTransformer()
        mvf = MultiVariantFile(mvfname,
                               'read',
                               contigindex=(not args.skip_index))
        if args.skip_index:
            mvf.read_index_file()
        mvf.reset_max_contig()
        mvfmetadata.append(mvf.metadata)
        for i, sid in enumerate(mvf.get_sample_ids()):
            if sid not in concatmvf.get_sample_ids():
                new_sindex = concatmvf.max_sample_index + 0
                concatmvf.max_sample_index += 1
                concatmvf.sample_indices.append(new_sindex)
                concatmvf.sample_ids.append(sid)
                concatmvf.sample_data[new_sindex] = {}
                concatmvf.sample_data[new_sindex]['id'] = sid
                concatmvf.sample_id_to_index[sid] = new_sindex
            transformer.set_label(i, concatmvf.sample_id_to_index[sid])
        for cindex in mvf.contig_indices:
            if (mvf.contig_data[cindex]['label']
                    not in concatmvf.contig_label_to_index):
                new_cindex = (mvf.contig_data[cindex]['id']
                              if mvf.contig_data[cindex]['id']
                              not in concatmvf.contig_ids else
                              concatmvf.get_next_contig_index())
                concatmvf.contig_data[new_cindex] = (
                    mvf.contig_data[cindex].copy())
            else:
                new_cindex = concatmvf.contig_label_to_index[
                    mvf.contig_data[cindex]['label']]
            transformer.set_contig(cindex, new_cindex)
        transformers.append(transformer)
        inputfiles.append(mvf)
    # Write output header
    args.qprint("Writing headers to merge output")
    concatmvf.reset_max_sample()
    concatmvf.notes.append(args.command_string)
    concatmvf.write_data(concatmvf.get_header())
    # Now loop through each file
    blank_entry = '-' * len(concatmvf.sample_indices)
    for cons_contig in concatmvf.contig_indices:
        contig_merged_entries = {}
        args.qprint("Merging Contig Index: {}".format(cons_contig))
        for ifile, mvffile in enumerate(inputfiles):
            if cons_contig not in transformers[ifile].contigs:
                continue
            localcontig = transformers[ifile].contigs[cons_contig]
            if 'idx' not in mvffile.contig_data[localcontig]:
                print("not found")
                continue
            for _, pos, allelesets in mvffile.itercontigentries(localcontig,
                                                                decode=True):
                if pos not in contig_merged_entries:
                    contig_merged_entries[pos] = blank_entry[:]
                for j, base in enumerate(allelesets[0]):
                    xcoord = transformers[ifile].labels_rev[j]
                    if contig_merged_entries[pos][xcoord] != '-':
                        if contig_merged_entries[pos][xcoord] == base:
                            continue
                        if base in '-X':
                            continue
                        raise RuntimeError(
                            ("Merging columns have two different bases: "
                             "{} {} {}").format(
                                 pos, contig_merged_entries[pos][xcoord],
                                 base))
                    contig_merged_entries[pos] = (
                        contig_merged_entries[pos][:xcoord] + base +
                        contig_merged_entries[pos][xcoord + 1:])
        if contig_merged_entries:
            concatmvf.write_entries(
                ((cons_contig, coord, (entry, ))
                 for coord, entry in sorted(contig_merged_entries.items())),
                encoded=False)
        args.qprint("Entries written for contig {}: {}".format(
            cons_contig, len(contig_merged_entries)))
    return ''

Example #10

Show file

def merge_mvf(args):
    """Main method"""
    args.qprint("Running MergeMVF")
    if any(fpath.endswith('.gz') for fpath in args.mvf):
        print("WARNING! Running MergeMVF with gzipped input files is "
              "extremely slow and strongly discouraged.")
    concatmvf = MultiVariantFile(args.out, 'write', overwrite=args.overwrite)
    # Copy the first file's metadata
    args.qprint("Reading First File and Establishing Output")
    if args.main_header_file:
        if args.main_header_file not in args.mvf:
            raise RuntimeError("{} not found in files".format(
                args.main_header_file))
        else:
            args.main_header_file = args.mvf.index(args.main_header_file)
    else:
        args.main_header_file = 0
    first_mvf = MultiVariantFile(args.mvf[args.main_header_file], 'read')
    concatmvf.metadata = first_mvf.metadata.copy()
    # Open each MVF file, read headers to make unified header
    transformers = []
    mvfmetadata = []
    concatmvf_reverse_contig = dict(
        (x['label'], k) for (k, x) in concatmvf.metadata['contigs'].items())
    inputfiles = []
    for mvfname in args.mvf:
        args.qprint("Reading headers from {}".format(mvfname))
        # This will create a dictionary of samples{old:new}, contigs{old:new}
        args.qprint("Processing Headers and Indexing: {}".format(mvfname))
        transformer = MvfTransformer()
        mvf = MultiVariantFile(mvfname,
                               'read',
                               contigindex=(not args.skip_index))
        if args.skip_index:
            mvf.read_index_file()
        mvf.reset_max_contig_id()
        mvfmetadata.append(mvf.metadata)
        for i, label in enumerate(mvf.get_sample_labels()):
            if label not in concatmvf.get_sample_labels():
                concatmvf.metadata['labels'].append(label)
                concatmvf.metadata['samples'][
                    concatmvf.metadata['labels'].index(label)] = {
                        'label': label
                    }
#            if concatmvf.metadata['labels'].index(label) != i:
            transformer.set_label(i, concatmvf.metadata['labels'].index(label))
        for contigid, contigdata in iter(mvf.metadata['contigs'].items()):
            if contigdata['label'] not in concatmvf_reverse_contig:
                newid = (contigid
                         if contigid not in concatmvf.metadata['contigs'] else
                         concatmvf.get_next_contig_id())
                concatmvf.metadata['contigs'][newid] = contigdata
                concatmvf_reverse_contig[contigdata['label']] = newid
            else:
                newid = concatmvf_reverse_contig[contigdata['label']]
            transformer.set_contig(contigid, newid)
        transformers.append(transformer)
        inputfiles.append(mvf)
    # Write output header
    args.qprint("Writing headers to merge output")
    concatmvf.reset_ncol()
    concatmvf.write_data(concatmvf.get_header())
    contigs = concatmvf.metadata['contigs']
    # Now loop through each file
    blank_entry = '-' * len(concatmvf.metadata['samples'])
    for current_contig in contigs:
        contig_merged_entries = {}
        args.qprint("Merging Contig: {}".format(current_contig))
        for ifile, mvffile in enumerate(inputfiles):
            if current_contig not in transformers[ifile].contigs:
                continue
            localcontig = transformers[ifile].contigs[current_contig]
            for chrom, pos, allelesets in mvffile.itercontigentries(
                    localcontig, decode=True):
                if pos not in contig_merged_entries:
                    contig_merged_entries[pos] = blank_entry[:]
                for j, base in enumerate(allelesets[0]):
                    xcoord = transformers[ifile].labels_rev[j]
                    if contig_merged_entries[pos][xcoord] != '-':
                        if contig_merged_entries[pos][xcoord] == base:
                            continue
                        if base == '-' or base == 'X':
                            continue
                        raise RuntimeError(
                            "Merging columns have two different bases: {} {} {}"
                            .format(pos, contig_merged_entries[pos][xcoord],
                                    base))
                    contig_merged_entries[pos] = (
                        contig_merged_entries[pos][:xcoord] + base +
                        contig_merged_entries[pos][xcoord + 1:])
        concatmvf.write_entries(
            ((current_contig, coord, (entry, ))
             for coord, entry in sorted(contig_merged_entries.items())),
            encoded=False)
        args.qprint("Entries written for contig {}: {}".format(
            current_contig, len(contig_merged_entries)))
    return ''

Example #11

Show file

File: mvfwindowtree.py Project: luhuimeng/mvftools

def infer_window_tree(args):
    """Main method"""
    # ESTABLISH FILE OBJECTS
    mvf = MultiVariantFile(args.mvf, 'read')
    # Set up contig ids
    if args.contig_ids is not None:
        contig_ids = args.contig_ids[0].split(",")
    elif args.contig_labels is not None:
        contig_ids = mvf.get_contig_ids(
            labels=args.contig_labels[0].split(","))
    else:
        contig_ids = mvf.get_contig_ids()
    treefile = OutputFile(
        args.out,
        headers=[
            'contig',
            'windowstart',
            'windowsize',
            'tree',
            'topology',
            'topoid',
            # 'templabels', ### USED FOR DEBUGGING ###
            'alignlength',
            'aligndepth',
            'status'
        ])
    topofile = OutputFile(args.out + '.counts',
                          headers=['rank', 'topology', 'count'])
    if args.sample_indices is not None:
        sample_indices = [int(x) for x in args.sample_indices[0].split(",")]
    elif args.sample_labels is not None:
        sample_indices = mvf.get_sample_indices(
            labels=args.sample_labels[0].split(","))
    else:
        sample_indices = mvf.get_sample_indices()
    if not os.path.exists(args.temp_dir):
        os.mkdir(args.temp_dir)
    os.chdir(args.temp_dir)
    # SETUP PARAMS
    main_labels = mvf.get_sample_labels(sample_indices)
    if args.choose_allele in ['randomboth', 'majorminor']:
        main_labels = [label + x for x in ['a', 'b'] for label in main_labels]
    params = {
        'outgroups':
        args.raxml_outgroups or [],
        'rootwith':
        (args.root_with.split(',') if args.root_with is not None else None),
        'minsites':
        args.min_sites,
        'minseqcoverage':
        args.min_seq_coverage,
        'mindepth':
        args.min_depth,
        'raxmlpath':
        args.raxml_path,
        'raxmlopts':
        args.raxml_opts,
        'duplicateseq':
        args.duplicate_seq,
        'model':
        args.raxml_model,
        'bootstrap':
        args.bootstrap,
        'windowsize':
        args.windowsize,
        'chooseallele':
        args.choose_allele,
        'tempdir':
        args.temp_dir,
        'tempprefix':
        args.temp_prefix
    }
    # WINDOW START INTERATION
    verify_raxml(params)
    current_contig = ''
    current_position = 0
    window_data = None
    skip_contig = False
    topo_ids = {}
    topo_counts = {}
    for contig, pos, allelesets in mvf.iterentries(contigs=contig_ids,
                                                   subset=sample_indices,
                                                   quiet=args.quiet,
                                                   no_invariant=False,
                                                   no_ambig=False,
                                                   no_gap=False,
                                                   decode=True):
        if current_contig == contig:
            if skip_contig is True:
                continue
        if not same_window((current_contig, current_position),
                           (contig, pos), args.windowsize):
            skip_contig = False
            if window_data is not None:
                entry = window_data.maketree_raxml(params)
                if entry['status'] != 'ok':
                    if args.output_empty:
                        treefile.write_entry(entry)
                    if args.windowsize != -1:
                        skip_contig = True
                else:
                    topo = entry["topology"]
                    topo_counts[topo] = topo_counts.get(topo, 0) + 1
                    if topo not in topo_ids:
                        topo_ids[topo] = (topo_ids
                                          and max(topo_ids.values()) + 1 or 0)
                    entry["topoid"] = topo_ids[topo]
                    treefile.write_entry(entry)
                current_position = (current_position + args.windowsize if
                                    (contig == current_contig
                                     and args.windowsize > 0) else 0)
            current_contig = contig[:]
            window_data = None
            window_data = WindowData(
                window_params={
                    'contigname': (mvf.get_contig_labels(
                        ids=current_contig) if args.output_contig_labels
                                   is not None else current_contig[:]),
                    "windowstart": (
                        '-1' if args.windowsize == -1 else current_position +
                        0),
                    "windowsize":
                    args.windowsize,
                    "labels":
                    main_labels[:]
                })
        # ADD ALLELES
        if mvf.flavor == 'dna':
            if args.choose_allele != 'none':
                allelesets[0] = hapsplit(allelesets[0], args.choose_allele)
            window_data.append_alleles(allelesets[0], mindepth=args.min_depth)
    # LAST LOOP
    if window_data:
        entry = window_data.maketree_raxml(params)
        if entry['status'] != 'ok':
            if args.output_empty:
                treefile.write_entry(entry)
        else:
            topo = entry["topology"]
            topo_counts[topo] = topo_counts.get(topo, 0) + 1
            if topo not in topo_ids:
                topo_ids[topo] = (max(topo_ids.values()) +
                                  1 if topo_ids else 0)
            entry["topoid"] = topo_ids[topo]
            treefile.write_entry(entry)
        window_data = None
    # END WINDOW ITERATION
    topo_list = sorted([(v, k) for k, v in topo_counts.items()], reverse=True)
    for rank, [value, topo] in enumerate(topo_list):
        topofile.write_entry({'rank': rank, 'count': value, 'topology': topo})
    return ''

Example #12

Show file

def infer_window_tree(args):
    """Main method"""
    args.qprint("Running InferTree")
    # ESTABLISH FILE OBJECTS
    mvf = MultiVariantFile(args.mvf, 'read')
    args.qprint("Read MVF File: {}".format(args.mvf))
    # Set up contig ids
    if args.contig_ids is not None:
        contig_ids = args.contig_ids[0].split(",")
    elif args.contig_labels is not None:
        contig_ids = mvf.get_contig_ids(
            labels=args.contig_labels[0].split(","))
    else:
        contig_ids = mvf.get_contig_ids()
    treefile = OutputFile(
        args.out,
        headers=['contig', 'windowstart', 'windowsize', 'tree',
                 'topology', 'topoid',
                 # 'templabels', ### USED FOR DEBUGGING ###
                 'alignlength', 'aligndepth', 'status'])
    topofile = OutputFile(args.out + '.counts',
                          headers=['rank', 'topology', 'count'])
    if args.sample_indices is not None:
        sample_indices = [int(x) for x in
                          args.sample_indices[0].split(",")]
    elif args.sample_labels is not None:
        sample_indices = mvf.get_sample_indices(
            ids=args.sample_labels[0].split(","))
    else:
        sample_indices = mvf.get_sample_indices()
    if not os.path.exists(args.temp_dir):
        os.mkdir(args.temp_dir)
    os.chdir(args.temp_dir)
    # SETUP PARAMS
    main_labels = mvf.get_sample_ids(sample_indices)
    if args.choose_allele in ['randomboth', 'majorminor']:
        main_labels = [label + x for x in ['a', 'b'] for label in main_labels]
    params = {
        'bootstrap': args.bootstrap,
        'chooseallele': args.choose_allele,
        'collapse_polytomies': args.collapse_polytomies,
        'duplicateseq': args.duplicate_seq,
        'engine': args.engine,
        'engine_path': args.engine_path,
        'engine_opts': args.engine_opts,
        'mindepth': args.min_depth,
        'minseqcoverage': args.min_seq_coverage,
        'minsites': args.min_sites,
        'model': args.model,
        'outgroups': (args.raxml_outgroups 
                      if args.raxml_outgroups is not None
                      else None),
        'rootwith': (args.root_with.split(',')
                     if args.root_with is not None
                    else []),
        'tempdir': args.temp_dir,
        'tempprefix': args.temp_prefix,
        'windowsize': args.windowsize,
        }
    # DEFAULT MODEL
    if params['model'] is None:
        if params['engine'] == 'raxml':
            params['model'] = 'GTRGAMMA'
        elif params['engine'] == 'raxml-ng':
            params['model'] = "GTR+G"
    # WINDOW START INTERATION
    verify_raxml(params)
    args.qprint("RAxML Found.")
    current_contig = None
    current_position = 0
    window_data = None
    # skip_contig = False
    topo_ids = {}
    topo_counts = {}
    args.qprint("Prcocessing Records")
    windowsizename = "window size={}".format(args.windowsize)
    if windowsizename == "window size=-1":
        windowsizename = "whole contig"
    elif windowsizename == "window size=0":
        windowsizename = "whole genome"
        window_data = WindowData(window_params={
            'contigname': 'all',
            "windowstart": 0,
            "windowsize": 0,
            "labels": main_labels[:]})
    for contig, pos, allelesets in mvf.iterentries(
            contig_ids=contig_ids, subset=sample_indices,
            no_invariant=False, no_ambig=False, no_gap=False, decode=True):
        # if current_contig == contig:
        #     if skip_contig is True:
        #         args.qprint("Skipping contig: {}".format(current_contig))
        #         continue
        if not same_window((current_contig, current_position),
                           (contig, pos), args.windowsize):
            # skip_contig = False
            if window_data is not None:
                args.qprint(("Making tree for {} "
                             "at contig {} position {}").format(
                                 windowsizename,
                                 current_contig,
                                 current_position))
                entry = window_data.maketree_raxml(params)
                if entry['status'] != 'ok':
                    if args.output_empty:
                        treefile.write_entry(entry)
                    # if args.windowsize != -1:
                    #     skip_contig = True
                    args.qprint(
                        "TREE REJECTED with error code: {} ({})".format(
                            entry['status'], entry.get('comment', "None")))
                else:
                    args.qprint("Tree completed.")
                    topo = entry["topology"]
                    topo_counts[topo] = topo_counts.get(topo, 0) + 1
                    if topo not in topo_ids:
                        topo_ids[topo] = (max(topo_ids.values()) + 1
                                          if topo_ids else 0)
                    entry["topoid"] = topo_ids[topo]
                    treefile.write_entry(entry)
                current_position = current_position + args.windowsize if (
                    contig == current_contig and args.windowsize > 0) else 0
            current_contig = contig[:]
            window_data = None
            window_data = WindowData(window_params={
                'contigname': (mvf.get_contig_labels(ids=current_contig) if
                               args.output_contig_labels is not None else
                               current_contig[:]),
                "windowstart": ('-1' if args.windowsize == -1
                                else current_position + 0),
                "windowsize": args.windowsize,
                "labels": main_labels[:]})
        # ADD ALLELES
        if mvf.flavor == 'dna':
            if args.choose_allele != 'none':
                allelesets[0] = hapsplit(allelesets[0], args.choose_allele)
            window_data.append_alleles(allelesets[0], mindepth=args.min_depth)
        elif mvf.flavor == 'codon':
            for i in (1, 2, 3):
                if args.choose_allele != 'none':
                    allelesets[i] = hapsplit(allelesets[i], args.choose_allele)
                window_data.append_alleles(allelesets[i], mindepth=args.min_depth)
    # LAST LOOP
    if window_data:
        entry = window_data.maketree_raxml(params)
        if entry['status'] != 'ok':
            if args.output_empty:
                treefile.write_entry(entry)
        else:
            topo = entry["topology"]
            topo_counts[topo] = topo_counts.get(topo, 0) + 1
            if topo not in topo_ids:
                topo_ids[topo] = (
                    max(topo_ids.values()) + 1 if topo_ids else 0)
            entry["topoid"] = topo_ids[topo]
            treefile.write_entry(entry)
        window_data = None
    # END WINDOW ITERATION
    topo_list = sorted([(v, k) for k, v in topo_counts.items()],
                       reverse=True)
    for rank, [value, topo] in enumerate(topo_list):
        topofile.write_entry({'rank': rank, 'count': value, 'topology': topo})
    return ''

Example #13

Show file

File: mvfmaf.py Project: hj1994412/mvftools

def maf2mvf(args):
    """Main method"""
    # ESTABLISH MAF
    maf = MultiAlignFile(args)
    # ESTABLISH MVF
    mvf = MultiVariantFile(args.out, 'write', overwrite=args.overwrite)
    # PROCESS SAMPLE INFO
    contig_translate = {1: 1}
    samplelabels = [s.split(':')[0] for s in args.sample_tags]
    samplelabels.remove(args.ref_tag)
    samplelabels.insert(0, args.ref_tag)
    mvf.metadata['labels'] = samplelabels[:]
    for i, label in enumerate(samplelabels):
        mvf.metadata['samples'][i] = {'label': label}
    mvf.metadata['ncol'] = len(mvf.metadata['labels'])
    mvf.metadata['sourceformat'] = maf.metadata['sourceformat']
    # WRITE MVF HEADER
    mvf.write_data(mvf.get_header())
    mvfentries = []
    nentry = 0
    for pos, length, msa in maf:
        for sname in samplelabels:
            if sname not in msa:
                msa[sname] = '-'*length
        msa['contig'] = 1
        for i in range(length):
            mvf_alleles = encode_mvfstring(
                ''.join(msa[s][i].strip() for s in samplelabels))
            if mvf_alleles:
                mvfentries.append(
                    (contig_translate.get(msa['contig']),
                     pos+i, (mvf_alleles,)))
                nentry += 1
                if nentry == args.line_buffer:
                    mvf.write_entries(mvfentries, encoded=True)
                    mvfentries = []
                    nentry = 0
    if mvfentries:
        mvf.write_entries(mvfentries)
    return ''

Example #14

Show file

def mvf2fasta(args):
    """Main method"""
    mvf = MultiVariantFile(args.mvf, 'read')
    if (mvf.flavor in ("dna", "rna") and args.output_data == "prot") or (
            mvf.flavor == "prot" and args.output_data in ("dna", "rna")):
        raise RuntimeError(
            "--output-data {} incompatiable with '{}' flavor mvf".format(
                args.output_data, mvf.flavor))
    regions, max_region_coord, regionlabel = parse_regions_arg(
        args.regions, mvf.metadata['contigs'])
    sample_labels = mvf.get_sample_labels()
    if args.sample_indices is not None:
        sample_indices = [int(x) for x in args.sample_indices[0].split(",")]
    elif args.sample_labels is not None:
        sample_indices = mvf.get_sample_indices(
            labels=args.sample_labels[0].split(","))
    else:
        sample_indices = mvf.get_sample_indices()
    skipcontig = ''
    tmp_files = dict((fn,
                      open("{}-{}.tmp".format(fn, randint(1000000, 9999999)),
                           'w+', args.buffer)) for fn in sample_labels)
    labelwritten = dict.fromkeys(sample_labels, False)
    for contig, pos, allelesets in mvf.iterentries(contigs=[
            x for x in max_region_coord
    ],
                                                   quiet=args.quiet,
                                                   decode=True):
        if contig == skipcontig:
            continue
        if (contig not in max_region_coord) or (
                max_region_coord[contig] is not None
                and pos > max_region_coord[contig]):
            skipcontig = contig[:]
            continue
        inregion = False
        for rcontig, rstart, rstop, _ in regions:
            if contig == rcontig:
                if rstart is None or pos >= rstart:
                    if rstop is None or pos <= rstop:
                        inregion = True
                        break
        if inregion is False:
            continue
        for col, label in zip(sample_indices, sample_labels):
            if not labelwritten[label]:
                if args.label_type == 'long':
                    xlabel = "{} region={}".format(label, regionlabel)
                elif args.label_type == 'short':
                    xlabel = "{}".format(label)
                tmp_files[label].write(">{}\n".format(xlabel))
                labelwritten[label] = True
            if mvf.flavor == 'dna':
                tmp_files[label].write("N" if allelesets[0][col] ==
                                       'X' else allelesets[0][col])
            elif mvf.flavor in ('codon', 'prot') and (args.output_data
                                                      == 'prot'):
                tmp_files[label].write(allelesets[0][col])
            elif mvf.flavor == 'codon' and args.output_data == 'dna':
                codon = [
                    "N" if allelesets[x][col] == 'X' else allelesets[x][col]
                    for x in (1, 2, 3)
                ]
                tmp_files[label].write(''.join(codon))
    with open(args.out, 'w') as outfile:
        for filehandler in tmp_files.values():
            filehandler.seek(0, 0)
            buff = filehandler.read(args.buffer)
            while len(buff):
                outfile.write(buff)
                buff = filehandler.read(args.buffer)
            outfile.write("\n")
            filehandler.close()
            os.remove(os.path.join(args.temp_dir, filehandler.name))
    return ''

Example #15

Show file

def translate_mvf(args):
    """Main method"""
    mvf = MultiVariantFile(args.mvf, 'read')
    if mvf.flavor != 'dna':
        raise RuntimeError("MVF must be flavor=dna to translate")
    if args.gff:
        gff = parse_gff_translate(args.gff, args)
        if not args.quiet:
            print("gff_processed")
    outmvf = MultiVariantFile(args.out, 'write', overwrite=args.overwrite)
    outmvf.metadata = deepcopy(mvf.metadata)
    outmvf.flavor = args.output_data
    outmvf.write_data(outmvf.get_header())
    entrybuffer = []
    nentry = 0
    if not args.gff:
        inputbuffer = []
        current_contig = ''
        for contigid, pos, allelesets in mvf.iterentries(decode=False):
            if current_contig == '':
                current_contig = contigid[:]
            if contigid == current_contig:
                inputbuffer.append((pos, allelesets))
            else:
                for _, amino_acids, alleles in iter_codons(inputbuffer, mvf):
                    if all([x in '-X' for x in amino_acids]):
                        continue
                    if args.output_data == 'protein':
                        entrybuffer.append(
                            (current_contig, pos, (amino_acids, )))
                    else:
                        entrybuffer.append(
                            (current_contig, pos, (amino_acids, alleles[0],
                                                   alleles[1], alleles[2])))
                    nentry += 1
                    if nentry == args.line_buffer:
                        outmvf.write_entries(entrybuffer)
                        entrybuffer = []
                        nentry = 0
                inputbuffer = [(pos, allelesets)]
                current_contig = contigid[:]
        if inputbuffer:
            for _, amino_acids, alleles in iter_codons(inputbuffer, mvf):
                if all([x in '-X' for x in amino_acids]):
                    continue
                if args.output_data == 'protein':
                    entrybuffer.append((current_contig, pos, (amino_acids, )))
                else:
                    entrybuffer.append(
                        (current_contig, pos, (amino_acids, alleles[0],
                                               alleles[1], alleles[2])))
                nentry += 1
                if nentry == args.line_buffer:
                    outmvf.write_entries(entrybuffer)
                    entrybuffer = []
                    nentry = 0
    else:
        mvf_entries = {}
        for contigid, pos, allelesets in mvf.iterentries(decode=False):
            if contigid not in mvf_entries:
                mvf_entries[contigid] = {}
            mvf_entries[contigid][pos] = allelesets[0]
        for contigname in sorted(gff):
            contigid = mvf.get_contig_ids(labels=contigname)[0]
            for coords in sorted(gff[contigname]):
                reverse_strand = False
                if coords[3] == '-':
                    reverse_strand = True
                    alleles = [
                        mvf_entries[contigid].get(x, '-')
                        for x in coords[2::-1]
                    ]
                else:
                    alleles = [
                        mvf_entries[contigid].get(x, '-') for x in coords[0:3]
                    ]
                if all(len(x) == 1 for x in alleles):
                    if reverse_strand:
                        alleles = [MLIB.complement_bases[x] for x in alleles]
                    decoded_alleles = alleles
                    amino_acids = translate(''.join(alleles))[0]
                else:
                    if reverse_strand:
                        decoded_alleles = [[
                            MLIB.complement_bases[y] for y in mvf.decode(x)
                        ] for x in alleles]
                        alleles = [
                            mvf.encode(''.join(x)) for x in decoded_alleles
                        ]
                    else:
                        decoded_alleles = [mvf.decode(x) for x in alleles]
                    amino_acids = [
                        translate(''.join(x)) for x in zip(*decoded_alleles)
                    ]
                    amino_acids = mvf.encode(''.join(
                        [x[0] for x in amino_acids]))
                if all([x in '-X' for x in amino_acids]):
                    continue
                if args.output_data == 'protein':
                    entrybuffer.append((contigid, coords[0], (amino_acids, )))
                else:
                    entrybuffer.append(
                        (contigid, coords[0], (amino_acids, alleles[0],
                                               alleles[1], alleles[2])))
                nentry += 1
                if nentry == args.line_buffer:
                    outmvf.write_entries(entrybuffer)
                    entrybuffer = []
                    nentry = 0
    if entrybuffer:
        outmvf.write_entries(entrybuffer)
        entrybuffer = []
        nentry = 0
    return ''

Example #16

Show file

File: mvfanalysis.py Project: peaselab/mvftools

def calc_dstat_combinations(args):
    """Calculate genome-wide D-statstics for
       all possible trio combinations of samples
       and outgroups specified.
    """
    mvf = MultiVariantFile(args.mvf, 'read')
    data = {}
    sample_labels = mvf.get_sample_ids()
    if args.outgroup_indices is not None:
        outgroup_indices = [
            int(x) for x in args.outgroup_indices[0].split(",")
        ]
    elif args.outgroup_labels is not None:
        outgroup_indices = mvf.get_sample_indices(
            ids=args.outgroup_labels[0].split(","))
    if args.sample_indices is not None:
        sample_indices = [int(x) for x in args.sample_indices[0].split(",")]
    elif args.sample_labels is not None:
        sample_indices = mvf.get_sample_indices(
            ids=args.sample_labels[0].split(","))
    else:
        sample_indices = mvf.get_sample_indices()
    if args.contig_ids is not None:
        contig_ids = args.contig_ids[0].split(",")
    elif args.contig_labels is not None:
        contig_ids = mvf.get_contig_ids(
            labels=args.contig_labels[0].split(","))
    else:
        contig_ids = None
    if any(x in outgroup_indices for x in sample_indices):
        raise RuntimeError("Sample and Outgroup column lists cannot overlap.")
    for contig, _, allelesets in mvf:
        if contig not in contig_ids:
            continue
        alleles = mvf.decode(allelesets[0])
        for i, j, k in combinations(sample_indices, 3):
            for outgroup in outgroup_indices:
                subset = [alleles[x] for x in [i, j, k, outgroup]]
                if any(x not in 'ATGC' for x in subset):
                    continue
                if subset[-1] not in subset[:3]:
                    continue
                if len(set(subset)) != 2:
                    continue
                # [ABBA, BABA, BBAA]
                val = (0 + 1 * (subset[0] == subset[3]) + 2 *
                       (subset[1] == subset[3]) + 4 * (subset[2] == subset[3]))
                if val in (1, 2):
                    val -= 1
                elif val == 4:
                    val = 2
                else:
                    continue
                tetrad = (i, j, k, outgroup)
                if tetrad not in data:
                    data[tetrad] = {}
                if contig not in data[tetrad]:
                    data[tetrad][contig] = [0, 0, 0]
                data[tetrad][contig][val] += 1
    # WRITE OUTPUT
    headers = ['sample0', 'sample1', 'sample2', "outgroup"]
    for xcontig in contig_ids:
        headers.extend([
            '{}:abba'.format(xcontig), '{}:baba'.format(xcontig),
            '{}:bbaa'.format(xcontig), '{}:D'.format(xcontig)
        ])
    outfile = OutputFile(path=args.out, headers=headers)
    for i, j, k in combinations(sample_indices, 3):
        for outgroup in outgroup_indices:
            tetrad = tuple([i, j, k, outgroup])
            if tetrad not in data:
                continue
            entry = dict(('sample{}'.format(i), sample_labels[x])
                         for i, x in enumerate(tetrad[:3]))
            entry['outgroup'] = sample_labels[outgroup]
            for contig in contig_ids:
                if contig not in data[tetrad]:
                    entry.update(dict().fromkeys([
                        '{}:abba'.format(contig), '{}:baba'.format(contig),
                        '{}:bbaa'.format(contig), '{}:D'.format(contig)
                    ], '0'))
                else:
                    [abba, baba, bbaa] = data[tetrad][contig]
                    if abba > baba and abba > bbaa:

                        dstat = zerodiv(baba - bbaa, baba + bbaa)
                    elif baba > bbaa and baba > abba:
                        dstat = zerodiv(abba - bbaa, abba + bbaa)
                    else:
                        dstat = zerodiv(abba - baba, abba + baba)
                    entry.update([('{}:abba'.format(contig), abba),
                                  ('{}:baba'.format(contig), baba),
                                  ('{}:bbaa'.format(contig), bbaa),
                                  ('{}:D'.format(contig), dstat)])
            outfile.write_entry(entry)
    return ''

Example #17

Show file

File: mvfanalysis.py Project: peaselab/mvftools

def calc_pattern_count(args):
    """Count biallelic patterns spatially along
       chromosomes (e.g,, for use in DFOIL or Dstats
       http://www.github.com/jbpease/dfoil).
       The last sample specified will determine the 'A'
       versus 'B' allele.
    """
    mvf = MultiVariantFile(args.mvf, 'read')
    data = {}
    current_contig = None
    current_position = 0
    sitepatterns = {}
    if args.sample_indices is not None:
        sample_indices = [int(x) for x in args.sample_indices[0].split(",")]
    elif args.sample_labels is not None:
        sample_indices = mvf.get_sample_indices(
            ids=args.sample_labels[0].split(","))
    else:
        sample_indices = mvf.get_sample_indices()
    nsamples = len(sample_indices)
    for contig, pos, allelesets in mvf.iterentries(decode=True,
                                                   subset=sample_indices):
        alleles = allelesets[0]
        # Check Minimum Site Coverage
        if check_mincoverage(args.mincoverage, alleles) is False:
            continue
        # Establish first contig
        if current_contig is None:
            current_contig = contig[:]
            if args.windowsize > 0:
                while pos > current_position + args.windowsize - 1:
                    current_position += args.windowsize
        # Check if windows are specified.
        if not same_window((current_contig, current_position),
                           (contig, pos), args.windowsize):
            data[(current_contig,
                  current_position)] = dict([('contig', current_contig),
                                             ('position', current_position)])
            data[(current_contig, current_position)].update(sitepatterns)
            sitepatterns = {}
            if contig != current_contig:
                current_position = 0
                current_contig = contig[:]
            else:
                current_position += (0 if args.windowsize == -1 else
                                     args.windowsize)
        if set(alleles) - set("ACGT"):
            continue
        if len(set(alleles)) > 2:
            continue
        pattern = ''.join(
            ['A' if x == alleles[-1] else 'B' for x in alleles[:-1]]) + 'A'
        sitepatterns[pattern] = sitepatterns.get(pattern, 0) + 1
    if sitepatterns:
        data[(current_contig,
              current_position)] = dict([('contig', current_contig),
                                         ('position', current_position)])
        data[(current_contig, current_position)].update(sitepatterns)
    # WRITE OUTPUT
    headers = ['contig', 'position']
    headers.extend(
        [MLIB.abpattern(x, nsamples) for x in range(0, 2**nsamples, 2)])
    outfile = OutputFile(path=args.out, headers=headers)
    outfile.write("#{}\n".format(",".join(mvf.get_sample_ids(sample_indices))))
    sorted_entries = sorted([(data[k]['contig'], data[k]['position'], k)
                             for k in data])
    for _, _, k in sorted_entries:
        outfile.write_entry(data[k])
    # WRITE LIST OUTPUT
    if args.output_lists is True:
        sorted_entries = sorted([(data[k]['contig'], data[k]['position'], k)
                                 for k in data])
        total_counts = {}
        for contig, pos, k in sorted_entries:
            outfilepath = "{}-{}-{}.counts.list".format(args.out, contig, pos)
            with open(outfilepath, 'w') as outfile:
                outfile.write("pattern,count\n")
                for pattern, pcount in sorted(data[k].items()):
                    if pattern in ['contig', 'position']:
                        continue
                    outfile.write("{},{}\n".format(pattern, pcount))
                    total_counts[pattern] = (total_counts.get(pattern, 0) +
                                             pcount)
        outfilepath = "{}-TOTAL.counts.list".format(args.out)
        with open(outfilepath, 'w') as outfile:
            outfile.write("pattern,count\n")
            for pattern, pcount in sorted(total_counts.items()):
                if pattern in ['contig', 'position']:
                    continue
                outfile.write("{},{}\n".format(pattern, pcount))
    return ''

Example #18

Show file

def verify_mvf(args):
    """Main method"""
    args.qprint("Running VerifyMVF")
    mvf = MultiVariantFile(args.mvf, 'read')
    contigs = mvf.metadata['contigs']
    ncol = mvf.metadata['ncol']
    previous_location = (None, None)
    if mvf.metadata['mvftype'] in ('dna', 'protein'):
        if mvf.metadata['mvftype'] == 'protein':
            valid_bases = 'ACDEFGHIKLMNPQRSTVWY'
            valid_characters = 'ACDEFGHIKLMNPQRSTVWYX-'
        else:
            valid_bases = 'ATGCKMRSWY'
            valid_characters = 'ATGCKMRSWYX-'
        for contigid, pos, allelesets in mvf:
            alleles = allelesets[0]
            nonref = False
            if alleles[0] == '@':
                alleles = alleles[1:]
                nonref = True
            errmsg = []
            #  CHECK ALLELES
            if len(alleles) == 1:
                if alleles in 'X-':
                    errmsg.append("no data")
                elif alleles not in valid_bases:
                    errmsg.append("invalid alleles")
            elif len(alleles) == 2:
                if alleles[0] == alleles[1]:
                    errmsg.append("invalid format")
                elif alleles[0] in '-' and not nonref:
                    errmsg.append("empty reference")
                elif (alleles[0] not in valid_characters
                      or alleles[1] not in valid_characters):
                    errmsg.append("invalid alleles")
            elif alleles[1] == '+':
                if alleles[2] == '-':
                    errmsg.append("invalid format")
                elif alleles[0] == '-' and not nonref:
                    errmsg.append("empty reference")
                elif (alleles[0] not in valid_characters
                      or alleles[2] not in valid_characters):
                    errmsg.append("invalid alleles")
                elif int(alleles[3:]) > ncol:
                    errmsg.append("invalid sample number")
            elif alleles[2] == '+':
                if alleles[0] == alleles[1] and alleles[0] == alleles[3]:
                    errmsg.append("invalid format")
                elif any(alleles[x] not in valid_characters
                         for x in (0, 1, 3)):
                    errmsg.append("invalid alleles")
                elif int(alleles[4:]) > ncol:
                    errmsg.append("invalid sample number")
            else:
                if alleles[0] in '-' and not nonref:
                    errmsg.append("empty reference")
                if alleles[0] in '-':
                    errmsg.append("empty reference")
                if any(x not in valid_characters for x in alleles):
                    errmsg.append("invalid alleles")
            #  CHECK POSITION
            if contigid not in contigs:
                errmsg.append("invalid contigid")
            elif pos > contigs[contigid]['length']:
                errmsg.append("invalid position on contig")
            elif contigid != previous_location[0]:
                previous_location = (contigid, pos)
            elif pos <= previous_location[1]:
                errmsg.append("position out of order")
            #  PRINT MESSAGES
            if errmsg:
                print(contigid, pos, allelesets, errmsg)
    elif mvf.metadata['mvftype'] == 'codon':
        args.qprint("codon checking coming soon")
    return ''

Example #19

Show file

File: mvfanalysis.py Project: peaselab/mvftools

def calc_all_character_count_per_sample(args):
    """Count the number of and relative rate of certain bases
       spatially along chromosomes
    """
    args.qprint("Running CalcAllCharacterCountPerSample")
    mvf = MultiVariantFile(args.mvf, 'read')
    current_contig = None
    current_position = 0
    data_in_buffer = False
    # Set up sample indices
    sample_labels = mvf.get_sample_ids()
    if args.sample_indices is not None:
        sample_indices = [int(x) for x in args.sample_indices[0].split(",")]
    elif args.sample_labels is not None:
        sample_indices = mvf.get_sample_indices(
            ids=args.sample_labels[0].split(","))
    else:
        sample_indices = mvf.get_sample_indices()
    # Set up contig ids
    if args.contig_ids is not None:
        contig_ids = args.contig_ids[0].split(",")
    elif args.contig_labels is not None:
        contig_ids = mvf.get_contig_ids(
            labels=args.contig_labels[0].split(","))
    else:
        contig_ids = None
    data = dict((i, {}) for i in sample_indices)
    data_characters = [{} for i in sample_indices]
    for contig, pos, allelesets in mvf.iterentries(decode=False,
                                                   contig_ids=contig_ids):
        # Check Minimum Site Coverage
        if check_mincoverage(args.mincoverage, allelesets[0]) is False:
            continue
        if current_contig is None:
            current_contig = contig[:]
            if args.windowsize > 0:
                while pos > current_position + args.windowsize - 1:
                    current_position += args.windowsize
        # Check if windows are specified.
        if not same_window((current_contig, current_position),
                           (contig, pos), args.windowsize):
            args.qprint("Processing contig {}".format(current_contig))
            for i in sample_indices:
                data[i][(current_contig, current_position)] = {
                    'contig': current_contig,
                    'position': current_position
                }
                data[i][(current_contig,
                         current_position)].update(data_characters[i])
            if contig != current_contig:
                current_contig = contig[:]
                current_position = 0
            else:
                current_position += (0 if args.windowsize == -1 else
                                     args.windowsize)
            data_characters = [{} for i in sample_indices]
            data_in_buffer = False
        alleles = allelesets[0]
        if len(alleles) == 1:
            for i in sample_indices:
                data_characters[i][alleles[0]] = (
                    data_characters[i].get(alleles[0], 0) + 1)
        else:
            alleles = mvf.decode(alleles)
            for i in sample_indices:
                data_characters[i][alleles[i]] = (
                    data_characters[i].get(alleles[i], 0) + 1)
        data_in_buffer = True
    if data_in_buffer:
        for i in sample_indices:
            data[i][(current_contig, current_position)] = {
                'contig': current_contig,
                'position': current_position
            }
            data[i][(current_contig,
                     current_position)].update(data_characters[i])
    # WRITE OUTPUT
    all_chars = set([])
    for sampleid in data:
        for window in data[sampleid]:
            all_chars.update([
                x for x in data[sampleid][window]
                if x not in ('contig', 'position')
            ])
    headers = ['contig', 'position']
    headers.extend(list(sorted(all_chars)))
    outfile = OutputFile(path=args.out, headers=headers)

    for sampleid in sample_indices:
        outfile.write("#{}\n".format(sample_labels[sampleid]))
        sorted_entries = [(data[sampleid][k]['contig'],
                           data[sampleid][k]['position'], k)
                          for k in data[sampleid]]
        for _, _, k in sorted_entries:
            outfile.write_entry(data[sampleid][k], defaultvalue='0')
    return ''

Example #20

Show file

def legacy_annotate_mvf(args):
    """Main method"""
    args.qprint("Running LegacyAnnotateMVF")
    mvf = MultiVariantFile(args.mvf, 'read')
    args.qprint("Input MVF header processed.")
    args.qprint("MVF flavor: {}".format(mvf.flavor))
    gff, geneids = parse_gff_legacy_annotate(
        args.gff, mvf.contig_data, gene_pattern=args.gene_pattern)
    args.qprint("GFF processed.")
    outmvf = MultiVariantFile(args.out, 'write', overwrite=args.overwrite,
                              flavor=mvf.flavor)
    outmvf.copy_headers_from(mvf)
    if args.nongenic_mode is False:
        outmvf.contig_data = geneids.copy()
        outmvf.contig_indices = list(range(len(geneids)))
        outmvf.contig_ids = [geneids[x]['id'] for x in
                             outmvf.contig_indices]
        outmvf.contig_labels = [geneids[x]['label'] for x in
                                outmvf.contig_indices]
    outmvf.write_data(outmvf.get_header())
    args.qprint("Output MVF established.")
    entrybuffer = []
    nentry = 0
    args.qprint("Processing MVF entries.")
    for contigid, pos, allelesets in mvf.iterentries(decode=False):
        annotated_pos = None
        if contigid in gff:
            for (xgeneid, xstart, xstop) in gff[contigid]:
                if xstart < pos < xstop:
                    annotated_pos = xgeneid + 0
                    break
                if args.nongenic_mode is True and args.unmargin > 0:
                    for xpos in range(pos - args.unmargin,
                                      pos + args.unmargin + 1):
                        if xstart < xpos < xstop:
                            annotated_pos = xgeneid + 0
                            break
        if annotated_pos is not None and not args.nongenic_mode:
            entrybuffer.append((annotated_pos, pos, allelesets))
        elif args.nongenic_mode and annotated_pos is None:
            entrybuffer.append((contigid, pos, allelesets))
        if args.nongenic_mode or annotated_pos is not None:
            nentry += 1
            if nentry == args.line_buffer:
                args.qprint("Writing block of entries.")
                outmvf.write_entries(entrybuffer)
                entrybuffer = []
                nentry = 0
    if entrybuffer:
        outmvf.write_entries(entrybuffer)
        args.qprint("Writing final block of entries.")
        entrybuffer = []
        nentry = 0
    return ''

Example #21

Show file

def legacy_translate_mvf(args):
    """Main method"""
    args.qprint("Running LegacyTranslateMVF")
    if args.gff:
        args.qprint("Reading and Indexing MVF.")
    else:
        args.qprint("Reading MVF.")
    mvf = MultiVariantFile(args.mvf, 'read', contigindex=bool(args.gff))
    if mvf.flavor != 'dna':
        raise RuntimeError("MVF must be flavor=dna to translate")
    if args.gff:
        args.qprint("Processing MVF Index File.")
        mvf.read_index_file()
        args.qprint("GFF processing start.")
        gff = parse_gff_legacy_translate(
            args.gff, args,
            parent_gene_pattern=args.parent_gene_pattern)
        args.qprint("GFF processed.")
    outmvf = MultiVariantFile(args.out, 'write', overwrite=args.overwrite)
    outmvf.copy_headers_from(mvf)
    outmvf.flavor = args.output_data
    outmvf.write_data(outmvf.get_header())
    args.qprint("Output MVF Established.")
    entrybuffer = []
    nentry = 0
    pos = None
    if not args.gff:
        args.qprint("No GFF used, translating sequences as pre-aligned in "
                    "coding frame.")
        inputbuffer = []
        current_contig = ''
        for contigid, pos, allelesets in mvf.iterentries(decode=False):
            if current_contig == '':
                current_contig = contigid[:]
            if contigid == current_contig:
                inputbuffer.append((pos, allelesets))
            else:
                for _, amino_acids, alleles in iter_codons(
                        inputbuffer, mvf):
                    if all([x in '-X' for x in amino_acids]):
                        continue
                    if args.output_data == 'protein':
                        entrybuffer.append(
                            (current_contig, pos, (amino_acids,)))
                    else:
                        entrybuffer.append((
                            current_contig, pos, (
                                amino_acids, alleles[0],
                                alleles[1], alleles[2])))
                    nentry += 1
                    if nentry == args.line_buffer:
                        outmvf.write_entries(entrybuffer)
                        entrybuffer = []
                        nentry = 0
                inputbuffer = [(pos, allelesets)]
                current_contig = contigid[:]
        if inputbuffer:
            for _, amino_acids, alleles in iter_codons(
                    inputbuffer, outmvf):
                if all([x in '-X' for x in amino_acids]):
                    continue
                if args.output_data == 'protein':
                    entrybuffer.append(
                        (current_contig, pos, (amino_acids,)))
                else:
                    entrybuffer.append((
                        current_contig, pos, (
                            amino_acids, alleles[0],
                            alleles[1], alleles[2])))
                nentry += 1
                if nentry == args.line_buffer:
                    outmvf.write_entries(entrybuffer)
                    entrybuffer = []
                    nentry = 0
    else:
        args.qprint("Indexing GFF gene names.")
        # mvfid_to_gffname = outmvf.get_contig_reverse_dict()
        for xcontig in outmvf.get_contig_indices():
            mvf_entries = {}
            xcontiglabel = outmvf.get_contig_labels(indices=xcontig)[0]
            xcontigid = outmvf.get_contig_ids(indices=xcontig)[0]
            if xcontiglabel not in gff:
                if args.verbose:
                    print(
                        ("No entries in GFF, "
                         "skipping contig: index:{} id:{} label:{}").format(
                             xcontig, xcontigid, xcontiglabel))
                continue
            if not xcontig % 100:
                args.qprint("Processing contig: {} {}".format(
                    xcontigid, xcontiglabel))
            for contigid, pos, allelesets in mvf.itercontigentries(
                    xcontig, decode=False):
                mvf_entries[pos] = allelesets[0]
            for coords in sorted(gff[xcontiglabel]):
                reverse_strand = coords[3] == '-'
                alleles = (tuple(mvf_entries.get(x, '-')
                                 for x in coords[2::-1])
                           if reverse_strand is True
                           else tuple(mvf_entries.get(x, '-')
                                      for x in coords[0:3]))
                if all(len(x) == 1 for x in alleles):
                    if reverse_strand:
                        alleles = tuple(
                            MLIB.complement_bases[x] for x in alleles)
                    decoded_alleles = alleles
                    amino_acids = translate_single_codon(''.join(alleles))
                else:
                    if reverse_strand is True:
                        decoded_alleles = tuple(tuple(MLIB.complement_bases[y]
                                                      for y in mvf.decode(x))
                                                for x in alleles)
                        alleles = tuple(outmvf.encode(''.join(x))
                                        for x in decoded_alleles)
                    else:
                        decoded_alleles = tuple(mvf.decode(x) for x in alleles)
                    amino_acids = tuple(translate_single_codon(''.join(x))
                                        for x in zip(*decoded_alleles))
                    # print("aminx", amino_acids)
                    amino_acids = outmvf.encode(''.join(amino_acids))
                # if all(x in '-X' for x in amino_acids):
                #    continue
                # print("amino", amino_acids)
                # print("translated", amino_acids, alleles)
                if args.output_data == 'protein':
                    entrybuffer.append((xcontig, coords[0], (amino_acids,)))
                else:
                    entrybuffer.append((
                        xcontigid, coords[0], (
                            amino_acids, alleles[0], alleles[1], alleles[2])))
                nentry += 1
                if nentry >= args.line_buffer:
                    args.qprint("Writing a block of {} entries.".format(
                        args.line_buffer))
                    outmvf.write_entries(entrybuffer)
                    entrybuffer = []
                    nentry = 0
    if entrybuffer:
        outmvf.write_entries(entrybuffer)
        entrybuffer = []
        nentry = 0
    return ''

Example #22

Show file

def translate_mvf(args):
    """Main method"""
    args.qprint("Running TranslateMVF")
    if args.gff:
        args.qprint("Reading and Indexing MVF.")
    else:
        args.qprint("Reading MVF.")
    mvf = MultiVariantFile(args.mvf, 'read', contigindex=bool(args.gff))
    if mvf.flavor != 'dna':
        raise RuntimeError("MVF must be flavor=dna to translate")
    if args.gff:
        args.qprint("Processing MVF Index File.")
        mvf.read_index_file()
        args.qprint("GFF processing start.")
        gff_genes, gene_order = parse_gff_exome(args)
        args.qprint("GFF processed.")
    outmvf = MultiVariantFile(args.out, 'write', overwrite=args.overwrite)
    outmvf.copy_headers_from(mvf)
    outmvf.contig_data = dict(
         (
                i, dict((y, z)
                                       for (y, z) in gff_genes[x].items()
                                       if y not in ('cds', )))
                              for (i, x) in enumerate(gene_order))
    outmvf.contig_indices = list(range(len(gene_order)))
    outmvf.contig_ids = [gff_genes[x]['id']
                         for x in gene_order]
    outmvf.contig_labels = [gff_genes[x]['label']
                            for x in gene_order]
    outmvf.flavor = args.output_data
    outmvf.metadata.notes.append(args.command_string)
    outmvf.write_data(outmvf.get_header())
    args.qprint("Output MVF Established.")
    entrybuffer = []
    nentry = 0
    pos = None
    if not args.gff:
        args.qprint("No GFF used, translating sequences as pre-aligned in "
                    "coding frame.")
        inputbuffer = []
        current_contig = ''
        for contigid, pos, allelesets in mvf.iterentries(decode=False):
            if current_contig == '':
                current_contig = contigid[:]
            if contigid == current_contig:
                inputbuffer.append((pos, allelesets))
            else:
                for _, amino_acids, alleles in iter_codons(
                        inputbuffer, mvf):
                    if all([x in '-X' for x in amino_acids]):
                        continue
                    if args.output_data == 'protein':
                        entrybuffer.append(
                            (current_contig, pos, (amino_acids,)))
                    else:
                        entrybuffer.append((
                            current_contig, pos, (
                                amino_acids, alleles[0],
                                alleles[1], alleles[2])))
                    nentry += 1
                    if nentry == args.line_buffer:
                        outmvf.write_entries(entrybuffer)
                        entrybuffer = []
                        nentry = 0
                inputbuffer = [(pos, allelesets)]
                current_contig = contigid[:]
        if inputbuffer:
            for _, amino_acids, alleles in iter_codons(
                    inputbuffer, outmvf):
                if all([x in '-X' for x in amino_acids]):
                    continue
                if args.output_data == 'protein':
                    entrybuffer.append(
                        (current_contig, pos, (amino_acids,)))
                else:
                    entrybuffer.append((
                        current_contig, pos, (
                            amino_acids, alleles[0],
                            alleles[1], alleles[2])))
                nentry += 1
                if nentry == args.line_buffer:
                    outmvf.write_entries(entrybuffer)
                    entrybuffer = []
                    nentry = 0
    else:
        running_gene_index = -1
        for igene, gene in enumerate(gene_order):
            xcontiglabel = gff_genes[gene]['contig']
            xcontig = mvf.get_contig_indices(
                labels=gff_genes[gene]['contig'])
            if xcontig is None:
                print("Warning: contig {} not found".format(
                    gff_genes[gene]['contig']))
            xcontigid = mvf.get_contig_ids(indices=xcontig)[0]
            min_gene_coord = gff_genes[gene]['cds'][0][0]
            max_gene_coord = gff_genes[gene]['cds'][-1][1]
            mvf_entries = {}
            if not igene % 100:
                args.qprint("Processing gene {} on {}".format(
                    gene, xcontiglabel))
            for contigid, pos, allelesets in mvf.itercontigentries(
                    xcontig, decode=False):
                if pos < min_gene_coord:
                    continue
                if pos > max_gene_coord:
                    break
                mvf_entries[pos] = allelesets[0]
            reverse_strand = gff_genes[gene]['strand'] == '-'
            coords = []
            running_gene_index += 1
            for elem in gff_genes[gene]['cds']:
                coords.extend(list(range(elem[0], elem[1] + 1)))
            if reverse_strand:
                coords = coords[::-1]
            for codoncoord in range(0, len(coords), 3):
                alleles = tuple(mvf_entries.get(x, '-')
                                for x in coords[codoncoord:codoncoord + 3])
                if len(alleles) < 3:
                    alleles = tuple(list(alleles) + ['-'] * (3 - len(alleles)))
                if all(len(x) == 1 for x in alleles):
                    if reverse_strand:
                        alleles = tuple(
                            MLIB.complement_bases[x] for x in alleles)
                    decoded_alleles = alleles
                    amino_acids = translate_single_codon(''.join(alleles))
                else:
                    if reverse_strand is True:
                        decoded_alleles = tuple(tuple(MLIB.complement_bases[y]
                                                      for y in mvf.decode(x))
                                                for x in alleles)
                        alleles = tuple(outmvf.encode(''.join(x))
                                        for x in decoded_alleles)
                    else:
                        decoded_alleles = tuple(mvf.decode(x) for x in alleles)
                    amino_acids = tuple(translate_single_codon(''.join(x))
                                        for x in zip(*decoded_alleles))
                    amino_acids = outmvf.encode(''.join(amino_acids))
                if args.output_data == 'protein':
                    entrybuffer.append((
                        (
                            xcontigid
                            if args.retain_contigs
                            else running_gene_index
                        ),
                        (
                            coords[codoncoord]
                            if args.retain_coords
                            else codoncoord
                        ),
                        (
                            amino_acids,
                        )
                    ))
                elif args.output_data == 'codon':
                    entrybuffer.append((
                        (
                            xcontigid
                            if args.retain_contigs
                            else running_gene_index
                        ),
                        (
                            coords[codoncoord]
                            if args.retain_coords
                            else codoncoord
                        ),
                        (
                            amino_acids,
                            alleles[0],
                            alleles[1],
                            alleles[2]
                        )
                    ))
                elif args.output_data == 'dna':
                    for j, elem in enumerate(
                            range(codoncoord,
                                  min(codoncoord + 3, len(coords)))):
                        entrybuffer.append((
                            (
                                xcontigid
                                if args.retain_contigs
                                else running_gene_index
                            ),
                            (
                                coords[elem]
                                if args.retain_coords
                                else elem + 1
                            ),
                            (
                                alleles[j],
                            )
                        ))
                nentry += 1
                if nentry >= args.line_buffer:
                    args.qprint("Writing a block of {} entries.".format(
                        args.line_buffer))
                    outmvf.write_entries(entrybuffer)
                    entrybuffer = []
                    nentry = 0
        if entrybuffer:
            outmvf.write_entries(entrybuffer)
            entrybuffer = []
            nentry = 0
    return ''

Example #23

Show file

def calc_group_unique_allele_window(args):
    """Count the number of and relative rate of uniquely held alleles
       spatially along chromosomes (i.e. Lineage-specific rates)"""
    data = {}
    mvf = MultiVariantFile(args.mvf, 'read')
    if mvf.flavor != 'codon':
        raise RuntimeError(
            "\n=====================\nERROR: MVF is not codon flavor!")
    annotations = {}
    coordinates = {}
    labels = mvf.get_sample_labels()[:]
    ncol = len(labels)
    current_contig = None
    current_position = 0
    counts = Counter()
    totals = Counter()
    args.start_contig = (args.start_contig
                         if args.start_contig is not None else 0)
    args.end_contig = (args.end_contig
                       if args.end_contig is not None else 100000000000)
    if args.output_align is True:
        outputalign = []
    if args.gff is not None:
        annotations, coordinates = (parse_gff_analysis(args.gff))
    if args.allele_groups is not None:
        args.allele_groups = procarg_allelegroups(args.allele_groups, mvf)
    if args.species_groups is None:
        args.species_groups = args.allele_groups
    else:
        args.species_groups = procarg_speciesgroups(args.species_groups, mvf)
    fieldtags = [
        'likelihood', 'bgdnds0', 'bgdnds1', 'bgdnds2a', 'bgdnds2b', 'fgdnds0',
        'fgdnds1', 'fgdnds2a', 'fgdnds2b', 'dndstree', 'errorstate'
    ]
    if args.branch_lrt is not None:
        with open(args.branch_lrt, 'w') as branchlrt:
            genealign = []
            branchlrt.write(
                "\t".join(['contig', 'ntaxa', 'alignlength', 'lrtscore'] +
                          ["null.{}".format(x) for x in fieldtags] +
                          ["test.{}".format(x)
                           for x in fieldtags] + ['tree']) + "\n")
    groups = args.allele_groups.values()
    if args.species_groups is not None:
        speciesgroups = args.species_groups.values()
    allsets = set([])
    for group in groups:
        allsets.update(group)
    allsets = list(sorted(allsets))
    speciesnames = args.species_groups.keys()
    speciesrev = {}
    if args.species_groups is not None:
        for species in args.species_groups:
            speciesrev.update([(x, species)
                               for x in args.species_groups[species]])
    if args.mincoverage is not None:
        if args.mincoverage < len(groups) * 2:
            raise RuntimeError("""
                Error: GroupUniqueAlleleWindow:
                --mincoverage cannot be lower than the twice the number
                of specified groups in --allele-groups
                """)
    genealign = []
    for contig, pos, allelesets in mvf:
        if not current_contig:
            current_contig = contig[:]
        if contig != current_contig or (args.windowsize > 0 and pos >
                                        current_position + args.windowsize):
            xkey = (
                current_contig,
                current_position,
            )
            data[xkey] = counts.copy()
            data[xkey].update([
                ('contig', (mvf.get_contig_labels(ids=current_contig)
                            if args.use_labels is True else current_contig)),
                ('position', current_position),
                ('nonsynyonymous_changes',
                 counts.get('nonsynonymous_changes', 0) or 0),
                ('synyonymous_changes', counts.get('synonymous_changes', 0)
                 or 0)
            ])
            data[xkey].update([
                ('ns_ratio',
                 (float(data[xkey].get('nonsynonymous_changes', 0)) /
                  (data[xkey].get('synonymous_changes', 1.0)))),
                ('annotation', annotations.get(data[xkey]['contig'], '.')),
                ('coordinates', coordinates.get(data[xkey]['contig'], '.'))
            ])
            if genealign:
                if (args.end_contig >= int(current_contig)) and (
                        args.start_contig <= int(current_contig)):
                    (pamlnull, pamltest, tree) = paml_branchsite(
                        genealign,
                        labels[:],
                        species=speciesnames,
                        speciesrev=speciesrev,
                        codemlpath=args.codeml_path,
                        raxmlpath=args.raxml_path,
                        pamltmp=args.paml_tmp,
                        target=args.target,
                        targetspec=args.num_target_species,
                        allsampletrees=args.all_sample_trees,
                        outgroup=args.outgroup)
                    lrtscore = -1
                    if (pamlnull.get('likelihood', -1) != -1
                            and pamltest.get('likelihood', -1) != -1):
                        lrtscore = 2 * (pamltest['likelihood'] -
                                        pamlnull['likelihood'])
                    with open(args.branch_lrt, 'a') as branchlrt:
                        branchlrt.write("\t".join([
                            str(x) for x in [
                                data[xkey]['contig'],
                                len(genealign),
                                len(genealign[0]) * 3, lrtscore
                            ] + [pamlnull.get(y, -1) for y in fieldtags] +
                            [pamltest.get(y, -1)
                             for y in fieldtags] + [str(tree).rstrip()]
                        ]) + "\n")
            genealign = None
            totals.add('genes_total')
            if counts.get('total_codons', 0) > 0:
                totals.add('genes_tested')
            if counts.get('total_nsyn_codons', 0) > 0:
                totals.add('genes_with_nsyn')
            if contig != current_contig:
                current_contig = contig[:]
                current_position = 0
            elif args.windowsize > 0:
                current_position += args.windowsize
            counts = Counter()
        proteins = allelesets[0]
        codons = allelesets[1:4]
        if len(proteins) == 1 and all(len(x) == 1 for x in codons):
            if proteins == '*' or ''.join(codons) in MLIB.stop_codons:
                continue
            counts.add('total_codons')
            totals.add('total_codons')
            if args.output_align is True:
                if not outputalign:
                    outputalign = [[''.join(codons)]
                                   for x in range(mvf.metadata['ncol'])]
                else:
                    for ialign, xalign in enumerate(outputalign):
                        xalign.append(''.join(codons))
            if args.branch_lrt is not None:
                if not genealign:
                    genealign = [[''.join(codons)] for x in range(ncol)]
                else:
                    for ialign in range(len(genealign)):
                        genealign[ialign].append(''.join(codons))
            continue
        if len(proteins) > 1:
            if allelesets[0][1] == '+':
                continue
        proteins = mvf.decode(proteins)
        if args.mincoverage is not None:
            if sum([int(x not in 'X-')
                    for x in proteins]) < (args.mincoverage):
                continue
        species_groups = [[proteins[i] for i in x if proteins[i] not in '-X']
                          for x in speciesgroups]
        if any(len(x) == 0 for x in species_groups):
            continue
        xcodons = [mvf.decode(x) for x in codons]
        codons = [''.join(x) for x in zip(*xcodons)]
        if any(codons[x] in MLIB.stop_codons for x in allsets):
            continue
        if any(
                any(x != species_groups[0][0] for x in y)
                for y in species_groups):
            totals.add('total_nsyn_codons')
            counts.add('total_nsyn_codons')
        totals.add('total_codons')
        totals.add('tested_codons')
        counts.add('total_codons')
        totals.add('variable_codons',
                   val=int(
                       sum([int(len(set(x) - set('X-')) > 1)
                            for x in xcodons]) > 0))
        if args.output_align is not None:
            if not outputalign:
                outputalign = [[x] for x in codons]
            else:
                for ialign in range(len(outputalign)):
                    outputalign[ialign].append(codons[ialign])
        if args.branch_lrt is not None:
            if not genealign:
                genealign = [[x] for x in codons]
            else:
                for ialign in range(len(codons)):
                    genealign[ialign].append(codons[ialign])
        nonsyn_change = False
        synon_change = False
        codon_groups = [
            set([
                codons[i] for i in x
                if '-' not in codons[i] and 'X' not in codons[i]
            ]) for x in groups
        ]
        protein_groups = None
        for i in range(len(codon_groups)):
            if any(base in codon for base in 'RYWKMS'
                   for codon in codon_groups[i]):
                codon_groups[i] = hapgroup(codon_groups[i])
        if all(
                grp1.isdisjoint(grp0)
                for grp0, grp1 in combinations(codon_groups, 2)):
            protein_groups = [
                set([
                    MLIB.codon_tables['full'][''.join(x)]
                    for x in codon_groups[i]
                ]) for i in range(len(codon_groups))
            ]
            if all(
                    grp1.isdisjoint(grp0)
                    for grp0, grp1 in combinations(protein_groups, 2)):
                nonsyn_change = True
            elif all(grp1 == grp0
                     for grp0, grp1 in combinations(protein_groups, 2)):
                synon_change = True
        if nonsyn_change:
            if args.verbose is True:
                print('NON', contig, pos, allelesets,
                      codon_groups, protein_groups, groups,
                      mvf.get_contig_labels(ids=contig))
            counts.add('nonsynonymous_changes')
            totals.add('nonsynonymous_changes')
        elif synon_change:
            if args.verbose is True:
                print('SYN', contig, pos, allelesets,
                      codon_groups, protein_groups, groups,
                      mvf.get_contig_labels(ids=contig))
            counts.add('synonymous_changes')
            totals.add('synonymous_changes')
    args.totals = totals
    # WRITE OUTPUT
    headers = [
        "contig", "position", "nonsynonymous_changes", "synonymous_changes",
        "ns_ratio", "nonsynonymous_total", "synonymous_total", "pvalue",
        "total_codons", "annotation", "coordinates"
    ]
    if args.windowsize == -1:
        headers.remove('position')
    if args.chi_test is None:
        headers.remove('pvalue')
    outfile = OutputFile(path=args.out, headers=headers)
    sorted_entries = sorted(
        [(data[k]['ns_ratio'], k)
         for k in data if data[k].get('nonsynonymous_changes', 0) > 0],
        reverse=True)
    for _, k in sorted_entries:
        outfile.write_entry(data[k])
    with open(args.out + '.total', 'w') as totalfile:
        for entry in args.totals.iter_sorted():
            totalfile.write(entry)
    if args.output_align is not None:
        with open(args.output_align, 'w') as alignfile:
            alignfile.write("\n".join([
                ">{}\n{}".format(mvf.metadata['labels'][i],
                                 ''.join(outputalign[i]))
                for i in range(len(outputalign))
            ]))
    return ''

Example #24

Show file

File: mvfjoin.py Project: ddelacer/mvftools

def mvf_join(args):
    """Main method"""
    concatmvf = MultiVariantFile(args.out, 'write', overwrite=args.overwrite)
    # Copy the first file's metadata
    if args.main_header_file:
        if args.main_header_file not in args.mvf:
            raise RuntimeError("{} not found in files".format(
                args.main_header_file))
        else:
            args.main_header_file = args.mvf.index(args.main_header_file)
    else:
        args.main_header_file = 0
    first_mvf = MultiVariantFile(args.mvf[args.main_header_file], 'read')
    concatmvf.metadata = first_mvf.metadata.copy()
    # Open each MVF file, read headers to make unified header
    transformers = []
    for mvfname in args.mvf:
        # This will create a dictionary of samples{old:new}, contigs{old:new}
        transformer = MvfTransformer()
        mvf = MultiVariantFile(mvfname, 'read')
        for i, label in enumerate(mvf.get_sample_labels()):
            if label not in concatmvf.get_sample_labels():
                concatmvf.metadata['labels'].append(label)
                concatmvf.metadata['samples'][
                    concatmvf.metadata['labels'].index(label)] = {
                        'label': label
                    }
            if concatmvf.metadata['labels'].index(label) != i:
                transformer.set_label(
                    i, concatmvf.metadata['labels'].index(label))
        for contigid, contigdata in iter(mvf.metadata['contigs'].items()):
            if contigdata['label'] not in [
                    concatmvf.metadata['contigs'][x]['label']
                    for x in concatmvf.metadata['contigs']
            ]:
                newid = (contigid not in concatmvf.metadata['contigs']
                         and contigid or concatmvf.get_next_contig_id())
                concatmvf.metadata['contigs'][newid] = contigdata
            else:
                for concatid, concatdata in (
                        concatmvf.metadata['contigs'].items()):
                    if contigdata['label'] == concatdata['label']:
                        newid = concatid
                        break
            if newid != contigid:
                transformer.set_contig(contigid, newid)
        transformers.append(transformer)
    # Write output header
    concatmvf.write_data(concatmvf.get_header())
    # Now loop through each file
    entries = []
    nentries = 0
    for ifile, mvfname in enumerate(args.mvf):
        if not args.quiet:
            sys.stderr.write("Processing {} ...\n".format(mvfname))
        transformer = transformers[ifile]
        mvf = MultiVariantFile(mvfname, 'read')
        for contigid, pos, allelesets in mvf.iterentries(decode=False,
                                                         quiet=args.quiet):
            if transformer.labels:
                allelesets = [mvf.decode(x) for x in allelesets]
                for j, alleles in enumerate(allelesets):
                    allelesets[j] = concatmvf.encode(''.join([
                        x in transformer.labels
                        and alleles[transformer.labels[x]] or alleles[x]
                        for x in range(len(alleles))
                    ]))
            if transformer.contigs:
                contigid = (contigid in transformer['contigs']
                            and transformer['contigs'][contigid] or contigid)
            entries.append((contigid, pos, allelesets))
            nentries += 1
            if nentries == args.line_buffer:
                concatmvf.write_entries(entries)
                entries = []
                nentries = 0
        if entries:
            concatmvf.write_entries(entries)
            entries = []
            nentries = 0
        if not args.quiet:
            sys.stderr.write("done\n")
    return ''

Example #25

Show file

def vcf2mvf(args=None):
    """Main method for vcf2mvf"""
    sepchars = dict([("TAB", "\t"), ("SPACE", " "), ("DBLSPACE", "  "),
                     ("COMMA", ","), ("MIXED", None)])
    args.fieldsep = sepchars[args.field_sep]
    # ESTABLISH VCF
    args.qprint("Opening input VCF: {}".format(args.vcf))
    vcf = VariantCallFile(args.vcf, indexcontigs=(not args.no_autoindex))
    # ESTABLISH MVF
    args.qprint("Establishing output MVF: {}".format(args.out))
    mvf = MultiVariantFile(args.out, 'write', overwrite=args.overwrite)
    mvf.notes.append(args.command_string)
    mvf.metadata['mvfversion'] = args.versionx
    # PROCESS CONTIG INFO
    args.qprint("Processing VCF headers.")
    vcfcontigs = vcf.metadata['contigs'].copy()
    args.qprint("{} contigs found.".format(len(vcfcontigs)))
    contig_translate = {}
    if args.contig_ids:
        for cid, cvcf, cmvf in (x.split(';') for x in args.contig_ids):
            try:
                cid = int(cid)
            except ValueError:
                pass
            assert cvcf in [vcfcontigs[x]['label'] for x in vcfcontigs]
            for vid in vcfcontigs:
                if vcfcontigs[vid]['label'] == cvcf:
                    contig_translate[cvcf] = [cid, cmvf]
                    if cid in mvf.metadata['contigs']:
                        raise RuntimeError(
                            'Contig id {} is not unique'.format(cid))
                    mvf.metadata['contigs'][cid] = vcfcontigs[vid].copy()
                    if cmvf in mvf.get_contig_labels():
                        raise RuntimeError(
                            'Contig label {} is not unique'.format(cmvf))
                    mvf.metadata['contigs'][cid]['label'] = cmvf[:]
    mvf.reset_max_contig()
    mvf.max_contig_index -= 1
    args.qprint("Processing contigs.")
    static_contig_ids = list(mvf.get_contig_ids())
    for vcid in vcfcontigs:
        vlabel = vcfcontigs[vcid]['label']
        if vlabel not in static_contig_ids:
            newindex = mvf.get_next_contig_index()
            if ((is_int(vlabel) or len(vlabel) < 3)
                    and vlabel not in static_contig_ids):
                newid = vlabel[:]
            else:
                newid = str(newindex)
            mvf.contig_indices.append(newindex)
            mvf.contig_ids.append(newid)
            mvf.contig_data[newindex] = vcfcontigs[vcid].copy()
            static_contig_ids.append(newid)
            contig_translate[vlabel] = [newindex, vlabel]
    mvf.reset_max_contig()
    new_contigs = [(x, mvf.contig_data[x]['label'])
                   for x in mvf.contig_indices]
    if args.skip_contig_label_check is False:
        args.qprint("Checking contigs for label/id overlap errors.")
        xids = [x[0] for x in new_contigs]
        xlabels = [x[1] for x in new_contigs]
        xintersect = set(xids).intersection(xlabels)
        if xintersect:
            for i, (newid, newlabel) in enumerate(new_contigs):
                if i % 100 == 0:
                    args.qprint("{} contigs processed".format(i))
                if newid in xlabels[:i] or newid in xlabels[i + 1:]:
                    # if newid in xlabels:
                    # if xlabels.index(newid) != i:
                    raise RuntimeError("Error contig id {} is the same as"
                                       " the label for another contig"
                                       " ({})".format(newid,
                                                      xlabels.index(newid)))
                if newlabel in xids[:i] or newlabel in xids[i + 1:]:
                    # if newlabel in xids:
                    # if xids.index(newlabel) != i:
                    raise RuntimeError("Error contig label {} is the same"
                                       "as the id for another contig"
                                       "({})".format(newlabel,
                                                     xids.index(newlabel)))
    # PROCESS SAMPLE INFO
    args.qprint("Processing samples.")
    samplelabels = [args.ref_label] + vcf.metadata['samples'][:]
    if args.alleles_from:
        args.alleles_from = args.alleles_from.split(':')
        samplelabels += args.alleles_from
    if args.sample_replace:
        newsample = [
            x.split(':') if ':' in tuple(x) else tuple([x, x])
            for x in args.sample_replace
        ]
        unmatched = list(enumerate(samplelabels))
        for old, new in newsample:
            labelmatched = False
            for j, (i, name) in enumerate(unmatched):
                if old in name:
                    samplelabels[i] = new
                    labelmatched = j
                    break
            if labelmatched is not False:
                del unmatched[labelmatched]
    mvf.sample_indices = list(range(len(samplelabels)))
    mvf.sample_ids = samplelabels[:]
    for i, label in enumerate(samplelabels):
        mvf.sample_data[i] = {'id': label}
    mvf.metadata['ncol'] = len(mvf.sample_ids)
    mvf.max_sample_index = len(mvf.sample_ids)
    mvf.metadata['sourceformat'] = vcf.metadata['sourceformat']
    # WRITE MVF HEADER
    mvf.write_data(mvf.get_header())
    mvfentries = []
    nentry = 0
    args.qprint("Processing VCF entries.")
    for vcfrecord in vcf.iterentries(args):
        mvfstring = ''.join(vcfrecord['genotypes'])
        if args.filter_nonref_empty is True:
            if all(x in 'Xx-?' for x in mvfstring[1:]):
                continue
        mvf_alleles = encode_mvfstring(mvfstring)
        if args.out_flavor in ('dnaqual', ):
            qual_alleles = encode_mvfstring(''.join(vcfrecord['qscores']))
        if mvf_alleles:
            mvfentries.append(
                (contig_translate.get(vcfrecord['contig'])[0],
                 vcfrecord['coord'],
                 ((mvf_alleles,
                   qual_alleles) if args.out_flavor in ('dnaqual', ) else
                  (mvf_alleles, ))))
            nentry += 1
            if nentry == args.line_buffer:
                mvf.write_entries(mvfentries, encoded=True)
                mvfentries = []
                nentry = 0
    if mvfentries:
        mvf.write_entries(mvfentries)
        mvfentries = []
    return ''

Example #26

Show file

def filter_mvf(args):
    """Main method"""
    args.qprint("Running FilterMVF")
    if args.more_help is True:
        modulehelp()
        sys.exit()
    if args.mvf is None and args.test is None:
        raise RuntimeError("No input file specified with --mvf")
    if args.out is None and args.test is None:
        raise RuntimeError("No output file specified with --out")
    # Establish Input MVF
    if args.test is not None:
        ncol = args.test_nchar or len(args.test.split()[1])
    else:
        mvf = MultiVariantFile(args.mvf, 'read')
        ncol = mvf.metadata['ncol']
    args.qprint("Input MVF read with {} columns.".format(ncol))
    # Create Actionset
    if args.labels:
        for i in range(len(args.actions)):
            action = args.actions[i]
            arr = action.split(':')
            if arr[0] in ('collapsepriority', 'collapsemerge'):
                arr[1] = ','.join([
                    str(mvf.sample_id_to_index[x])
                    for x in arr[1].split(',')])
            if arr[0] in ('columns', 'allelegroup', 
                          'notmultigroup', 'reqsample'):
                for j in range(1, len(arr)):
                    arr[j] = ','.join([
                        str(mvf.sample_id_to_index[x])
                        for x in arr[j].split(',')])
            args.actions[i] = ':'.join(arr)
    removed_columns = set([])
    for i in range(len(args.actions)):
        action = args.actions[i]
        arr = action.split(':')
        if arr[0] in ('collapsepriority', 'collapsemerge'):
            tmp_arr = arr[1][:]
            arr[1] = ','.join([
                str(int(x) - len([y for y in removed_columns if y < int(x)]))
                for x in arr[1].split(',')])
            removed_columns.update([int(x) for x in tmp_arr.split(',')[1:]])
            print(arr)
            print(removed_columns)
        if arr[0] in ('columns', 'allelegroup', 
                      'notmultigroup', 'reqsample'):
            for j in range(1, len(arr)):
                arr[j] = ','.join([
                    str(int(x) - len([y for y in removed_columns if y < int(x)]))
                    for x in arr[j].split(',')])
        args.actions[i] = ':'.join(arr)
            
            
    actionset = build_actionset(args.actions, ncol)
    args.qprint("Actions established.")
    args.qprint(actionset)
    # TESTING MODE
    if args.test:
        loc, alleles = args.test.split()
        linefail = False
        transformed = False
        # invar = invariant (single character)
        # refvar (all different than reference, two chars)
        # onecov (single coverage, + is second character)
        # onevar (one variable base, + is third character)
        # full = full alleles (all chars)
        if args.verbose:
            print(alleles)
        linetype = get_linetype(alleles)
        sys.stdout.write("MVF Encoding type '{}' detected\n".format(linetype))
        for actionname, actiontype, actionfunc, actionarg in actionset:
            sys.stdout.write("Applying action {} ({}): ".format(
                actionname, actiontype))
            if actiontype == 'filter':
                if not actionfunc(alleles, linetype):
                    linefail = True
                    sys.stdout.write("Filter Fail\n")
                    break
                sys.stdout.write("Filter Pass\n")
            elif actiontype == 'transform':
                transformed = True
                alleles = actionfunc(alleles, linetype)
                linetype = get_linetype(alleles)
                if linetype == 'empty':
                    linefail = True
                    sys.stdout.write("Transform removed all alleles\n")
                    break
                sys.stdout.write("Transform result {}\n".format(alleles))
            elif actiontype == 'location':
                loc = loc.split(':')
                loc[1] = int(loc[1])
                if actionfunc(loc) is False:
                    linefail = True
                    sys.stdout.write("Location Fail\n")
                    break
                sys.stdout.write("Location Pass\n")
        if linefail is False:
            if transformed:
                if linetype == 'full':
                    alleles = encode_mvfstring(alleles)
                if alleles:
                    test_output = "{}\t{}\n".format(loc, alleles)
                    sys.stdout.write("Final output = {}\n".format(
                        test_output))
                else:
                    sys.stdout.write("Transform removed all alleles\n")
            else:
                sys.stdout.write("No changes applied\n")
                sys.stdout.write("Final output = {}\n".format(args.test))
        sys.exit()
    # MAIN MODE
    # Set up file handler
    outmvf = MultiVariantFile(args.out, 'write', overwrite=args.overwrite)
    outmvf.copy_headers_from(mvf)

    removed_indices = set([])
    # reprocess header if actions are used that filter columns
    if any(x == y[0] for x in ('columns', 'collapsepriority', 'collapsemerge')
           for y in actionset):
        for actionname, actiontype, actionfunc, actionarg in actionset:
            if actionname == 'columns':
                if args.labels:
                    oldindices = [outmvf.sample_id_to_index[int(x)]
                                  for x in actionarg[0]]
                else:
                    oldindices = [int(x) for x in actionarg[0]]
            elif actionname in ('collapsepriority', 'collapsemerge'):
                actionarg[0] = [x - len([y for y in removed_indices if y < x])
                                 for x in actionarg[0]]
                oldindices = [x for x in outmvf.sample_indices
                              if x not in actionarg[0][1:]]
            outmvf.sample_ids = outmvf.get_sample_ids(oldindices)
            outmvf.sample_data = dict(
                (i, outmvf.sample_data[oldindices[i]])
                for i, _ in enumerate(oldindices))

            if actionname in ('collapsepriority', 'collapsemerge'):
                if len(actionarg) == 2:
                    outmvf.sample_data[actionarg[0][0]]['id'] = actionarg[1][0]
                    outmvf.sample_ids[actionarg[0][0]] = actionarg[1][0]
            outmvf.sample_indices = list(range(len(oldindices)))
    outmvf.metadata['ncol'] = len(outmvf.sample_indices)
    outmvf.notes.append(args.command_string)
    outmvf.write_data(outmvf.get_header())
    args.qprint("Output MVF established.")
    # End header editing
    linebuffer = []
    nbuffer = 0
    args.qprint("Processing Entries.")
    write_total = 0
    for chrom, pos, allelesets in mvf.iterentries(decode=False):
        linefail = False
        transformed = False
        # invar = invariant (single character)
        # refvar (all different than reference, two chars)
        # onecov (single coverage, + is second character)
        # onevar (one variable base, + is third character)
        # full = full alleles (all chars)
        alleles = allelesets[0]
        linetype = get_linetype(alleles)
        if linetype == 'empty':
            continue
        if args.verbose is True:
            sys.stdout.write(" {} {} ".format(alleles, linetype))
        for actionname, actiontype, actionfunc, _ in actionset:
            if actiontype == 'filter':
                linefail = not actionfunc(alleles, linetype)
            elif actiontype == 'transform':
                transformed = True
                alleles = actionfunc(alleles, linetype)
                linetype = get_linetype(alleles)
                linefail = linetype == 'empty'
            elif actiontype == 'location':
                linefail = not actionfunc([chrom, pos])
            if linefail:
                break
        if linefail is False:
            if transformed:
                if linetype == 'full':
                    alleles = mvf.encode(alleles)
                if not alleles:
                    linefail = True
            nbuffer += 1
            linebuffer.append((chrom, pos, (alleles,)))
            if args.verbose:
                sys.stdout.write("{}\n".format(alleles))
            if nbuffer == args.line_buffer:
                write_total += args.line_buffer
                args.qprint("{} entries written. Total written: {}.".format(
                    args.line_buffer, write_total))
                outmvf.write_entries(linebuffer)
                linebuffer = []
                nbuffer = 0
        elif args.verbose:
            sys.stdout.write("FAIL\n")
    if linebuffer:
        outmvf.write_entries(linebuffer)
        write_total += len(linebuffer)
        args.qprint("{} entries written. Total written: {}.".format(
            args.line_buffer, write_total))
        linebuffer = []
    return ''

Example #27

Show file

def mvf2fastagene(args):
    """Main method"""
    args.qprint("Indexing MVF")
    mvf = MultiVariantFile(args.mvf, 'read', contigindex=True)
    if (mvf.flavor in ("dna", "rna") and args.output_data == "prot") or (
            mvf.flavor == "prot" and args.output_data in ("dna", "rna")):
        raise RuntimeError(
            "--output-data {} incompatiable with '{}' flavor mvf".format(
                args.output_data, mvf.flavor))
    if args.output_data is None:
        raise RuntimeError("--output-data required")
    sample_labels = mvf.get_sample_ids()
    if args.sample_indices is not None:
        sample_indices = [int(x) for x in args.sample_indices[0].split(",")]
    elif args.sample_labels is not None:
        sample_indices = mvf.get_sample_indices(
            ids=args.sample_labels[0].split(","))
    else:
        sample_indices = mvf.get_sample_indices()
    args.qprint("Beginning Entries.")
    if not os.path.exists(args.output_dir):
        args.qprint("Output Directory Created: {}".format(args.output_dir))
        os.mkdir(args.output_dir)
    else:
        args.qprint("Output Directory Exists Already: {}".format(
            args.output_dir))
    write_buffer = {}
    for targetcontig in mvf.get_contig_indices():
        contiglabel = mvf.get_contig_labels(indices=targetcontig)[0]
        args.qprint("Reading Contig {}: {}".format(targetcontig, contiglabel))
        write_buffer = dict((x, []) for x in sample_labels)
        data_in_buffer = False
        for _, _, allelesets in mvf.itercontigentries(targetcontig,
                                                      decode=True):
            for col, label in zip(sample_indices, sample_labels):
                if mvf.flavor == 'dna':
                    write_buffer[label].append('N' if allelesets[0][col] ==
                                               'X' else allelesets[0][col])
                    data_in_buffer = True
                elif mvf.flavor in ('codon', 'prot') and (args.output_data
                                                          == 'prot'):
                    write_buffer[label].append(allelesets[0][col])
                    data_in_buffer = True
                elif mvf.flavor == 'codon' and args.output_data == 'dna':
                    if args.choose_allele == 'random1':
                        codon = [
                            'N' if allelesets[x][col] == 'X' else
                            (MLIB.randomnuc(allelesets[x][col]) if
                             (allelesets[x][col]
                              in MLIB.validchars['dnaambig23']) else
                             allelesets[x][col]) for x in (1, 2, 3)
                        ]
                    else:
                        codon = [
                            'N' if allelesets[x][col] == 'X' else
                            allelesets[x][col] for x in (1, 2, 3)
                        ]
                    write_buffer[label].append(''.join(codon))
                    data_in_buffer = True
        if data_in_buffer:
            args.qprint("Writing Align")
            with open(os.path.join(args.output_dir, contiglabel + ".fa"),
                      'w') as outfile:
                for label in write_buffer:
                    if (mvf.flavor == 'codon'
                            and args.output_data in ('dna', 'prot')):
                        if ((mvf.contig_data[targetcontig].get('strand', '+')
                             == '-') and (args.ignore_strand is False)):
                            entryseq = ''.join(write_buffer[label][::-1])
                        else:
                            entryseq = ''.join(write_buffer[label])
                    else:
                        entryseq = ''.join(write_buffer[label])
                    outfile.write(">{}\n{}\n".format(label, entryseq))
                outfile.write("\b")

    return ''

Example #28

Show file

File: mvfchromoplot.py Project: luhuimeng/mvftools

def plot_chromoplot(args):
    """Main method"""
    pallette = Pallette()
    if args.colors is not None:
        pallette.basecolors = args.colors
    # Establish MVF and parse chromosome information
    if args.quiet is False:
        print("Reading MVF...")
    mvf = MultiVariantFile(args.mvf, 'read')
    if args.quiet is False:
        print("Parsing headers...")
    if args.contig_ids is not None:
        contigids = args.contig_ids[0].split(",")
    elif args.contig_labels is not None:
        contigids = mvf.get_contig_ids(labels=args.contig_labels[0].split(","))
    else:
        contigids = mvf.get_contig_ids()
    if args.quiet is False:
        print("Plotting chromoplot for contigs: {}".format(
            ",".join(contigids)))
    sample_labels = mvf.get_sample_labels()
    if args.sample_indices is not None:
        sample_indices = [int(x) for x in args.sample_indices[0].split(",")]
    elif args.sample_labels is not None:
        sample_indices = mvf.get_sample_indices(
            labels=args.sample_labels[0].split(","))
    else:
        sample_indices = mvf.get_sample_indices()
    assert len(sample_indices) >= 3
    if args.outgroup_indices is not None:
        outgroup_indices = [
            int(x) for x in args.outgroup_indices[0].split(",")
        ]
    elif args.outgroup_labels is not None:
        outgroup_indices = mvf.get_sample_indices(
            labels=args.outgroup_labels[0].split(","))
    assert len(outgroup_indices) >= 1
    quartets = [(x, y, z, outgroup)
                for x, y, z in combinations(sample_indices, 3)
                for outgroup in outgroup_indices]
    # Begin iterations
    for quartet_indices in quartets:
        quartet_labels = [sample_labels[x] for x in quartet_indices]
        if args.quiet is False:
            print("Beginning quartet {}".format(",".join(quartet_labels)))
        params = {
            'contigs': [[
                contigid, mvf.metadata['contigs'][contigid]['label'],
                mvf.metadata['contigs'][contigid]['length']
            ] for contigid in contigids],
            'outpath':
            ((args.out_prefix if args.out_prefix is not None else '')
             or '_'.join(quartet_labels)) + ".png",
            'labels':
            quartet_labels,
            'indices':
            quartet_indices,
            'windowsize':
            args.windowsize,
            'majority':
            args.majority,
            'infotrack':
            args.info_track,
            'yscale':
            args.yscale,
            'xscale':
            args.xscale,
            'quiet':
            args.quiet,
            'plottype':
            args.plot_type
        }
        chromoplot = Chromoplot(params=params, pallette=pallette)
        current_contig = ''
        for contig, pos, allelesets in mvf.iterentries(subset=quartet_indices,
                                                       decode=True,
                                                       contigs=contigids):
            if contig != current_contig:
                if args.quiet is False:
                    print("Starting contig {}".format(contig))
                    current_contig = contig[:]
            alleles = allelesets[0]
            if '-' in alleles:
                site_code = 'gap'
            elif any(x not in 'ATGCatgc' for x in alleles):
                site_code = 'ambiguous'
            elif alleles[3] not in alleles[:3]:
                site_code = 'nonpolar'
            elif len(set(alleles)) > 2:
                site_code = 'triallelic'
            else:
                site_code = sum([
                    2**(3 - j) * (alleles[j] != alleles[3]) for j in range(3)
                ])
            chromoplot.add_data(str(contig), int(pos // args.windowsize),
                                site_code)
        contig = ''
        current_contig = ''
        if not args.quiet:
            print("Writing image...")
        chromoplot.plot_chromoplot()

        if not args.quiet:
            print("Writing log...")
        chromoplot.write_total_log()
    return ''

Example #29

Show file

def mvf2phy(args):
    """Main method"""
    mvf = MultiVariantFile(args.mvf, 'read')
    if (mvf.flavor in ("dna", "rna") and args.output_data == "prot") or (
            mvf.flavor == "prot" and args.output_data in ("dna", "rna")):
        raise RuntimeError(
            "--outdput-data {} incompatiable with '{}' flavor mvf".format(
                args.output_data, mvf.flavor))
    max_region_coord = dict((x, None) for x in mvf.get_contig_ids())
    if args.regions is not None:
        _, max_region_coord, _ = parse_regions_arg(args.regions,
                                                   mvf.get_contig_ids())
    if args.sample_indices is not None:
        sample_indices = [int(x) for x in args.sample_indices[0].split(",")]
    elif args.sample_labels is not None:
        sample_indices = mvf.get_sample_indices(
            ids=args.sample_labels[0].split(","))
    else:
        sample_indices = mvf.get_sample_indices()
    sample_labels = mvf.get_sample_ids(indices=sample_indices)
    skipcontig = ''
    tmp_files = dict((fn,
                      open("{}-{}.tmp".format(fn, randint(1000000, 9999999)),
                           'w+', args.buffer)) for fn in sample_labels)
    labelwritten = dict.fromkeys(sample_labels, False)
    current_contig_id = None
    current_contig_start = 1
    current_contig_end = 1
    if args.partition is True:
        partprefix = "PROT" if args.output_data == "prot" else "DNA"
        partitionfile = open("{}.part".format(args.out), 'w')
    for contig, _, allelesets in mvf.iterentries(
            contig_ids=(mvf.get_contig_ids()
                        if args.regions is None else max_region_coord[:]),
            decode=True):
        if contig == skipcontig:
            continue
        if contig not in max_region_coord:
            skipcontig = contig[:]
            continue
        if current_contig_id is None:
            current_contig_id = contig[:]
        elif contig != current_contig_id:
            if args.partition is True:
                if current_contig_end > current_contig_start:
                    partitionfile.write("{}, {} = {}-{}\n".format(
                        partprefix,
                        mvf.get_contig_labels(ids=current_contig_id),
                        current_contig_start, current_contig_end - 1))
            current_contig_id = contig[:]
            # reset start as one position after end of last
            current_contig_start = current_contig_end
            current_contig_end = current_contig_end + 1
        for col, label in zip(sample_indices, sample_labels):
            if not labelwritten[label]:
                if args.label_type == 'long':
                    tmp_files[label].write("{}{}".format(
                        label[:100], " " * (100 - len(label[:100]))))
                elif args.label_type == 'short':
                    tmp_files[label].write("{}{}".format(
                        label[:20], " " * (20 - len(label[:20]))))
                labelwritten[label] = True
            if mvf.flavor == 'dna':
                tmp_files[label].write(allelesets[0][col] == 'X' and 'N'
                                       or allelesets[0][col])
                if label == sample_labels[0]:
                    current_contig_end += 1
            elif ((mvf.flavor == 'codon' and args.output_data == 'prot')
                  or (mvf.flavor == 'prot')):
                tmp_files[label].write(allelesets[0][col])
                if label == sample_labels[0]:
                    current_contig_end += 1
            elif mvf.flavor == 'codon':
                codon = [
                    "N" if allelesets[x][col] == 'X' else allelesets[x][col]
                    for x in (1, 2, 3)
                ]
                tmp_files[label].write(''.join(codon))
                if label == sample_labels[0]:
                    current_contig_end += 3
    first_file = True
    totalseqlen = 0
    with open(args.out, 'w') as outfile:
        for filehandler in tmp_files.values():
            # read first file to establish sequence length for phylip header
            if first_file is True:
                filehandler.seek(0, 0)
                buff = filehandler.read(args.buffer)
                while buff != '':
                    if " " in buff:
                        totalseqlen += len(buff.strip().split(" ")[-1])
                    else:
                        totalseqlen += len(buff.strip())
                    buff = filehandler.read(args.buffer)
                outfile.write("{} {}\n".format(len(sample_labels),
                                               totalseqlen))
                first_file = False
            filehandler.seek(0, 0)
            buff = filehandler.read(args.buffer)
            while buff != '':
                if first_file is True:
                    outfile.write("{} {}\n".format(len(sample_labels),
                                                   len(buff.split()[1])))
                    first_file = False
                outfile.write(buff)
                buff = filehandler.read(args.buffer)
            outfile.write("\n")
            filehandler.close()
            os.remove(os.path.join(args.temp_dir, filehandler.name))
    if args.partition is True:
        if current_contig_end > current_contig_start:
            partitionfile.write("{}, {} = {}-{}\n".format(
                partprefix, mvf.get_contig_labels(ids=current_contig_id),
                current_contig_start, current_contig_end - 1))
        partitionfile.close()
    return ''

Example #30

Show file

File: mvftranslate.py Project: hj1994412/mvftools

def annotate_mvf(args):
    """Main method"""
    args.qprint("Running AnnotateMVF")
    mvf = MultiVariantFile(args.mvf, 'read')
    args.qprint("Input MVF header processed.")
    args.qprint("MVF flavor: {}".format(mvf.metadata['flavor']))
    gff, geneids = parse_gff_annotate(args.gff, mvf.metadata['contigs'],
                                      gene_prefix=args.gene_prefix)
    args.qprint("GFF processed.")
    outmvf = MultiVariantFile(args.out, 'write', overwrite=args.overwrite,
                              flavor=mvf.metadata['flavor'])
    outmvf.metadata = deepcopy(mvf.metadata)
    if args.nongenic_mode is False:
        outmvf.metadata['contigs'] = geneids
    outmvf.write_data(outmvf.get_header())
    args.qprint("Output MVF established.")
    entrybuffer = []
    nentry = 0
    args.qprint("Processing MVF entries.")
    for contigid, pos, allelesets in mvf.iterentries(decode=False):
        annotated_pos = False
        if contigid in gff:
            if pos in gff[contigid]:
                annotated_pos = True
            elif args.nongenic_mode is True and args.unmargin > 0:
                for xpos in range(pos - args.unmargin,
                                  pos + args.unmargin + 1):
                    if xpos in gff[contigid]:
                        annotated_pos = True
                        break
        if annotated_pos and not args.nongenic_mode:
            entrybuffer.append((gff[contigid][pos], pos, allelesets))
        elif args.nongenic_mode and not annotated_pos:
            entrybuffer.append((contigid, pos, allelesets))
        if args.nongenic_mode or annotated_pos:
            nentry += 1
            if nentry == args.line_buffer:
                args.qprint("Writing block of entries.")
                outmvf.write_entries(entrybuffer)
                entrybuffer = []
                nentry = 0
    if entrybuffer:
        outmvf.write_entries(entrybuffer)
        args.qprint("Writing final block of entries.")
        entrybuffer = []
        nentry = 0
    return ''