Beispiel #1
0
def calc_sample_coverage(args):
    """Counts the total number of non-gap/ambiguous characters for
      each sample per contig.
      """
    mvf = MultiVariantFile(args.mvf, 'read')
    data = {}
    # data_order = []
    # Set up sample indices
    if args.sample_indices is not None:
        sample_indices = [int(x) for x in args.sample_indices[0].split(",")]
    elif args.sample_labels is not None:
        sample_indices = mvf.get_sample_indices(
            ids=args.sample_labels[0].split(","))
    else:
        sample_indices = mvf.get_sample_indices()
    sample_labels = mvf.get_sample_ids(indices=sample_indices)
    # Set up contig ids
    if args.contig_ids is not None:
        contig_indices = mvf.get_contig_indices(args.contig_ids[0].split(","))
    elif args.contig_labels is not None:
        contig_indices = mvf.get_contig_indices(
            labels=args.contig_labels[0].split(","))
    else:
        contig_indices = None
    for contig, _, allelesets in mvf.iterentries(contig_indices=contig_indices,
                                                 subset=sample_indices,
                                                 decode=True):
        if contig not in data:
            data[contig] = dict((x, 0) for x in sample_labels)
            data[contig]['contig'] = contig
        for j, elem in enumerate(sample_indices):
            data[contig][sample_labels[elem]] += int(
                allelesets[0][j] not in 'Xx-')
    outfile = OutputFile(path=args.out,
                         headers=(["contig"] +
                                  [sample_labels[x] for x in sample_indices]))
    for contig in data:
        outfile.write_entry(data[contig])
    return ''
Beispiel #2
0
def mvf2phy(args):
    """Main method"""
    mvf = MultiVariantFile(args.mvf, 'read')
    if (mvf.flavor in ("dna", "rna") and args.output_data == "prot") or (
            mvf.flavor == "prot" and args.output_data in ("dna", "rna")):
        raise RuntimeError(
            "--outdput-data {} incompatiable with '{}' flavor mvf".format(
                args.output_data, mvf.flavor))
    max_region_coord = dict((x, None) for x in mvf.get_contig_ids())
    if args.regions is not None:
        _, max_region_coord, _ = parse_regions_arg(args.regions,
                                                   mvf.get_contig_ids())
    if args.sample_indices is not None:
        sample_indices = [int(x) for x in args.sample_indices[0].split(",")]
    elif args.sample_labels is not None:
        sample_indices = mvf.get_sample_indices(
            ids=args.sample_labels[0].split(","))
    else:
        sample_indices = mvf.get_sample_indices()
    sample_labels = mvf.get_sample_ids(indices=sample_indices)
    skipcontig = ''
    tmp_files = dict((fn,
                      open("{}-{}.tmp".format(fn, randint(1000000, 9999999)),
                           'w+', args.buffer)) for fn in sample_labels)
    labelwritten = dict.fromkeys(sample_labels, False)
    current_contig_id = None
    current_contig_start = 1
    current_contig_end = 1
    if args.partition is True:
        partprefix = "PROT" if args.output_data == "prot" else "DNA"
        partitionfile = open("{}.part".format(args.out), 'w')
    for contig, _, allelesets in mvf.iterentries(
            contig_ids=(mvf.get_contig_ids()
                        if args.regions is None else max_region_coord[:]),
            decode=True):
        if contig == skipcontig:
            continue
        if contig not in max_region_coord:
            skipcontig = contig[:]
            continue
        if current_contig_id is None:
            current_contig_id = contig[:]
        elif contig != current_contig_id:
            if args.partition is True:
                if current_contig_end > current_contig_start:
                    partitionfile.write("{}, {} = {}-{}\n".format(
                        partprefix,
                        mvf.get_contig_labels(ids=current_contig_id),
                        current_contig_start, current_contig_end - 1))
            current_contig_id = contig[:]
            # reset start as one position after end of last
            current_contig_start = current_contig_end
            current_contig_end = current_contig_end + 1
        for col, label in zip(sample_indices, sample_labels):
            if not labelwritten[label]:
                if args.label_type == 'long':
                    tmp_files[label].write("{}{}".format(
                        label[:100], " " * (100 - len(label[:100]))))
                elif args.label_type == 'short':
                    tmp_files[label].write("{}{}".format(
                        label[:20], " " * (20 - len(label[:20]))))
                labelwritten[label] = True
            if mvf.flavor == 'dna':
                tmp_files[label].write(allelesets[0][col] == 'X' and 'N'
                                       or allelesets[0][col])
                if label == sample_labels[0]:
                    current_contig_end += 1
            elif ((mvf.flavor == 'codon' and args.output_data == 'prot')
                  or (mvf.flavor == 'prot')):
                tmp_files[label].write(allelesets[0][col])
                if label == sample_labels[0]:
                    current_contig_end += 1
            elif mvf.flavor == 'codon':
                codon = [
                    "N" if allelesets[x][col] == 'X' else allelesets[x][col]
                    for x in (1, 2, 3)
                ]
                tmp_files[label].write(''.join(codon))
                if label == sample_labels[0]:
                    current_contig_end += 3
    first_file = True
    totalseqlen = 0
    with open(args.out, 'w') as outfile:
        for filehandler in tmp_files.values():
            # read first file to establish sequence length for phylip header
            if first_file is True:
                filehandler.seek(0, 0)
                buff = filehandler.read(args.buffer)
                while buff != '':
                    if " " in buff:
                        totalseqlen += len(buff.strip().split(" ")[-1])
                    else:
                        totalseqlen += len(buff.strip())
                    buff = filehandler.read(args.buffer)
                outfile.write("{} {}\n".format(len(sample_labels),
                                               totalseqlen))
                first_file = False
            filehandler.seek(0, 0)
            buff = filehandler.read(args.buffer)
            while buff != '':
                if first_file is True:
                    outfile.write("{} {}\n".format(len(sample_labels),
                                                   len(buff.split()[1])))
                    first_file = False
                outfile.write(buff)
                buff = filehandler.read(args.buffer)
            outfile.write("\n")
            filehandler.close()
            os.remove(os.path.join(args.temp_dir, filehandler.name))
    if args.partition is True:
        if current_contig_end > current_contig_start:
            partitionfile.write("{}, {} = {}-{}\n".format(
                partprefix, mvf.get_contig_labels(ids=current_contig_id),
                current_contig_start, current_contig_end - 1))
        partitionfile.close()
    return ''
Beispiel #3
0
def mvf2fastagene(args):
    """Main method"""
    args.qprint("Indexing MVF")
    mvf = MultiVariantFile(args.mvf, 'read', contigindex=True)
    if (mvf.flavor in ("dna", "rna") and args.output_data == "prot") or (
            mvf.flavor == "prot" and args.output_data in ("dna", "rna")):
        raise RuntimeError(
            "--output-data {} incompatiable with '{}' flavor mvf".format(
                args.output_data, mvf.flavor))
    if args.output_data is None:
        raise RuntimeError("--output-data required")
    sample_labels = mvf.get_sample_ids()
    if args.sample_indices is not None:
        sample_indices = [int(x) for x in args.sample_indices[0].split(",")]
    elif args.sample_labels is not None:
        sample_indices = mvf.get_sample_indices(
            ids=args.sample_labels[0].split(","))
    else:
        sample_indices = mvf.get_sample_indices()
    args.qprint("Beginning Entries.")
    if not os.path.exists(args.output_dir):
        args.qprint("Output Directory Created: {}".format(args.output_dir))
        os.mkdir(args.output_dir)
    else:
        args.qprint("Output Directory Exists Already: {}".format(
            args.output_dir))
    write_buffer = {}
    for targetcontig in mvf.get_contig_indices():
        contiglabel = mvf.get_contig_labels(indices=targetcontig)[0]
        args.qprint("Reading Contig {}: {}".format(targetcontig, contiglabel))
        write_buffer = dict((x, []) for x in sample_labels)
        data_in_buffer = False
        for _, _, allelesets in mvf.itercontigentries(targetcontig,
                                                      decode=True):
            for col, label in zip(sample_indices, sample_labels):
                if mvf.flavor == 'dna':
                    write_buffer[label].append('N' if allelesets[0][col] ==
                                               'X' else allelesets[0][col])
                    data_in_buffer = True
                elif mvf.flavor in ('codon', 'prot') and (args.output_data
                                                          == 'prot'):
                    write_buffer[label].append(allelesets[0][col])
                    data_in_buffer = True
                elif mvf.flavor == 'codon' and args.output_data == 'dna':
                    if args.choose_allele == 'random1':
                        codon = [
                            'N' if allelesets[x][col] == 'X' else
                            (MLIB.randomnuc(allelesets[x][col]) if
                             (allelesets[x][col]
                              in MLIB.validchars['dnaambig23']) else
                             allelesets[x][col]) for x in (1, 2, 3)
                        ]
                    else:
                        codon = [
                            'N' if allelesets[x][col] == 'X' else
                            allelesets[x][col] for x in (1, 2, 3)
                        ]
                    write_buffer[label].append(''.join(codon))
                    data_in_buffer = True
        if data_in_buffer:
            args.qprint("Writing Align")
            with open(os.path.join(args.output_dir, contiglabel + ".fa"),
                      'w') as outfile:
                for label in write_buffer:
                    if (mvf.flavor == 'codon'
                            and args.output_data in ('dna', 'prot')):
                        if ((mvf.contig_data[targetcontig].get('strand', '+')
                             == '-') and (args.ignore_strand is False)):
                            entryseq = ''.join(write_buffer[label][::-1])
                        else:
                            entryseq = ''.join(write_buffer[label])
                    else:
                        entryseq = ''.join(write_buffer[label])
                    outfile.write(">{}\n{}\n".format(label, entryseq))
                outfile.write("\b")

    return ''
Beispiel #4
0
def mvf2fasta(args):
    """Main method"""
    mvf = MultiVariantFile(args.mvf, 'read')
    if (mvf.flavor in ("dna", "rna") and args.output_data == "prot") or (
            mvf.flavor == "prot" and args.output_data in ("dna", "rna")):
        raise RuntimeError(
            "--output-data {} incompatiable with '{}' flavor mvf".format(
                args.output_data, mvf.flavor))
    regions, max_region_coord, regionlabel = parse_regions_arg(
        args.regions, mvf.contig_data)
    sample_labels = mvf.get_sample_ids()
    if args.sample_indices is not None:
        sample_indices = [int(x) for x in args.sample_indices[0].split(",")]
    elif args.sample_labels is not None:
        sample_indices = mvf.get_sample_indices(
            ids=args.sample_labels[0].split(","))
    else:
        sample_indices = mvf.get_sample_indices()
    skipcontig = None
    tmp_files = dict(
        (fname, tempfile.NamedTemporaryFile(mode='w+', prefix=fname))
        for fname in sample_labels)
    labelwritten = dict.fromkeys(sample_labels, False)
    write_buffer = {}
    current_contig = None
    data_written = False
    args.qprint("Regions determined. Reading entries.")
    for contig, pos, allelesets in mvf.iterentries(
            contig_indices=mvf.get_contig_indices(
                ids=list(max_region_coord.keys())),
            decode=True):
        if current_contig is None:
            current_contig = mvf.get_contig_indices(ids=contig)
        if contig == skipcontig:
            continue
        if (contig not in max_region_coord) or (
                max_region_coord[contig] is not None
                and pos > max_region_coord[contig]):
            skipcontig = contig[:]
            continue
        inregion = False
        for rcontig, rstart, rstop, _ in regions[contig]:
            if contig == rcontig:
                if rstart is None or pos >= rstart:
                    if rstop is None or pos <= rstop:
                        inregion = True
                        break
        if inregion is False:
            continue
        for col, label in zip(sample_indices, sample_labels):
            if not labelwritten[label]:
                if args.label_type == 'long':
                    xlabel = "{} region={}".format(label, regionlabel)
                elif args.label_type == 'short':
                    xlabel = "{}".format(label)
                tmp_files[label].write(">{}\n".format(xlabel))
                labelwritten[label] = True
            if mvf.flavor == 'dna':
                tmp_files[label].write("N" if allelesets[0][col] ==
                                       'X' else allelesets[0][col])
                data_written = True
            elif mvf.flavor in ('codon', 'prot') and (args.output_data
                                                      == 'prot'):
                tmp_files[label].write(allelesets[0][col])
                data_written = True
            elif mvf.flavor == 'codon' and args.output_data == 'dna':
                codon = [
                    "N" if allelesets[x][col] == 'X' else allelesets[x][col]
                    for x in (1, 2, 3)
                ]
                if not args.gene_mode:
                    tmp_files[label].write(''.join(codon))
                    data_written = True
                else:
                    if contig != current_contig:
                        if mvf.metadata['contigs'][current_contig].get(
                                'strand', "+") == '-':
                            write_buffer[label] = write_buffer[label][::-1]
                        tmp_files[label].write(''.join(write_buffer[label]))
                        data_written = True
                    if label not in write_buffer:
                        write_buffer[label] = []
                    write_buffer[label].append(''.join(codon))
        if args.gene_mode and current_contig != contig:
            write_buffer = {}
            current_contig = contig[:]
    if write_buffer:
        for label in write_buffer:
            if mvf.metadata['contigs'][current_contig].get('strand',
                                                           "+") == '-':
                write_buffer[label] = write_buffer[label][::-1]
            tmp_files[label].write(''.join(write_buffer[label]))
            data_written = True
        write_buffer = {}
    if data_written is False:
        print("ERROR NO DATA WRITTEN")
    with open(args.out, 'w') as outfile:
        for filehandler in tmp_files.values():
            filehandler.seek(0, 0)
            buff = filehandler.read(args.buffer)
            while buff:
                outfile.write(buff)
                buff = filehandler.read(args.buffer)
            outfile.write("\n")
            filehandler.close()
    return ''
Beispiel #5
0
def filter_mvf(args):
    """Main method"""
    if args.more_help is True:
        modulehelp()
        sys.exit()
    if args.mvf is None and args.test is None:
        raise RuntimeError("No input file specified with --mvf")
    if args.out is None and args.test is None:
        raise RuntimeError("No output file specified with --out")
    # Establish Input MVF
    if args.test is not None:
        ncol = args.test_nchar or len(args.test.split()[1])
    else:
        mvf = MultiVariantFile(args.mvf, 'read')
        ncol = mvf.metadata['ncol']
    # Create Actionset
    if args.labels:
        labels = mvf.get_sample_labels()[:]
        for i in range(len(args.actions)):
            action = args.actions[i]
            arr = action.split(':')
            if arr[0] in ('columns', 'collapsepriority', 'collapsemerge',
                          'allelegroup', 'notmultigroup'):
                for j in range(1, len(arr)):
                    arr[j] = ','.join(
                        [str(labels.index(x)) for x in arr[j].split(',')])
            args.actions[i] = ':'.join(arr)
    actionset = build_actionset(args.actions, ncol)
    # TESTING MODE
    if args.test:
        loc, alleles = args.test.split()
        linefail = False
        transformed = False
        # invar = invariant (single character)
        # refvar (all different than reference, two chars)
        # onecov (single coverage, + is second character)
        # onevar (one variable base, + is third character)
        # full = full alleles (all chars)
        if args.verbose:
            print(alleles)
        linetype = get_linetype(alleles)
        sys.stdout.write("MVF Encoding type '{}' detected\n".format(linetype))
        for actionname, actiontype, actionfunc, actionarg in actionset:
            sys.stdout.write("Applying action {} ({}): ".format(
                actionname, actiontype))
            if actiontype == 'filter':
                if not actionfunc(alleles, linetype):
                    linefail = True
                    sys.stdout.write("Filter Fail\n")
                    break
                else:
                    sys.stdout.write("Filter Pass\n")
            elif actiontype == 'transform':
                transformed = True
                alleles = actionfunc(alleles, linetype)
                linetype = get_linetype(alleles)
                if linetype == 'empty':
                    linefail = True
                    sys.stdout.write("Transform removed all alleles\n")
                    break
                else:
                    sys.stdout.write("Transform result {}\n".format(alleles))
            elif actiontype == 'location':
                loc = loc.split(':')
                loc[1] = int(loc[1])
                if actionfunc(loc) is False:
                    linefail = True
                    sys.stdout.write("Location Fail\n")
                    break
                else:
                    sys.stdout.write("Location Pass\n")
        if linefail is False:
            if transformed:
                if linetype == 'full':
                    alleles = encode_mvfstring(alleles)
                if alleles:
                    test_output = "{}\t{}\n".format(loc, alleles)
                    sys.stdout.write("Final output = {}\n".format(test_output))
                else:
                    sys.stdout.write("Transform removed all alleles\n")
            else:
                sys.stdout.write("No changes applied\n")
                sys.stdout.write("Final output = {}\n".format(args.test))
        sys.exit()
    # MAIN MODE
    # Set up file handler
    outmvf = MultiVariantFile(args.out, 'write', overwrite=args.overwrite)
    outmvf.metadata = deepcopy(mvf.metadata)
    # reprocess header if actions are used that filter columns
    if any(x == y[0] for x in ('columns', 'collapsepriority', 'collapsemerge')
           for y in actionset):
        if args.labels:
            labels = outmvf.metadata['labels'][:]
        else:
            labels = [x for x in outmvf.metadata['samples']]
        for actionname, actiontype, actionfunc, actionarg in actionset:
            if actionname == 'columns':
                labels = [labels[x] for x in actionarg[0]]
            elif actionname in ('collapsepriority', 'collapsemerge'):
                labels = [
                    labels[x] for x in range(len(labels))
                    if x not in actionarg[0][1:]
                ]
        if args.labels:
            oldindices = mvf.get_sample_indices(labels)
        else:
            oldindices = labels[:]
        newsamples = {}
        for i, _ in enumerate(labels):
            newsamples[i] = mvf.metadata['samples'][oldindices[i]]
        outmvf.metadata['samples'] = newsamples.copy()
        outmvf.metadata['labels'] = labels[:]
    outmvf.write_data(outmvf.get_header())
    # End header editing
    linebuffer = []
    nbuffer = 0
    for chrom, pos, allelesets in mvf.iterentries(decode=False):
        linefail = False
        transformed = False
        # invar = invariant (single character)
        # refvar (all different than reference, two chars)
        # onecov (single coverage, + is second character)
        # onevar (one variable base, + is third character)
        # full = full alleles (all chars)
        alleles = allelesets[0]
        linetype = get_linetype(alleles)
        if linetype == 'empty':
            continue
        if args.verbose is True:
            sys.stdout.write(" {} {}".format(alleles, linetype))
        for actionname, actiontype, actionfunc, actionargs in actionset:
            if actiontype == 'filter':
                if not actionfunc(alleles, linetype):
                    linefail = True
            elif actiontype == 'transform':
                transformed = True
                alleles = actionfunc(alleles, linetype)
                linetype = get_linetype(alleles)
                if linetype == 'empty':
                    linefail = True
            elif actiontype == 'location':
                if actionfunc([chrom, pos]) is False:
                    linefail = True
            if linefail:
                break
        if linefail is False:
            if transformed:
                if linetype == 'full':
                    alleles = mvf.encode(alleles)
                if not alleles:
                    linefail = True
            nbuffer += 1
            linebuffer.append((chrom, pos, (alleles, )))
            if args.verbose:
                sys.stdout.write("{}\n".format(alleles))
            if nbuffer == args.line_buffer:
                outmvf.write_entries(linebuffer)
                linebuffer = []
                nbuffer = 0
        elif args.verbose:
            sys.stdout.write("FAIL\n")
    if linebuffer:
        outmvf.write_entries(linebuffer)
        linebuffer = []
    return ''
Beispiel #6
0
def calc_all_character_count_per_sample(args):
    """Count the number of and relative rate of certain bases
       spatially along chromosomes
    """
    args.qprint("Running CalcAllCharacterCountPerSample")
    mvf = MultiVariantFile(args.mvf, 'read')
    current_contig = None
    current_position = 0
    data_in_buffer = False
    # Set up sample indices
    sample_labels = mvf.get_sample_ids()
    if args.sample_indices is not None:
        sample_indices = [int(x) for x in args.sample_indices[0].split(",")]
    elif args.sample_labels is not None:
        sample_indices = mvf.get_sample_indices(
            ids=args.sample_labels[0].split(","))
    else:
        sample_indices = mvf.get_sample_indices()
    # Set up contig ids
    if args.contig_ids is not None:
        contig_ids = args.contig_ids[0].split(",")
    elif args.contig_labels is not None:
        contig_ids = mvf.get_contig_ids(
            labels=args.contig_labels[0].split(","))
    else:
        contig_ids = None
    data = dict((i, {}) for i in sample_indices)
    data_characters = [{} for i in sample_indices]
    for contig, pos, allelesets in mvf.iterentries(decode=False,
                                                   contig_ids=contig_ids):
        # Check Minimum Site Coverage
        if check_mincoverage(args.mincoverage, allelesets[0]) is False:
            continue
        if current_contig is None:
            current_contig = contig[:]
            if args.windowsize > 0:
                while pos > current_position + args.windowsize - 1:
                    current_position += args.windowsize
        # Check if windows are specified.
        if not same_window((current_contig, current_position),
                           (contig, pos), args.windowsize):
            args.qprint("Processing contig {}".format(current_contig))
            for i in sample_indices:
                data[i][(current_contig, current_position)] = {
                    'contig': current_contig,
                    'position': current_position
                }
                data[i][(current_contig,
                         current_position)].update(data_characters[i])
            if contig != current_contig:
                current_contig = contig[:]
                current_position = 0
            else:
                current_position += (0 if args.windowsize == -1 else
                                     args.windowsize)
            data_characters = [{} for i in sample_indices]
            data_in_buffer = False
        alleles = allelesets[0]
        if len(alleles) == 1:
            for i in sample_indices:
                data_characters[i][alleles[0]] = (
                    data_characters[i].get(alleles[0], 0) + 1)
        else:
            alleles = mvf.decode(alleles)
            for i in sample_indices:
                data_characters[i][alleles[i]] = (
                    data_characters[i].get(alleles[i], 0) + 1)
        data_in_buffer = True
    if data_in_buffer:
        for i in sample_indices:
            data[i][(current_contig, current_position)] = {
                'contig': current_contig,
                'position': current_position
            }
            data[i][(current_contig,
                     current_position)].update(data_characters[i])
    # WRITE OUTPUT
    all_chars = set([])
    for sampleid in data:
        for window in data[sampleid]:
            all_chars.update([
                x for x in data[sampleid][window]
                if x not in ('contig', 'position')
            ])
    headers = ['contig', 'position']
    headers.extend(list(sorted(all_chars)))
    outfile = OutputFile(path=args.out, headers=headers)

    for sampleid in sample_indices:
        outfile.write("#{}\n".format(sample_labels[sampleid]))
        sorted_entries = [(data[sampleid][k]['contig'],
                           data[sampleid][k]['position'], k)
                          for k in data[sampleid]]
        for _, _, k in sorted_entries:
            outfile.write_entry(data[sampleid][k], defaultvalue='0')
    return ''
Beispiel #7
0
def calc_pairwise_distances(args):
    """Count the pairwise nucleotide distance between
       combinations of samples in a window
    """
    args.qprint("Running CalcPairwiseDistances")
    mvf = MultiVariantFile(args.mvf, 'read')
    args.qprint("Input MVF: Read")
    data = {}
    data_order = []
    if args.sample_indices is not None:
        sample_indices = [int(x) for x in args.sample_indices[0].split(",")]
    elif args.sample_labels is not None:
        sample_indices = mvf.get_sample_indices(
            ids=args.sample_labels[0].split(","))
    else:
        sample_indices = mvf.get_sample_indices()
    sample_labels = mvf.get_sample_ids(indices=sample_indices)
    args.qprint("Calculating for sample columns: {}".format(
        list(sample_indices)))
    current_contig = None
    current_position = 0
    data_in_buffer = False
    sample_pairs = [tuple(x) for x in combinations(sample_indices, 2)]
    base_matches = dict((x, {}) for x in sample_pairs)
    all_match = {}
    if mvf.flavor == 'dna':
        allele_frames = (0, )
        args.data_type = 'dna'
    elif mvf.flavor == 'prot':
        allele_frames = (0, )
        args.data_type = 'dna'
    elif mvf.flavor == 'codon':
        if args.data_type == 'prot':
            allele_frames = (0, )
        else:
            allele_frames = (1, 2, 3)
            args.data_type = 'dna'
    args.qprint("MVF flavor is: {}".format(mvf.flavor))
    args.qprint("Data type is: {}".format(args.data_type))
    args.qprint("Ambiguous mode: {}".format(args.ambig))
    args.qprint("Processing MVF Records")
    pwdistance_function = get_pairwise_function(args.data_type, args.ambig)
    if args.emit_counts:
        outfile_emitcounts = open(args.out + ".pairwisecounts", 'w')
    for contig, pos, allelesets in mvf.iterentries(decode=None):
        # Check Minimum Site Coverage
        if check_mincoverage(args.mincoverage, allelesets[0]) is False:
            continue
        # Establish first contig
        if current_contig is None:
            current_contig = contig[:]
            if args.windowsize > 0:
                while pos > current_position + args.windowsize - 1:
                    current_position += args.windowsize
        # Check if windows are specified.
        if not same_window((current_contig, current_position),
                           (contig, pos), args.windowsize):
            data[(current_contig, current_position)] = {
                'contig': current_contig,
                'position': current_position
            }
            data_order.append((current_contig, current_position))
            all_diff, all_total = pwdistance_function(all_match)
            for samplepair in base_matches:
                ndiff, ntotal = pwdistance_function(base_matches[samplepair])
                taxa = "{};{}".format(sample_labels[samplepair[0]],
                                      sample_labels[samplepair[1]])
                data[(current_contig, current_position)].update({
                    '{};ndiff'.format(taxa):
                    ndiff + all_diff,
                    '{};ntotal'.format(taxa):
                    ntotal + all_total,
                    '{};dist'.format(taxa):
                    zerodiv(ndiff + all_diff, ntotal + all_total)
                })
            if contig != current_contig:
                current_contig = contig[:]
                current_position = 0
                if args.windowsize > 0:
                    while pos > current_position + args.windowsize - 1:
                        current_position += args.windowsize
            else:
                current_position += args.windowsize
            if args.emit_counts:
                args.qprint("Writing Full Count Table")
                for p0, p1 in base_matches:
                    outfile_emitcounts.write("#{}\t{}\t{}\t{}\n{}\n".format(
                        p0, p1, current_position, current_contig, "\n".join([
                            "{} {}".format(x,
                                           (base_matches[(p0, p1)].get(x, 0) +
                                            all_match.get(x, 0)))
                            for x in set(base_matches[(p0,
                                                       p1)]).union(all_match)
                        ])))
            base_matches = dict((x, {}) for x in sample_pairs)
            all_match = {}
            data_in_buffer = False
        for iframe in allele_frames:
            alleles = allelesets[iframe]
            if len(alleles) == 1:
                all_match["{0}{0}".format(alleles)] = (
                    all_match.get("{0}{0}".format(alleles), 0) + 1)
                data_in_buffer = True
                continue
            if alleles[1] == '+':
                if alleles[2] in 'X-':
                    continue
                samplepair = (0, int(alleles[3:]))
                if any(x not in sample_indices for x in samplepair):
                    continue
                basepair = "{0}{1}".format(alleles[0], alleles[2])
                base_matches[samplepair][basepair] = (
                    base_matches[samplepair].get(basepair, 0) + 1)
                data_in_buffer = True
                continue
            alleles = mvf.decode(alleles)
            valid_positions = [
                i for i, x in enumerate(alleles)
                if x not in 'X-' and i in sample_indices
            ]
            assert len(alleles) == 4
            assert alleles[0] not in 'X-', alleles
            assert alleles[1] not in 'X-', alleles
            for i, j in combinations(valid_positions, 2):
                samplepair = (i, j)
                basepair = "{0}{1}".format(alleles[i], alleles[j])
                base_matches[samplepair][basepair] = (
                    base_matches[samplepair].get(basepair, 0) + 1)
            data_in_buffer = True
        # print(base_matches)
    if data_in_buffer is True:
        print(sum(base_matches[samplepair].values()), base_matches[samplepair],
              samplepair)
        print(sum(all_match.values()), all_match)
        print(sum(base_matches[samplepair].values()) + sum(all_match.values()))
        # Check whether, windows, contigs, or total
        if args.windowsize == 0:
            current_contig = 'TOTAL'
            current_position = 0
        elif args.windowsize == -1:
            current_position = 0
        data[(current_contig, current_position)] = {
            'contig': current_contig,
            'position': current_position
        }
        data_order.append((current_contig, current_position))
        # print("All match")
        all_diff, all_total = pwdistance_function(all_match)
        print(all_diff, all_total)
        for samplepair in base_matches:
            ndiff, ntotal = pwdistance_function(base_matches[samplepair])
            taxa = "{};{}".format(sample_labels[samplepair[0]],
                                  sample_labels[samplepair[1]])
            data[(current_contig, current_position)].update({
                '{};ndiff'.format(taxa):
                ndiff + all_diff,
                '{};ntotal'.format(taxa):
                ntotal + all_total,
                '{};dist'.format(taxa):
                zerodiv(ndiff + all_diff, ntotal + all_total)
            })
        if args.emit_counts:
            args.qprint("Writing Full Count Table")
            for p0, p1 in base_matches:
                outfile_emitcounts.write("#{}\t{}\t{}\t{}\n{}\n".format(
                    p0, p1, current_position, current_contig, "\n".join([
                        "{} {}".format(x, (base_matches[(p0, p1)].get(x, 0) +
                                           all_match.get(x, 0)))
                        for x in set(base_matches[(p0, p1)]).union(all_match)
                    ])))
    args.qprint("Writing Output")
    headers = ['contig', 'position']
    for samplepair in sample_pairs:
        headers.extend([
            '{};{};{}'.format(sample_labels[samplepair[0]],
                              sample_labels[samplepair[1]], x)
            for x in ('ndiff', 'ntotal', 'dist')
        ])
    outfile = OutputFile(path=args.out, headers=headers)
    for okey in data_order:
        outfile.write_entry(data[okey])
    if args.emit_counts:
        outfile_emitcounts.close()
    return ''
Beispiel #8
0
def calc_pattern_count(args):
    """Count biallelic patterns spatially along
       chromosomes (e.g,, for use in DFOIL or Dstats
       http://www.github.com/jbpease/dfoil).
       The last sample specified will determine the 'A'
       versus 'B' allele.
    """
    mvf = MultiVariantFile(args.mvf, 'read')
    data = {}
    current_contig = None
    current_position = 0
    sitepatterns = {}
    if args.sample_indices is not None:
        sample_indices = [int(x) for x in args.sample_indices[0].split(",")]
    elif args.sample_labels is not None:
        sample_indices = mvf.get_sample_indices(
            ids=args.sample_labels[0].split(","))
    else:
        sample_indices = mvf.get_sample_indices()
    nsamples = len(sample_indices)
    for contig, pos, allelesets in mvf.iterentries(decode=True,
                                                   subset=sample_indices):
        alleles = allelesets[0]
        # Check Minimum Site Coverage
        if check_mincoverage(args.mincoverage, alleles) is False:
            continue
        # Establish first contig
        if current_contig is None:
            current_contig = contig[:]
            if args.windowsize > 0:
                while pos > current_position + args.windowsize - 1:
                    current_position += args.windowsize
        # Check if windows are specified.
        if not same_window((current_contig, current_position),
                           (contig, pos), args.windowsize):
            data[(current_contig,
                  current_position)] = dict([('contig', current_contig),
                                             ('position', current_position)])
            data[(current_contig, current_position)].update(sitepatterns)
            sitepatterns = {}
            if contig != current_contig:
                current_position = 0
                current_contig = contig[:]
            else:
                current_position += (0 if args.windowsize == -1 else
                                     args.windowsize)
        if set(alleles) - set("ACGT"):
            continue
        if len(set(alleles)) > 2:
            continue
        pattern = ''.join(
            ['A' if x == alleles[-1] else 'B' for x in alleles[:-1]]) + 'A'
        sitepatterns[pattern] = sitepatterns.get(pattern, 0) + 1
    if sitepatterns:
        data[(current_contig,
              current_position)] = dict([('contig', current_contig),
                                         ('position', current_position)])
        data[(current_contig, current_position)].update(sitepatterns)
    # WRITE OUTPUT
    headers = ['contig', 'position']
    headers.extend(
        [MLIB.abpattern(x, nsamples) for x in range(0, 2**nsamples, 2)])
    outfile = OutputFile(path=args.out, headers=headers)
    outfile.write("#{}\n".format(",".join(mvf.get_sample_ids(sample_indices))))
    sorted_entries = sorted([(data[k]['contig'], data[k]['position'], k)
                             for k in data])
    for _, _, k in sorted_entries:
        outfile.write_entry(data[k])
    # WRITE LIST OUTPUT
    if args.output_lists is True:
        sorted_entries = sorted([(data[k]['contig'], data[k]['position'], k)
                                 for k in data])
        total_counts = {}
        for contig, pos, k in sorted_entries:
            outfilepath = "{}-{}-{}.counts.list".format(args.out, contig, pos)
            with open(outfilepath, 'w') as outfile:
                outfile.write("pattern,count\n")
                for pattern, pcount in sorted(data[k].items()):
                    if pattern in ['contig', 'position']:
                        continue
                    outfile.write("{},{}\n".format(pattern, pcount))
                    total_counts[pattern] = (total_counts.get(pattern, 0) +
                                             pcount)
        outfilepath = "{}-TOTAL.counts.list".format(args.out)
        with open(outfilepath, 'w') as outfile:
            outfile.write("pattern,count\n")
            for pattern, pcount in sorted(total_counts.items()):
                if pattern in ['contig', 'position']:
                    continue
                outfile.write("{},{}\n".format(pattern, pcount))
    return ''
Beispiel #9
0
def calc_character_count(args):
    """Count the number of and relative rate of certain bases
       spatially along chromosomes
    """
    mvf = MultiVariantFile(args.mvf, 'read')
    data = {}
    current_contig = None
    current_position = 0
    all_match = 0
    all_total = 0
    data_in_buffer = False
    # Set up base matching from special words
    data_order = []

    def proc_special_word(argx):
        if argx == 'dna':
            argx = MLIB.validchars['dna']
        elif argx == 'dnaambig2':
            argx = MLIB.validchars['dna+ambig2']
        elif argx == 'dnaambig3':
            argx = MLIB.validchars['dna+ambig3']
        elif argx == 'dnaambigall':
            argx = MLIB.validchars['dna+ambigall']
        elif argx == 'prot':
            argx = MLIB.validchars['amino']
        return argx

    args.base_match = proc_special_word(args.base_match)
    args.base_total = proc_special_word(args.base_total)
    # Set up sample indices
    if args.sample_indices is not None:
        sample_indices = [int(x) for x in args.sample_indices[0].split(",")]
    elif args.sample_labels is not None:
        sample_indices = mvf.get_sample_indices(
            ids=args.sample_labels[0].split(","))
    else:
        sample_indices = mvf.get_sample_indices()
    sample_labels = mvf.get_sample_ids(indices=sample_indices)
    # Set up contig ids
    if args.contig_ids is not None:
        contig_indices = mvf.get_contig_indices(
            ids=args.contig_ids[0].split(","))
    elif args.contig_labels is not None:
        contig_indices = mvf.get_contig_indices(
            labels=args.contig_labels[0].split(","))
    else:
        contig_indices = None
    match_counts = dict().fromkeys([sample_labels[i] for i in sample_indices],
                                   0)
    total_counts = dict().fromkeys([sample_labels[i] for i in sample_indices],
                                   0)
    for contig, pos, allelesets in mvf.iterentries(
            decode=False, contig_indices=contig_indices):
        # Check Minimum Site Coverage
        if check_mincoverage(args.mincoverage, allelesets[0]) is False:
            continue
        # if contig not in contig_ids:
        #   continue
        # Establish first contig
        if current_contig is None:
            current_contig = contig[:]
            if args.windowsize > 0:
                while pos > current_position + args.windowsize - 1:
                    current_position += args.windowsize
        # Check if windows are specified.
        if not same_window((current_contig, current_position),
                           (contig, pos), args.windowsize):
            data[(current_contig, current_position)] = {
                'contig': current_contig,
                'position': current_position
            }
            data_order.append((current_contig, current_position))
            for k in match_counts:

                data[(current_contig, current_position)].update([
                    (k + '.match', match_counts[k] + all_match),
                    (k + '.total', total_counts[k] + all_total),
                    (k + '.prop', ((float(match_counts[k] + all_match) /
                                    float(total_counts[k] + all_total))
                                   if total_counts[k] + all_total > 0 else 0))
                ])
            if contig != current_contig:
                current_contig = contig[:]
                current_position = 0
            else:
                current_position += (0 if args.windowsize == -1 else
                                     args.windowsize)
            match_counts = dict().fromkeys(
                [sample_labels[i] for i in sample_indices], 0)
            total_counts = dict().fromkeys(
                [sample_labels[i] for i in sample_indices], 0)
            all_total = 0
            all_match = 0
            data_in_buffer = False
        else:
            alleles = allelesets[0]
            if len(alleles) == 1:
                if args.base_match is None:
                    all_match += 1
                elif alleles in args.base_match:
                    all_match += 1
                if args.base_total is None:
                    all_total += 1
                elif alleles in args.base_total:
                    all_total += 1
            else:
                alleles = mvf.decode(alleles)
                for i in sample_indices:
                    if args.base_match is None:
                        match_counts[sample_labels[i]] += 1
                    elif alleles[i] in args.base_match:
                        match_counts[sample_labels[i]] += 1
                    if args.base_total is None:
                        total_counts[sample_labels[i]] += 1
                    elif alleles[i] in args.base_total:
                        total_counts[sample_labels[i]] += 1
            data_in_buffer = True
    if data_in_buffer:
        data[(current_contig, current_position)] = {
            'contig': current_contig,
            'position': current_position
        }
        data_order.append((current_contig, current_position))
        for k in match_counts:
            data[(current_contig, current_position)].update([
                (k + '.match', match_counts[k] + all_match),
                (k + '.total', total_counts[k] + all_total),
                (k + '.prop', ((float(match_counts[k] + all_match) /
                                float(total_counts[k] + all_total))
                               if total_counts[k] + all_total > 0 else 0))
            ])
    # WRITE OUTPUT
    headers = ['contig', 'position']
    for label in sample_labels:
        headers.extend([label + x for x in ('.match', '.total', '.prop')])
    outfile = OutputFile(path=args.out, headers=headers)
    for okey in data_order:
        outfile.write_entry(data[okey])
    return ''
Beispiel #10
0
def plot_chromoplot(args):
    """Main method"""
    pallette = Pallette()
    if args.colors is not None:
        pallette.basecolors = args.colors
    # Establish MVF and parse chromosome information
    if args.quiet is False:
        print("Reading MVF...")
    mvf = MultiVariantFile(args.mvf, 'read')
    if args.quiet is False:
        print("Parsing headers...")
    if args.contig_ids is not None:
        contigids = args.contig_ids[0].split(",")
    elif args.contig_labels is not None:
        contigids = mvf.get_contig_ids(labels=args.contig_labels[0].split(","))
    else:
        contigids = mvf.get_contig_ids()
    if args.quiet is False:
        print("Plotting chromoplot for contigs: {}".format(
            ",".join(contigids)))
    sample_labels = mvf.get_sample_labels()
    if args.sample_indices is not None:
        sample_indices = [int(x) for x in args.sample_indices[0].split(",")]
    elif args.sample_labels is not None:
        sample_indices = mvf.get_sample_indices(
            labels=args.sample_labels[0].split(","))
    else:
        sample_indices = mvf.get_sample_indices()
    assert len(sample_indices) >= 3
    if args.outgroup_indices is not None:
        outgroup_indices = [
            int(x) for x in args.outgroup_indices[0].split(",")
        ]
    elif args.outgroup_labels is not None:
        outgroup_indices = mvf.get_sample_indices(
            labels=args.outgroup_labels[0].split(","))
    assert len(outgroup_indices) >= 1
    quartets = [(x, y, z, outgroup)
                for x, y, z in combinations(sample_indices, 3)
                for outgroup in outgroup_indices]
    # Begin iterations
    for quartet_indices in quartets:
        quartet_labels = [sample_labels[x] for x in quartet_indices]
        if args.quiet is False:
            print("Beginning quartet {}".format(",".join(quartet_labels)))
        params = {
            'contigs': [[
                contigid, mvf.metadata['contigs'][contigid]['label'],
                mvf.metadata['contigs'][contigid]['length']
            ] for contigid in contigids],
            'outpath':
            ((args.out_prefix if args.out_prefix is not None else '')
             or '_'.join(quartet_labels)) + ".png",
            'labels':
            quartet_labels,
            'indices':
            quartet_indices,
            'windowsize':
            args.windowsize,
            'majority':
            args.majority,
            'infotrack':
            args.info_track,
            'yscale':
            args.yscale,
            'xscale':
            args.xscale,
            'quiet':
            args.quiet,
            'plottype':
            args.plot_type
        }
        chromoplot = Chromoplot(params=params, pallette=pallette)
        current_contig = ''
        for contig, pos, allelesets in mvf.iterentries(subset=quartet_indices,
                                                       decode=True,
                                                       contigs=contigids):
            if contig != current_contig:
                if args.quiet is False:
                    print("Starting contig {}".format(contig))
                    current_contig = contig[:]
            alleles = allelesets[0]
            if '-' in alleles:
                site_code = 'gap'
            elif any(x not in 'ATGCatgc' for x in alleles):
                site_code = 'ambiguous'
            elif alleles[3] not in alleles[:3]:
                site_code = 'nonpolar'
            elif len(set(alleles)) > 2:
                site_code = 'triallelic'
            else:
                site_code = sum([
                    2**(3 - j) * (alleles[j] != alleles[3]) for j in range(3)
                ])
            chromoplot.add_data(str(contig), int(pos // args.windowsize),
                                site_code)
        contig = ''
        current_contig = ''
        if not args.quiet:
            print("Writing image...")
        chromoplot.plot_chromoplot()

        if not args.quiet:
            print("Writing log...")
        chromoplot.write_total_log()
    return ''
Beispiel #11
0
def calc_dstat_combinations(args):
    """Calculate genome-wide D-statstics for
       all possible trio combinations of samples
       and outgroups specified.
    """
    mvf = MultiVariantFile(args.mvf, 'read')
    data = {}
    sample_labels = mvf.get_sample_ids()
    if args.outgroup_indices is not None:
        outgroup_indices = [
            int(x) for x in args.outgroup_indices[0].split(",")
        ]
    elif args.outgroup_labels is not None:
        outgroup_indices = mvf.get_sample_indices(
            ids=args.outgroup_labels[0].split(","))
    if args.sample_indices is not None:
        sample_indices = [int(x) for x in args.sample_indices[0].split(",")]
    elif args.sample_labels is not None:
        sample_indices = mvf.get_sample_indices(
            ids=args.sample_labels[0].split(","))
    else:
        sample_indices = mvf.get_sample_indices()
    if args.contig_ids is not None:
        contig_ids = args.contig_ids[0].split(",")
    elif args.contig_labels is not None:
        contig_ids = mvf.get_contig_ids(
            labels=args.contig_labels[0].split(","))
    else:
        contig_ids = None
    if any(x in outgroup_indices for x in sample_indices):
        raise RuntimeError("Sample and Outgroup column lists cannot overlap.")
    for contig, _, allelesets in mvf:
        if contig not in contig_ids:
            continue
        alleles = mvf.decode(allelesets[0])
        for i, j, k in combinations(sample_indices, 3):
            for outgroup in outgroup_indices:
                subset = [alleles[x] for x in [i, j, k, outgroup]]
                if any(x not in 'ATGC' for x in subset):
                    continue
                if subset[-1] not in subset[:3]:
                    continue
                if len(set(subset)) != 2:
                    continue
                # [ABBA, BABA, BBAA]
                val = (0 + 1 * (subset[0] == subset[3]) + 2 *
                       (subset[1] == subset[3]) + 4 * (subset[2] == subset[3]))
                if val in (1, 2):
                    val -= 1
                elif val == 4:
                    val = 2
                else:
                    continue
                tetrad = (i, j, k, outgroup)
                if tetrad not in data:
                    data[tetrad] = {}
                if contig not in data[tetrad]:
                    data[tetrad][contig] = [0, 0, 0]
                data[tetrad][contig][val] += 1
    # WRITE OUTPUT
    headers = ['sample0', 'sample1', 'sample2', "outgroup"]
    for xcontig in contig_ids:
        headers.extend([
            '{}:abba'.format(xcontig), '{}:baba'.format(xcontig),
            '{}:bbaa'.format(xcontig), '{}:D'.format(xcontig)
        ])
    outfile = OutputFile(path=args.out, headers=headers)
    for i, j, k in combinations(sample_indices, 3):
        for outgroup in outgroup_indices:
            tetrad = tuple([i, j, k, outgroup])
            if tetrad not in data:
                continue
            entry = dict(('sample{}'.format(i), sample_labels[x])
                         for i, x in enumerate(tetrad[:3]))
            entry['outgroup'] = sample_labels[outgroup]
            for contig in contig_ids:
                if contig not in data[tetrad]:
                    entry.update(dict().fromkeys([
                        '{}:abba'.format(contig), '{}:baba'.format(contig),
                        '{}:bbaa'.format(contig), '{}:D'.format(contig)
                    ], '0'))
                else:
                    [abba, baba, bbaa] = data[tetrad][contig]
                    if abba > baba and abba > bbaa:

                        dstat = zerodiv(baba - bbaa, baba + bbaa)
                    elif baba > bbaa and baba > abba:
                        dstat = zerodiv(abba - bbaa, abba + bbaa)
                    else:
                        dstat = zerodiv(abba - baba, abba + baba)
                    entry.update([('{}:abba'.format(contig), abba),
                                  ('{}:baba'.format(contig), baba),
                                  ('{}:bbaa'.format(contig), bbaa),
                                  ('{}:D'.format(contig), dstat)])
            outfile.write_entry(entry)
    return ''
Beispiel #12
0
def infer_window_tree(args):
    """Main method"""
    # ESTABLISH FILE OBJECTS
    mvf = MultiVariantFile(args.mvf, 'read')
    # Set up contig ids
    if args.contig_ids is not None:
        contig_ids = args.contig_ids[0].split(",")
    elif args.contig_labels is not None:
        contig_ids = mvf.get_contig_ids(
            labels=args.contig_labels[0].split(","))
    else:
        contig_ids = mvf.get_contig_ids()
    treefile = OutputFile(
        args.out,
        headers=[
            'contig',
            'windowstart',
            'windowsize',
            'tree',
            'topology',
            'topoid',
            # 'templabels', ### USED FOR DEBUGGING ###
            'alignlength',
            'aligndepth',
            'status'
        ])
    topofile = OutputFile(args.out + '.counts',
                          headers=['rank', 'topology', 'count'])
    if args.sample_indices is not None:
        sample_indices = [int(x) for x in args.sample_indices[0].split(",")]
    elif args.sample_labels is not None:
        sample_indices = mvf.get_sample_indices(
            labels=args.sample_labels[0].split(","))
    else:
        sample_indices = mvf.get_sample_indices()
    if not os.path.exists(args.temp_dir):
        os.mkdir(args.temp_dir)
    os.chdir(args.temp_dir)
    # SETUP PARAMS
    main_labels = mvf.get_sample_labels(sample_indices)
    if args.choose_allele in ['randomboth', 'majorminor']:
        main_labels = [label + x for x in ['a', 'b'] for label in main_labels]
    params = {
        'outgroups':
        args.raxml_outgroups or [],
        'rootwith':
        (args.root_with.split(',') if args.root_with is not None else None),
        'minsites':
        args.min_sites,
        'minseqcoverage':
        args.min_seq_coverage,
        'mindepth':
        args.min_depth,
        'raxmlpath':
        args.raxml_path,
        'raxmlopts':
        args.raxml_opts,
        'duplicateseq':
        args.duplicate_seq,
        'model':
        args.raxml_model,
        'bootstrap':
        args.bootstrap,
        'windowsize':
        args.windowsize,
        'chooseallele':
        args.choose_allele,
        'tempdir':
        args.temp_dir,
        'tempprefix':
        args.temp_prefix
    }
    # WINDOW START INTERATION
    verify_raxml(params)
    current_contig = ''
    current_position = 0
    window_data = None
    skip_contig = False
    topo_ids = {}
    topo_counts = {}
    for contig, pos, allelesets in mvf.iterentries(contigs=contig_ids,
                                                   subset=sample_indices,
                                                   quiet=args.quiet,
                                                   no_invariant=False,
                                                   no_ambig=False,
                                                   no_gap=False,
                                                   decode=True):
        if current_contig == contig:
            if skip_contig is True:
                continue
        if not same_window((current_contig, current_position),
                           (contig, pos), args.windowsize):
            skip_contig = False
            if window_data is not None:
                entry = window_data.maketree_raxml(params)
                if entry['status'] != 'ok':
                    if args.output_empty:
                        treefile.write_entry(entry)
                    if args.windowsize != -1:
                        skip_contig = True
                else:
                    topo = entry["topology"]
                    topo_counts[topo] = topo_counts.get(topo, 0) + 1
                    if topo not in topo_ids:
                        topo_ids[topo] = (topo_ids
                                          and max(topo_ids.values()) + 1 or 0)
                    entry["topoid"] = topo_ids[topo]
                    treefile.write_entry(entry)
                current_position = (current_position + args.windowsize if
                                    (contig == current_contig
                                     and args.windowsize > 0) else 0)
            current_contig = contig[:]
            window_data = None
            window_data = WindowData(
                window_params={
                    'contigname': (mvf.get_contig_labels(
                        ids=current_contig) if args.output_contig_labels
                                   is not None else current_contig[:]),
                    "windowstart": (
                        '-1' if args.windowsize == -1 else current_position +
                        0),
                    "windowsize":
                    args.windowsize,
                    "labels":
                    main_labels[:]
                })
        # ADD ALLELES
        if mvf.flavor == 'dna':
            if args.choose_allele != 'none':
                allelesets[0] = hapsplit(allelesets[0], args.choose_allele)
            window_data.append_alleles(allelesets[0], mindepth=args.min_depth)
    # LAST LOOP
    if window_data:
        entry = window_data.maketree_raxml(params)
        if entry['status'] != 'ok':
            if args.output_empty:
                treefile.write_entry(entry)
        else:
            topo = entry["topology"]
            topo_counts[topo] = topo_counts.get(topo, 0) + 1
            if topo not in topo_ids:
                topo_ids[topo] = (max(topo_ids.values()) +
                                  1 if topo_ids else 0)
            entry["topoid"] = topo_ids[topo]
            treefile.write_entry(entry)
        window_data = None
    # END WINDOW ITERATION
    topo_list = sorted([(v, k) for k, v in topo_counts.items()], reverse=True)
    for rank, [value, topo] in enumerate(topo_list):
        topofile.write_entry({'rank': rank, 'count': value, 'topology': topo})
    return ''
Beispiel #13
0
def infer_window_tree(args):
    """Main method"""
    args.qprint("Running InferTree")
    # ESTABLISH FILE OBJECTS
    mvf = MultiVariantFile(args.mvf, 'read')
    args.qprint("Read MVF File: {}".format(args.mvf))
    # Set up contig ids
    if args.contig_ids is not None:
        contig_ids = args.contig_ids[0].split(",")
    elif args.contig_labels is not None:
        contig_ids = mvf.get_contig_ids(
            labels=args.contig_labels[0].split(","))
    else:
        contig_ids = mvf.get_contig_ids()
    treefile = OutputFile(
        args.out,
        headers=['contig', 'windowstart', 'windowsize', 'tree',
                 'topology', 'topoid',
                 # 'templabels', ### USED FOR DEBUGGING ###
                 'alignlength', 'aligndepth', 'status'])
    topofile = OutputFile(args.out + '.counts',
                          headers=['rank', 'topology', 'count'])
    if args.sample_indices is not None:
        sample_indices = [int(x) for x in
                          args.sample_indices[0].split(",")]
    elif args.sample_labels is not None:
        sample_indices = mvf.get_sample_indices(
            ids=args.sample_labels[0].split(","))
    else:
        sample_indices = mvf.get_sample_indices()
    if not os.path.exists(args.temp_dir):
        os.mkdir(args.temp_dir)
    os.chdir(args.temp_dir)
    # SETUP PARAMS
    main_labels = mvf.get_sample_ids(sample_indices)
    if args.choose_allele in ['randomboth', 'majorminor']:
        main_labels = [label + x for x in ['a', 'b'] for label in main_labels]
    params = {
        'bootstrap': args.bootstrap,
        'chooseallele': args.choose_allele,
        'collapse_polytomies': args.collapse_polytomies,
        'duplicateseq': args.duplicate_seq,
        'engine': args.engine,
        'engine_path': args.engine_path,
        'engine_opts': args.engine_opts,
        'mindepth': args.min_depth,
        'minseqcoverage': args.min_seq_coverage,
        'minsites': args.min_sites,
        'model': args.model,
        'outgroups': (args.raxml_outgroups 
                      if args.raxml_outgroups is not None
                      else None),
        'rootwith': (args.root_with.split(',')
                     if args.root_with is not None
                    else []),
        'tempdir': args.temp_dir,
        'tempprefix': args.temp_prefix,
        'windowsize': args.windowsize,
        }
    # DEFAULT MODEL
    if params['model'] is None:
        if params['engine'] == 'raxml':
            params['model'] = 'GTRGAMMA'
        elif params['engine'] == 'raxml-ng':
            params['model'] = "GTR+G"
    # WINDOW START INTERATION
    verify_raxml(params)
    args.qprint("RAxML Found.")
    current_contig = None
    current_position = 0
    window_data = None
    # skip_contig = False
    topo_ids = {}
    topo_counts = {}
    args.qprint("Prcocessing Records")
    windowsizename = "window size={}".format(args.windowsize)
    if windowsizename == "window size=-1":
        windowsizename = "whole contig"
    elif windowsizename == "window size=0":
        windowsizename = "whole genome"
        window_data = WindowData(window_params={
            'contigname': 'all',
            "windowstart": 0,
            "windowsize": 0,
            "labels": main_labels[:]})
    for contig, pos, allelesets in mvf.iterentries(
            contig_ids=contig_ids, subset=sample_indices,
            no_invariant=False, no_ambig=False, no_gap=False, decode=True):
        # if current_contig == contig:
        #     if skip_contig is True:
        #         args.qprint("Skipping contig: {}".format(current_contig))
        #         continue
        if not same_window((current_contig, current_position),
                           (contig, pos), args.windowsize):
            # skip_contig = False
            if window_data is not None:
                args.qprint(("Making tree for {} "
                             "at contig {} position {}").format(
                                 windowsizename,
                                 current_contig,
                                 current_position))
                entry = window_data.maketree_raxml(params)
                if entry['status'] != 'ok':
                    if args.output_empty:
                        treefile.write_entry(entry)
                    # if args.windowsize != -1:
                    #     skip_contig = True
                    args.qprint(
                        "TREE REJECTED with error code: {} ({})".format(
                            entry['status'], entry.get('comment', "None")))
                else:
                    args.qprint("Tree completed.")
                    topo = entry["topology"]
                    topo_counts[topo] = topo_counts.get(topo, 0) + 1
                    if topo not in topo_ids:
                        topo_ids[topo] = (max(topo_ids.values()) + 1
                                          if topo_ids else 0)
                    entry["topoid"] = topo_ids[topo]
                    treefile.write_entry(entry)
                current_position = current_position + args.windowsize if (
                    contig == current_contig and args.windowsize > 0) else 0
            current_contig = contig[:]
            window_data = None
            window_data = WindowData(window_params={
                'contigname': (mvf.get_contig_labels(ids=current_contig) if
                               args.output_contig_labels is not None else
                               current_contig[:]),
                "windowstart": ('-1' if args.windowsize == -1
                                else current_position + 0),
                "windowsize": args.windowsize,
                "labels": main_labels[:]})
        # ADD ALLELES
        if mvf.flavor == 'dna':
            if args.choose_allele != 'none':
                allelesets[0] = hapsplit(allelesets[0], args.choose_allele)
            window_data.append_alleles(allelesets[0], mindepth=args.min_depth)
        elif mvf.flavor == 'codon':
            for i in (1, 2, 3):
                if args.choose_allele != 'none':
                    allelesets[i] = hapsplit(allelesets[i], args.choose_allele)
                window_data.append_alleles(allelesets[i], mindepth=args.min_depth)
    # LAST LOOP
    if window_data:
        entry = window_data.maketree_raxml(params)
        if entry['status'] != 'ok':
            if args.output_empty:
                treefile.write_entry(entry)
        else:
            topo = entry["topology"]
            topo_counts[topo] = topo_counts.get(topo, 0) + 1
            if topo not in topo_ids:
                topo_ids[topo] = (
                    max(topo_ids.values()) + 1 if topo_ids else 0)
            entry["topoid"] = topo_ids[topo]
            treefile.write_entry(entry)
        window_data = None
    # END WINDOW ITERATION
    topo_list = sorted([(v, k) for k, v in topo_counts.items()],
                       reverse=True)
    for rank, [value, topo] in enumerate(topo_list):
        topofile.write_entry({'rank': rank, 'count': value, 'topology': topo})
    return ''
Beispiel #14
0
def mvf2fasta(args):
    """Main method"""
    mvf = MultiVariantFile(args.mvf, 'read')
    if (mvf.flavor in ("dna", "rna") and args.output_data == "prot") or (
            mvf.flavor == "prot" and args.output_data in ("dna", "rna")):
        raise RuntimeError(
            "--output-data {} incompatiable with '{}' flavor mvf".format(
                args.output_data, mvf.flavor))
    regions, max_region_coord, regionlabel = parse_regions_arg(
        args.regions, mvf.metadata['contigs'])
    sample_labels = mvf.get_sample_labels()
    if args.sample_indices is not None:
        sample_indices = [int(x) for x in args.sample_indices[0].split(",")]
    elif args.sample_labels is not None:
        sample_indices = mvf.get_sample_indices(
            labels=args.sample_labels[0].split(","))
    else:
        sample_indices = mvf.get_sample_indices()
    skipcontig = ''
    tmp_files = dict((fn,
                      open("{}-{}.tmp".format(fn, randint(1000000, 9999999)),
                           'w+', args.buffer)) for fn in sample_labels)
    labelwritten = dict.fromkeys(sample_labels, False)
    for contig, pos, allelesets in mvf.iterentries(contigs=[
            x for x in max_region_coord
    ],
                                                   quiet=args.quiet,
                                                   decode=True):
        if contig == skipcontig:
            continue
        if (contig not in max_region_coord) or (
                max_region_coord[contig] is not None
                and pos > max_region_coord[contig]):
            skipcontig = contig[:]
            continue
        inregion = False
        for rcontig, rstart, rstop, _ in regions:
            if contig == rcontig:
                if rstart is None or pos >= rstart:
                    if rstop is None or pos <= rstop:
                        inregion = True
                        break
        if inregion is False:
            continue
        for col, label in zip(sample_indices, sample_labels):
            if not labelwritten[label]:
                if args.label_type == 'long':
                    xlabel = "{} region={}".format(label, regionlabel)
                elif args.label_type == 'short':
                    xlabel = "{}".format(label)
                tmp_files[label].write(">{}\n".format(xlabel))
                labelwritten[label] = True
            if mvf.flavor == 'dna':
                tmp_files[label].write("N" if allelesets[0][col] ==
                                       'X' else allelesets[0][col])
            elif mvf.flavor in ('codon', 'prot') and (args.output_data
                                                      == 'prot'):
                tmp_files[label].write(allelesets[0][col])
            elif mvf.flavor == 'codon' and args.output_data == 'dna':
                codon = [
                    "N" if allelesets[x][col] == 'X' else allelesets[x][col]
                    for x in (1, 2, 3)
                ]
                tmp_files[label].write(''.join(codon))
    with open(args.out, 'w') as outfile:
        for filehandler in tmp_files.values():
            filehandler.seek(0, 0)
            buff = filehandler.read(args.buffer)
            while len(buff):
                outfile.write(buff)
                buff = filehandler.read(args.buffer)
            outfile.write("\n")
            filehandler.close()
            os.remove(os.path.join(args.temp_dir, filehandler.name))
    return ''
Beispiel #15
0
def calc_pairwise_distances(args):
    """Count the pairwise nucleotide distance between
       combinations of samples in a window
    """
    mvf = MultiVariantFile(args.mvf, 'read')
    data = {}
    sample_labels = mvf.get_sample_labels()
    if args.sample_indices is not None:
        sample_indices = [int(x) for x in
                          args.sample_indices[0].split(",")]
    elif args.sample_labels is not None:
        sample_indices = mvf.get_sample_indices(
            labels=args.sample_labels[0].split(","))
    else:
        sample_indices = mvf.get_sample_indices()
    current_contig = None
    current_position = 0
    data_in_buffer = False
    sample_pairs = [tuple(x) for x in combinations(sample_indices, 2)]
    base_matches = dict([(x, {}) for x in sample_pairs])
    all_match = {}
    for contig, pos, allelesets in mvf:
        # Check Minimum Site Coverage
        if check_mincoverage(args.mincoverage, allelesets[0]) is False:
            continue
        # Establish first contig
        if current_contig is None:
            current_contig = contig[:]
            while pos > current_position + args.windowsize - 1:
                current_position += args.windowsize
        # Check if windows are specified.
        if not same_window((current_contig, current_position),
                           (contig, pos), args.windowsize):
            data[(current_contig, current_position)] = {
                'contig': current_contig, 'position': current_position}
            if mvf.flavor == 'dna':
                all_diff, all_total = pairwise_distance_nuc(all_match)
            elif mvf.flavor == 'prot':
                all_diff, all_total = pairwise_distance_prot(all_match)
            for samplepair in base_matches:
                if mvf.flavor == 'dna':
                    ndiff, ntotal = pairwise_distance_nuc(
                        base_matches[samplepair])
                elif mvf.flavor == 'prot':
                    ndiff, ntotal = pairwise_distance_prot(
                        base_matches[samplepair])
                taxa = "{};{}".format(sample_labels[samplepair[0]],
                                      sample_labels[samplepair[1]])
                data[(current_contig, current_position)].update({
                    '{};ndiff'.format(taxa): ndiff + all_diff,
                    '{};ntotal'.format(taxa): ntotal + all_total,
                    '{};dist'.format(taxa): zerodiv(ndiff + all_diff,
                                                    ntotal + all_total)})
            if contig != current_contig:
                current_contig = contig[:]
                current_position = 0
                while pos > current_position + args.windowsize - 1:
                    current_position += args.windowsize
            else:
                current_position += args.windowsize
            base_matches = dict([(x, {}) for x in sample_pairs])
            all_match = {}
            data_in_buffer = False
        alleles = allelesets[0]
        if len(alleles) == 1:
            all_match["{}{}".format(alleles, alleles)] = (
                all_match.get("{}{}".format(alleles, alleles),
                              0) + 1)
            data_in_buffer = True
            continue
        if alleles[1] == '+':
            if 'X' in alleles or '-' in alleles:
                continue
            samplepair = (0, int(alleles[3:]))
            if any(x not in sample_indices for x in samplepair):
                continue
            basepair = "{}{}".format(alleles[0], alleles[2])
            base_matches[samplepair][basepair] = (
                base_matches[samplepair].get(basepair, 0) + 1)
            data_in_buffer = True
            continue
        alleles = mvf.decode(alleles)
        valid_positions = [i for i, x in enumerate(alleles)
                           if x not in 'X-']
        for i, j in combinations(valid_positions, 2):
            samplepair = (i, j)
            if any(x not in sample_indices for x in samplepair):
                continue
            basepair = "{}{}".format(alleles[i], alleles[j])
            base_matches[samplepair][basepair] = (
                base_matches[samplepair].get(basepair, 0) + 1)
        data_in_buffer = True
    if data_in_buffer is True:
        # Check whether, windows, contigs, or total
        if args.windowsize == 0:
            current_contig = 'TOTAL'
            current_position = 0
        elif args.windowsize == -1:
            current_position = 0
        data[(current_contig, current_position)] = {
            'contig': current_contig, 'position': current_position}
        if mvf.flavor == 'dna':
            all_diff, all_total = pairwise_distance_nuc(all_match)
        elif mvf.flavor == 'prot':
            all_diff, all_total = pairwise_distance_prot(all_match)
        for samplepair in base_matches:
            if mvf.flavor == 'dna':
                ndiff, ntotal = pairwise_distance_nuc(base_matches[samplepair])
            elif mvf.flavor == 'prot':
                ndiff, ntotal = pairwise_distance_prot(
                    base_matches[samplepair])
            taxa = "{};{}".format(sample_labels[samplepair[0]],
                                  sample_labels[samplepair[1]])
            data[(current_contig, current_position)].update({
                '{};ndiff'.format(taxa): ndiff + all_diff,
                '{};ntotal'.format(taxa): ntotal + all_total,
                '{};dist'.format(taxa): zerodiv(ndiff + all_diff,
                                                ntotal + all_total)})
    headers = ['contig', 'position']
    for samplepair in sample_pairs:
        headers.extend(['{};{};{}'.format(
            sample_labels[samplepair[0]],
            sample_labels[samplepair[1]],
            x) for x in ('ndiff', 'ntotal', 'dist')])
    outfile = OutputFile(path=args.out, headers=headers)
    sorted_entries = sorted([(
        data[k]['contig'], data[k]['position'], k)
                             for k in data])
    for _, _, k in sorted_entries:
        outfile.write_entry(data[k])
    return ''
Beispiel #16
0
def calc_character_count(args):
    """Count the number of and relative rate of certain bases
       spatially along chromosomes
    """
    mvf = MultiVariantFile(args.mvf, 'read')
    data = {}
    current_contig = None
    current_position = 0
    all_match = 0
    all_total = 0
    data_in_buffer = 0
    # Set up sample indices
    sample_labels = mvf.get_sample_labels()
    if args.sample_indices is not None:
        sample_indices = [int(x) for x in
                          args.sample_indices[0].split(",")]
    elif args.sample_labels is not None:
        sample_indices = mvf.get_sample_indices(
            labels=args.sample_labels[0].split(","))
    else:
        sample_indices = mvf.get_sample_indices()
    # Set up contig ids
    if args.contig_ids is not None:
        contig_ids = args.contig_ids[0].split(",")
    elif args.contig_labels is not None:
        contig_ids = mvf.get_contig_ids(
            labels=args.contig_labels[0].split(","))
    else:
        contig_ids = mvf.get_contig_ids()
    match_counts = dict().fromkeys(
        [sample_labels[i] for i in sample_indices], 0)
    total_counts = dict().fromkeys(
        [sample_labels[i] for i in sample_indices], 0)
    for contig, pos, allelesets in mvf:
        # Check Minimum Site Coverage
        if check_mincoverage(args.mincoverage,
                             allelesets[0]) is False:
            continue
        if contig not in contig_ids:
            continue
        # Establish first contig
        if current_contig is None:
            current_contig = contig[:]
            while pos > current_position + args.windowsize - 1:
                current_position += args.windowsize
        # Check if windows are specified.
        if not same_window((current_contig, current_position),
                           (contig, pos), args.windowsize):
            data[(current_contig, current_position)] = {
                'contig': current_contig, 'position': current_position}
            for k in match_counts:
                data[(current_contig, current_position)].update([
                    (k + '.match', match_counts[k] + all_match),
                    (k + '.total', total_counts[k] + all_total),
                    (k + '.prop', (
                        (float(match_counts[k] + all_match) /
                         float(total_counts[k] + all_total)) if
                        total_counts[k] + all_total > 0 else 0))])
            if contig != current_contig:
                current_contig = contig[:]
                current_position = 0
            else:
                current_position += (0 if args.windowsize == -1
                                     else args.windowsize)
            match_counts = dict().fromkeys(
                [sample_labels[i] for i in sample_indices], 0)
            total_counts = dict().fromkeys(
                [sample_labels[i] for i in sample_indices], 0)
            all_total = 0
            all_match = 0
            data_in_buffer = 0
        else:
            alleles = allelesets[0]
            if len(alleles) == 1:
                if args.base_match is None:
                    all_match += 1
                elif alleles in args.base_match:
                    all_match += 1
                if args.base_total is None:
                    all_total += 1
                elif alleles in args.base_total:
                    all_total += 1
            else:
                alleles = mvf.decode(alleles)
                for i in sample_indices:
                    if args.base_match is None:
                        match_counts[sample_labels[i]] += 1
                    elif alleles[i] in args.base_match:
                        match_counts[sample_labels[i]] += 1
                    if args.base_total is None:
                        total_counts[sample_labels[i]] += 1
                    elif alleles[i] in args.base_total:
                        total_counts[sample_labels[i]] += 1
            data_in_buffer = 1
    if data_in_buffer:
        data[(current_contig, current_position)] = {
            'contig': current_contig, 'position': current_position}
        for k in match_counts:
            data[(current_contig, current_position)].update([
                (k + '.match', match_counts[k] + all_match),
                (k + '.total', total_counts[k] + all_total),
                (k + '.prop', ((float(match_counts[k] + all_match) /
                                float(total_counts[k] + all_total)) if
                               total_counts[k] + all_total > 0 else 0))])
    # WRITE OUTPUT
    headers = ['contig', 'position']
    for label in sample_labels:
        headers.extend([label + x for x in ('.match', '.total', '.prop')])
    outfile = OutputFile(path=args.out,
                         headers=headers)
    sorted_entries = sorted([(data[k]['contig'],
                              data[k]['position'], k)
                             for k in data])
    for _, _, k in sorted_entries:
        outfile.write_entry(data[k])
    return ''