Ejemplo n.º 1
0
def fasta2mvf(args):
    """Main method"""
    sepchars = dict([("PIPE", "\\|"), ("TAB", "\\t"), ("SPACE", "\\s"),
                     ("DBLSPACE", "\\s\\s"), ("COMMA", "\\,"), ("NONE", None),
                     ("AT", "\\@"), ('UNDER', "\\_"), ("DBLUNDER", "\\_\\_")])
    if args.field_sep is None:
        args.field_sep = ''
    else:
        args.field_sep = re.compile("[{}]".format(''.join(
            [sepchars[x] for x in args.field_sep])))
    if args.manual_coord:
        assert len(args.manual_coord) == len(args.fasta)
        args.manual_coord = [(x.split(':')[0],
                              int(x.split(":")[1].split('..')[0]),
                              int(x.split(':')[1].split('..')[1]))
                             for x in args.manual_coord]
    mvf = MultiVariantFile(args.out, 'write', overwrite=args.overwrite)
    fasta = {}
    current_contig = 0
    fsamples = []
    fcontigs = []
    for ifasta, fastapath in enumerate(args.fasta):
        print("Processing {}".format(fastapath))
        for header, seq in fasta_iter(fastapath):
            if args.field_sep is None:
                header = header[:]
            if args.field_sep != '' and args.field_sep is not None:
                header = [str(x) for x in re.split(args.field_sep, header)]
            if args.contig_by_file is True:
                contig = os.path.basename(fastapath[:])
                if args.sample_field is None:
                    sample = header[:]
                else:
                    sample = header[args.sample_field]
            elif (len(header) < max(
                    args.contig_field if args.contig_field is not None else 0,
                    args.sample_field if args.sample_field is not None else 0)
                  or args.contig_field is None or args.sample_field is None):
                contig = "UNK{}".format(current_contig)
                sample = header[:]
            elif args.manual_coord:
                contig = args.manual_coord[ifasta][0]
            else:
                contig = header[args.contig_field]
                sample = header[args.sample_field]
            if contig not in fcontigs:
                fcontigs.append(contig)
                fasta[contig] = {}
            if sample not in fsamples:
                fsamples.append(sample)
            fasta[contig][sample] = (len(seq), seq)
    reflabel = None
    if args.ref_label:
        for i, samplename in enumerate(fsamples):
            if args.ref_label in samplename:
                reflabel = i
                break
    if reflabel:
        newref = fsamples.pop(i)
        fsamples = [newref] + fsamples
    for i, contig in enumerate(fcontigs):
        new_index = mvf.get_next_contig_index()
        mvf.contig_indices.append(new_index)
        mvf.contig_ids.append(str(new_index))
        mvf.contig_labels.append(contig)
        mvf.contig_label_to_index[contig] = new_index
        mvf.contig_id_to_index[str(new_index)] = new_index
        mvf.contig_data[new_index] = {
            'label': contig,
            'id': str(new_index),
            'length': max([fasta[contig][x][0] for x in fasta[contig]])
        }
    mvf.metadata['labels'] = fsamples[:]
    for i, label in enumerate(fsamples[:]):
        mvf.sample_indices.append(i)
        mvf.sample_id_to_index[label] = i
        mvf.sample_ids.append(label)
        mvf.sample_data[i] = {'id': label}
    mvf.metadata['ncol'] = len(mvf.metadata['labels'])
    mvf.metadata['sourceformat'] = 'fasta'
    mvf.metadata.append(args.command_string)
    mvf.flavor = args.flavor
    # WRITE MVF HEADER
    mvf.write_data(mvf.get_header())
    mvfentries = []
    nentry = 0
    mvf_alleles = {}
    for cind, contig in enumerate(fcontigs):
        for pos in range(mvf.contig_data[cind + 1]['length']):
            mvf_alleles = encode_mvfstring(
                ''.join(samp not in fasta[contig] and '-'
                        or pos >= fasta[contig][samp][0] and '-'
                        or fasta[contig][samp][1][pos] for samp in fsamples))
            if mvf_alleles:
                if args.flavor == 'dna':
                    mvf_alleles = ''.join(
                        ["X" if x in 'NX' else x for x in mvf_alleles])
                mvfentries.append((cind, pos + 1, (mvf_alleles, )))
                nentry += 1
                if nentry == args.write_buffer:
                    mvf.write_entries(mvfentries, encoded=True)
                    mvfentries = []
                    nentry = 0
    if mvfentries:
        mvf.write_entries(mvfentries)
        mvfentries = []
    return ''
Ejemplo n.º 2
0
def legacy_translate_mvf(args):
    """Main method"""
    args.qprint("Running LegacyTranslateMVF")
    if args.gff:
        args.qprint("Reading and Indexing MVF.")
    else:
        args.qprint("Reading MVF.")
    mvf = MultiVariantFile(args.mvf, 'read', contigindex=bool(args.gff))
    if mvf.flavor != 'dna':
        raise RuntimeError("MVF must be flavor=dna to translate")
    if args.gff:
        args.qprint("Processing MVF Index File.")
        mvf.read_index_file()
        args.qprint("GFF processing start.")
        gff = parse_gff_legacy_translate(
            args.gff, args,
            parent_gene_pattern=args.parent_gene_pattern)
        args.qprint("GFF processed.")
    outmvf = MultiVariantFile(args.out, 'write', overwrite=args.overwrite)
    outmvf.copy_headers_from(mvf)
    outmvf.flavor = args.output_data
    outmvf.write_data(outmvf.get_header())
    args.qprint("Output MVF Established.")
    entrybuffer = []
    nentry = 0
    pos = None
    if not args.gff:
        args.qprint("No GFF used, translating sequences as pre-aligned in "
                    "coding frame.")
        inputbuffer = []
        current_contig = ''
        for contigid, pos, allelesets in mvf.iterentries(decode=False):
            if current_contig == '':
                current_contig = contigid[:]
            if contigid == current_contig:
                inputbuffer.append((pos, allelesets))
            else:
                for _, amino_acids, alleles in iter_codons(
                        inputbuffer, mvf):
                    if all([x in '-X' for x in amino_acids]):
                        continue
                    if args.output_data == 'protein':
                        entrybuffer.append(
                            (current_contig, pos, (amino_acids,)))
                    else:
                        entrybuffer.append((
                            current_contig, pos, (
                                amino_acids, alleles[0],
                                alleles[1], alleles[2])))
                    nentry += 1
                    if nentry == args.line_buffer:
                        outmvf.write_entries(entrybuffer)
                        entrybuffer = []
                        nentry = 0
                inputbuffer = [(pos, allelesets)]
                current_contig = contigid[:]
        if inputbuffer:
            for _, amino_acids, alleles in iter_codons(
                    inputbuffer, outmvf):
                if all([x in '-X' for x in amino_acids]):
                    continue
                if args.output_data == 'protein':
                    entrybuffer.append(
                        (current_contig, pos, (amino_acids,)))
                else:
                    entrybuffer.append((
                        current_contig, pos, (
                            amino_acids, alleles[0],
                            alleles[1], alleles[2])))
                nentry += 1
                if nentry == args.line_buffer:
                    outmvf.write_entries(entrybuffer)
                    entrybuffer = []
                    nentry = 0
    else:
        args.qprint("Indexing GFF gene names.")
        # mvfid_to_gffname = outmvf.get_contig_reverse_dict()
        for xcontig in outmvf.get_contig_indices():
            mvf_entries = {}
            xcontiglabel = outmvf.get_contig_labels(indices=xcontig)[0]
            xcontigid = outmvf.get_contig_ids(indices=xcontig)[0]
            if xcontiglabel not in gff:
                if args.verbose:
                    print(
                        ("No entries in GFF, "
                         "skipping contig: index:{} id:{} label:{}").format(
                             xcontig, xcontigid, xcontiglabel))
                continue
            if not xcontig % 100:
                args.qprint("Processing contig: {} {}".format(
                    xcontigid, xcontiglabel))
            for contigid, pos, allelesets in mvf.itercontigentries(
                    xcontig, decode=False):
                mvf_entries[pos] = allelesets[0]
            for coords in sorted(gff[xcontiglabel]):
                reverse_strand = coords[3] == '-'
                alleles = (tuple(mvf_entries.get(x, '-')
                                 for x in coords[2::-1])
                           if reverse_strand is True
                           else tuple(mvf_entries.get(x, '-')
                                      for x in coords[0:3]))
                if all(len(x) == 1 for x in alleles):
                    if reverse_strand:
                        alleles = tuple(
                            MLIB.complement_bases[x] for x in alleles)
                    decoded_alleles = alleles
                    amino_acids = translate_single_codon(''.join(alleles))
                else:
                    if reverse_strand is True:
                        decoded_alleles = tuple(tuple(MLIB.complement_bases[y]
                                                      for y in mvf.decode(x))
                                                for x in alleles)
                        alleles = tuple(outmvf.encode(''.join(x))
                                        for x in decoded_alleles)
                    else:
                        decoded_alleles = tuple(mvf.decode(x) for x in alleles)
                    amino_acids = tuple(translate_single_codon(''.join(x))
                                        for x in zip(*decoded_alleles))
                    # print("aminx", amino_acids)
                    amino_acids = outmvf.encode(''.join(amino_acids))
                # if all(x in '-X' for x in amino_acids):
                #    continue
                # print("amino", amino_acids)
                # print("translated", amino_acids, alleles)
                if args.output_data == 'protein':
                    entrybuffer.append((xcontig, coords[0], (amino_acids,)))
                else:
                    entrybuffer.append((
                        xcontigid, coords[0], (
                            amino_acids, alleles[0], alleles[1], alleles[2])))
                nentry += 1
                if nentry >= args.line_buffer:
                    args.qprint("Writing a block of {} entries.".format(
                        args.line_buffer))
                    outmvf.write_entries(entrybuffer)
                    entrybuffer = []
                    nentry = 0
    if entrybuffer:
        outmvf.write_entries(entrybuffer)
        entrybuffer = []
        nentry = 0
    return ''
Ejemplo n.º 3
0
def translate_mvf(args):
    """Main method"""
    args.qprint("Running TranslateMVF")
    if args.gff:
        args.qprint("Reading and Indexing MVF.")
    else:
        args.qprint("Reading MVF.")
    mvf = MultiVariantFile(args.mvf, 'read', contigindex=bool(args.gff))
    if mvf.flavor != 'dna':
        raise RuntimeError("MVF must be flavor=dna to translate")
    if args.gff:
        args.qprint("Processing MVF Index File.")
        mvf.read_index_file()
        args.qprint("GFF processing start.")
        gff_genes, gene_order = parse_gff_exome(args)
        args.qprint("GFF processed.")
    outmvf = MultiVariantFile(args.out, 'write', overwrite=args.overwrite)
    outmvf.copy_headers_from(mvf)
    outmvf.contig_data = dict(
         (
                i, dict((y, z)
                                       for (y, z) in gff_genes[x].items()
                                       if y not in ('cds', )))
                              for (i, x) in enumerate(gene_order))
    outmvf.contig_indices = list(range(len(gene_order)))
    outmvf.contig_ids = [gff_genes[x]['id']
                         for x in gene_order]
    outmvf.contig_labels = [gff_genes[x]['label']
                            for x in gene_order]
    outmvf.flavor = args.output_data
    outmvf.metadata.notes.append(args.command_string)
    outmvf.write_data(outmvf.get_header())
    args.qprint("Output MVF Established.")
    entrybuffer = []
    nentry = 0
    pos = None
    if not args.gff:
        args.qprint("No GFF used, translating sequences as pre-aligned in "
                    "coding frame.")
        inputbuffer = []
        current_contig = ''
        for contigid, pos, allelesets in mvf.iterentries(decode=False):
            if current_contig == '':
                current_contig = contigid[:]
            if contigid == current_contig:
                inputbuffer.append((pos, allelesets))
            else:
                for _, amino_acids, alleles in iter_codons(
                        inputbuffer, mvf):
                    if all([x in '-X' for x in amino_acids]):
                        continue
                    if args.output_data == 'protein':
                        entrybuffer.append(
                            (current_contig, pos, (amino_acids,)))
                    else:
                        entrybuffer.append((
                            current_contig, pos, (
                                amino_acids, alleles[0],
                                alleles[1], alleles[2])))
                    nentry += 1
                    if nentry == args.line_buffer:
                        outmvf.write_entries(entrybuffer)
                        entrybuffer = []
                        nentry = 0
                inputbuffer = [(pos, allelesets)]
                current_contig = contigid[:]
        if inputbuffer:
            for _, amino_acids, alleles in iter_codons(
                    inputbuffer, outmvf):
                if all([x in '-X' for x in amino_acids]):
                    continue
                if args.output_data == 'protein':
                    entrybuffer.append(
                        (current_contig, pos, (amino_acids,)))
                else:
                    entrybuffer.append((
                        current_contig, pos, (
                            amino_acids, alleles[0],
                            alleles[1], alleles[2])))
                nentry += 1
                if nentry == args.line_buffer:
                    outmvf.write_entries(entrybuffer)
                    entrybuffer = []
                    nentry = 0
    else:
        running_gene_index = -1
        for igene, gene in enumerate(gene_order):
            xcontiglabel = gff_genes[gene]['contig']
            xcontig = mvf.get_contig_indices(
                labels=gff_genes[gene]['contig'])
            if xcontig is None:
                print("Warning: contig {} not found".format(
                    gff_genes[gene]['contig']))
            xcontigid = mvf.get_contig_ids(indices=xcontig)[0]
            min_gene_coord = gff_genes[gene]['cds'][0][0]
            max_gene_coord = gff_genes[gene]['cds'][-1][1]
            mvf_entries = {}
            if not igene % 100:
                args.qprint("Processing gene {} on {}".format(
                    gene, xcontiglabel))
            for contigid, pos, allelesets in mvf.itercontigentries(
                    xcontig, decode=False):
                if pos < min_gene_coord:
                    continue
                if pos > max_gene_coord:
                    break
                mvf_entries[pos] = allelesets[0]
            reverse_strand = gff_genes[gene]['strand'] == '-'
            coords = []
            running_gene_index += 1
            for elem in gff_genes[gene]['cds']:
                coords.extend(list(range(elem[0], elem[1] + 1)))
            if reverse_strand:
                coords = coords[::-1]
            for codoncoord in range(0, len(coords), 3):
                alleles = tuple(mvf_entries.get(x, '-')
                                for x in coords[codoncoord:codoncoord + 3])
                if len(alleles) < 3:
                    alleles = tuple(list(alleles) + ['-'] * (3 - len(alleles)))
                if all(len(x) == 1 for x in alleles):
                    if reverse_strand:
                        alleles = tuple(
                            MLIB.complement_bases[x] for x in alleles)
                    decoded_alleles = alleles
                    amino_acids = translate_single_codon(''.join(alleles))
                else:
                    if reverse_strand is True:
                        decoded_alleles = tuple(tuple(MLIB.complement_bases[y]
                                                      for y in mvf.decode(x))
                                                for x in alleles)
                        alleles = tuple(outmvf.encode(''.join(x))
                                        for x in decoded_alleles)
                    else:
                        decoded_alleles = tuple(mvf.decode(x) for x in alleles)
                    amino_acids = tuple(translate_single_codon(''.join(x))
                                        for x in zip(*decoded_alleles))
                    amino_acids = outmvf.encode(''.join(amino_acids))
                if args.output_data == 'protein':
                    entrybuffer.append((
                        (
                            xcontigid
                            if args.retain_contigs
                            else running_gene_index
                        ),
                        (
                            coords[codoncoord]
                            if args.retain_coords
                            else codoncoord
                        ),
                        (
                            amino_acids,
                        )
                    ))
                elif args.output_data == 'codon':
                    entrybuffer.append((
                        (
                            xcontigid
                            if args.retain_contigs
                            else running_gene_index
                        ),
                        (
                            coords[codoncoord]
                            if args.retain_coords
                            else codoncoord
                        ),
                        (
                            amino_acids,
                            alleles[0],
                            alleles[1],
                            alleles[2]
                        )
                    ))
                elif args.output_data == 'dna':
                    for j, elem in enumerate(
                            range(codoncoord,
                                  min(codoncoord + 3, len(coords)))):
                        entrybuffer.append((
                            (
                                xcontigid
                                if args.retain_contigs
                                else running_gene_index
                            ),
                            (
                                coords[elem]
                                if args.retain_coords
                                else elem + 1
                            ),
                            (
                                alleles[j],
                            )
                        ))
                nentry += 1
                if nentry >= args.line_buffer:
                    args.qprint("Writing a block of {} entries.".format(
                        args.line_buffer))
                    outmvf.write_entries(entrybuffer)
                    entrybuffer = []
                    nentry = 0
        if entrybuffer:
            outmvf.write_entries(entrybuffer)
            entrybuffer = []
            nentry = 0
    return ''
Ejemplo n.º 4
0
def translate_mvf(args):
    """Main method"""
    mvf = MultiVariantFile(args.mvf, 'read')
    if mvf.flavor != 'dna':
        raise RuntimeError("MVF must be flavor=dna to translate")
    if args.gff:
        gff = parse_gff_translate(args.gff, args)
        if not args.quiet:
            print("gff_processed")
    outmvf = MultiVariantFile(args.out, 'write', overwrite=args.overwrite)
    outmvf.metadata = deepcopy(mvf.metadata)
    outmvf.flavor = args.output_data
    outmvf.write_data(outmvf.get_header())
    entrybuffer = []
    nentry = 0
    if not args.gff:
        inputbuffer = []
        current_contig = ''
        for contigid, pos, allelesets in mvf.iterentries(decode=False):
            if current_contig == '':
                current_contig = contigid[:]
            if contigid == current_contig:
                inputbuffer.append((pos, allelesets))
            else:
                for _, amino_acids, alleles in iter_codons(inputbuffer, mvf):
                    if all([x in '-X' for x in amino_acids]):
                        continue
                    if args.output_data == 'protein':
                        entrybuffer.append(
                            (current_contig, pos, (amino_acids, )))
                    else:
                        entrybuffer.append(
                            (current_contig, pos, (amino_acids, alleles[0],
                                                   alleles[1], alleles[2])))
                    nentry += 1
                    if nentry == args.line_buffer:
                        outmvf.write_entries(entrybuffer)
                        entrybuffer = []
                        nentry = 0
                inputbuffer = [(pos, allelesets)]
                current_contig = contigid[:]
        if inputbuffer:
            for _, amino_acids, alleles in iter_codons(inputbuffer, mvf):
                if all([x in '-X' for x in amino_acids]):
                    continue
                if args.output_data == 'protein':
                    entrybuffer.append((current_contig, pos, (amino_acids, )))
                else:
                    entrybuffer.append(
                        (current_contig, pos, (amino_acids, alleles[0],
                                               alleles[1], alleles[2])))
                nentry += 1
                if nentry == args.line_buffer:
                    outmvf.write_entries(entrybuffer)
                    entrybuffer = []
                    nentry = 0
    else:
        mvf_entries = {}
        for contigid, pos, allelesets in mvf.iterentries(decode=False):
            if contigid not in mvf_entries:
                mvf_entries[contigid] = {}
            mvf_entries[contigid][pos] = allelesets[0]
        for contigname in sorted(gff):
            contigid = mvf.get_contig_ids(labels=contigname)[0]
            for coords in sorted(gff[contigname]):
                reverse_strand = False
                if coords[3] == '-':
                    reverse_strand = True
                    alleles = [
                        mvf_entries[contigid].get(x, '-')
                        for x in coords[2::-1]
                    ]
                else:
                    alleles = [
                        mvf_entries[contigid].get(x, '-') for x in coords[0:3]
                    ]
                if all(len(x) == 1 for x in alleles):
                    if reverse_strand:
                        alleles = [MLIB.complement_bases[x] for x in alleles]
                    decoded_alleles = alleles
                    amino_acids = translate(''.join(alleles))[0]
                else:
                    if reverse_strand:
                        decoded_alleles = [[
                            MLIB.complement_bases[y] for y in mvf.decode(x)
                        ] for x in alleles]
                        alleles = [
                            mvf.encode(''.join(x)) for x in decoded_alleles
                        ]
                    else:
                        decoded_alleles = [mvf.decode(x) for x in alleles]
                    amino_acids = [
                        translate(''.join(x)) for x in zip(*decoded_alleles)
                    ]
                    amino_acids = mvf.encode(''.join(
                        [x[0] for x in amino_acids]))
                if all([x in '-X' for x in amino_acids]):
                    continue
                if args.output_data == 'protein':
                    entrybuffer.append((contigid, coords[0], (amino_acids, )))
                else:
                    entrybuffer.append(
                        (contigid, coords[0], (amino_acids, alleles[0],
                                               alleles[1], alleles[2])))
                nentry += 1
                if nentry == args.line_buffer:
                    outmvf.write_entries(entrybuffer)
                    entrybuffer = []
                    nentry = 0
    if entrybuffer:
        outmvf.write_entries(entrybuffer)
        entrybuffer = []
        nentry = 0
    return ''