def get_1001pseudogenome_by_range(accs,
                                  chrom,
                                  start,
                                  end,
                                  out_dir,
                                  ref_file=data_path,
                                  num_col=0,
                                  name_col=1,
                                  delim='',
                                  header=True):
    fasta_out = os.path.join(out_dir,
                             "chr{}_p{}-{}.fasta".format(chrom, start, end))
    get_1001pseudogenome_raw(accs,
                             chrom_pref + chrom,
                             start,
                             end,
                             fasta_out,
                             ref_file=ref_file,
                             num_col=num_col,
                             name_col=name_col,
                             delim=delim,
                             header=header)
    ## ensure nice line width
    from fasta_manip import fasta_to_dict
    tmp = fasta_to_dict(fasta_out)
    from fasta_manip import dict_to_fasta
    dict_to_fasta(tmp, fasta_out)
    print("Sequences were successfully written to: {}".format(fasta_out))
def get_domain_seqs(domain_f,
                    fasta,
                    fout,
                    header=[],
                    domain='',
                    adj_dir=False):
    ## get domain ranges
    with open(domain_f, 'r') as f:
        domain_dat = [x[:-1].split('\t') for x in f.readlines()]
    get = make_custom_get(domain_dat[0])
    domain_dat = domain_dat[1:]
    ## parse fasta file
    from fasta_manip import fasta_to_dict
    seqs = fasta_to_dict(fasta)
    ## get sequences
    output = {}
    num = 1
    for i, entry in enumerate(domain_dat):
        ## note: domain_dat is 1-indexed, start and end inclusive, but Python splicing is 0-indexed
        seq = seqs[get(entry, "contig")][get(entry, "hit.start") -
                                         1:get(entry, "hit.end")]
        if adj_dir and get(entry, "strand") == '-':
            seq = seq.reverse_complement()
        output['|'.join([str(x) for x in \
                         # (get(entry, "contig", "accID", "group") + [domain, num] + \
                         (get(entry, "contig", "accID") + [domain, num] + \
                          ['-'.join([str(x) for x in get(entry, "hit.start", "hit.end")])] + \
                          (["revcomp"] if adj_dir and get(entry, "strand") == '-' else []))])] = seq
        num += 1
    from fasta_manip import dict_to_fasta
    dict_to_fasta(output, fout)
def get_ref_raw(chrom,
                start,
                end,
                fasta_out,
                encoding="utf-8",
                ref_fasta_files=ref_fasta):
    from fasta_manip import fasta_to_dict
    ref_seq = list(fasta_to_dict(
        ref_fasta_files[chrom]).values())[0][start:end]  ## 0-indexed
    from fasta_manip import dict_to_fasta
    dict_to_fasta(
        {"Col-0_ref|{}:{}..{}".format(chrom, start + 1, end): ref_seq},
        fasta_out)
Exemple #4
0
def calculate_pi(
    domain,
    IGNORE_GAPS,
    GAP_CHAR='-',
    preview=False,
    cds_only=False,
    in_all_ref=False,
    prefix="nlr165_AL70",
    fout_dir='',
    make_seq_fname=lambda domain:
    f"/mnt/chaelab/rachelle/hap_tags/results/nlr165/alignment/nlr165_col0-AL70-Alyrata_{domain}_mafft.fa",
    make_pred_fname=lambda domain:
    f"/mnt/chaelab/rachelle/hap_tags/results/nlr165/predicted_identity/nlr165_AL70_{domain}_predictedIdentity.txt"
):

    dat_seqs = fasta_to_dict(make_seq_fname(domain))
    with open(make_pred_fname(domain), 'r') as f:
        id_dat = [x[:-1].split('\t') for x in f.readlines()]

    get = make_custom_get(id_dat[0])
    ids = id_dat[1:]
    import itertools
    gene_cluster = set(
        list(
            itertools.chain.from_iterable([
                list(
                    zip(
                        get(x, "gene").split(';'),
                        get(x, "cluster").split(';'))) for x in ids
            ])))
    gene_cluster = {x[0]: x for x in gene_cluster}
    genes = set(gene_cluster.keys())

    pi_raw = {}
    pi_totalAlnLen = {}
    pi_pairAlnLen = {}

    for i, gene in enumerate(genes):
        print(i + 1, gene)

        ## get sequences assigned to the gene
        seq_ids = set(
            get(entry, "contig") for entry in ids
            if gene in get(entry, "gene"))
        seqs_raw = {
            seq_id: seq
            for seq_id, seq in dat_seqs.items() if seq_id in seq_ids
        }

        ## remove empty positions
        if cds_only:
            ref_names = tuple(
                seq_id for seq_id in dat_seqs.keys()
                if f"Col-0_ref|{gene}" in seq_id and "complete" not in seq_id)
            if not ref_names:
                continue
            from fasta_manip import trim_alignment_to_seqs
            seqs = trim_alignment_to_seqs(seqs={
                **seqs_raw,
                **{
                    seq_id: seq
                    for seq_id, seq in dat_seqs.items() if seq_id in ref_names
                }
            },
                                          gap_char=GAP_CHAR,
                                          write=False,
                                          ref_names=ref_names,
                                          in_all_ref=in_all_ref)
            seqs = [
                seq for seq_id, seq in seqs.items()
                if "Col-0_ref" not in seq_id
            ]
        else:
            seqs = remove_empty_pos(list(seqs_raw.values()),
                                    empty_char=GAP_CHAR)

        ## calculate pi while accounting for every pairwise alignment length
        n_seqs = len(seqs)
        freq_constant = 1 / (n_seqs * n_seqs)
        pi_raw[gene] = 0
        pi_pairAlnLen[gene] = 0
        for i in range(n_seqs - 1):
            for j in range(i + 1, n_seqs):
                seq_i, seq_j = (remove_incomplete_pos([seqs[i], seqs[j]],
                                                      empty_char=GAP_CHAR)
                                if IGNORE_GAPS else [seqs[i], seqs[j]])
                if len(seq_i) == 0:
                    continue
                curr_pi = (freq_constant * len(
                    tuple(1 for c_i in range(len(seq_i))
                          if (seq_i[c_i] != seq_j[c_i]))))
                pi_raw[gene] += curr_pi
                pi_pairAlnLen[gene] += curr_pi / len(seq_i)

        ## throw in the constants
        pi_raw[gene] *= 2
        pi_totalAlnLen[gene] = pi_raw[gene] / len(seqs[0])
        pi_pairAlnLen[gene] *= 2

    pi_header = ["gene", "cluster", "pi", "pi/alnlen", "pi/pairalnlen"]
    pi_get = make_custom_get(pi_header)
    genes = set(get(ids, "gene"))
    pi_dat = [
        gene_cluster[gene] +
        (pi_raw[gene], pi_totalAlnLen[gene], pi_pairAlnLen[gene])
        for gene in pi_raw
    ]

    ## print different things?
    if preview:
        for cluster in set(get(ids, "cluster")):
            entries = sorted(entry for entry in pi_dat
                             if pi_get(entry, "cluster") == cluster)
            for entry in entries:
                print('\t'.join(str(x) for x in entry))

        for entry in sorted(pi_dat, key=lambda x: pi_get(x, "pi/alnlen")):
            print('\t'.join(str(x) for x in entry))

    ## write
    with open(
            f"{fout_dir}/{prefix}_{domain}_{'gapsExc' if IGNORE_GAPS else 'gapsInc'}_{'CDSonly' if cds_only else 'CDScomplete'}_pi.tsv",
            "w+") as f:
        f.write('\n'.join('\t'.join(str(y) for y in x) for x in [pi_header] +
                          sorted(pi_dat, key=lambda x: pi_get(x, "gene"))))
    return
def get_ref_by_gene(gene,
                    feature,
                    out_dir,
                    bed=bed_path,
                    encoding="utf-8",
                    ref_fasta_files=ref_fasta,
                    complete=False,
                    domain="",
                    domain_f="",
                    start_inc=True,
                    end_inc=True,
                    merge=False,
                    translate=False,
                    adj_dir=False,
                    by_gene=False,
                    **for_get_domain_in_genome_coords):
    if domain_f:
        feature = "CDS"
    data = grep_bedmerge(gene, bed, feature, encoding, out_dir,
                         merge=merge)["data"]
    ## extract sequences from fasta file
    if not data:
        return
    chrom = data[0][0]
    start = min([int(x[1]) for x in data])
    end = max([int(x[2]) for x in data])
    strand = data[0][3]
    if feature == "gene":
        isoforms = {gene: data}
    else:
        isoforms = {
            re.search("=(.+\.\d+)(?=[,;]|$)", isoform).group(1):
            [x for x in data if x[-1] == isoform]
            for isoform in set([x[-1] for x in data])
        }
    fasta_out_l = []
    seq_ranges = {}
    ## iterate through isoforms
    for isoform, isoform_dat in isoforms.items():
        ## if extracting only domain-specific range
        if domain_f:
            domain_data = get_domain_in_genome_coords(
                gene,
                domain,
                domain_f,
                out_dir,
                bed=bed,
                encoding=encoding,
                isoform=isoform,
                start_inc=start_inc,
                end_inc=end_inc,
                **{
                    k: v
                    for k, v in for_get_domain_in_genome_coords.items()
                    if k in ["qname_dname", "qstart_qend"]
                })
            if (not domain_data):
                continue
        else:
            domain_data = [(start, end)]
        ## get fasta file of sequence data
        fasta_out = os.path.join(out_dir, isoform + "_ref_" + feature + ("_complete" if complete else '') + \
                                 (('_' + ("domain" if not domain else domain)) if domain_f else '') + \
                                 ("_protein" if (translate and (feature=="CDS")) else '') + ".fasta")
        get_ref_raw(chrom,
                    start,
                    end,
                    fasta_out,
                    encoding=encoding,
                    ref_fasta_files=ref_fasta_files)
        seqs_to_write = {}
        for i, domain_range in enumerate(domain_data):
            from fasta_manip import fasta_to_dict
            ref_seq = list(fasta_to_dict(fasta_out).values())[0]
            d_start, d_end = domain_range
            ranges = [(d_start, d_end)]
            ## trim sequence if complete flag not raised or if domain required
            if (not complete) or domain_f:
                if complete and domain_f and d_start and d_end:
                    ranges = [(max(start, d_start) - start,
                               min(end, d_end) - start)]
                elif domain_f and d_start and d_end:
                    ranges = [(max(int(x[1]), d_start) - start, min(int(x[2]), d_end) - start) \
                              for x in isoform_dat if has_overlap((int(x[1]), int(x[2])), (d_start, d_end))]
                else:
                    ranges = [(int(x[1]) - start, int(x[2]) - start)
                              for x in isoform_dat]
                from fasta_manip import extract_ranges
                ref_seq = extract_ranges(ref_seq, ranges)
            if (adj_dir or translate) and strand == '-':
                ref_seq = ref_seq.reverse_complement()
            ## translate sequence if translate flag raised AND feature is CDS
            if translate:
                if feature == "CDS" and not complete:
                    ref_seq = ref_seq.translate(to_stop=True)
                else:
                    print(
                        "Translation is only possible when the selected feature is 'CDS' and the flag 'complete' is not raised."
                    )
            seq_name = "Col-0_ref|{}|{}|{}".format(gene, feature, isoform) + \
                       (('|' + ("domain" if not domain else domain) + f"|{i+1}") if domain_f else '') + \
                       ("|complete" if complete else '') + \
                       ("|revcomp" if adj_dir and strand == '-' else '')
            seqs_to_write[seq_name] = ref_seq
            ## for by_gene
            if by_gene:
                overlap_ranges = []
                overlap_seq_names = []
                for logged_ranges, logged_seq_names in seq_ranges.items():
                    if has_any_overlap(ranges, logged_ranges):
                        overlap_ranges.append(logged_ranges)
                        overlap_seq_names.extend(logged_seq_names)
                if overlap_ranges:
                    for logged_ranges in overlap_ranges:
                        del (seq_ranges[logged_ranges])
                    ranges = merge_ranges(*overlap_ranges)
                seq_ranges[tuple(sorted(ranges))] = seq_ranges.get(
                    tuple(sorted(ranges)), []) + [seq_name] + overlap_seq_names
            else:
                seq_ranges[tuple(sorted(ranges))] = seq_ranges.get(
                    tuple(sorted(ranges)), []) + [seq_name]
        from fasta_manip import dict_to_fasta
        dict_to_fasta(seqs_to_write, fasta_out)
        fasta_out_l.append(fasta_out)
    fasta_out_final = os.path.join(out_dir, gene + "_ref_" + feature + \
                                   ("_complete" if complete else '') + \
                                   (('_' + ("domain" if not domain else domain)) if domain_f else '') + \
                                   ("_protein" if (translate and (feature=="CDS") and not complete) else '')+\
                                   ".fasta")
    if fasta_out_l:
        if fasta_out_l[0] != fasta_out_final:
            from file_manip import cat_files
            cat_files(sorted(fasta_out_l), fasta_out_final)
            for fasta_out in fasta_out_l:
                os.remove(fasta_out)
        if by_gene:
            isoform_seqs = fasta_to_dict(fasta_out_final)
            final_seqs = {}
            i = 0
            for ranges, seq_names in sorted(seq_ranges.items()):
                seq_name_l = seq_names[0].split('|')
                seq_name_l[3] = ','.join(f'{r[0]}-{r[1]}' for r in ranges)
                if domain_f:
                    seq_name_l[5] = str(i + 1)
                seq_name = '|'.join(seq_name_l)
                final_seqs[seq_name] = isoform_seqs[seq_names[0]]
                i += 1
            dict_to_fasta(final_seqs, fasta_out_final)
        print("Sequences were successfully written to: {}".format(
            fasta_out_final))
    elif not fasta_out_l:
        f = open(fasta_out_final, "w+")
        f.write('')
        f.close()
        print("{} is an empty file".format(fasta_out_final))
def get_1001pseudogenome_by_gene(accs,
                                 gene,
                                 feature,
                                 out_dir,
                                 bed=bed_path,
                                 encoding="utf-8",
                                 ref_file=data_path,
                                 num_col=0,
                                 name_col=1,
                                 delim='',
                                 header=True,
                                 complete=False,
                                 domain="",
                                 domain_f="",
                                 merge=False,
                                 adj_dir=False,
                                 **for_get_domain_in_genome_coords):
    data = grep_bedmerge(gene, bed, feature, encoding, out_dir,
                         merge=merge)["data"]
    ## extract sequences
    chrom = data[0][0]
    start = min([int(x[1]) for x in data])
    end = max([int(x[2]) for x in data])
    strand = data[0][3]
    print(data[0], chrom, start, end)
    if feature == "gene":
        isoforms = {gene: data}
    else:
        isoforms = {
            re.search("=(.+\.\d+)(?=[,;]|$)", isoform).group(1):
            [x for x in data if x[-1] == isoform]
            for isoform in set([x[-1] for x in data])
        }
    fasta_out_l = []
    ## iterate through isoforms
    for isoform, isoform_dat in isoforms.items():
        ## if extracting only domain-specific range
        if domain_f:
            d_start, d_end = get_domain_in_genome_coords(
                gene,
                domain,
                domain_f,
                out_dir,
                bed=bed,
                encoding=encoding,
                isoform=isoform,
                start_inc=start_inc,
                end_inc=end_inc,
                **{
                    k: v
                    for k, v in for_get_domain_in_genome_coords.items()
                    if k in ["qname_dname", "qstart_qend"]
                })
            if (not d_start) or (not d_end):
                continue
        ## get fasta file of sequence data
        fasta_out = os.path.join(out_dir, isoform + '_' + feature + ("_complete" if complete else '') + \
                                 (('_' + ("domain" if not domain else domain))if domain_f else '') + ".fasta")
        get_1001pseudogenome_raw(accs, chrom, start + 1, end, fasta_out)
        accs = raw_accs_to_id(accs,
                              ref_file=data_path,
                              num_col=0,
                              name_col=1,
                              delim='',
                              header=True)
        ## rename seqs + trim if required
        from fasta_manip import fasta_to_dict
        tmp_seqs = fasta_to_dict(fasta_out)
        seqs = {"{}|{}|{}|{}|{}".format(k.split('|')[3], accs[k.split('|')[3]], gene, feature, isoform) +\
                (('|' + ("domain" if not domain else domain)) if domain_f else '' +
                 ("|complete" if complete else '')): seq \
                for k, seq in tmp_seqs.items()}
        if (not complete) or domain_f:
            if complete and domain_f and d_start and d_end:
                ranges = [(max(start, d_start) - start,
                           min(end, d_end) - start)]
            elif domain_f and d_start and d_end:
                ranges = [(max(int(x[1]), d_start) - start, min(int(x[2]), d_end) - start) \
                          for x in isoform_dat if has_overlap((int(x[1]), int(x[2])), (d_start, d_end))]
            else:
                ranges = [(int(x[1]) - start, int(x[2]) - start)
                          for x in isoform_dat]
            from fasta_manip import extract_ranges
            seqs = {k: extract_ranges(seq, ranges) for k, seq in seqs.items()}
        if adj_dir and strand == '-':
            seqs = {
                k + "|revcomp": seq.reverse_complement()
                for k, seq in seqs.items()
            }
        from fasta_manip import dict_to_fasta
        dict_to_fasta(seqs, fasta_out)
        fasta_out_l.append(fasta_out)
    fasta_out_final = os.path.join(out_dir, gene + '_' + feature + ("_complete" if complete else '') + \
                                   (('_' + ("domain" if not domain else domain)) if domain_f else '') + ".fasta")
    if fasta_out_l and fasta_out_l[0] != fasta_out_final:
        from file_manip import cat_files
        cat_files(sorted(fasta_out_l), fasta_out_final)
        for fasta_out in fasta_out_l:
            os.remove(fasta_out)
    print("Sequences were successfully written to: {}".format(fasta_out_final))
Exemple #7
0
def get_cds(fout,
            fasta,
            bed,
            domain_f='',
            domain='',
            complete=False,
            adjust_dir=False,
            translate=False,
            protein_id_field="protein_id",
            domain_pid_f=lambda x: x,
            **kwargs):

    ## extract isoform ranges
    data = grep_bedmerge(bed, fasta, "CDS")
    data.sort(key=lambda x: x[-1])
    get = make_custom_get(["chrom", "start", "end", "strand", "phase", "name"])
    ## extract domain ranges
    if domain and domain_f:
        with open(domain_f) as f:
            domain_data = [x[:-1].split('\t') for x in f.readlines()]
        domain_get = make_custom_get(domain_data[0])
        domain_data = [
            x for x in domain_data[1:] if domain_get(x, "domain") == domain
        ]
        domain_seqs = set(
            domain_pid_f(x) for x in domain_get(domain_data, "qseqid"))
        # domain_data = {seq_name: [x for x in domain_data if domain_get(x, "qseqid") == seq_name]
        #                for seq_name in domain_seqs}
        data = [
            x for x in data if split_extract(
                split_extract(get(x, "name"), protein_id_field +
                              '=', 1), ';', 0) in domain_seqs
        ]
    isoforms_dat = {}
    isoform = split_extract(
        split_extract(get(data[0], "name"), protein_id_field + '=', 1), ';', 0)
    isoform_dat = []
    for i, entry in enumerate(data):
        curr_isoform = split_extract(
            split_extract(get(entry, "name"), protein_id_field + '=', 1), ';',
            0)
        if curr_isoform == isoform:
            isoform_dat.append(
                get(entry, "chrom", "start", "end", "strand", "phase"))
        else:
            ## if complete, get min, max of combined CDS in the isoform
            if complete:
                tmp_isoform_dat = isoform_dat[0]
                tmp_isoform_dat[1] = min(get(isoform_dat, "start"))
                tmp_isoform_dat[2] = max(get(isoform_dat, "end"))
                isoforms_dat[isoform] = [tmp_isoform_dat]
            else:
                isoforms_dat[isoform] = isoform_dat
            isoform_dat = [
                get(entry, "chrom", "start", "end", "strand", "phase")
            ]
            isoform = curr_isoform
        ## if last entry, write to isoforms_dat
        if i == len(data) - 1:
            isoforms_dat[isoform] = isoform_dat

    ## domain magic
    if domain_f and domain:
        updated_isoforms_dat = {}
        for isoform in domain_seqs:
            if not isoform in isoforms_dat:
                continue
            isoform_dat = isoforms_dat[isoform]
            chrom, strand = get(isoform_dat[0], "chrom", "strand")
            domain_ranges = get_domain_in_genome_coords(isoform,
                                                        domain,
                                                        domain_f,
                                                        os.path.dirname(fout),
                                                        qname_dname=("qseqid",
                                                                     "domain"),
                                                        qstart_qend=("qstart",
                                                                     "qend"),
                                                        bed=bed,
                                                        fasta=fasta,
                                                        **kwargs)
            for i, domain_dat in enumerate(domain_ranges):
                d_start, d_end = domain_dat
                domain_cds = [[
                    chrom,
                    max(min(x), d_start),
                    min(max(x), d_end), strand
                ] for x in get(isoform_dat, "start", "end")
                              if has_overlap(x, (d_start, d_end))]
                updated_isoforms_dat[isoform + f"|{domain}|{i+1}"] = domain_cds
        isoforms_dat = updated_isoforms_dat

    ## extract sequences
    ref_seqs = {
        split_extract(k, ' ', 0): v
        for k, v in fasta_to_dict(fasta).items()
    }
    isoforms_seq = {}
    for isoform, isoform_dat in isoforms_dat.items():
        if not isoform_dat:
            continue
        chrom, strand = get(isoform_dat[0], "chrom", "strand")
        isoform_seq = extract_ranges(ref_seqs[str(chrom)],
                                     get(isoform_dat, "start", "end"))
        if (adjust_dir or (translate and not complete)) and strand == '-':
            isoform_seq = isoform_seq.reverse_complement()
        if translate and not complete:
            isoform_seq = isoform_seq.translate()
        isoforms_seq[isoform +
                     ("|revcomp" if strand == '-' and
                      (adjust_dir or (translate and not complete)) else '') +
                     ("|complete" if complete else '')] = isoform_seq
    dict_to_fasta(isoforms_seq, fout)
    return