Example #1
0
def concatenate_fastas(fns, fn_out, remove_gaps):
    strains = read_fasta.read_fasta(fns[0])[0]
    concat_seqs = dict(zip(strains, ['' for s in strains]))
    for fn in fns:
        headers, seqs = read_fasta.read_fasta(fn)
        for i in range(len(seqs)):
            concat_seqs[headers[i]] += seqs[i]
    f = open(fn_out, 'w')
    for strain in strains:
        f.write(strain + '\n')
        f.write(concat_seqs[strain] + '\n')
    f.close()
def test_read_fasta_empty(mocker):
    fasta = StringIO('>\n')
    mocked_file = mocker.patch('misc.read_fasta.open', return_value=fasta)
    headers, seqs = read_fasta('mocked')

    assert headers == ['>']
    assert seqs.tolist() == [[]]
    mocked_file.assert_called_with('mocked', 'r')

    fasta = StringIO('')
    mocked_file = mocker.patch('misc.read_fasta.open', return_value=fasta)
    # TODO probably handle empty files better
    with pytest.raises(IndexError):
        headers, seqs = read_fasta('mocked')
Example #3
0
def test_read_fasta_empty(mocker):
    fasta = StringIO('>\n')
    mocked_file = mocker.patch('misc.read_fasta.open', return_value=fasta)
    headers, seqs = read_fasta('mocked')

    assert headers == ['>']
    assert seqs.tolist() == [[]]
    mocked_file.assert_called_with('mocked', 'r')

    fasta = StringIO('')
    mocked_file = mocker.patch('misc.read_fasta.open', return_value=fasta)
    # TODO probably handle empty files better
    with pytest.raises(IndexError):
        headers, seqs = read_fasta('mocked')
Example #4
0
def test_read_fasta_multi(mocker):
    fasta = StringIO('''
not read
> headseq headfname.fa
actg
---
atcg
> headseq headfname.fa
actg
actg
---
> headseq2 headfname.fa
actg-
cataaa
''')
    mocked_file = mocker.patch('misc.read_fasta.open', return_value=fasta)
    headers, seqs = read_fasta('mocked')

    assert headers == [
        '> headseq headfname.fa', '> headseq headfname.fa',
        '> headseq2 headfname.fa'
    ]
    print(seqs)
    assert seqs == approx(
        np.array(
            [list('actg---atcg'),
             list('actgactg---'),
             list('actg-cataaa')]))
    mocked_file.assert_called_with('mocked', 'r')
Example #5
0
def get_range_seqs(strains, chrm, start, end, tag, gp_dir='../'):
    # TODO this shouldn't actually be dependent on tag

    strain_range_seqs = {}
    for strain, d in strains:
        print(strain)
        fn = d + strain + '_chr' + chrm + gp.fasta_suffix
        chrm_seq = read_fasta.read_fasta(fn)[1][0]

        t = None
        try:
            t, labels = read_table.read_table_columns(
                gp.analysis_out_dir_absolute + tag + '/' +
                'site_summaries/predictions_' + strain + '_chr' + chrm +
                '_site_summary.txt.gz', '\t')
        except FileNotFoundError:
            # for par reference which doesn't have site summary file
            align_fn = gp_dir + gp.alignments_dir + \
                       '_'.join(gp.alignment_ref_order) + '_chr' + chrm + \
                       '_mafft' + gp.alignment_suffix
            t = get_inds_from_alignment(align_fn, True)

        ref_ind_to_strain_ind = dict(zip(t['ps_ref'], t['ps_strain']))

        start_strain = int(math.ceil(float(ref_ind_to_strain_ind[str(start)])))
        end_strain = int(math.floor(float(ref_ind_to_strain_ind[str(end)])))

        strain_range_seqs[strain] = (chrm_seq[start_strain:end_strain + 1],
                                     start_strain, end_strain)
    return strain_range_seqs
def test_filter_ambiguous_on_region_10805(filterer, mocker):

    fa = os.path.join(os.path.split(__file__)[0], 'r10805.fa')

    if os.path.exists(fa):
        headers, seqs = read_fasta.read_fasta(fa, gz=False)
        seqs = seqs[:-1]
        region = {'predicted_species': 'N_45'}
        p, _ = filterer.filter_ambiguous(
            region, seqs, 0.1,
            ['S288c', 'CBS432', 'N_45', 'DBVPG6304', 'UWOPS91_917_1'])
        assert p is False
        assert region['alternative_states'] == (
            'CBS432,N_45,UWOPS91_917_1,DBVPG6304')
        assert region['alternative_ids'] == (
            '0.9983805668016195,0.994331983805668,'
            '0.9642857142857143,0.9618506493506493')
        assert region['alternative_P_counts'] == '145,143,128,129'

        region = {'predicted_species': 'N_45'}
        p, _ = filterer.filter_ambiguous(
            region, seqs, 0.98,
            ['S288c', 'CBS432', 'N_45', 'DBVPG6304', 'UWOPS91_917_1'])
        assert p is False
        assert region['alternative_states'] == 'CBS432,N_45'
        assert region['alternative_ids'] == (
            '0.9983805668016195,0.994331983805668')
        assert region['alternative_P_counts'] == '145,143'

    else:
        warnings.warn('Unable to test with datafile r10805.fa')
    def get_indices(self, chromosome: str, strain: str) -> Tuple:
        '''
        Get the sequences and different indices for the provided
        chromosome and strain
        Returned tuple contains:
        -sequences as np.array
        -index alignment list of indices for each sequence
        -masked_sites, index aligned for each sequence
        '''
        _, sequences = read_fasta.read_fasta(
            self.alignments.format(chrom=chromosome, strain=strain))

        # to go from index in reference seq to index in alignment
        alignments = [
            self.index_alignment_by_reference(seq)
            for seq in sequences
        ]

        masked = self.read_masked_sites(chromosome, strain)

        masked_sites = [
            alignments[ind][self.masked_sites[chromosome][state]]
            for ind, state in enumerate(self.known_states)
        ] + [alignments[-1][masked]]  # for strain

        return sequences, alignments, masked_sites
def test_read_fasta_multi(mocker):
    fasta = StringIO('''
not read
> headseq headfname.fa
actg
---
atcg
> headseq headfname.fa
actg
actg
---
> headseq2 headfname.fa
actg-
cataaa
''')
    mocked_file = mocker.patch('misc.read_fasta.open', return_value=fasta)
    headers, seqs = read_fasta('mocked')

    assert headers == ['> headseq headfname.fa',
                       '> headseq headfname.fa',
                       '> headseq2 headfname.fa']
    print(seqs)
    assert seqs == approx(np.array([list('actg---atcg'),
                                    list('actgactg---'),
                                    list('actg-cataaa')]))
    mocked_file.assert_called_with('mocked', 'r')
Example #9
0
def get_orfs(fn):
    headers, seqs = read_fasta.read_fasta(fn)
    orfs = {}
    for h in headers:
        m = re.search(r' (?P<name>[a-zA-Z0-9]+)_(?P<strain>[a-zA-Z0-9\.]+)'
                      ':(?P<start>[0-9]+):(?P<end>[0-9]+)', h)
        orfs[(int(m.group('start')), int(m.group('end')))] = m.group('name')
    return orfs
def mask(fn, masked_fn, intervals_fn):
    headers, seqs = read_fasta.read_fasta(fn)
    seq = list(seqs[0])
    intervals = read_intervals(intervals_fn)
    for start, end in intervals:
        for i in range(start, end + 1):
            seq[i] = gp.unsequenced_symbol
    seq = ''.join(seq)
    write_fasta.write_fasta(headers, [seq], masked_fn)
Example #11
0
def fraction_strains_aligned(headers, seqs):
    nseqs = len(seqs)
    nsites = len(seqs[0])
    seq_lengths = []
    fracs_aligned = []
    for i in range(nseqs):
        h = headers[i].split(' ')
        actual = nsites - seqs[i].count(gp.gap_symbol)
        seq_lengths.append(actual)
        s = read_fasta.read_fasta(h[-1])
        expected = len(s[1][0])
        fracs_aligned.append(float(actual) / expected)

    return fracs_aligned, seq_lengths
def test_read_fasta_single(mocker):
    fasta = StringIO('''
not read
> headseq headfname.fa
actg
---
atcg
''')
    mocked_file = mocker.patch('misc.read_fasta.open', return_value=fasta)
    headers, seqs = read_fasta('mocked')

    assert headers == ['> headseq headfname.fa']
    assert seqs == approx(np.asarray([list('actg---atcg')]))
    mocked_file.assert_called_with('mocked', 'r')
Example #13
0
def test_read_fasta_single(mocker):
    fasta = StringIO('''
not read
> headseq headfname.fa
actg
---
atcg
''')
    mocked_file = mocker.patch('misc.read_fasta.open', return_value=fasta)
    headers, seqs = read_fasta('mocked')

    assert headers == ['> headseq headfname.fa']
    assert seqs == approx(np.asarray([list('actg---atcg')]))
    mocked_file.assert_called_with('mocked', 'r')
Example #14
0
def get_gene_seqs(query_fn, strains, chrm, ref_chrm_fn, start, end, strand,
                  tag, strain_ind_to_ref_ind):

    # outfmt = '"6 qseqid sseqid slen qstart qend \
    #     length mismatch gapopen gaps sseq"'
    outfmt = '"6 sseqid slen evalue bitscore"'

    strain_gene_seqs = {}
    out_fn = 'blast_chr' + chrm + '.out'
    for strain, d in strains:
        if strain != 'yjm1332':
            continue

        print('-', strain)
        sys.stdout.flush()
        fn = d + 'orfs/' + strain + '_chr' + chrm + '_orfs' + gp.fasta_suffix
        cmd_string = gp.blast_install_path + 'blastn' + \
            ' -db ' + fn + \
            ' -query ' + query_fn + \
            ' -out ' + out_fn + \
            ' -outfmt ' + outfmt
        # print(cmd_string)
        os.system(cmd_string)
        hits = [
            line[:-1].split('\t') for line in open(out_fn, 'r').readlines()
        ]
        if len(hits) == 0:
            strain_gene_seqs[strain] = ('nohit', '', -1, -1, '')
            continue
        # best_orf_id = hits[0][0]
        headers, seqs = read_fasta.read_fasta(fn)
        best_orf_id, x, seq, orf_start, orf_end, orf_strand = \
            choose_best_hit(hits, start, end, tag, strain,
                            chrm, headers, seqs,
                            strain_ind_to_ref_ind[strain])
        print(hits)
        print(best_orf_id)
        print(orf_strand, strand)
        sys.exit()

        if best_orf_id is None or orf_strand != strand:
            strain_gene_seqs[strain] = ('nohit', '', -1, -1, '')
            continue
        strain_gene_seqs[strain] = (x, seq, orf_start, orf_end, orf_strand)
    os.remove(out_fn)
    return strain_gene_seqs
def test_filter_ambiguous_on_region_10817(filterer, mocker):

    fa = os.path.join(os.path.split(__file__)[0], 'r10817.fa')

    if os.path.exists(fa):
        headers, seqs = read_fasta.read_fasta(fa, gz=False)
        seqs = seqs[:-1]
        region = {'predicted_species': 'CBS432'}
        p, _ = filterer.filter_ambiguous(
            region, seqs, 0.98,
            ['S288c', 'CBS432', 'N_45', 'DBVPG6304', 'UWOPS91_917_1'])
        assert p is False
        assert region['alternative_states'] == (
            'CBS432,N_45')
        assert region['alternative_P_counts'] == '111,110'

    else:
        warnings.warn('Unable to test with datafile r10817.fa')
def get_inds_from_alignment(fn, rind, sind):
    headers, seqs = read_fasta.read_fasta(fn)
    n = len(seqs[0])
    ri = -1
    si = -1
    ps = []
    for i in range(n):
        s_gap = True
        if seqs[sind][i] != gp.gap_symbol:
            si += 1
            s_gap = False
        if seqs[rind][i] != gp.gap_symbol:
            ri += 1
            if s_gap:
                ps.append(None)
            else:
                ps.append(str(si))
    return ps
Example #17
0
def get_inds_from_alignment(fn, flip_ref, rind=0, sind=1):
    headers, seqs = read_fasta.read_fasta(fn)
    n = len(seqs[0])
    ri = -1
    si = -1
    pr = []
    ps = []
    if flip_ref:
        rind = 1
        sind = 0
    for i in range(n):
        if seqs[sind][i] != gp.gap_symbol:
            si += 1
        if seqs[rind][i] != gp.gap_symbol:
            ri += 1
            pr.append(str(ri))
            ps.append(str(si))
    if flip_ref:
        return {'ps_ref': ps, 'ps_strain': pr}
    return {'ps_ref': pr, 'ps_strain': ps}
def get_aligned_genes(fn, strains):
    headers, seqs = read_fasta.read_fasta(fn)
    d = {}
    for i in range(len(headers)):
        strain = headers[i][1:].split()[0]
        if strain in strains:
            d[strain] = seqs[i]
    n = len(d.values()[0])
    remove_columns = []
    for i in range(n):
        all_gap = True
        for strain in d.keys():
            if d[strain][i] != gp.gap_symbol:
                all_gap = False
                break
        if all_gap:
            remove_columns.append(i)
    for i in remove_columns[::-1]:
        for strain in d.keys():
            d[strain] = d[strain][:i] + d[strain][i + 1:]
    return d
Example #19
0
def get_ref_gene_seq(gene, gene_coords_fn, seq_fn):

    d1, labels = read_table.read_table_rows(gene_coords_fn,
                                            '\t',
                                            header=False,
                                            key_ind=0)
    d = {}
    for g in d1:
        if d1[g][0] == '""':
            d[g] = d1[g][1:]
        else:
            d[d1[g][0]] = d1[g][1:]

    gene_start = int(d[gene][2]) - 1
    gene_end = int(d[gene][3]) - 1
    chrm_seq = read_fasta.read_fasta(seq_fn)[1][0]
    gene_seq = chrm_seq[gene_start:gene_end + 1]
    strand = d[gene][1]
    if strand == '-1':
        gene_seq = seq_functions.reverse_complement(gene_seq)
    assert gene_seq.startswith('atg') or gene_seq.startswith('ATG')
    assert gene_start < gene_end
    return gene_seq, gene_start, gene_end, strand
import sys
from misc import read_fasta


def pad(s, n):
    s = s.strip()
    return s[:n] + (n - len(s)) * ' '


headers, seqs = read_fasta.read_fasta(sys.argv[1])

fp = open(sys.argv[2], 'w')
fp.write(str(len(headers)) + ' ' + str(len(seqs[0])) + '\n')
for i in range(len(headers)):
    h = pad(headers[i][1:], 10)
    fp.write(h + seqs[i] + '\n')

fp.close()
# ======
# write all sites, including gaps
# ======

print('writing file with all sites')

f_all = open(fn_all, 'w')

# master reference (cerevisiae)
print('*', gp.master_ref)
f_all.write('>' + gp.master_ref + '\n')
chrm_offset = 0
for chrm in gp.chrms:
    seq = read_fasta.read_fasta(gp.ref_dir[gp.master_ref] +
                                gp.ref_fn_prefix[gp.master_ref] + '_chr' +
                                chrm + gp.fasta_suffix)[1][0]
    f_all.write(seq)
    chrm_offsets[chrm] = chrm_offset
    chrm_offset += len(seq)
f_all.write('\n')

# other reference (paradoxus)
other_ref_strain = gp.ref_fn_prefix[gp.alignment_ref_order[1]]
print('*', other_ref_strain)
f_all.write('>' + other_ref_strain + '\n')
for chrm in gp.chrms:
    align_fn = gp_dir + gp.alignments_dir + \
               '_'.join(gp.alignment_ref_order) + '_chr' + chrm + \
               '_mafft' + gp.alignment_suffix
    ps = get_inds_from_alignment(align_fn, 0, 1)
                        open('check_paralogs_out_cer_paralog.tsv',
                             'r').readlines()]
    genes_to_analyze = list(set(genes_to_analyze))

ip = 0
for gene in genes_to_analyze:
    if gene not in paralogs:
        continue

    print(ip)
    ip += 1

    chrm, ref_gene_start, ref_gene_end = gene_coords[gene]

    gene_headers, gene_seqs = \
        read_fasta.read_fasta(gp.analysis_out_dir_absolute + tag + '/genes/' +
                              gene + '/' + gene + '_from_alignment.fa')
    gene_headers = [x[1:].strip() for x in gene_headers]
    strain_seqs = dict(zip(gene_headers, gene_seqs))

    cer_seq = strain_seqs['S288c']
    par_seq = strain_seqs['CBS432']

    paralog = paralogs[gene]
    gene_headers, gene_seqs = \
        read_fasta.read_fasta(gp.analysis_out_dir_absolute + tag + '/genes/' +
                              paralog + '/' + paralog + '_from_alignment.fa')
    gene_headers = [x[1:].strip() for x in gene_headers]
    strain_paralog_seqs = dict(zip(gene_headers, gene_seqs))

    cer_paralog_seq = strain_paralog_seqs['S288c']
    par_paralog_seq = strain_paralog_seqs['CBS432']
Example #23
0
import os
from convert_coordinates import (write_coordinates, convert)
import global_params as gp
from misc import read_fasta

gp_dir = '../'
fns = os.listdir(gp_dir + gp.alignments_dir)
fns = filter(lambda fn: fn.endswith(gp.alignment_suffix), fns)

for fn in fns:
    print(fn)

    x = fn.split('_')
    chrm = x[-2]
    strain_names = x[0:-2]
    headers, seqs = read_fasta.read_fasta(gp_dir + gp.alignments_dir + fn)

    # for each index in cer reference, get index in other strain
    # (either par reference for 2-way alignment or cer strain for
    # 3-way)
    coord_fn = (gp.analysis_out_dir_absolute + 'coordinates/' +
                strain_names[0] + '_to_' + strain_names[-1] +
                '_' + chrm + '.txt.gz')
    write_coordinates(convert(seqs[0], seqs[-1]), coord_fn)

    # for each index in other strain, get index in cer reference
    coord_fn = (gp.analysis_out_dir_absolute + 'coordinates/' +
                strain_names[-1] + '_to_' + strain_names[0] +
                '_' + chrm + '.txt.gz')
    write_coordinates(convert(seqs[-1], seqs[0]), coord_fn)
                   '_chr' + chrm + '.txt.gz'
        f_coord = gzip.open(coord_fn, 'rb')
        ref_ind_to_strain_i_ind = [
            try_int(line[:-1]) for line in f_coord.readlines()
        ]

        # current strain fasta file for current chromosome
        strain_fn = d_i + strain_i + '_chr' + chrm + gp.fasta_suffix
        print(strain_i, chrm)

        # get chromosome sequence for this strain relative to
        # reference strain (the base for this strain at each site in
        # the reference, based on original alignment);
        # gaps/unsequenced sites/etc marked as 'N'
        strain_i_seqs[chrm] = referize(
            read_fasta.read_fasta(strain_fn)[1][0].lower(),
            ref_ind_to_strain_i_ind)

        # get version of sequence where everything that doesn't fall
        # within gene is replaced by 'N'
        strain_i_seqs_coding[chrm] = mark_included(strain_i_seqs[chrm],
                                                   ref_genes[chrm])

        # also get version of above sequences where introgressed sites are
        # replaced by 'N'
        strain_i_seqs_nonint[chrm] = copy.deepcopy(strain_i_seqs[chrm])
        strain_i_seqs_coding_nonint[chrm] = copy.deepcopy(
            strain_i_seqs_coding[chrm])
        if strain_i in regions_by_chrm_and_strain[chrm]:
            strain_i_seqs_nonint[chrm] = mark_excluded(
                strain_i_seqs[chrm],
Example #25
0
# ======

# input file for ldselect is formatted so that each row is a snp and
# each column is the genotype for a strain, e.g.

fn = out_dir + 'ldselect_input_chr' + chrm + '.tsv'
f = open(fn, 'w')
snps = defaultdict(list)
# loop through all the strains
for strain in strains:
    print('-', strain)
    # read multiple alignment file for this strain with the master
    # reference (and other references which we don't care about
    # here)
    headers, seqs = read_fasta.read_fasta(gp_dir + gp.alignments_dir +
                                          '_'.join(gp.alignment_ref_order) +
                                          '_' + strain + '_chr' + chrm +
                                          '_mafft.maf')
    # look at all alignment columns, keeping track of the index in
    # the master reference
    i = 0
    for c in range(len(seqs[0])):
        # if the master reference doesn't have a gap in this
        # column, then store the allele that the current strain
        # has at this site
        if seqs[0][c] != gp.gap_symbol and seqs[0][c] != gp.unsequenced_symbol:
            snps[i].append(seqs[-1][c])
            i += 1

# get reference sequence (unaligned, without gaps)
# TODO correct alignment file location
ref_seq = read_fasta.read_fasta(gp_dir + gp.alignments_dir +
    sys.stdout.flush()

    fn_out = gp.analysis_out_dir_absolute + args['tag'] + '/site_summaries/' +\
        'predictions_' + strain + '_chr' + chrm + '_site_summary.txt.gz'
    if not os.path.exists(os.path.dirname(fn_out)):
        os.makedirs(os.path.dirname(fn_out))

    # skip this strain x chromosome if there are no introgressed
    # regions for it
    if strain not in regions or chrm not in regions[strain]:
        continue

    # read alignment blocks for this strain and chromosome
    fn_align = fn_align_prefix + \
        strain + '_chr' + chrm + '_mafft' + gp.alignment_suffix
    alignment_headers, alignment_seqs = read_fasta.read_fasta(fn_align)

    # read masked (unaligned) sequences
    seq_masked_fns = [header.split()[-1] for header in alignment_headers]
    seq_masked_fns = [
        mfn[:-len(gp.fasta_suffix)] + '_masked' + gp.fasta_suffix
        for mfn in seq_masked_fns
    ]
    seqs_masked = [read_fasta.read_fasta(mfn)[1][0] for mfn in seq_masked_fns]

    labels = ref_labels + [strain]

    # mark each site as matching each reference or not
    ref_match_by_site = gene_predictions.get_ref_match_by_site(
        alignment_seqs, labels)
    # mark each site as in a gene or not
Example #27
0
# - by chromosome
# - in windows across genome

window = 100

gp_dir = '../'

nrefs = len(gp.alignment_ref_order)

pair_chrm_ids = defaultdict(lambda: defaultdict(list))
for chrm in gp.chrms:
    print(chrm)
    fn = (gp_dir + gp.alignments_dir + '_'.join(gp.alignment_ref_order) +
          '_chr' + chrm + '_mafft' + gp.alignment_suffix)

    headers, seqs = read_fasta.read_fasta(fn)

    for i in range(nrefs):
        ref1 = gp.alignment_ref_order[i]
        for j in range(i + 1, nrefs):
            print(i, j)
            ref2 = gp.alignment_ref_order[j]

            ids = seq_functions.seq_id_windowed(seqs[i], seqs[j], window)

            pair_chrm_ids[(ref1, ref2)][chrm] = ids

fs = open(
    gp.analysis_out_dir_absolute + 'ref_ids_summary_' +
    '_'.join(gp.alignment_ref_order) + '.txt', 'w')
fs.write('pair\tchromosome\tmean\tmedian\n')
Example #28
0
        region_seqs = {}
        for strain in strains:
            print(' ', strain)
            ref_to_strain_coords = [
                float(x[:-1])
                for x in gzip.open(gp.analysis_out_dir_absolute +
                                   'coordinates/S288c_to_' + strain + '_chr' +
                                   chrm + '.txt.gz').readlines()
            ]
            strain_start = int(
                max(0, math.ceil(ref_to_strain_coords[ref_start])))
            strain_end = int(math.floor(ref_to_strain_coords[ref_end]))

            if strain not in chrom_seqs:
                chrom_seqs[strain] = read_fasta.read_fasta(
                    strain_dirs[strain] + strain + '_chr' + chrm +
                    gp.fasta_suffix)[1][0]
            # seq = chrom_seqs[strain][strain_start:strain_end+1]
            seq = [gp.gap_symbol for i in range(ref_start, ref_end + 1)]
            for i in range(ref_start, ref_end + 1):
                c = ref_to_strain_coords[i]
                if int(c) == c:
                    seq[i - ref_start] = chrom_seqs[strain][int(c)]
            region_seqs[strain] = ''.join(seq)

        p, t = calculate_polymorphism(region_seqs)
        fp = 'NA'
        if t != 0:
            fp = float(p) / t

        nuc_div = calculate_nuc_div(region_seqs)
for chrm in gp.chrms:

    for strain, strain_dir in args['setup_args']['strain_dirs']:

        print(f'working on: {strain} {chrm}')

        ref_prefix = '_'.join(args['known_states'])
        fn = (f'{args["setup_args"]["alignments_directory"]}{ref_prefix}_{strain}'
              f'_chr{chrm}_mafft{gp.alignment_suffix}')

        if not os.path.exists(fn):
            print(fn)
            print(f'no alignment for {strain} {chrm}')
            continue

        headers, seqs = read_fasta.read_fasta(fn)

        ref_seqs = seqs[:-1]
        predict_seq = seqs[-1]

        # predict introgressed/non-introgressed tracts

        state_seq, probs, hmm, hmm_init, ps = \
            predict.predict_introgressed(ref_seqs, predict_seq,
                                         args, train=True)

        state_seq_blocks = predict.convert_to_blocks(state_seq, args['states'])

        # output

        # the positions actually used in predictions
Example #30
0
        '_genes.txt'
    genes, _ = read_table.read_table_rows(fn, '\t', header=False, key_ind=0)
    for gene in genes:
        genes[gene] = (int(genes[gene][0]), int(genes[gene][1]))

    # read in cer ref -> par ref position file
    fn = gp.analysis_out_dir_absolute + 'coordinates/' + gp.master_ref + \
        '_to_' + other_ref + '_chr' + chrm + '.txt.gz'
    master_to_other_ref_pos = [
        float(line[:-1]) for line in gzip.open(fn, 'rb').readlines()
    ]

    # read in cer ref chromosome sequence
    fn = gp.ref_dir[gp.master_ref] + gp.ref_fn_prefix[gp.master_ref] + \
        '_chr' + chrm + gp.fasta_suffix
    master_seq = read_fasta.read_fasta(fn)[1][0]

    # read in par ref chromosome sequence
    fn = gp.ref_dir[other_ref] + gp.ref_fn_prefix[other_ref] + \
        '_chr' + chrm + gp.fasta_suffix
    other_ref_seq = read_fasta.read_fasta(fn)[1][0]

    # read in par ref ORFs
    fn = gp.ref_dir[other_ref] + 'orfs/' + other_ref + \
        '_chr' + chrm + '_orfs' + gp.fasta_suffix
    ref_orfs = annotate_positions.get_orfs(fn)

    for strain in region_ids_by_chrm_strain[chrm].keys():
        print('-', strain)

        if strain not in strain_totals:
Example #31
0
# ======

strain_dirs = align_helpers.get_strains(
    align_helpers.flatten(gp.non_ref_dirs.values()))
num_strains = len(strain_dirs)


# ======
# loop through all strains, getting appropriate sequence
# ======

# master reference and other reference seqs
master_ref = gp.alignment_ref_order[0]
master_fn = gp.ref_dir[master_ref] + gp.ref_fn_prefix[master_ref] + '_chr' + \
            chrm + gp.fasta_suffix
master_seq = read_fasta.read_fasta(master_fn)[1][0][
    region_start:region_end+1].lower()


other_ref = gp.alignment_ref_order[1]
coord_fn = gp.analysis_out_dir_absolute + 'coordinates/' + \
           gp.master_ref + '_to_' + other_ref + \
           '_chr' + chrm + '.txt.gz'
f_coord = gzip.open(coord_fn, 'rb')
ref_ind_to_strain_ind = [try_int(line[:-1]) for line in f_coord.readlines()]
other_ref_fn = gp.ref_dir[other_ref] + gp.ref_fn_prefix[other_ref] + \
               '_chr' + chrm + gp.fasta_suffix
other_ref_seq = referize(read_fasta.read_fasta(other_ref_fn)[1][0].lower(),
                         ref_ind_to_strain_ind)[region_start:region_end+1]

# other strains
seqs = {}
def main():

    args = read_args.process_predict_args(sys.argv[2:])

    task_ind = int(sys.argv[1])
    species_ind = task_ind

    species_from = args['states'][species_ind]

    base_dir = gp.analysis_out_dir_absolute + args['tag']

    regions_dir = f'{base_dir}/regions/'
    if not os.path.isdir(regions_dir):
        os.mkdir(regions_dir)

    quality_writer = None
    positions = gzip.open(f'{base_dir}/positions_{args["tag"]}.txt.gz', 'rt')
    line_number = 0

    region_writer = gzip.open(
        f'{regions_dir}{species_from}{gp.fasta_suffix}.gz', 'wt')
    region_index = {}

    for chrm in gp.chrms:
        # region_id strain chromosome predicted_species start end num_non_gap
        regions_chrm, labels = read_table.read_table_columns(
            f'{base_dir}/blocks_{species_from}_{args["tag"]}_labeled.txt',
            '\t',
            group_by='strain',
            chromosome=chrm
        )

        for strain in regions_chrm:
            n = len(regions_chrm[strain]['region_id'])

            for s in args['known_states']:
                regions_chrm[strain]['match_nongap_' + s] = [0] * n
                regions_chrm[strain]['num_sites_nongap_' + s] = [0] * n
                regions_chrm[strain]['match_hmm_' + s] = [0] * n
                regions_chrm[strain]['match_nonmask_' + s] = [0] * n
                regions_chrm[strain]['num_sites_nonmask_' + s] = [0] * n

            info_string_symbols = list('.-_npbcxNPBCX')
            for s in info_string_symbols:
                regions_chrm[strain]['count_' + s] = [0] * n

        # get masked sites for all references, not just the current
        # species_from we're considering regions from
        masked_sites_refs = {}
        for s, state in enumerate(args['known_states']):
            masked_sites_refs[s] = \
                convert_intervals_to_sites(
                    read_masked_intervals(
                        f'{gp.mask_dir}{state}'
                        f'_chr{chrm}_intervals.txt'))

        # loop through chromosomes and strains, followed by species of
        # introgression so that we only have to read each alignment in once
        # move to last read chromosome
        positions.seek(line_number)
        line = positions.readline()
        while line != '':
            line = line.split('\t')

            current_chrm = line[1]
            if current_chrm != chrm:
                break

            strain = line[0]
            if strain not in regions_chrm:
                # record current position in case need to re read line
                line_number = positions.tell()
                line = positions.readline()
                continue

            print(strain, chrm)

            # indices of alignment columns used by HMM
            ps = np.array([int(x) for x in line[2:]])

            headers, seqs = read_fasta.read_fasta(
                args['setup_args']['alignments_directory'] + \
                '_'.join(args['known_states'])
                + f'_{strain}_chr{chrm}_mafft{gp.alignment_suffix}')

            # to go from index in reference seq to index in alignment
            ind_align = []
            for seq in seqs:
                ind_align.append(index_alignment_by_reference(seq))
            
            masked_sites = convert_intervals_to_sites(
                read_masked_intervals(
                    f'{gp.mask_dir}{strain}_chr{chrm}_intervals.txt'))

            masked_sites_ind_align = []
            for s in range(len(args['known_states'])):
                masked_sites_ind_align.append(
                    ind_align[s][masked_sites_refs[s]])

            # add in sequence of query strain
            masked_sites_ind_align.append(
                ind_align[-1][masked_sites])

            # convert position indices from indices in master reference to
            # indices in alignment
            ps_ind_align = ind_align[0][ps]

            # loop through all regions for the specified chromosome and the
            # current strain
            for i in range(len(regions_chrm[strain]['region_id'])):
                r_id = regions_chrm[strain]['region_id'][i]
                start = regions_chrm[strain]['start'][i]
                end = regions_chrm[strain]['end'][i]

                # calculate:
                # - identity with each reference
                # - fraction of region that is gapped/masked

                # index of start and end of region in aligned sequences
                slice_start = ind_align[0][int(start)]
                slice_end = ind_align[0][int(end)]
                assert slice_start in ps_ind_align, \
                    f'{slice_start} {start} {r_id}'
                assert slice_end in ps_ind_align, \
                    f'{slice_end} {end} {r_id}'

                seqx = seqs[-1][slice_start:slice_end + 1]
                len_seqx = slice_end - slice_start + 1
                len_states = len(args['known_states'])

                # . = all match
                # - = gap in one or more sequences
                # p = matches predicted reference

                info = {'gap_any_flag': np.zeros((len_seqx), bool),
                        'mask_any_flag': np.zeros((len_seqx), bool),
                        'unseq_any_flag': np.zeros((len_seqx), bool),
                        'hmm_flag': np.zeros((len_seqx), bool),
                        'gap_flag': np.zeros((len_seqx, len_states), bool),
                        'mask_flag': np.zeros((len_seqx, len_states), bool),
                        'unseq_flag': np.zeros((len_seqx, len_states), bool),
                        'match_flag': np.zeros((len_seqx, len_states), bool)}

                for sj, statej in enumerate(args['known_states']):
                    seqj = seqs[sj][slice_start:slice_end+1]

                    # only alignment columns used by HMM (polymorphic, no
                    # gaps in any strain)
                    total_match_hmm, total_sites_hmm, infoj = \
                        seq_id_hmm(seqj, seqx, slice_start, ps_ind_align)

                    if statej == species_from \
                            or species_ind >= len(args['known_states']):
                        regions_chrm[strain]['num_sites_hmm'][i] = \
                            total_sites_hmm

                    # only write once, the first index
                    if sj == 0:
                        info['hmm_flag'] = infoj['hmm_flag']

                    info['gap_any_flag'] = np.logical_or(
                        info['gap_any_flag'], infoj['gap_flag'])
                    info['unseq_any_flag'] = np.logical_or(
                        info['unseq_any_flag'], infoj['unseq_flag'])
                    info['gap_flag'][:, sj] = infoj['gap_flag']
                    info['unseq_flag'][:, sj] = infoj['unseq_flag']
                    info['match_flag'][:, sj] = infoj['match']

                    regions_chrm[strain][f'match_hmm_{statej}'][i] = \
                        total_match_hmm

                    # all alignment columns, excluding ones with gaps in
                    # these two sequences
                    total_match_nongap, total_sites_nongap = \
                        seq_functions.seq_id(seqj, seqx)

                    regions_chrm[strain][f'match_nongap_{statej}'][i] =\
                        total_match_nongap
                    regions_chrm[strain][f'num_sites_nongap_{statej}'][i] =\
                        total_sites_nongap

                    # all alignment columns, excluding ones with gaps or
                    # masked bases or unsequenced in *these two sequences*
                    total_match_nonmask, total_sites_nonmask, infoj = \
                        seq_id_unmasked(seqj, seqx, slice_start,
                                        masked_sites_ind_align[sj],
                                        masked_sites_ind_align[-1])

                    info['mask_any_flag'] = np.logical_or(
                        info['mask_any_flag'], infoj['mask_flag'])
                    info['mask_flag'][:, sj] = infoj['mask_flag']

                    regions_chrm[strain][f'match_nonmask_{statej}'][i] = \
                        total_match_nonmask
                    regions_chrm[strain][f'num_sites_nonmask_{statej}'][i] = \
                        total_sites_nonmask

                region_index[int(r_id[1:])] = region_writer.tell()
                region_writer.write(f'#{r_id}\n')
                names = args['known_states'] + [strain]
                for sj in range(len(names)):
                    # write sequence to region alignment file, along with
                    # start and end coordinates
                    startj = bisect.bisect_left(ind_align[sj], slice_start)
                    endj = bisect.bisect_left(ind_align[sj], slice_end)

                    region_writer.write(f'> {names[sj]} {startj} {endj}\n')
                    region_writer.write(
                        ''.join(seqs[sj][slice_start:slice_end+1]) + '\n')

                # also write string with info about each site
                info_string = make_info_string(info, 0, species_ind)
                region_writer.write('> info\n')
                region_writer.write(info_string + '\n')

                # TODO this can be made faster with numpy
                # and keep track of each symbol count
                for sym in info_string_symbols:
                    regions_chrm[strain]['count_' + sym][i] = \
                        info_string.count(sym)

            # record current position in case need to re read line
            line_number = positions.tell()
            line = positions.readline()
            sys.stdout.flush()

        labels += ['match_nongap_' + x for x in args['known_states']]
        labels += ['num_sites_nongap_' + x for x in args['known_states']]
        labels += ['match_hmm_' + x for x in args['known_states']]
        labels += ['match_nonmask_' + x for x in args['known_states']]
        labels += ['num_sites_nonmask_' + x for x in args['known_states']]
        labels += ['count_' + x for x in info_string_symbols]

        assert labels[0] == 'region_id', 'Unexpected labeled format'

        # write on first execution
        if quality_writer is None:
            quality_writer = open(f'{base_dir}/blocks_{species_from}'
                                  f'_{args["tag"]}_quality.txt', 'w')

            quality_writer.write('\t'.join(labels) + '\n')

        # reorganize output as list of tuples ordered by label
        output = []
        strains = list(regions_chrm.keys())
        for strain in strains:
            # pop to limit memory usage
            d = regions_chrm.pop(strain)
            output += list(zip(*[d[l] for l in labels]))

        # sort by region id (index 0, remove r)
        for entry in sorted(output, key=lambda e: int(e[0][1:])):
            quality_writer.write('\t'.join([str(e) for e in entry]) + '\n')

    quality_writer.close()
    region_writer.close()
    with open(f'{regions_dir}{species_from}.pkl', 'wb') as index:
        pickle.dump(region_index, index)
# get all non-reference strains of cerevisiae and paradoxus
s = get_strains(flatten(gp.non_ref_dirs.values()))
strain, d = s[int(sys.argv[1])]
gp_dir = '../'

fn_start = (gp_dir + gp.alignments_dir + '_'.join(gp.alignment_ref_order) +
            '_' + strain + '_chr')

for chrm in gp.chrms:
    print(chrm)
    sys.stdout.flush()

    if not os.path.isfile(fn_start + chrm + '_mafft.maf'):
        continue

    headers, seqs = read_fasta.read_fasta(fn_start + chrm + '_mafft.maf')
    a = dict(zip(headers, seqs))

    f_out = open(fn_start + chrm + '_mafft.stats', 'w')

    # number of sites where n,...,3,2,1 genomes aligned
    num_strains_by_site = num_strains_aligned_by_site(seqs)
    f_out.write('# histogram of number of strains '
                'aligned across all alignment columns\n')
    for n in range(len(num_strains_by_site)):
        f_out.write(str(n) + ',' + str(num_strains_by_site[n]) + '\n')
    f_out.write('\n')

    # fraction of genomes aligned (should all be 1)
    fracs_aligned, seq_lengths = fraction_strains_aligned(headers, seqs)
    for frac in fracs_aligned:
Example #34
0
def maf_id(fn, ref1='S288c', ref2='CBS432'):
    headers, seqs = read_fasta.read_fasta(fn)
    id1, den1 = seq_id(seqs[2], seqs[0])
    id2, den2 = seq_id(seqs[2], seqs[1])
    return id1, id2, den1, den2
Example #35
0
def get_range_seq(start, end, seq_fn):

    chrm_seq = read_fasta.read_fasta(seq_fn)[1][0]
    range_seq = chrm_seq[start:end + 1]
    return range_seq