Esempio n. 1
0
def dnld_cds_for_ncbi_prot_acc(ss, prot_acc_user, prot_cds_ncbi_file, tax,
                               dir_cache_prj):

    pickle_file = opj(dir_cache_prj, 'ncbi_prot_cds_cache__' + ss)
    acc_old = set()
    if ope(pickle_file):
        with open(pickle_file, 'rb') as f:
            pickled = pickle.load(f)
            acc_old = set(pickled[0])

    if acc_old == set(prot_acc_user):
        cds_rec_dict = pickled[1]
        Log.inf('The CDS for the dereplicated set of the user-provided '
                    'NCBI protein accessions have already been '
                    'downloaded:', ss)
    else:
        Log.inf('Downloading CDS for the dereplicated set of the user-provided '
                    'NCBI protein accessions:', ss)
        cds_rec_dict = seq_records_to_dict(cds_for_prot(prot_acc_user),
                                           prepend_acc=True)
        with open(pickle_file, 'wb') as f:
            pickle.dump((prot_acc_user, cds_rec_dict), f,
                        protocol=PICKLE_PROTOCOL)

    write_fasta(cds_rec_dict, prot_cds_ncbi_file)
Esempio n. 2
0
def dnld_pfam_uniprot_seqs(ss, uniprot_acc, aa_uniprot_file, dir_cache_prj):
    if len(uniprot_acc) != 0:
        _ = opj(dir_cache_prj, 'aa_uniprot_acc_cache__' + ss)
        prev_uniprot_acc = []
        if ope(_):
            with open(_, 'rb') as f:
                prev_uniprot_acc = pickle.load(f)

        with open(_, 'wb') as f:
            pickle.dump(uniprot_acc, f, protocol=PICKLE_PROTOCOL)

        if (set(uniprot_acc) != set(prev_uniprot_acc)) or \
           (not ope(aa_uniprot_file)):

            Log.inf('Downloading Pfam protein sequences from UniProt:', ss)
            # Note: the number of sequences downloaded from UniProt may
            # be less than the total number of accessions. This is normal
            # as Pfam may return "obsolete" accessions, which will not be
            # downloaded here.
            _ = fasta_by_accession_list(uniprot_acc)
            _ = standardize_fasta_text(_, SEQ_TYPE_AA, pfam=True)

            write_fasta(_, aa_uniprot_file)

    else:
        if ope(aa_uniprot_file):
            osremove(aa_uniprot_file)
Esempio n. 3
0
def dnld_refseqs_for_taxid(taxid,
                           filter_term,
                           taxonomy,
                           dir_cache_refseqs,
                           query='',
                           db='nuccore'):
    ft = None
    if filter_term == 'plastid':
        ft = '("chloroplast"[filter] OR "plastid"[filter])'
    else:
        ft = '("' + filter_term + '"[filter])'

    tax_terms = tuple(reversed(taxonomy.lineage_for_taxid(taxid)['names']))
    for tax_term in tax_terms:
        if tax_term is None:
            tax_term = taxonomy.scientific_name_for_taxid(taxid)
        term = '"RefSeq"[Keyword] AND "{}"[Primary Organism] AND {}'.format(
            tax_term, ft)
        term = query + term
        accs = set(accs_eutil(search_eutil(db, term)))
        if len(accs) > 0:
            plural = 'sequences'
            if len(accs) == 1:
                plural = 'sequence'
            Log.msg(
                'Found {} RefSeq {} {} for'.format(len(accs), filter_term,
                                                   plural), tax_term)
            # Random sample ###################################################
            if len(accs) > 10:
                Log.wrn('Using a random sample of ten RefSeq sequences.')
                random.seed(a=len(accs), version=2)
                accs = set(random.sample(accs, 10))
            ###################################################################
            break
        else:
            Log.wrn(
                'No RefSeq {} sequences were found for'.format(filter_term),
                tax_term)

    cache_path = opj(
        dir_cache_refseqs,
        filter_term + '__' + tax_term.replace(' ', '_') + '.fasta')
    parsed_fasta_cache = {}
    if ope(cache_path):
        parsed_fasta_cache = read_fasta(cache_path,
                                        seq_type=SEQ_TYPE_NT,
                                        def_to_first_space=True)
        parsed_fasta_cache = seq_records_to_dict(parsed_fasta_cache)
        for acc in parsed_fasta_cache:
            if acc in accs:
                accs.remove(acc)
    if len(accs) > 0:
        parsed_fasta = dnld_ncbi_seqs(db, list(accs))
        parsed_fasta = seq_records_to_dict(parsed_fasta, prepend_acc=True)
        parsed_fasta.update(parsed_fasta_cache)
        write_fasta(parsed_fasta, cache_path)

    return cache_path
Esempio n. 4
0
def user_aa_fasta(ss, user_queries, aa_prot_user_file):
    _ = ''
    if len(user_queries) > 0:
        print()
        Log.inf('Reading user provided AA sequences:', ss)
        for ap in user_queries:
            Log.msg(ap)
            with open(ap, 'r') as f:
                _ = _ + f.read()
    if _ != '':
        with open(aa_prot_user_file, 'w') as f:
            write_fasta(standardize_fasta_text(_, SEQ_TYPE_AA), f)
Esempio n. 5
0
def dnld_prot_seqs(ss, prot_acc_user, aa_prot_ncbi_file, dir_cache_prj):
    if len(prot_acc_user) != 0:
        acc_old = set()
        if ope(aa_prot_ncbi_file):
            _ = read_fasta(aa_prot_ncbi_file, SEQ_TYPE_AA)
            acc_old = set([x.definition.split('|')[0] for x in _])

        if acc_old == set(prot_acc_user):
            return prot_acc_user
        else:

            pickle_file = opj(dir_cache_prj, 'ncbi_prot_metadata_cache__' + ss)
            if ope(pickle_file):
                with open(pickle_file, 'rb') as f:
                    pa_info = pickle.load(f)

            print()
            Log.inf('Downloading protein sequences from NCBI:', ss)
            _ = dnld_ncbi_seqs('protein',
                               prot_acc_user,
                               rettype='gb',
                               retmode='xml')
            prot_acc_user_new = list()
            for rec in _:
                acc_ver = rec.accession_version
                defn = rec.definition
                organism = rec.organism

                prot_acc_user_new.append(acc_ver)

                defn_new = defn.split('[' + organism + ']')[0]
                defn_new = defn_new.lower().strip()
                defn_new = defn_new.replace(' ', '_').replace('-', '_')
                defn_new = defn_new.replace(',', '')
                defn_new = defn_new[0].upper() + defn_new[1:]

                defn_new = acc_ver + '|' + defn_new + '|' + organism
                defn_new = defn_new.replace(' ', '_').replace('-', '_')

                rec.definition = defn_new

            prot_acc_user = prot_acc_user_new
            write_fasta(_, aa_prot_ncbi_file)
    else:
        if ope(aa_prot_ncbi_file):
            osremove(aa_prot_ncbi_file)

    return prot_acc_user
Esempio n. 6
0
def filter_queries(ss,
                   aa_queries_file,
                   min_query_length,
                   max_query_length,
                   max_query_identity,
                   vsearch,
                   prot_acc_user,
                   overwrite,
                   logging=True):

    if logging is True:
        print()
        Log.inf('Filtering AA query sequences:', ss)
        Log.msg('min_query_length:', str(min_query_length))
        Log.msg('max_query_length:', str(max_query_length))
        Log.msg('max_query_identity:', str(max_query_identity))

    parsed_fasta_1 = filter_fasta_by_length(aa_queries_file, SEQ_TYPE_AA,
                                            min_query_length, max_query_length)
    tmp1 = aa_queries_file + '_temp1'
    tmp2 = aa_queries_file + '_temp2'
    for rec in parsed_fasta_1:
        rec.seq.gc_code = 1
        rec.seq = rec.seq.untranslate()
    write_fasta(parsed_fasta_1, tmp1)
    run_cluster_fast(vsearch, max_query_identity, tmp1, tmp2)
    parsed_fasta_2 = read_fasta(tmp2, SEQ_TYPE_DNA, parse_def=True)
    prot_acc_user_new = list()
    for rec in parsed_fasta_2:
        rec.seq.gc_code = 1
        rec.seq = rec.seq.translate()
        acc = rec.accession_version
        if acc in prot_acc_user:
            prot_acc_user_new.append(acc)

    if overwrite is True:
        write_fasta(parsed_fasta_2, aa_queries_file, prepend_acc=True)

    osremove(tmp1)
    osremove(tmp2)

    return prot_acc_user_new
Esempio n. 7
0
def find_orfs_translate(ss, assemblies, dir_prj_transcripts, seqtk, dir_temp,
                        prepend_assmbl, min_target_orf_len, max_target_orf_len,
                        allow_non_aug, allow_no_strt_cod, allow_no_stop_cod,
                        tax, tax_group, tax_ids_user, min_overlap, organelle):

    if len(assemblies) > 0:
        if seqtk is None:
            Log.err('seqtk is not available. Cannot continue. Exiting.')
            exit(0)

    for a in assemblies:

        if ('blast_hits_aa__' + ss) not in a:
            continue

        assmbl_name = a['name']
        tax_id = a['tax_id']

        parsed_hits = a['blast_hits_aa__' + ss]

        a_path = a['path']

        gc_tt = a['gc_tt']
        if tax.is_eukaryote(tax_id) is True:
            if organelle == 'mitochondrion':
                gc_tt = a['gc_tt_mito']
            if tax.contains_plastid(tax_id) is True:
                if organelle == 'plastid':
                    gc_tt = a['gc_tt_plastid']

        transcripts_nt_fasta_file = opj(
            dir_prj_transcripts,
            assmbl_name + '_transcripts_nt__' + ss + '.fasta')

        transcripts_nt_orf_fasta_file = opj(
            dir_prj_transcripts,
            assmbl_name + '_transcripts_nt_orf__' + ss + '.fasta')

        transcripts_aa_orf_fasta_file = opj(
            dir_prj_transcripts,
            assmbl_name + '_transcripts_aa_orf__' + ss + '.fasta')

        transcripts_nt = {}
        transcripts_nt_orf = {}
        transcripts_aa_orf = {}

        transcripts_with_acceptable_orfs = set()

        ann_key = 'annotations__'

        a[ann_key + ss] = {}

        collated = collate_blast_results(parsed_hits)

        ######################################################################
        # Use seqtk to sample the assembly FASTA file for sequences with
        # BLAST hits. This increases the speed substantially when the assembly
        # file is large.
        temp_a_file = opj(dir_temp, 'temp__' + ss + '.fasta')
        temp_s_file = opj(dir_temp, 'temp__' + ss + '.txt')
        sseqids_subsample = []
        for hit in collated:
            target_name = hit['sseqid']
            sseqids_subsample.append(target_name)
        sseqids_subsample_text = '\n'.join(sseqids_subsample)
        with open(temp_s_file, 'w') as f:
            f.write(sseqids_subsample_text)
        seqtk_extract_reads(seqtk,
                            in_file=a_path,
                            out_file=temp_a_file,
                            ids_file=temp_s_file)

        with open(temp_a_file, 'r') as f:
            _ = f.read()

        if _.strip() == '':
            continue

        print()
        Log.inf('Analyzing BLAST hits', '=' * 113 + '\n')
        Log.msg('Assembly:', assmbl_name, False)
        Log.msg('Search Strategy:', ss + '\n\n' + '-' * 134 + '\n', False)

        parsed_fasta = trim_desc_to_first_space_in_fasta_text(_, SEQ_TYPE_DNA)
        parsed_fasta = seq_records_to_dict(parsed_fasta)
        ######################################################################

        all_kakapo_results = {}
        json_dump_file_path = opj(dir_prj_transcripts,
                                  assmbl_name + '_ann_kakapo__' + ss + '.json')

        for hit in collated:

            target_name = hit['sseqid']
            target_seq = parsed_fasta[target_name]
            query_name = hit['qseqid']
            hit_evalue = hit['evalue']

            # Prepend assembly name to the sequence name:
            if prepend_assmbl is True:
                target_name = assmbl_name + '__' + target_name
                # Also prepend taxonomic info to the sequence name:
                if tax_id is not None:
                    fm = tax.higher_rank_for_taxid(tax_id, rank='family')
                    if fm is not None:
                        target_name = fm + '__' + target_name

            hit_start = hit['start']
            hit_end = hit['end']
            hit_frame = hit['frame']

            if allow_non_aug is True:
                start_codons = gc_tt.start_codons_ambiguous
            else:
                start_codons = ['ATG']

            stop_codons = gc_tt.stop_codons_ambiguous

            ##################################################################
            if tax_id is not None:
                tax_ids_for_orf = (tax_id, )
            else:
                tax_ids_for_orf = tax_ids_user

            cntx_txids_avail = tuple(
                sorted(
                    set(
                        map(lambda x: int(x.split('_')[0]),
                            atg_contexts.keys()))))

            cntx_taxid = set()
            for txid in tax_ids_for_orf:
                tax_path = partial(tax.path_between_taxids, txid)
                path_len = tuple(
                    map(len, tuple(map(tax_path, cntx_txids_avail))))
                cntx_taxid.add(cntx_txids_avail[path_len.index(min(path_len))])
            cntx_taxid = tuple(cntx_taxid)[0]

            cntx_l_key = str(cntx_taxid) + '_L'
            cntx_r_key = str(cntx_taxid) + '_R'

            cntx_l = atg_contexts[cntx_l_key]
            cntx_r = atg_contexts[cntx_r_key]
            ##################################################################

            orf_log_str = ('grade'.rjust(5) + 'ovrlp'.rjust(7) +
                           'cntx'.rjust(6) + 'length'.center(9) +
                           'cntx_l'.rjust(7) + 'cntx_r'.rjust(15) + '\n')

            orf = find_orf_for_blast_hit(seq=target_seq,
                                         frame=hit_frame,
                                         hit_start=hit_start,
                                         hit_end=hit_end,
                                         stop_codons=stop_codons,
                                         start_codons=start_codons,
                                         context_l=cntx_l,
                                         context_r=cntx_r,
                                         min_overlap=min_overlap,
                                         min_len=min_target_orf_len,
                                         max_len=max_target_orf_len,
                                         allow_no_strt_cod=allow_no_strt_cod,
                                         allow_no_stop_cod=allow_no_stop_cod)

            orf_log_str += orf[2]

            rev_comp_def_str = ''
            if hit_frame > 0:
                ann_hit_b = hit_start
                ann_hit_e = hit_end
            else:
                target_seq = reverse_complement(target_seq)
                ann_hit_b = len(target_seq) - hit_start
                ann_hit_e = len(target_seq) - hit_end
                rev_comp_def_str = '; RevComp'

            target_def = target_name + ' ' + query_name + rev_comp_def_str

            a[ann_key + ss][target_name] = {}

            good_orfs = orf[0]
            bad_orfs = orf[1]

            if len(good_orfs) > 0:
                a[ann_key + ss][target_name]['orfs_good'] = dict()
                orfs_good_dict = a[ann_key + ss][target_name]['orfs_good']
                orf_log_str += '\n' + 'VALID ' + '-' * 128 + '\n'

                for i, good_orf in enumerate(good_orfs):

                    good_orf_frame = good_orf[2]

                    if good_orf_frame > 0:
                        ann_orf_b = good_orf[0]
                        ann_orf_e = good_orf[1] + 3
                        orf_seq = target_seq[ann_orf_b:ann_orf_e]
                    else:
                        ann_orf_b = len(target_seq) - good_orf[1]
                        ann_orf_e = len(target_seq) - good_orf[0] + 3
                        orf_seq = target_seq[ann_orf_b:ann_orf_e]

                    orf_good_dict = dict()
                    orf_good_dict['orf_begin'] = ann_orf_b
                    orf_good_dict['orf_end'] = ann_orf_e
                    orf_good_dict['orf_frame'] = abs(good_orf_frame)
                    orf_good_dict['orf_grade'] = good_orf[3]
                    orf_good_dict['orf_tt_id'] = str(gc_tt.gc_id)
                    orf_good_dict['orf_tt_name'] = gc_tt.gc_name

                    orfs_good_dict['ORF{:03d}'.format(i + 1)] = orf_good_dict

                    target_def_orf = (target_name +
                                      '__ORF{:03d}'.format(i + 1) + ' ' +
                                      query_name + rev_comp_def_str)

                    transcripts_nt_orf[target_def_orf] = orf_seq

                    transcripts_with_acceptable_orfs.add(target_name)

                    transl_seq = translate(orf_seq, gc_tt.table_ambiguous,
                                           start_codons)

                    transcripts_aa_orf[target_def_orf] = transl_seq[:-1]

            else:
                orf_log_str += '\n' + 'NOT VALID ' + '-' * 124 + '\n'

            Log.msg('Transcript:', target_name, False)
            Log.msg('     Query:', query_name + '\n\n' + orf_log_str, False)

            if len(bad_orfs) > 0:
                a[ann_key + ss][target_name]['orfs_bad'] = dict()
                orfs_bad_dict = a[ann_key + ss][target_name]['orfs_bad']

                for i, bad_orf in enumerate(bad_orfs):

                    bad_orf_frame = bad_orf[2]

                    if bad_orf_frame > 0:
                        ann_orf_b = bad_orf[0]
                        ann_orf_e = bad_orf[1] + 3
                        orf_seq = target_seq[ann_orf_b:ann_orf_e]
                    else:
                        ann_orf_b = len(target_seq) - bad_orf[1]
                        ann_orf_e = len(target_seq) - bad_orf[0] + 3
                        orf_seq = target_seq[ann_orf_b:ann_orf_e]

                    orf_bad_dict = dict()
                    orf_bad_dict['orf_begin'] = ann_orf_b
                    orf_bad_dict['orf_end'] = ann_orf_e
                    orf_bad_dict['orf_frame'] = abs(bad_orf_frame)
                    orf_bad_dict['orf_grade'] = bad_orf[3]
                    orf_bad_dict['orf_tt_id'] = str(gc_tt.gc_id)
                    orf_bad_dict['orf_tt_name'] = gc_tt.gc_name

                    orfs_bad_dict['ORF{:03d}'.format(i + 1)] = orf_bad_dict

            transcripts_nt[target_def] = target_seq

            a[ann_key + ss][target_name]['blast_hit'] = dict()
            blast_hit_dict = a[ann_key + ss][target_name]['blast_hit']
            blast_hit_dict['query_name'] = query_name
            blast_hit_dict['query_id'] = ss
            blast_hit_dict['evalue'] = hit_evalue
            blast_hit_dict['frame'] = abs(hit_frame)
            blast_hit_dict['blast_hit_begin'] = ann_hit_b
            blast_hit_dict['blast_hit_end'] = ann_hit_e

            # Collect ORF and BLAST hit annotations for downstream use. ######
            kakapo_json = [{}]
            kakapo_json[0]['kakapo_annotations__' + ss] = (a[ann_key +
                                                             ss][target_name])
            all_kakapo_results[target_name] = kakapo_json
            ##################################################################

        # --------------------------------------------------------------------

        Log.msg('Assembly:', assmbl_name, False)
        Log.msg('Search Strategy:', ss, False)
        Log.msg('Transcripts:', str(len(transcripts_nt)), False)
        Log.msg('Transcripts with acceptable ORFs:',
                str(len(transcripts_with_acceptable_orfs)) + '\n' + '=' * 134,
                False)

        if len(transcripts_nt) > 0:
            write_fasta(transcripts_nt, transcripts_nt_fasta_file)
            a['transcripts_nt_fasta_file__' + ss] = transcripts_nt_fasta_file
        else:
            a['transcripts_nt_fasta_file__' + ss] = None

        if len(transcripts_nt_orf) > 0:
            write_fasta(transcripts_nt_orf, transcripts_nt_orf_fasta_file)
            a['transcripts_nt_orf_fasta_file__' +
              ss] = transcripts_nt_orf_fasta_file
        else:
            a['transcripts_nt_orf_fasta_file__' + ss] = None

        if len(transcripts_aa_orf) > 0:
            write_fasta(transcripts_aa_orf, transcripts_aa_orf_fasta_file)
            a['transcripts_aa_orf_fasta_file__' +
              ss] = transcripts_aa_orf_fasta_file
        else:
            a['transcripts_aa_orf_fasta_file__' + ss] = None

        # Save ORF and BLAST hit annotations for downstream use.--------------
        with open(json_dump_file_path, 'w') as f:
            json.dump(all_kakapo_results, f, sort_keys=True, indent=4)