def load_data(self):
     self.gff_gen = parse_gff(self.gff_f)
     self.rbs_cat_d = load_rbs_category(self.rbs_cat_f)
     # exclude: seqname, seqlen,
     # ('orf_index', 'start', 'end', 'strand',
     #        'partial', 'start_type', 'gc_cont', 'rbs_motif')
     self.gff_mat_colnames = GFF_PARSER_COLS[2:]
Example #2
0
    def load_data(self):
        self.gff_gen = parse_gff(self.gff_f)

        df_tax_all = pd.read_csv(self.tax_f,
                                 sep='\t',
                                 header=None,
                                 names=['orfname', 'tax', 'hmm', 'score'])
        try:
            seqnames, indice = zip(
                *[orfname.rsplit('_', 1) for orfname in df_tax_all['orfname']])
        except ValueError as e:
            seqnames = []
            indice = []

        df_tax_all['seqname'] = seqnames
        # convert to int to match orf_index in df_gff
        indice = [int(i) for i in indice]
        df_tax_all['orf_index'] = indice
        self.df_tax_all = df_tax_all

        self.rbs_cat_d = load_rbs_category(self.rbs_cat_f)

        if self.hallmark_f != None:
            self.d_hallmark_hmm = parse_hallmark_hmm(self.hallmark_f)
        else:
            self.d_hallmark_hmm = None

        self.name2loc_d = {}
        with screed.open(self.full_and_part_seqfile) as sp:
            for rec in sp:
                header = rec.name
                name, desc = header.split(None, 1)
                seqname_ori = name.rsplit('||', 1)[0]
                _d = dict(i.split(':') for i in desc.split('||'))
                seq_len = len(rec.sequence)

                self.name2loc_d.setdefault(seqname_ori, {})
                self.name2loc_d[seqname_ori].setdefault('seqname', [])
                self.name2loc_d[seqname_ori].setdefault('loc', [])

                self.name2loc_d[seqname_ori]['seqname'].append(name)
                self.name2loc_d[seqname_ori]['loc'].append(
                    (int(_d.get('start', 0)), int(_d.get('end', seq_len))))

        with warnings.catch_warnings():
            warnings.filterwarnings('ignore', category=DeprecationWarning)
            warnings.filterwarnings('ignore', category=FutureWarning)
            model = joblib.load(self.model_f)
            try:
                model.named_steps.gs.best_estimator_.set_params(
                    n_jobs=CLASSIFY_THREADS)
            except AttributeError as e:
                model.named_steps.rf.set_params(n_jobs=CLASSIFY_THREADS)

            self.model = model

        self.gff_mat_colnames = ('orf_index', 'start', 'end', 'strand',
                                 'partial', 'start_type', 'gc_cont',
                                 'rbs_motif')
Example #3
0
    def load_data(self):
        self.gff_gen = parse_gff(self.gff_f)

        self.rbs_cat_d = load_rbs_category(self.rbs_cat_f)

        if self.hallmark_f != None:
            self.d_hallmark_hmm = parse_hallmark_hmm(self.hallmark_f)
        else:
            self.d_hallmark_hmm = None

        if self.fullseq_clf_f != None:
            df = pd.read_csv(self.fullseq_clf_f, sep='\t', header=0)
            # force seqname col to be str dtype in case seqname are
            #   numbers only
            df = df.astype({'seqname': 'str'})
            decoy_lis = [i for i in df.columns if i.startswith('decoy')]
            df = df.drop(decoy_lis, axis=1)
            self.df_fullseq_clf = df

        with warnings.catch_warnings():
            warnings.filterwarnings('ignore', category=DeprecationWarning)
            warnings.filterwarnings('ignore', category=FutureWarning)
            model = joblib.load(self.model_f)
            try:
                # pipe on grid search; legacy
                model.named_steps.gs.best_estimator_.set_params(
                    n_jobs=CLASSIFY_THREADS)
            except AttributeError as e:
                # grid search on pipe; new
                # steps = [('scaler', MinMaxScaler()), ('rf', clf_to_train)]
                model.named_steps.rf.set_params(n_jobs=CLASSIFY_THREADS)

            self.model = model

        self.gff_mat_colnames = GFF_PARSER_COLS[2:]
        self.hallmark_ftr_ind = SELECT_FEATURE_LIST.index('hallmark')
        self.arc_ind = SELECT_FEATURE_LIST.index('arc')
        self.bac_ind = SELECT_FEATURE_LIST.index('bac')
        self.euk_ind = SELECT_FEATURE_LIST.index('euk')
        self.vir_ind = SELECT_FEATURE_LIST.index('vir')
        self.mix_ind = SELECT_FEATURE_LIST.index('mix')
        self.unaligned_ind = SELECT_FEATURE_LIST.index('unaligned')
def main(seqfile, outfile, gff_list_str, tax_list_str, group_list_str,
        affi_contigs_file, pfamtax_list_str):
    '''Add sequence length to table

    \b
    Example:
        python make-affi-contigs-tabfile.py
                --pfamtax-list
                    "A/<all.pdg.hmm.pfamtax>,B/<all.pdg.hmm.pfamtax>"
                <viral-combined.fa>
                <viral-gene-annotation.tsv>
                <affi-contigs.tab>
                "A/<all.pdg.gff>,B/<all.pdg.gff>"
                "A/<all.pdg.hmm.tax>,B/<all.pdg.hmm.tax>"
                "A,B"


        \b
        <viral-combined.fa>: viral contigs with less than
                two genes and hallmark gene
        <viral-gene-annotation.tsv>: viral gene annotation table
        <affi-contigs.tab>: output affi-contigs.tab file for DRAM-v
        <all.pdg.gff>: gff file from prodigal
        <all.pdg.hmm.tax>: table with best hit of each gene, bit score, 
                and taxonomy using customized viral hmm db
        <all.pdg.hmm.pfamtax>: table with best hit of each gene, bit score, 
                and taxonomy using pfam viral hmm
        A: viral group name


    '''

    seqfile = seqfile
    gff_fs = gff_list_str
    tax_fs = tax_list_str
    groups = group_list_str

    pfamtax_fs = pfamtax_list_str
    affi_f = affi_contigs_file

    d_group2name = {}
    d_name2provirus = {}
    with screed.open(seqfile) as sp:
        for rec in sp:
            header = rec.name
            name, desc = header.split(None ,1)
            # remove suffix, ty in ['full', '*_partial', 'lt2gene']
            name, ty = name.rsplit('||', 1)
            provirus = False
            if ty.endswith('partial'):
                provirus = True
            d_name2provirus[name] = provirus
            _d = dict(i.split(':') for i in desc.split('||'))
            best_group = _d['group']
            st = d_group2name.setdefault(best_group, set())
            st.add(name)

    gff_fs = [f.strip() for f in gff_fs.split(',')]
    tax_fs = [f.strip() for f in tax_fs.split(',')]
    if pfamtax_fs != None:
        pfamtax_fs = [f.strip() for f in pfamtax_fs.split(',')]
    groups = [group.strip() for group in groups.split(',')]

    gene_anno_lis = []
    orf_index_ind = GFF_PARSER_COLS.index('orf_index')
    seqname_ind = GFF_PARSER_COLS.index('seqname')
    for i, l in enumerate(zip(gff_fs, tax_fs, groups)):

        gff_f, tax_f, group = l
        gen_gff = parse_gff(gff_f) 

        if pfamtax_list_str != None:
            pfamtax_f = pfamtax_fs[i] 

        name_st = d_group2name.get(group, set())
        if len(name_st) == 0:
            continue

        prev_seqname = None
        for l in gen_gff:
            seqname = l[0]
            seqname_ori = seqname.rsplit('||', 1)[0]
            if not seqname_ori in name_st:
                continue
            if seqname != prev_seqname:
                df_tax_sel = df_tax_per_config(tax_f, seqname, taxwhm=True)
                if  pfamtax_list_str != None:
                    df_pfamtax_sel = df_tax_per_config(pfamtax_f, seqname)

            orf_index = l[orf_index_ind]
            sel = (df_tax_sel['orf_index'] == orf_index)
            ser = df_tax_sel.loc[sel, :].squeeze()
            if len(ser) == 0:
                tax = 'unaligned'
                hmm = 'NA'
                score = np.nan
                hallmark = 0
            else:
                tax = ser.loc['tax']
                hmm = ser.loc['hmm']
                score = ser.loc['score']
                hallmark = int(ser.loc['hallmark'])

            # pfamtax
            if pfamtax_list_str != None:
                sel = (df_pfamtax_sel['orf_index'] == orf_index)
                ser = df_pfamtax_sel.loc[sel, :].squeeze()
                if len(ser) == 0:
                    pfamtax = 'unaligned'
                    pfamhmm = 'NA'
                    pfamscore = np.nan
                else:
                    pfamtax = ser.loc['tax']
                    pfamhmm = ser.loc['hmm']
                    pfamscore = ser.loc['score']
            else:
                pfamhmm = 'NA'


            provirus = d_name2provirus[seqname_ori]
            is_hallmark = 0
            if not provirus:
                if hallmark == 1:
                    cat = 0
                    is_hallmark = 1
                elif tax == 'vir':
                    cat = 1
                else:
                    cat = 2

            else:
                if hallmark == 1:
                    cat = 0
                    is_hallmark = 1
                elif tax == 'vir':
                    cat = 1
                else:
                    cat = 2

            _l = list(l)
            _l[seqname_ind] = seqname_ori
            bits = score
            _l.extend(
                [hmm, bits, pfamhmm, pfamscore, tax, is_hallmark, cat, group]
            )
            gene_anno_lis.append(_l)
            prev_seqname = seqname

    # continue work from here
    df_anno = pd.DataFrame(gene_anno_lis, 
            columns=(GFF_PARSER_COLS + GENE_ANNO_COLS))

    df_lis = []
    with screed.open(seqfile) as sp, open(affi_f, 'w') as fw:
        for rec in sp:
            header = rec.name
            name, desc = header.split(None ,1)
            # remove full, _provirus suffix
            seqname, ty = name.rsplit('||', 1)

            d_desc = dict(i.split(':') for i in desc.split('||'))
            shape = d_desc['shape']
            start_ind = d_desc['start_ind']
            end_ind = d_desc['end_ind']
            group = d_desc['group']

            _sel = ((df_anno['group'] == group) & 
                    (df_anno['seqname'] == seqname))

            _df = df_anno.loc[_sel, :]  # genes for only one seqname
            if start_ind == 'nan' or end_ind == 'nan':
                df_oneseq = _df
            else:
                _sel2 = ((_df['orf_index'] >= int(start_ind)) & 
                            (_df['orf_index'] <= int(end_ind)))
                df_oneseq = _df.loc[_sel2,:]

            df_oneseq = df_oneseq.copy()
            df_oneseq['seqname_final'] = name
            df_lis.append(df_oneseq)

            # within contigs-affi.tsv
            # no | allowed in name (the string after >) for DRAM-v
            # no | allowed in gene_name for DRAM-v
            name = name.replace('|', '_')
            gene_nb = len(df_oneseq)
            shape_simple = 'c' if shape == 'circular' else 'l'

            fw.write(f'>{name}|{gene_nb}|{shape_simple}\n')
            for i in range(len(df_oneseq)):
                ser = df_oneseq.iloc[i]
                orf_ind = ser.loc['orf_index']
                gene_name = f'{name}__{orf_ind}'
                l = [gene_name]
                # skip gene_name
                for i in AFFI_CONTIG_COLS[1:]:
                    try:
                        j = ser.loc[i]
                    except KeyError as e:
                        j = np.nan

                    if j in set(['NA', np.nan]):
                        j = '-'

                    l.append(str(j))

                gene_row_str = '|'.join(l)
                fw.write(f'{gene_row_str}\n')

        df_merge = pd.concat(df_lis)
        df_merge.to_csv(outfile, sep='\t', index=False, na_rep='nan')
def main():
    '''Add sequence length to table

    Example:
        python add-extra-to-lt2gene-fasta-header.py \
                <viral-lt2gene-w-hallmark.fa> \
                "A/<all.pdg.gff>,B/<all.pdg.gff>" \
                "A/<all.pdg.hmm.tax>,B/<all.pdg.hmm.ftr>" \
                "A,B"


        <viral-lt2gene-w-hallmark.fa>: viral contigs with less than \
                two genes and hallmark gene
        <all.pdg.gff>: gff file from prodigal
        <all.pdg.hmm.tax>: table with best hit of each gene, bit score, 
                and taxonomy
        A: viral group name


    '''
    if len(sys.argv) != 5:
        mes = ('python {} viral-lt2gene-w-hallmark.fa '
                '"A/<all.pdg.gff>,B/<all.pdg.gff>" '
                '"A/<all.pdg.hmm.tax>,B/<all.pdg.hmm.tax>" '
                '"A,B"\n')
        sys.stderr.write(mes.format(os.path.basename(sys.argv[0])))
        sys.exit(1)

    seqfile = sys.argv[1]
    gff_fs = sys.argv[2]
    tax_fs = sys.argv[3]
    groups = sys.argv[4]

    d_group2name = {}
    with screed.open(seqfile) as sp:
        for rec in sp:
            header = rec.name
            name, desc = header.split(None ,1)
            _d = dict(i.split(':') for i in desc.split('||'))
            group = _d['group']
            st = d_group2name.setdefault(group, set())
            st.add(name)

    gff_fs = [f.strip() for f in gff_fs.split(',')]
    tax_fs = [f.strip() for f in tax_fs.split(',')]
    groups = [group.strip() for group in groups.split(',')]

    d_name2info = {}
    for gff_f, tax_f, group in zip(gff_fs, tax_fs, groups):
        name_st = d_group2name.get(group, set())
        gen_gff = parse_gff(gff_f) 
        seqname_lis = []
        seqname_ori_lis = []
        for l in gen_gff:
            seqname = l[0]
            seqname_ori = seqname.rsplit('||', 1)[0]
            if not seqname_ori in name_st:
                continue
            seqname_lis.append(seqname)
            seqname_ori_lis.append(seqname_ori)

        d_name2cnt=Counter(seqname_ori_lis)

        for seqname_ori in name_st:
            total_gene_cnt = d_name2cnt[seqname_ori]
            ind = seqname_ori_lis.index(seqname_ori)
            seqname = seqname_lis[ind]
            df_tax_sel = df_tax_per_config(tax_f, seqname)
            sel_index_w_hallmark = [] # donot need hallmark cnt here 
            l_tax = extract_feature_tax(df_tax_sel, 
                    sel_index_w_hallmark=sel_index_w_hallmark, 
                    total_gene_cnt=total_gene_cnt)

            vir_ind = TAX_FEATURE_LIST.index('vir')
            arc_ind = TAX_FEATURE_LIST.index('arc')
            bac_ind = TAX_FEATURE_LIST.index('bac')
            euk_ind = TAX_FEATURE_LIST.index('euk')
            viral = l_tax[vir_ind] 
            cellular = l_tax[arc_ind] + l_tax[bac_ind] + l_tax[euk_ind]
            d_name2info[seqname_ori] = 1, total_gene_cnt, viral, cellular


    with screed.open(seqfile) as sp:
        for rec in sp:
            header = rec.name
            name, desc = header.split(None ,1)
            d_desc = dict(i.split(':') for i in desc.split('||'))
            start_ind, end_ind, viral, cellular = d_name2info[name]
            seq = rec.sequence
            length = len(seq)
            d_desc['start'] = 1
            d_desc['end'] = length
            d_desc['start_ind'] = start_ind
            d_desc['end_ind'] = end_ind
            d_desc['viral'] = viral
            d_desc['cellular'] = cellular
            d_desc['score'] = np.nan
            d_desc['hallmark'] = int(d_desc['hallmark'])

            desc = FASTA_DESC_FORMAT_TEMPLATE.format(**d_desc)
            mes = f'>{name}  {desc}\n{seq}\n'
            sys.stdout.write(mes)
 def load_data(self):
     self.gff_gen = parse_gff(self.gff_f)
     self.rbs_cat_d = load_rbs_category(self.rbs_cat_f)
     self.gff_mat_colnames = ('orf_index', 'start', 'end', 'strand',
                              'partial', 'start_type', 'gc_cont',
                              'rbs_motif')