def load_data(self): self.gff_gen = parse_gff(self.gff_f) self.rbs_cat_d = load_rbs_category(self.rbs_cat_f) # exclude: seqname, seqlen, # ('orf_index', 'start', 'end', 'strand', # 'partial', 'start_type', 'gc_cont', 'rbs_motif') self.gff_mat_colnames = GFF_PARSER_COLS[2:]
def load_data(self): self.gff_gen = parse_gff(self.gff_f) df_tax_all = pd.read_csv(self.tax_f, sep='\t', header=None, names=['orfname', 'tax', 'hmm', 'score']) try: seqnames, indice = zip( *[orfname.rsplit('_', 1) for orfname in df_tax_all['orfname']]) except ValueError as e: seqnames = [] indice = [] df_tax_all['seqname'] = seqnames # convert to int to match orf_index in df_gff indice = [int(i) for i in indice] df_tax_all['orf_index'] = indice self.df_tax_all = df_tax_all self.rbs_cat_d = load_rbs_category(self.rbs_cat_f) if self.hallmark_f != None: self.d_hallmark_hmm = parse_hallmark_hmm(self.hallmark_f) else: self.d_hallmark_hmm = None self.name2loc_d = {} with screed.open(self.full_and_part_seqfile) as sp: for rec in sp: header = rec.name name, desc = header.split(None, 1) seqname_ori = name.rsplit('||', 1)[0] _d = dict(i.split(':') for i in desc.split('||')) seq_len = len(rec.sequence) self.name2loc_d.setdefault(seqname_ori, {}) self.name2loc_d[seqname_ori].setdefault('seqname', []) self.name2loc_d[seqname_ori].setdefault('loc', []) self.name2loc_d[seqname_ori]['seqname'].append(name) self.name2loc_d[seqname_ori]['loc'].append( (int(_d.get('start', 0)), int(_d.get('end', seq_len)))) with warnings.catch_warnings(): warnings.filterwarnings('ignore', category=DeprecationWarning) warnings.filterwarnings('ignore', category=FutureWarning) model = joblib.load(self.model_f) try: model.named_steps.gs.best_estimator_.set_params( n_jobs=CLASSIFY_THREADS) except AttributeError as e: model.named_steps.rf.set_params(n_jobs=CLASSIFY_THREADS) self.model = model self.gff_mat_colnames = ('orf_index', 'start', 'end', 'strand', 'partial', 'start_type', 'gc_cont', 'rbs_motif')
def load_data(self): self.gff_gen = parse_gff(self.gff_f) self.rbs_cat_d = load_rbs_category(self.rbs_cat_f) if self.hallmark_f != None: self.d_hallmark_hmm = parse_hallmark_hmm(self.hallmark_f) else: self.d_hallmark_hmm = None if self.fullseq_clf_f != None: df = pd.read_csv(self.fullseq_clf_f, sep='\t', header=0) # force seqname col to be str dtype in case seqname are # numbers only df = df.astype({'seqname': 'str'}) decoy_lis = [i for i in df.columns if i.startswith('decoy')] df = df.drop(decoy_lis, axis=1) self.df_fullseq_clf = df with warnings.catch_warnings(): warnings.filterwarnings('ignore', category=DeprecationWarning) warnings.filterwarnings('ignore', category=FutureWarning) model = joblib.load(self.model_f) try: # pipe on grid search; legacy model.named_steps.gs.best_estimator_.set_params( n_jobs=CLASSIFY_THREADS) except AttributeError as e: # grid search on pipe; new # steps = [('scaler', MinMaxScaler()), ('rf', clf_to_train)] model.named_steps.rf.set_params(n_jobs=CLASSIFY_THREADS) self.model = model self.gff_mat_colnames = GFF_PARSER_COLS[2:] self.hallmark_ftr_ind = SELECT_FEATURE_LIST.index('hallmark') self.arc_ind = SELECT_FEATURE_LIST.index('arc') self.bac_ind = SELECT_FEATURE_LIST.index('bac') self.euk_ind = SELECT_FEATURE_LIST.index('euk') self.vir_ind = SELECT_FEATURE_LIST.index('vir') self.mix_ind = SELECT_FEATURE_LIST.index('mix') self.unaligned_ind = SELECT_FEATURE_LIST.index('unaligned')
def main(seqfile, outfile, gff_list_str, tax_list_str, group_list_str, affi_contigs_file, pfamtax_list_str): '''Add sequence length to table \b Example: python make-affi-contigs-tabfile.py --pfamtax-list "A/<all.pdg.hmm.pfamtax>,B/<all.pdg.hmm.pfamtax>" <viral-combined.fa> <viral-gene-annotation.tsv> <affi-contigs.tab> "A/<all.pdg.gff>,B/<all.pdg.gff>" "A/<all.pdg.hmm.tax>,B/<all.pdg.hmm.tax>" "A,B" \b <viral-combined.fa>: viral contigs with less than two genes and hallmark gene <viral-gene-annotation.tsv>: viral gene annotation table <affi-contigs.tab>: output affi-contigs.tab file for DRAM-v <all.pdg.gff>: gff file from prodigal <all.pdg.hmm.tax>: table with best hit of each gene, bit score, and taxonomy using customized viral hmm db <all.pdg.hmm.pfamtax>: table with best hit of each gene, bit score, and taxonomy using pfam viral hmm A: viral group name ''' seqfile = seqfile gff_fs = gff_list_str tax_fs = tax_list_str groups = group_list_str pfamtax_fs = pfamtax_list_str affi_f = affi_contigs_file d_group2name = {} d_name2provirus = {} with screed.open(seqfile) as sp: for rec in sp: header = rec.name name, desc = header.split(None ,1) # remove suffix, ty in ['full', '*_partial', 'lt2gene'] name, ty = name.rsplit('||', 1) provirus = False if ty.endswith('partial'): provirus = True d_name2provirus[name] = provirus _d = dict(i.split(':') for i in desc.split('||')) best_group = _d['group'] st = d_group2name.setdefault(best_group, set()) st.add(name) gff_fs = [f.strip() for f in gff_fs.split(',')] tax_fs = [f.strip() for f in tax_fs.split(',')] if pfamtax_fs != None: pfamtax_fs = [f.strip() for f in pfamtax_fs.split(',')] groups = [group.strip() for group in groups.split(',')] gene_anno_lis = [] orf_index_ind = GFF_PARSER_COLS.index('orf_index') seqname_ind = GFF_PARSER_COLS.index('seqname') for i, l in enumerate(zip(gff_fs, tax_fs, groups)): gff_f, tax_f, group = l gen_gff = parse_gff(gff_f) if pfamtax_list_str != None: pfamtax_f = pfamtax_fs[i] name_st = d_group2name.get(group, set()) if len(name_st) == 0: continue prev_seqname = None for l in gen_gff: seqname = l[0] seqname_ori = seqname.rsplit('||', 1)[0] if not seqname_ori in name_st: continue if seqname != prev_seqname: df_tax_sel = df_tax_per_config(tax_f, seqname, taxwhm=True) if pfamtax_list_str != None: df_pfamtax_sel = df_tax_per_config(pfamtax_f, seqname) orf_index = l[orf_index_ind] sel = (df_tax_sel['orf_index'] == orf_index) ser = df_tax_sel.loc[sel, :].squeeze() if len(ser) == 0: tax = 'unaligned' hmm = 'NA' score = np.nan hallmark = 0 else: tax = ser.loc['tax'] hmm = ser.loc['hmm'] score = ser.loc['score'] hallmark = int(ser.loc['hallmark']) # pfamtax if pfamtax_list_str != None: sel = (df_pfamtax_sel['orf_index'] == orf_index) ser = df_pfamtax_sel.loc[sel, :].squeeze() if len(ser) == 0: pfamtax = 'unaligned' pfamhmm = 'NA' pfamscore = np.nan else: pfamtax = ser.loc['tax'] pfamhmm = ser.loc['hmm'] pfamscore = ser.loc['score'] else: pfamhmm = 'NA' provirus = d_name2provirus[seqname_ori] is_hallmark = 0 if not provirus: if hallmark == 1: cat = 0 is_hallmark = 1 elif tax == 'vir': cat = 1 else: cat = 2 else: if hallmark == 1: cat = 0 is_hallmark = 1 elif tax == 'vir': cat = 1 else: cat = 2 _l = list(l) _l[seqname_ind] = seqname_ori bits = score _l.extend( [hmm, bits, pfamhmm, pfamscore, tax, is_hallmark, cat, group] ) gene_anno_lis.append(_l) prev_seqname = seqname # continue work from here df_anno = pd.DataFrame(gene_anno_lis, columns=(GFF_PARSER_COLS + GENE_ANNO_COLS)) df_lis = [] with screed.open(seqfile) as sp, open(affi_f, 'w') as fw: for rec in sp: header = rec.name name, desc = header.split(None ,1) # remove full, _provirus suffix seqname, ty = name.rsplit('||', 1) d_desc = dict(i.split(':') for i in desc.split('||')) shape = d_desc['shape'] start_ind = d_desc['start_ind'] end_ind = d_desc['end_ind'] group = d_desc['group'] _sel = ((df_anno['group'] == group) & (df_anno['seqname'] == seqname)) _df = df_anno.loc[_sel, :] # genes for only one seqname if start_ind == 'nan' or end_ind == 'nan': df_oneseq = _df else: _sel2 = ((_df['orf_index'] >= int(start_ind)) & (_df['orf_index'] <= int(end_ind))) df_oneseq = _df.loc[_sel2,:] df_oneseq = df_oneseq.copy() df_oneseq['seqname_final'] = name df_lis.append(df_oneseq) # within contigs-affi.tsv # no | allowed in name (the string after >) for DRAM-v # no | allowed in gene_name for DRAM-v name = name.replace('|', '_') gene_nb = len(df_oneseq) shape_simple = 'c' if shape == 'circular' else 'l' fw.write(f'>{name}|{gene_nb}|{shape_simple}\n') for i in range(len(df_oneseq)): ser = df_oneseq.iloc[i] orf_ind = ser.loc['orf_index'] gene_name = f'{name}__{orf_ind}' l = [gene_name] # skip gene_name for i in AFFI_CONTIG_COLS[1:]: try: j = ser.loc[i] except KeyError as e: j = np.nan if j in set(['NA', np.nan]): j = '-' l.append(str(j)) gene_row_str = '|'.join(l) fw.write(f'{gene_row_str}\n') df_merge = pd.concat(df_lis) df_merge.to_csv(outfile, sep='\t', index=False, na_rep='nan')
def main(): '''Add sequence length to table Example: python add-extra-to-lt2gene-fasta-header.py \ <viral-lt2gene-w-hallmark.fa> \ "A/<all.pdg.gff>,B/<all.pdg.gff>" \ "A/<all.pdg.hmm.tax>,B/<all.pdg.hmm.ftr>" \ "A,B" <viral-lt2gene-w-hallmark.fa>: viral contigs with less than \ two genes and hallmark gene <all.pdg.gff>: gff file from prodigal <all.pdg.hmm.tax>: table with best hit of each gene, bit score, and taxonomy A: viral group name ''' if len(sys.argv) != 5: mes = ('python {} viral-lt2gene-w-hallmark.fa ' '"A/<all.pdg.gff>,B/<all.pdg.gff>" ' '"A/<all.pdg.hmm.tax>,B/<all.pdg.hmm.tax>" ' '"A,B"\n') sys.stderr.write(mes.format(os.path.basename(sys.argv[0]))) sys.exit(1) seqfile = sys.argv[1] gff_fs = sys.argv[2] tax_fs = sys.argv[3] groups = sys.argv[4] d_group2name = {} with screed.open(seqfile) as sp: for rec in sp: header = rec.name name, desc = header.split(None ,1) _d = dict(i.split(':') for i in desc.split('||')) group = _d['group'] st = d_group2name.setdefault(group, set()) st.add(name) gff_fs = [f.strip() for f in gff_fs.split(',')] tax_fs = [f.strip() for f in tax_fs.split(',')] groups = [group.strip() for group in groups.split(',')] d_name2info = {} for gff_f, tax_f, group in zip(gff_fs, tax_fs, groups): name_st = d_group2name.get(group, set()) gen_gff = parse_gff(gff_f) seqname_lis = [] seqname_ori_lis = [] for l in gen_gff: seqname = l[0] seqname_ori = seqname.rsplit('||', 1)[0] if not seqname_ori in name_st: continue seqname_lis.append(seqname) seqname_ori_lis.append(seqname_ori) d_name2cnt=Counter(seqname_ori_lis) for seqname_ori in name_st: total_gene_cnt = d_name2cnt[seqname_ori] ind = seqname_ori_lis.index(seqname_ori) seqname = seqname_lis[ind] df_tax_sel = df_tax_per_config(tax_f, seqname) sel_index_w_hallmark = [] # donot need hallmark cnt here l_tax = extract_feature_tax(df_tax_sel, sel_index_w_hallmark=sel_index_w_hallmark, total_gene_cnt=total_gene_cnt) vir_ind = TAX_FEATURE_LIST.index('vir') arc_ind = TAX_FEATURE_LIST.index('arc') bac_ind = TAX_FEATURE_LIST.index('bac') euk_ind = TAX_FEATURE_LIST.index('euk') viral = l_tax[vir_ind] cellular = l_tax[arc_ind] + l_tax[bac_ind] + l_tax[euk_ind] d_name2info[seqname_ori] = 1, total_gene_cnt, viral, cellular with screed.open(seqfile) as sp: for rec in sp: header = rec.name name, desc = header.split(None ,1) d_desc = dict(i.split(':') for i in desc.split('||')) start_ind, end_ind, viral, cellular = d_name2info[name] seq = rec.sequence length = len(seq) d_desc['start'] = 1 d_desc['end'] = length d_desc['start_ind'] = start_ind d_desc['end_ind'] = end_ind d_desc['viral'] = viral d_desc['cellular'] = cellular d_desc['score'] = np.nan d_desc['hallmark'] = int(d_desc['hallmark']) desc = FASTA_DESC_FORMAT_TEMPLATE.format(**d_desc) mes = f'>{name} {desc}\n{seq}\n' sys.stdout.write(mes)
def load_data(self): self.gff_gen = parse_gff(self.gff_f) self.rbs_cat_d = load_rbs_category(self.rbs_cat_f) self.gff_mat_colnames = ('orf_index', 'start', 'end', 'strand', 'partial', 'start_type', 'gc_cont', 'rbs_motif')