def convert_axt(in_file, out_file): Outfile = open(out_file, 'w') dicHD2seq = kang.Fasta2dic(in_file) print(out_file, file=Outfile) for strHD in dicHD2seq: print(dicHD2seq[strHD], file=Outfile) Outfile.close()
#!/usr/bin/python from __future__ import print_function import subprocess import numpy as np import pandas as pd import sys from tqdm import tqdm sys.path.append('/mnt/c/ubuntu.download/pipelines/') import pysam import kang import math file_bam = sys.argv[1] #'intron3000.merge.sorted.bam' file_fa = sys.argv[2] #'Creinhardtii_281_v5.0.fa' file_pk = sys.argv[3] dicHD2seq = kang.Fasta2dic(file_fa) def get_block(array, depth_cut=0): lim_len_block = 100 # size of read fragment #depth_cut = 0 # ... 10. ... .. .. .. block_list = [] #print(len(np.shape(array))) if len(np.shape(array)) == 1: rows = 1 block = [] for n, j in enumerate(array): if j > depth_cut: block.append(n) else: if len(block) > lim_len_block:
import os import time from slackclient import SlackClient import kang import pandas as pd import primer3 dicFA = kang.Fasta2dic('../References/Athaliana/Araport11_genes.201606.cdna.fasta') dicFA_func = kang.Fasta2dic_all('../References/Athaliana/Araport11_genes.201606.cdna.fasta') genelist = dicFA.keys() keys = [x.split('|')[0].replace('>','').strip() for x in dicFA_func.keys()] values = [x.split('|')[1].strip() for x in dicFA_func.keys()] dicG2F = dict(zip(keys,values)) # starterbot's ID as an environment variable BOT_ID = os.environ.get("BOT_ID") # constants AT_BOT = "<@" + BOT_ID + ">" COMMAND_list = ["seq","func","primer"] # instantiate Slack & Twilio clients slack_client = SlackClient(os.environ.get('SLACK_BOT_TOKEN')) def get_opt_cloningprimer_pair(seq): oligo_calc = primer3.thermoanalysis.ThermoAnalysis(mv_conc=20, dv_conc=1.5, dntp_conc=0.8, dna_conc=50, max_nn_length=60) # Change if you have personal condition for PCR ## mv_conc : The millimolar (mM) concentration of monovalent salt cations (usually KCl) in the PCR.
import numpy as np import matplotlib.cm as cm import matplotlib.pyplot as plt try: file_tmap = sys.argv[ 1] #'./predicted/cuffcmp.my_csv.csv.addgene.gff3.sort.gff3.tmap' except IndexError: print(''' args : tmap, refcds, pep, cds ''') exit() file_ref_cds = sys.argv[ 2] #'/ref/analysis/References/Creinhardtii/annotation/Creinhardtii_281_v5.5.cds.fa' file_pep = sys.argv[3] #'../gff2cds/pep.fa' file_cds = sys.argv[4] #'../gff2cds/cds.fa' dicrefcds = kang.Fasta2dic(file_ref_cds) dicpep = kang.Fasta2dic(file_pep) diccds = kang.Fasta2dic(file_cds) df_tmap = pd.read_csv(file_tmap, sep='\t', comment='#') mask = (df_tmap['class_code'] == '=') TID_ws = set(df_tmap['ref_id'][mask]) # ws : well supported GID_ws = set(df_tmap['ref_gene_id'][mask]) mask = (df_tmap['class_code'] == 'c') TID_cs = set(df_tmap['ref_id'][mask]) # cs : partial supported GID_cs = set(df_tmap['ref_gene_id'][mask]) mask = (df_tmap['class_code'] == 'j') TID_js = set(df_tmap['ref_id'][mask]) # js : isoform supported
#!/usr/bin/python3 import glob, kang, numpy, subprocess file_list = glob.glob('*.prealn.fa') for file_in in file_list: dic = kang.Fasta2dic(file_in) len_list = [] for key in dic: len_list.append(len(dic[key])) fAvr = numpy.average(len_list) fStd = numpy.std(len_list) if fStd / fAvr > 0.1: continue #print(numpy.average(len_list),numpy.std(len_list)) subprocess.call('cp %s %s.ok.fa' % (file_in, file_in), shell=True)
genelist = [] with open(file_gff3 + '.merge.gff3', 'a') as f: for ix in set(df_tmap_sub_ix.index): genename = '.'.join(ix.split('.')[0:2]) if genename not in set(genelist): edf = df_gff_genename_ix.loc[genename] edf = edf[edf[2] == 'gene'] edf.to_csv(f, header=None, index=None, sep='\t') genelist.append(genename) else: pass edf = df_gff_transcript_ix.loc[ix] edf.to_csv(f, header=None, index=None, sep='\t') # new gene protein seq ret file_fa = file_pep_new dicfa = kang.Fasta2dic(file_fa) df_new = df_tmap_sub df_new['seq'] = df_new['cuff_id'].apply(lambda x: dicfa[x]) with open(file_pep_new + '.new_gene.fa', 'w') as f: for ix in set(df_new.index): hd = df_new.loc[ix]['cuff_id'] seq = df_new.loc[ix]['seq'] print('>' + hd, file=f) print(seq, file=f) # new gene protein seq ret end
mcscanout_list = [ 'Vr2Cc.collinearity.kaks.recentpeakWGD', 'Vr2Gm.collinearity.kaks.recentpeakWGD', 'Vr2Wd.collinearity.kaks.recentpeakWGD', 'Vra2Vrr.collinearity.kaks.recentpeakWGD' ] file_GMAA = 'GMAA.fasta' file_VRGA = 'VRPA.fasta' file_matchGM = 'match.GM.txt' file_matchVRG = 'match.Vrg.txt' #file_group = 'groups.primary.txt' file_fa = 'all.fas' Outfile_seed = open('seed_orthologs.txt', 'w') dicHD2seq = kang.Fasta2dic(file_fa) dicGMAA2seq = kang.Fasta2dic(file_GMAA) dicVRGA2seq = kang.Fasta2dic(file_VRGA) dicGMA2B = {} for line in open(file_matchGM): cell = line.strip().split('\t') strA = cell[0] strB = cell[1] dicGMA2B[strA] = strB dicVRGA2B = {} for line in open(file_matchVRG): cell = line.strip().split('\t') strA = cell[0] strB = cell[1]
filename_align = 'out.align.txt' filename_pos = 'out.pos.txt' file_ref_fa = '/ref/analysis/juntaehwan/ref/Cannuum/Pepper.v.1.55.total.chr.fa.seeders.fa' file_gff = '/ref/analysis/juntaehwan/ref/Cannuum/Pepper_1.55.gene_models.gff3' file_ref_annotation = '/ref/analysis/juntaehwan/ref/Pepper.v.1.55.proteins.annotated.fasta' list_vcf = [ '/ref/analysis/juntaehwan/data/YCM334_samtools.raw.vcf', '/ref/analysis/juntaehwan/data/Taeahn_samtools.raw.vcf' ] #'/ref/analysis/juntaehwan/data/Perennial.SRR2751913.cv.vcf','/ref/analysis/juntaehwan/data/Dempsey.SRR2751914.cv.sambamba.vcf'] list_vcf_label = ['YCM334', 'TAEAHN'] #,'PERENN','DEMPSE'] target_genelist = [x.strip() for x in open('nbslrr.txt').readlines()] dic_annotation_fa = kang.Fasta2dic_all(file_ref_annotation) dic_ref_fa = kang.Fasta2dic(file_ref_fa) dic_annot = {} for line in dic_annotation_fa.keys(): cell = line.split() try: dic_annot[cell[0]] = ' '.join(cell[1:]) except IndexError: dic_annot[cell[0]] = 'None' print('gff parsing') df_gff = pd.read_csv(file_gff, sep='\t', header=None) mask = (df_gff[2] == 'CDS') df_gff_cds = df_gff[mask] mask = (df_gff[2] == 'gene') df_gff_gene = df_gff[mask] df_gff_cds['ID'] = df_gff_cds[8].apply(lambda x: x.split(';')[0].split(':')[1]) df_gff_gene['ID'] = df_gff_gene[8].apply(
file_cdhitfa = 'transcripts.fasta.transdecoder.cds.cdhit' Gene_list = [] Transcript_list = [] for line in open(file_cdhitpfam): if line[0] == '#': continue cell = line.split() Gene_list.append(cell[3].replace('m', 'g')) Transcript_list.append(cell[3]) Gene_list = set(Gene_list) Transcript_list = set(Transcript_list) dicFa = kang.Fasta2dic(file_cdhitfa) dicFa_new = {} for gene in dicFa: if gene in Transcript_list: dicFa_new[gene] = dicFa[gene] kang.dic2fa(dicFa_new, file_cdhitfa + '.pfamfilt.fa') file_gff = 'transcripts.fasta.transdecoder.gff3' Outfile = open(file_gff + '.cdhit.pfamfilt.gff3', 'w') for line in open(file_gff): if line.strip() == '': continue cell = line.strip().split('\t') strT = cell[2] info = cell[-1] dicinfo = dict(
#!/usr/bin/python3 import kang, sys dicHD2seq = kang.Fasta2dic(sys.argv[1]) Outfile = open(sys.argv[2], 'w') for strHD in dicHD2seq: seq = dicHD2seq[strHD] if len(seq) < 5: continue print('>' + strHD, file=Outfile) print(kang.translation(seq), file=Outfile)
#!/usr/bin/python3 import kang, sys file_joo = sys.argv[1] #'joinmap.ml.n10.txt_ordered.txt' dicHD2Seq = kang.Fasta2dic('superscaf.fa') dicLG2Seq = {} LGIncludedSC = [] Outfile_chr = open(file_joo + '_' + 'Pseudo_chr.fa', 'w') Outfile_scaff = open(file_joo + '_' + 'non_anchored_scaffolds.fa', 'w') for line in open(file_joo): if line[0] == '#' or line.strip() == '': continue cell = line.strip().split('\t') strLG = cell[0] print(cell) strSC = cell[1].replace('*', '') if 'SS' in strSC: strSC = strSC.replace('SS', 'SuperScaf_') else: strSC = strSC.replace('s', 'scaffold_') LGIncludedSC.append(strSC) strD = cell[2] if strD == 'F': strSeq = dicHD2Seq[strSC] elif strD == 'R': strSeq = kang.rev_comp(dicHD2Seq[strSC]) else: strSeq = dicHD2Seq[strSC] try: dicLG2Seq[strLG] += 'N' * 500 + strSeq
#!/usr/bin/python3 import glob, kang fa_list = glob.glob('*.prealn.fa') orthologs = 'orthologs.txt' dicAHD2seq = kang.Fasta2dic('all.fa') dicA2B = {} for line in open(orthologs): cell = line.strip().split() strA = cell[0] strB = cell[1] try: dicA2B[strA].append(strB) except KeyError: dicA2B[strA] = [strB] try: dicA2B[strB].append(strA) except KeyError: dicA2B[strB] = [strA] for file_fa in fa_list: dicHD2seq = kang.Fasta2dic(file_fa) dicSPCS2GN = {} for strHD in dicHD2seq: spcs = strHD.split('|')[0] gn = strHD.split('|')[1] dicSPCS2GN[spcs] = strHD try: add_list = dicA2B[dicSPCS2GN['VRA']]
from __future__ import print_function import pandas as pd import numpy as np import sys sys.path.append('/ref/analysis/pipelines/') import kang from tqdm import tqdm import glob file_stringtie_fa = sys.argv[1] #'/ref/analysis/Cre/braker/braker.try5_mario/guided/transcripts.fasta' dic_stringtie_fa = kang.Fasta2dic(file_stringtie_fa) #main_dir = './' file_ag = sys.argv[2:6] #main_dir+'transcripts.fasta.augustus.ath.complete.gff3.nosharp.genome.v1.gff' file_td = sys.argv[6] #main_dir+'selected_mRNA_v4.gff' ag_predictions = [] for efile_ag in file_ag: df_ag = pd.read_csv(efile_ag,sep='\t') df_ag['ID'] = df_ag['Name'].apply(lambda x : x.replace('Name=','')) df_ag.set_index('ID',inplace=True) ag_predictions.append(df_ag) df_td = pd.read_csv(file_td,sep='\t') #df_td['ID'] = df_td['Name'].apply(lambda x : x.split('|')[0].replace('Name=','')) df_td['ID'] = df_td['Name'].apply(lambda x : x.split('::')[1]) df_td.set_index('ID',inplace=True)
#!/usr/bin/python3 import subprocess, glob, kang, os file_orth = 'orthologs.txt' Outfile = open(file_orth + '.shared', 'w') dicHD2seq = kang.Fasta2dic('../cds/all.cds.fa') dicA2B = {} for line in open(file_orth): cell = line.strip().split('\t') strA = cell[0] strB = cell[1] if strA.split('|')[0] == 'VAG': try: dicA2B[strA].append(strB) except KeyError: dicA2B[strA] = [strA, strB] else: try: dicA2B[strB].append(strA) except KeyError: dicA2B[strB] = [strB, strA] def convert_axt(in_file, out_file): Outfile = open(out_file, 'w') dicHD2seq = kang.Fasta2dic(in_file) print(out_file, file=Outfile) for strHD in dicHD2seq: print(dicHD2seq[strHD], file=Outfile) Outfile.close()