Beispiel #1
0
def convert_axt(in_file, out_file):
    Outfile = open(out_file, 'w')
    dicHD2seq = kang.Fasta2dic(in_file)
    print(out_file, file=Outfile)
    for strHD in dicHD2seq:
        print(dicHD2seq[strHD], file=Outfile)
    Outfile.close()
#!/usr/bin/python

from __future__ import print_function
import subprocess
import numpy as np
import pandas as pd
import sys
from tqdm import tqdm
sys.path.append('/mnt/c/ubuntu.download/pipelines/')
import pysam
import kang
import math
file_bam = sys.argv[1]  #'intron3000.merge.sorted.bam'
file_fa = sys.argv[2]  #'Creinhardtii_281_v5.0.fa'
file_pk = sys.argv[3]
dicHD2seq = kang.Fasta2dic(file_fa)


def get_block(array, depth_cut=0):
    lim_len_block = 100  # size of read fragment
    #depth_cut     = 0 # ... 10. ... .. .. ..
    block_list = []
    #print(len(np.shape(array)))
    if len(np.shape(array)) == 1:
        rows = 1
        block = []
        for n, j in enumerate(array):
            if j > depth_cut:
                block.append(n)
            else:
                if len(block) > lim_len_block:
Beispiel #3
0
import os
import time
from slackclient import SlackClient
import kang
import pandas as pd 
import primer3



dicFA    = kang.Fasta2dic('../References/Athaliana/Araport11_genes.201606.cdna.fasta')
dicFA_func = kang.Fasta2dic_all('../References/Athaliana/Araport11_genes.201606.cdna.fasta')
genelist = dicFA.keys()
keys   = [x.split('|')[0].replace('>','').strip() for x in dicFA_func.keys()]
values = [x.split('|')[1].strip() for x in dicFA_func.keys()]
dicG2F = dict(zip(keys,values))


# starterbot's ID as an environment variable
BOT_ID = os.environ.get("BOT_ID")

# constants
AT_BOT = "<@" + BOT_ID + ">"
COMMAND_list = ["seq","func","primer"]

# instantiate Slack & Twilio clients
slack_client = SlackClient(os.environ.get('SLACK_BOT_TOKEN'))

def get_opt_cloningprimer_pair(seq):
    oligo_calc = primer3.thermoanalysis.ThermoAnalysis(mv_conc=20, dv_conc=1.5, dntp_conc=0.8, dna_conc=50, max_nn_length=60)
    # Change if you have personal condition for PCR 
    ## mv_conc   : The millimolar (mM) concentration of monovalent salt cations (usually KCl) in the PCR.
Beispiel #4
0
import numpy as np
import matplotlib.cm as cm
import matplotlib.pyplot as plt

try:
    file_tmap = sys.argv[
        1]  #'./predicted/cuffcmp.my_csv.csv.addgene.gff3.sort.gff3.tmap'
except IndexError:
    print(''' args : tmap, refcds, pep, cds ''')
    exit()
file_ref_cds = sys.argv[
    2]  #'/ref/analysis/References/Creinhardtii/annotation/Creinhardtii_281_v5.5.cds.fa'
file_pep = sys.argv[3]  #'../gff2cds/pep.fa'
file_cds = sys.argv[4]  #'../gff2cds/cds.fa'

dicrefcds = kang.Fasta2dic(file_ref_cds)
dicpep = kang.Fasta2dic(file_pep)
diccds = kang.Fasta2dic(file_cds)

df_tmap = pd.read_csv(file_tmap, sep='\t', comment='#')

mask = (df_tmap['class_code'] == '=')
TID_ws = set(df_tmap['ref_id'][mask])  # ws : well supported
GID_ws = set(df_tmap['ref_gene_id'][mask])

mask = (df_tmap['class_code'] == 'c')
TID_cs = set(df_tmap['ref_id'][mask])  # cs : partial supported
GID_cs = set(df_tmap['ref_gene_id'][mask])

mask = (df_tmap['class_code'] == 'j')
TID_js = set(df_tmap['ref_id'][mask])  # js : isoform supported
Beispiel #5
0
#!/usr/bin/python3

import glob, kang, numpy, subprocess

file_list = glob.glob('*.prealn.fa')
for file_in in file_list:
    dic = kang.Fasta2dic(file_in)
    len_list = []
    for key in dic:
        len_list.append(len(dic[key]))
    fAvr = numpy.average(len_list)
    fStd = numpy.std(len_list)
    if fStd / fAvr > 0.1:
        continue
    #print(numpy.average(len_list),numpy.std(len_list))
    subprocess.call('cp %s %s.ok.fa' % (file_in, file_in), shell=True)
Beispiel #6
0
genelist = []

with open(file_gff3 + '.merge.gff3', 'a') as f:
    for ix in set(df_tmap_sub_ix.index):
        genename = '.'.join(ix.split('.')[0:2])
        if genename not in set(genelist):
            edf = df_gff_genename_ix.loc[genename]
            edf = edf[edf[2] == 'gene']
            edf.to_csv(f, header=None, index=None, sep='\t')
            genelist.append(genename)
        else:
            pass
        edf = df_gff_transcript_ix.loc[ix]
        edf.to_csv(f, header=None, index=None, sep='\t')

# new gene protein seq ret
file_fa = file_pep_new
dicfa = kang.Fasta2dic(file_fa)

df_new = df_tmap_sub
df_new['seq'] = df_new['cuff_id'].apply(lambda x: dicfa[x])

with open(file_pep_new + '.new_gene.fa', 'w') as f:
    for ix in set(df_new.index):
        hd = df_new.loc[ix]['cuff_id']
        seq = df_new.loc[ix]['seq']
        print('>' + hd, file=f)
        print(seq, file=f)
# new gene protein seq ret end
Beispiel #7
0
mcscanout_list = [
    'Vr2Cc.collinearity.kaks.recentpeakWGD',
    'Vr2Gm.collinearity.kaks.recentpeakWGD',
    'Vr2Wd.collinearity.kaks.recentpeakWGD',
    'Vra2Vrr.collinearity.kaks.recentpeakWGD'
]

file_GMAA = 'GMAA.fasta'
file_VRGA = 'VRPA.fasta'
file_matchGM = 'match.GM.txt'
file_matchVRG = 'match.Vrg.txt'
#file_group	= 'groups.primary.txt'
file_fa = 'all.fas'
Outfile_seed = open('seed_orthologs.txt', 'w')

dicHD2seq = kang.Fasta2dic(file_fa)

dicGMAA2seq = kang.Fasta2dic(file_GMAA)
dicVRGA2seq = kang.Fasta2dic(file_VRGA)

dicGMA2B = {}
for line in open(file_matchGM):
    cell = line.strip().split('\t')
    strA = cell[0]
    strB = cell[1]
    dicGMA2B[strA] = strB
dicVRGA2B = {}
for line in open(file_matchVRG):
    cell = line.strip().split('\t')
    strA = cell[0]
    strB = cell[1]
Beispiel #8
0
filename_align = 'out.align.txt'
filename_pos = 'out.pos.txt'
file_ref_fa = '/ref/analysis/juntaehwan/ref/Cannuum/Pepper.v.1.55.total.chr.fa.seeders.fa'
file_gff = '/ref/analysis/juntaehwan/ref/Cannuum/Pepper_1.55.gene_models.gff3'
file_ref_annotation = '/ref/analysis/juntaehwan/ref/Pepper.v.1.55.proteins.annotated.fasta'
list_vcf = [
    '/ref/analysis/juntaehwan/data/YCM334_samtools.raw.vcf',
    '/ref/analysis/juntaehwan/data/Taeahn_samtools.raw.vcf'
]
#'/ref/analysis/juntaehwan/data/Perennial.SRR2751913.cv.vcf','/ref/analysis/juntaehwan/data/Dempsey.SRR2751914.cv.sambamba.vcf']
list_vcf_label = ['YCM334', 'TAEAHN']  #,'PERENN','DEMPSE']
target_genelist = [x.strip() for x in open('nbslrr.txt').readlines()]

dic_annotation_fa = kang.Fasta2dic_all(file_ref_annotation)
dic_ref_fa = kang.Fasta2dic(file_ref_fa)
dic_annot = {}
for line in dic_annotation_fa.keys():
    cell = line.split()
    try:
        dic_annot[cell[0]] = ' '.join(cell[1:])
    except IndexError:
        dic_annot[cell[0]] = 'None'
print('gff parsing')
df_gff = pd.read_csv(file_gff, sep='\t', header=None)
mask = (df_gff[2] == 'CDS')
df_gff_cds = df_gff[mask]
mask = (df_gff[2] == 'gene')
df_gff_gene = df_gff[mask]
df_gff_cds['ID'] = df_gff_cds[8].apply(lambda x: x.split(';')[0].split(':')[1])
df_gff_gene['ID'] = df_gff_gene[8].apply(
file_cdhitfa = 'transcripts.fasta.transdecoder.cds.cdhit'
Gene_list = []
Transcript_list = []

for line in open(file_cdhitpfam):
    if line[0] == '#':
        continue

    cell = line.split()
    Gene_list.append(cell[3].replace('m', 'g'))
    Transcript_list.append(cell[3])

Gene_list = set(Gene_list)
Transcript_list = set(Transcript_list)

dicFa = kang.Fasta2dic(file_cdhitfa)
dicFa_new = {}
for gene in dicFa:
    if gene in Transcript_list:
        dicFa_new[gene] = dicFa[gene]
kang.dic2fa(dicFa_new, file_cdhitfa + '.pfamfilt.fa')

file_gff = 'transcripts.fasta.transdecoder.gff3'
Outfile = open(file_gff + '.cdhit.pfamfilt.gff3', 'w')
for line in open(file_gff):
    if line.strip() == '':
        continue
    cell = line.strip().split('\t')
    strT = cell[2]
    info = cell[-1]
    dicinfo = dict(
Beispiel #10
0
#!/usr/bin/python3

import kang, sys

dicHD2seq = kang.Fasta2dic(sys.argv[1])
Outfile = open(sys.argv[2], 'w')
for strHD in dicHD2seq:
    seq = dicHD2seq[strHD]
    if len(seq) < 5:
        continue
    print('>' + strHD, file=Outfile)
    print(kang.translation(seq), file=Outfile)
Beispiel #11
0
#!/usr/bin/python3

import kang, sys

file_joo = sys.argv[1]  #'joinmap.ml.n10.txt_ordered.txt'
dicHD2Seq = kang.Fasta2dic('superscaf.fa')
dicLG2Seq = {}
LGIncludedSC = []
Outfile_chr = open(file_joo + '_' + 'Pseudo_chr.fa', 'w')
Outfile_scaff = open(file_joo + '_' + 'non_anchored_scaffolds.fa', 'w')
for line in open(file_joo):
    if line[0] == '#' or line.strip() == '':
        continue
    cell = line.strip().split('\t')
    strLG = cell[0]
    print(cell)
    strSC = cell[1].replace('*', '')
    if 'SS' in strSC:
        strSC = strSC.replace('SS', 'SuperScaf_')
    else:
        strSC = strSC.replace('s', 'scaffold_')
    LGIncludedSC.append(strSC)
    strD = cell[2]
    if strD == 'F':
        strSeq = dicHD2Seq[strSC]
    elif strD == 'R':
        strSeq = kang.rev_comp(dicHD2Seq[strSC])
    else:
        strSeq = dicHD2Seq[strSC]
    try:
        dicLG2Seq[strLG] += 'N' * 500 + strSeq
Beispiel #12
0
#!/usr/bin/python3

import glob, kang

fa_list = glob.glob('*.prealn.fa')
orthologs = 'orthologs.txt'
dicAHD2seq = kang.Fasta2dic('all.fa')

dicA2B = {}
for line in open(orthologs):
    cell = line.strip().split()
    strA = cell[0]
    strB = cell[1]
    try:
        dicA2B[strA].append(strB)
    except KeyError:
        dicA2B[strA] = [strB]
    try:
        dicA2B[strB].append(strA)
    except KeyError:
        dicA2B[strB] = [strA]

for file_fa in fa_list:
    dicHD2seq = kang.Fasta2dic(file_fa)
    dicSPCS2GN = {}
    for strHD in dicHD2seq:
        spcs = strHD.split('|')[0]
        gn = strHD.split('|')[1]
        dicSPCS2GN[spcs] = strHD
    try:
        add_list = dicA2B[dicSPCS2GN['VRA']]
Beispiel #13
0
from __future__ import print_function
import pandas as pd
import numpy as np
import sys
sys.path.append('/ref/analysis/pipelines/')
import kang
from tqdm import tqdm
import glob

file_stringtie_fa = sys.argv[1] #'/ref/analysis/Cre/braker/braker.try5_mario/guided/transcripts.fasta'
dic_stringtie_fa = kang.Fasta2dic(file_stringtie_fa)

#main_dir      = './' 
file_ag       = sys.argv[2:6] #main_dir+'transcripts.fasta.augustus.ath.complete.gff3.nosharp.genome.v1.gff'


file_td       = sys.argv[6] #main_dir+'selected_mRNA_v4.gff'
ag_predictions = []
for efile_ag in file_ag:
    df_ag     = pd.read_csv(efile_ag,sep='\t')
    df_ag['ID'] = df_ag['Name'].apply(lambda x : x.replace('Name=',''))
    df_ag.set_index('ID',inplace=True)
    ag_predictions.append(df_ag)


df_td     = pd.read_csv(file_td,sep='\t')
#df_td['ID'] = df_td['Name'].apply(lambda x : x.split('|')[0].replace('Name=',''))
df_td['ID'] = df_td['Name'].apply(lambda x : x.split('::')[1])

df_td.set_index('ID',inplace=True)
Beispiel #14
0
#!/usr/bin/python3

import subprocess, glob, kang, os

file_orth = 'orthologs.txt'
Outfile = open(file_orth + '.shared', 'w')
dicHD2seq = kang.Fasta2dic('../cds/all.cds.fa')
dicA2B = {}
for line in open(file_orth):
    cell = line.strip().split('\t')
    strA = cell[0]
    strB = cell[1]
    if strA.split('|')[0] == 'VAG':
        try:
            dicA2B[strA].append(strB)
        except KeyError:
            dicA2B[strA] = [strA, strB]
    else:
        try:
            dicA2B[strB].append(strA)
        except KeyError:
            dicA2B[strB] = [strB, strA]


def convert_axt(in_file, out_file):
    Outfile = open(out_file, 'w')
    dicHD2seq = kang.Fasta2dic(in_file)
    print(out_file, file=Outfile)
    for strHD in dicHD2seq:
        print(dicHD2seq[strHD], file=Outfile)
    Outfile.close()