forked from dib-lab/dib-MMETSP
-
Notifications
You must be signed in to change notification settings - Fork 0
/
assembly_trinity_2.2.0.py
124 lines (112 loc) · 4.91 KB
/
assembly_trinity_2.2.0.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
# /usr/bin/env python
import os
import os.path
from os.path import basename
import subprocess
from subprocess import Popen, PIPE
import shutil
import glob
# import configuration variables
# import dibMMETSP_configuration as dib_conf
# custom Lisa module
import clusterfunc
def get_data(thefile):
count = 0
url_data = {}
with open(thefile, "rU") as inputfile:
headerline = next(inputfile).split(',')
position_name = headerline.index("ScientificName")
position_reads = headerline.index("Run")
position_ftp = headerline.index("download_path")
for line in inputfile:
line_data = line.split(',')
name = "_".join(line_data[position_name].split())
read_type = line_data[position_reads]
ftp = line_data[position_ftp]
name_read_tuple = (name, read_type)
print(name_read_tuple)
# check to see if Scientific Name and run exist
if name_read_tuple in url_data.keys():
# check to see if ftp exists
if ftp in url_data[name_read_tuple]:
print("url already exists:", ftp)
else:
url_data[name_read_tuple].append(ftp)
else:
url_data[name_read_tuple] = [ftp]
return url_data
def combine_orphans(diginormdir):
diginorm_files_dir = diginormdir + "qsub_files/"
rename_orphans = """
gzip -9c {}orphans.fq.gz.keep.abundfilt > {}orphans.keep.abundfilt.fq.gz
for file in {}*.se
do
gzip -9c ${{file}} >> {}orphans.keep.abundfilt.fq.gz
done
""".format(diginorm_files_dir, diginormdir, diginorm_files_dir, diginormdir)
return rename_orphans
def rename_files(trinitydir, diginormdir, diginormfile, SRA):
# takes diginormfile in,splits reads and put into newdir
rename_orphans = combine_orphans(diginormdir)
split_paired = "split-paired-reads.py -d " + diginormdir + " " + diginormfile
rename_string1 = "cat " + diginormdir + "*.1 > " + trinitydir + SRA + ".left.fq"
rename_string2 = "cat " + diginormdir + \
"*.2 > " + trinitydir + SRA + ".right.fq"
rename_string3 = "gunzip -c " + diginormdir + \
"orphans.keep.abundfilt.fq.gz >> " + trinitydir + SRA + ".left.fq"
commands = [rename_orphans, split_paired,
rename_string1, rename_string2, rename_string3]
process_name = "rename"
module_name_list = ["GNU/4.8.3", "khmer/2.0"]
filename = SRA
# clusterfunc.qsub_file(diginormdir,process_name,module_name_list,filename,commands)
def run_trinity(trinitydir,left,right,mmetsp,output_dir, file_extension):
trinity_command = """
set -x
# stops execution if there is an error
set -e
Trinity --left {} \\
--right {} --output /tmp/{}{} --full_cleanup --seqType fq --max_memory 20G --CPU 16
cp /tmp/{}*.fasta {}
rm -rf /tmp/{}*
""".format(left, right, mmetsp, file_extension, mmetsp, output_dir, mmetsp)
commands = [trinity_command]
process_name = "trinity_2.2.0"
module_name_list = ["trinity/2.2.0"]
filename = mmetsp
clusterfunc.qsub_file(trinitydir, process_name,
module_name_list, filename, commands)
def execute(trinity_fail, count, basedir):
id_list = os.listdir(basedir)
for mmetsp in id_list:
if mmetsp != "qsub_files":
mmetspdir = basedir + mmetsp + "/"
trinitydir = basedir + mmetsp + "/" + "trinity/"
trinity_files = os.listdir(mmetspdir)
trinity_fasta=trinitydir + dib_conf.output_extension
clusterfunc.check_dir(trinitydir)
if os.path.isfile(trinity_fasta) == False:
if os.path.isfile(dib_conf.output_dir + mmetsp + dib_conf.output_extension):
print("Trinity finished.")
count +=1
else:
print(mmetspdir)
right = [s for s in trinity_files if s.endswith(".right.fq")][0]
left = [s for s in trinity_files if s.endswith(".left.fq")][0]
right = mmetspdir + right
left = mmetspdir + left
if os.path.isfile(left) and os.path.isfile(right):
right = [s for s in trinity_files if s.endswith(".right.fq")][0]
left = [s for s in trinity_files if s.endswith(".left.fq")][0]
right = mmetspdir + right
left = mmetspdir + left
run_trinity(trinitydir, left, right, mmetsp, dib_conf.output_dir, dib_conf.output_extension)
else:
print("No files:",left)
else:
print("Trinity completed successfully.", trinity_fasta)
count += 1
assemblydir = dib_conf.output_dir
print("Number of Trinity de novo transcriptome assemblies:", count)
print("Number of times Trinity failed:", len(trinity_fail), trinity_fail)
return trinity_fail, count