/
apytram.py
executable file
·1068 lines (909 loc) · 52.5 KB
/
apytram.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#!/usr/bin/python
# coding: utf-8
# File: apytram.py
# Created by: Carine Rey
# Created on: Nov 2015
#
#
# Copyright or © or Copr. Carine Rey
# This software is a computer program whose purpose is to assembly
# sequences from RNA-Seq data (paired-end or single-end) using one or
# more reference homologous sequences.
# This software is governed by the CeCILL license under French law and
# abiding by the rules of distribution of free software. You can use,
# modify and/ or redistribute the software under the terms of the CeCILL
# license as circulated by CEA, CNRS and INRIA at the following URL
# "http://www.cecill.info".
# As a counterpart to the access to the source code and rights to copy,
# modify and redistribute granted by the license, users are provided only
# with a limited warranty and the software's author, the holder of the
# economic rights, and the successive licensors have only limited
# liability.
# In this respect, the user's attention is drawn to the risks associated
# with loading, using, modifying and/or developing or reproducing the
# software by the user in light of its specific status of free software,
# that may mean that it is complicated to manipulate, and that also
# therefore means that it is reserved for developers and experienced
# professionals having in-depth computer knowledge. Users are therefore
# encouraged to load and test the software's suitability as regards their
# requirements in conditions enabling the security of their systems and/or
# data to be ensured and, more generally, to use and operate it in the
# same conditions as regards security.
# The fact that you are presently reading this means that you have had
# knowledge of the CeCILL license and that you accept its terms.
#
import os
import re
import sys
import time
import tempfile
import logging
import argparse
import ApytramLib
def strict_positive_integer(x):
x = int(x)
if type(x) != type(1) or x <= 0:
raise argparse.ArgumentTypeError("Must be an integer superior to 0")
return x
def positive_integer(x):
x = int(x)
if type(x) != type(1) or x < 0:
raise argparse.ArgumentTypeError("Must be a positive integer")
return x
def strict_positive_float(x):
x = float(x)
if type(x) != type(0.1) or x <= 0:
raise argparse.ArgumentTypeError("Must be an float superior to 0")
return x
def positive_float(x):
x = float(x)
if type(x) != type(0.1) or x < 0:
raise argparse.ArgumentTypeError("Must be an float superior to 0")
return x
start_time = time.time()
### Option defining
parser = argparse.ArgumentParser(prog="apytram.py",
description='''
Run apytram.py on a fastq file to retrieve
homologous sequences of bait sequences.''')
parser.add_argument('--version', action='version', version='%(prog)s 1.2')
##############
requiredOptions = parser.add_argument_group('Required arguments')
requiredOptions.add_argument('-d', '--database', type=str,
help='Database prefix name. If a database with the same name already exists, the existing database will be kept and the database will NOT be rebuilt.', required=True)
requiredOptions.add_argument('-dt', '--database_type', type=str,
help="""
single: single unstranded data ______________________
paired: paired unstranded data ______________________
RF: paired stranded data (/1 = reverse ; /2 = forward)
FR: paired stranded data (/1 = forward ; /2 = reverse)
F: single stranded data (reads = forward) ____________
R: single stranded data (reads = reverse) ____________
WARNING: Paired read names must finished by 1 or 2""",
required=True)
requiredOptions.add_argument('-out', '--output_prefix', type=str,
help="Output prefix", required=True)
##############
##############
InUSOptions = parser.add_argument_group('Input Files')
InUSOptions.add_argument('-fa', '--fasta', type=str,
help="Fasta formated RNA-seq data to build the database of reads (only one file).")
InUSOptions.add_argument('-fq', '--fastq', type=str,
help="Fastq formated RNA-seq data to build the database of reads (several space delimited fastq file names are allowed). For paired data, fq must be previously concatenated. WARNING: Paired read names must finished by 1 or 2. (fastq files will be first converted to a fasta file. This process can require some time.)")
InUSOptions.add_argument('-idx', '--index', type=str,
help="BioPython index.)")
InUSOptions.add_argument('-clstr', '--clstr', type=str,
help="cluster (pseudo fasta formated).)")
InUSOptions.add_argument('-clstridx', '--clstridx', type=str,
help="cluster index)")
InUSOptions.add_argument('-clstr_rep', '--clstr_rep', type=str,
help="cluster representative (fasta).)")
##############
##############
QueryOptions = parser.add_argument_group('Query File')
QueryOptions.add_argument('-q', '--query', type=str,
help="""
Fasta file (nucl) with homologous bait sequences which will be treated together for the apytram run.
If no query is submitted, the program will just build the database.
WARNING: Sequences must not contain "- * . "
""",
)
#QueryOptions.add_argument('-pep', '--query_pep', type=str,
# default="",
# help="Fasta file containing the query in the peptide format. It will be used at the first iteration as bait sequences to fish reads. It is compulsory to include also the query in nucleotide format (-q option)")
##############
##############
IterationOptions = parser.add_argument_group('Number of iterations')
IterationOptions.add_argument('-i', '--iteration_max', type=strict_positive_integer,
help="Maximum number of iterations. (Default 5)",
default=5)
IterationOptions.add_argument('-i_start', '--iteration_start', type=positive_integer,
help="Number of the first iteration. If different of 1, the tmp option must be used. (Default: 1)",
default=1)
##############
##############
OutOptions = parser.add_argument_group('Output Files')
OutOptions.add_argument('-log', type=str, default="apytram.log",
help="a log file to report avancement (default: apytram.log)")
OutOptions.add_argument('-tmp', type=str,
help="Directory to stock all intermediary files for the apytram run. (default: a directory in /tmp which will be removed at the end)",
default="")
OutOptions.add_argument('--keep_tmp', action='store_true',
default=False,
help="By default, the temporary directory will be remove.")
#OutOptions.add_argument('--keep_iterations', action='store_true',
# help="A fasta file containing reconstructed sequences will be created at each iteration. (default: False)")
OutOptions.add_argument('--no_best_file', action='store_true',
default=False,
help="By default, a fasta file (Outprefix.best.fasta) containing only the best sequence is created. If this option is used, it will NOT be created.")
OutOptions.add_argument('--only_best_file', action='store_true',
default=False,
help="By default, a fasta file (Outprefix.fasta) containing all sequences from the last iteration is created. If this option is used, it will NOT be created.")
OutOptions.add_argument('--cds', action='store_true',
default=False,
help="Keep only CDS in output sequences using Transdecoder. (default: False)")
OutOptions.add_argument('--stats', action='store_true',
help='Create files with statistics on each iteration. (default: False)')
OutOptions.add_argument('--plot', action='store_true',
help='Create plots to represent the statistics on each iteration. (default: False)')
OutOptions.add_argument('--plot_ali', action='store_true',
help='Create file with a plot representing the alignement of all sequences from the last iteration on the query sequence. Take some seconds. (default: False)')
##############
##############
SearchOptions = parser.add_argument_group('Thresholds for EACH ITERATION')
SearchOptions.add_argument('-e', '--evalue', type=positive_float,
help="Evalue threshold of the blastn of the bait queries on the database of reads. (Default 1e-5)",
default=1e-5)
SearchOptions.add_argument('-id', '--min_id', type=positive_integer,
help="Minimum identity percentage of a sequence with a query on the length of their alignment so that the sequence is kept at the end of a iteration (Default 70)",
default=70)
SearchOptions.add_argument('-mal', '--min_ali_len', type=positive_integer,
help="Minimum alignment length of a sequence on a query to be kept at the end of a iteration (Default 180)",
default=180)
SearchOptions.add_argument('-len', '--min_len', type=positive_integer,
help="Minimum length to keep a sequence at the end of a iteration. (Default 200)",
default=200)
##############
##############
StopOptions = parser.add_argument_group('Criteria to stop iteration')
StopOptions.add_argument('-required_coverage', type=positive_integer,
help="Required coverage of a bait sequence to stop iteration (Default: No threshold)",
default=200)
StopOptions.add_argument('--finish_all_iter', action='store_true',
help="By default, iterations are stop if there is no improvment, if this option is used apytram will finish all iteration (-i).",
default=False)
StopOptions.add_argument('-time_max', type=positive_integer,
help="Do not begin a new iteration if the job duration (in seconds) has exceed this threshold. (Default 7200)",
default=7200)
##############
##############
FinalFilterOptions = parser.add_argument_group('Thresholds for Final output files')
FinalFilterOptions.add_argument('-flen', '--final_min_len', type=positive_integer,
help="Minimum PERCENTAGE of the query length to keep a sequence at the end of the run. (Default: 0)",
default=0)
FinalFilterOptions.add_argument('-fid', '--final_min_id', type=positive_integer,
help="Minimum identity PERCENTAGE of a sequence with a query on the length of their alignment so that the sequence is kept at the end of the run (Default 0)",
default=0)
FinalFilterOptions.add_argument('-fmal', '--final_min_ali_len', type=positive_integer,
help="Alignment length between a sequence and a query must be at least this PERCENTAGE of the query length to keep this sequence at the end of the run. (Default: 0)",
default=0)
##############
##############
MiscellaneousOptions = parser.add_argument_group('Miscellaneous options')
MiscellaneousOptions.add_argument('-threads', type=positive_integer,
help="Number of available threads. (Default 1)",
default=1)
MiscellaneousOptions.add_argument('-memory', type=positive_integer,
help="Memory available for the assembly in Giga. (Default 1)",
default=1)
MiscellaneousOptions.add_argument('--UseIndex', action='store_true',
help="Use index_db from BioPython to retrieve reads",
default=False)
MiscellaneousOptions.add_argument('--write_even_empty', action='store_true',
default=False,
help="Write output fasta files, even if they must be empty. (Default: False)")
MiscellaneousOptions.add_argument('--out_by_species', action='store_true',
default=False,
help="Write output fasta files for each species. (Default: False)")
MiscellaneousOptions.add_argument('--debug', action='store_true', default=False,
help="debug mode, default False")
##############
### Option parsing
args = parser.parse_args()
### Set up the log directory
if args.log:
LogDirName = os.path.dirname(args.log)
if not os.path.isdir(LogDirName) and LogDirName:
os.makedirs(LogDirName)
### Set up the logger
LogFile = args.log
# create logger with 'spam_application'
logger = logging.getLogger('apytram')
logger.setLevel(logging.INFO)
# create file handler which logs even debug messages
fh = logging.FileHandler(LogFile)
fh.setLevel(logging.DEBUG)
# create console handler with a higher log level
ch = logging.StreamHandler()
if args.debug:
ch.setLevel(logging.DEBUG)
fh.setLevel(logging.DEBUG)
logger.setLevel(logging.DEBUG)
else:
ch.setLevel(logging.WARN)
#ch.setLevel(logging.DEBUG)
# create formatter and add it to the handlers
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
fh.setFormatter(formatter)
formatter = logging.Formatter('%(message)s')
ch.setFormatter(formatter)
# add the handlers to the logger
logger.addHandler(fh)
logger.addHandler(ch)
logger.warning("[Running in process...]\n[Warning messages may print, but they are no error. If real errors appear, the process will stop]")
logger.info(" ".join(sys.argv))
error = 0
empty_queries = 0
### Set up the temporary directory
if args.tmp:
if not "apytram" in args.tmp:
logger.error("""ERROR: Temporary directory name (-tmp) must contain "apytram" to safety reasons because it will be completly remove.""")
error += 1
elif os.path.isdir(args.tmp):
logger.info("The temporary directory %s exists", args.tmp)
else:
logger.info("The temporary directory %s does not exist, it will be created", args.tmp)
os.makedirs(args.tmp)
TmpDirName = args.tmp
else:
TmpDirName = tempfile.mkdtemp(prefix='tmp_apytram_')
### Read the arguments
# Define global parameters
StartIteration = args.iteration_start
MaxIteration = args.iteration_max
UseIndex = args.UseIndex
if UseIndex or args.clstr:
try:
from Bio import SeqIO
logger.debug("Biopython available")
except ImportError:
logger.error("BioPython (Bio.SeqIO) not available, don't use Index")
ApytramLib.ApytramNeeds.end(1, TmpDirName, keep_tmp=args.keep_tmp)
if MaxIteration < 1:
logger.error("The number of iteration (-i) must be superior to 0")
error += 1
Evalue = args.evalue
MinIdentityPercentage = args.min_id
MinAliLength = args.min_ali_len
MinLength = args.min_len
RequiredCoverage = args.required_coverage
FinalMinLength = args.final_min_len
FinalMinIdentityPercentage = args.final_min_id
FinalMinAliLength = args.final_min_ali_len
#KeepIterations = args.keep_iterations
FinishAllIter = args.finish_all_iter
Threads = args.threads
Memory = args.memory
MaxTime = args.time_max
SeqtkAvailable = ApytramLib.ApytramNeeds.search("seqtk")
if SeqtkAvailable:
logger.warning("seqtk available in the PATH. We will use it")
if args.plot:
args.stats = True
# Define query files
if args.query:
Queries = args.query.split(",")
else:
Queries = []
logger.warning("Query:")
QueriesNamesList = []
QueriesList = []
for query in Queries:
if re.search(":", query):
(query, name) = query.split(":")
else:
name = ""
if not os.path.isfile(query):
logger.error("\t-%s ... ERROR (Don't exist)", query)
error += 1
elif not os.stat(query).st_size:
logger.error("\t-%s ... ERROR (empty)", query)
error += 1
empty_queries += 1
if not name:
name = os.path.basename(os.path.splitext(query)[0])
new_query = ApytramLib.ApytramClasses.Query(name, query, logger)
new_query.TmpDirName = "%s/%s" %(TmpDirName, name)
QueriesList.append(new_query)
QueriesNamesList.append(name)
else:
if not name:
name = os.path.basename(os.path.splitext(query)[0])
new_query = ApytramLib.ApytramClasses.Query(name, query, logger)
new_query.TmpDirName = "%s/%s/" %(TmpDirName, name)
ApytramLib.ApytramNeeds.set_directory_from_prefix(new_query.TmpDirName, "temporary", logger)
logger.warning("\t-%s %s ... ok (%s sequences)", name, new_query.RawQuery, new_query.SequenceNb)
if not name in QueriesNamesList:
new_query.initialization()
QueriesList.append(new_query)
QueriesNamesList.append(name)
else:
logger.error("""The name "%s" must have only one associated query file.You must chose between:\n\t%s\n\t%s""",
new_query.Name,
new_query.RawQuery,
QueriesList[QueriesNamesList.index(new_query.Name)].RawQuery
)
error += 1
if not len(QueriesList):
logger.warning("No query")
# Define species
SpeciesList = []
SpeciesNamesList = []
DBs = args.database.split(",")
UniqSpecies_flag = True
if len(DBs) > 1:
# There are several species:
UniqSpecies_flag = False
elif len(DBs) == 1:
if not re.search(":", DBs[0]):
DBs[0] += ":SP"
DB_types_dict = {}
for item in args.database_type.split(","):
db_type_sp = item.split(":")
logger.debug(db_type_sp)
if len(db_type_sp) == 2:
db_type = db_type_sp[1]
DB_types_dict[db_type] = db_type_sp[0]
elif len(db_type_sp) == 1 and len(DBs) == 1:
sp = DBs[0].split(":")[1]
DB_types_dict[sp] = db_type_sp[0]
FAs = []
FQs = []
IDXs = []
CLSTRs = []
CLSTRIDXs = []
CLSTR_REPs = []
if args.fasta:
FAs += args.fasta.split(",")
if args.fastq:
FQs += args.fastq.split(",")
if args.index:
IDXs += args.index.split(",")
if args.clstr:
CLSTRs += args.clstr.split(",")
if args.clstridx:
CLSTRIDXs += args.clstridx.split(",")
if args.clstr_rep:
CLSTR_REPs += args.clstr_rep.split(",")
# Get species names and check if a database exist
for item in DBs:
new_species = ApytramLib.ApytramClasses.RNA_species(start_time, logger)
new_species.Evalue = Evalue
new_species.MinLength = MinLength
new_species.MinIdentityPercentage = MinIdentityPercentage
new_species.MinAliLength = MinAliLength
new_species.FinalMinLength = FinalMinLength
new_species.FinalMinIdentityPercentage = FinalMinIdentityPercentage
new_species.FinalMinAliLength = FinalMinAliLength
new_species.keep_tmp = args.keep_tmp
s = item.split(":")
if len(s) == 2:
(new_species.DatabaseName, new_species.Species) = s
new_species.DatabaseType = DB_types_dict.get(new_species.Species ,"")
# Check db_type
if new_species.DatabaseType in ["single", "paired", "FR", "RF", "F", "R"]:
if new_species.DatabaseType in ["RF", "FR", "R", "F"]:
new_species.StrandedData = True
else:
new_species.StrandedData = False
if new_species.DatabaseType in ["paired", "RF", "FR"]:
new_species.PairedData = True
else:
new_species.PairedData = False
else:
logger.error("""The species "%s" must have data type in ["single", "paired", "FR", "RF", "F", "R"] and not %s""",
new_species.Species,
new_species.DatabaseType
)
if not new_species.Species in SpeciesList:
new_species.FormatedDatabase = new_species.has_a_formated_database()
SpeciesList.append(new_species)
SpeciesNamesList.append(new_species.Species)
else:
logger.error("""The species "%s" must have only one associated database.You must chose between:\n\t%s\n\t%s""",
new_species.Species,
new_species.DatabaseName,
SpeciesList[SpeciesNamesList.index(new_species.Species)].DatabaseName
)
else:
logger.error(""" "%s" must be formatted as "db:species" if you want to use the multispecies option" """, item)
# associate fasta files with a species
for item in FAs:
f = item.split(":")
if len(f) == 1 and UniqSpecies_flag:
f.append("SP")
if len(f) == 2:
(fa, species) = f
if os.path.isfile(fa):
SpeciesList[SpeciesNamesList.index(species)].Fasta.append(fa)
else:
logger.error("%s (-fa) is not a file.", fa)
ApytramLib.ApytramNeeds.end(1, TmpDirName, keep_tmp=args.keep_tmp)
else:
logger.error(""" "%s" must be formatted as "fa_path:species" if you want to use the multispecies option" """, item)
# associate fastq files with a species
for item in FQs:
f = item.split(":")
if len(f) == 1 and UniqSpecies_flag:
f.append("SP")
if len(f) == 2:
(fq, species) = f
if os.path.isfile(fq):
if species in SpeciesNamesList:
SpeciesList[SpeciesNamesList.index(species)].Fastq.append(fq)
else:
logger.error("The species associated with %s is %s. But there is no database associated with the species %s.", fq, species, species)
ApytramLib.ApytramNeeds.end(1, TmpDirName, keep_tmp=args.keep_tmp)
else:
logger.error("%s (-fq) is not a file.", fq)
ApytramLib.ApytramNeeds.end(1, TmpDirName, keep_tmp=args.keep_tmp)
else:
logger.error(""" "%s" must be formatted as "fq_path:species" if you want to use the multispecies option" """, item)
# associate index files with a species
if UseIndex:
logger.debug("Idx: %s" %(IDXs))
for item in IDXs:
item_idx = item.split(":")
if len(item_idx) == 1 and UniqSpecies_flag:
item_idx.append("SP")
if len(item_idx) == 2:
(idx, species) = item_idx
if os.path.isfile(idx):
if species in SpeciesNamesList:
SpeciesList[SpeciesNamesList.index(species)].IndexFilename = idx
logger.debug("add %s to %s", idx, species)
else:
logger.error("The species associated with %s is %s. But there is no species %s.", idx, species, species)
ApytramLib.ApytramNeeds.end(1, TmpDirName, keep_tmp=args.keep_tmp)
else:
logger.warn("%s (-idx) is not a file. An index will be build for %s", idx, species)
SpeciesList[SpeciesNamesList.index(species)].IndexFilename = idx
logger.debug("add %s to %s", idx, species)
else:
logger.error(""" "%s" must be formatted as "index_path:species" if you want to use the multispecies option" """, item)
# associate clstr files with a species
if args.clstr:
logger.debug("Clstr: %s" %(CLSTRs))
for item in CLSTRs:
item_clstr = item.split(":")
if len(item_clstr) == 1 and UniqSpecies_flag:
item_clstr.append("SP")
if len(item_clstr) == 2:
(clstr, species) = item_clstr
if os.path.isfile(clstr):
if species in SpeciesNamesList:
SpeciesList[SpeciesNamesList.index(species)].ClstrFilename = clstr
logger.debug("add %s to %s", clstr, species)
else:
logger.error("The species associated with %s is %s. But there is no species %s.", clstr, species, species)
ApytramLib.ApytramNeeds.end(1, TmpDirName, keep_tmp=args.keep_tmp)
else:
logger.error("%s (-clstr) is not a file. (%s)", clstr, species)
ApytramLib.ApytramNeeds.end(1, TmpDirName, keep_tmp=args.keep_tmp)
else:
logger.error(""" "%s" must be formatted as "clstr_path:species" if you want to use the multispecies option" """, item)
# associate clstridx files with a species
if args.clstridx:
logger.debug("Clstridx: %s" %(CLSTRIDXs))
for item in CLSTRIDXs:
item_clstridx = item.split(":")
if len(item_clstridx) == 1 and UniqSpecies_flag:
item_clstridx.append("SP")
if len(item_clstridx) == 2:
(clstridx, species) = item_clstridx
if os.path.isfile(clstridx):
if species in SpeciesNamesList:
SpeciesList[SpeciesNamesList.index(species)].ClstrIndexFilename = clstridx
logger.debug("add %s to %s", clstridx, species)
else:
logger.error("The species associated with %s is %s. But there is no species %s.", clstridx, species, species)
ApytramLib.ApytramNeeds.end(1, TmpDirName, keep_tmp=args.keep_tmp)
elif os.path.isfile(SpeciesList[SpeciesNamesList.index(species)].ClstrFilename):
logger.warn("%s (-clstridx) is not a file. An clstr index will be build for %s", clstridx, species)
SpeciesList[SpeciesNamesList.index(species)].ClstrIndexFilename = clstridx
logger.debug("add %s to %s", clstridx, species)
else:
logger.error("%s (-clstridx) is not a file and no -clstr file provided for %s", clstr, species)
ApytramLib.ApytramNeeds.end(1, TmpDirName, keep_tmp=args.keep_tmp)
else:
logger.error(""" "%s" must be formatted as "clstr_path:species" if you want to use the multispecies option" """, item)
# associate clstr_rep files with a species
if args.clstr_rep:
logger.debug("Clstr_rep: %s" %(CLSTR_REPs))
for item in CLSTR_REPs:
item_clstr_rep = item.split(":")
if len(item_clstr_rep) == 1 and UniqSpecies_flag:
item_clstr_rep.append("SP")
if len(item_clstr_rep) == 2:
(clstr_rep, species) = item_clstr_rep
if os.path.isfile(clstr_rep):
if species in SpeciesNamesList:
SpeciesList[SpeciesNamesList.index(species)].ClstrRepFilename = clstr_rep
logger.debug("add %s to %s", clstr_rep, species)
else:
logger.error("The species associated with %s is %s. But there is no species %s.", clstr_rep, species, species)
ApytramLib.ApytramNeeds.end(1, TmpDirName, keep_tmp=args.keep_tmp)
else:
logger.error("%s (-clstr_rep) is not a file. (%s)", clstr_rep, species)
ApytramLib.ApytramNeeds.end(1, TmpDirName, keep_tmp=args.keep_tmp)
else:
logger.error(""" "%s" must be formatted as "clstr_rep_path:species" if you want to use the multispecies option" """, item)
#Check all species has a formated database or input fasta/fastq files:
logger.warning("Species:")
for Species in SpeciesList:
if not Species.DatabaseType in ["single", "paired", "FR", "RF", "F", "R"]:
logger.warning("\t-%s ... Unknown data type (%s) -> ERROR", Species.Species, Species.DatabaseType)
error += 1
elif Species.FormatedDatabase:
logger.warning("\t-%s ... Formated database (%s)", Species.Species, Species.DatabaseType)
elif Species.Fasta and Species.Fastq:
logger.warning("\t-%s ... NO formated database. fasta AND fastq input files -> ERROR", Species.Species)
error += 1
elif Species.Fasta or Species.Fastq:
logger.warning("\t-%s ... NO formated database. (A database will be built)", Species.Species)
else:
logger.warning("\t-%s ... NO formated database and NO input fasta/fastq files -> ERROR", Species.Species)
error += 1
logger.debug("Time to parse input command line: %s", time.time() - start_time)
if error > 0:
if args.write_even_empty and error == empty_queries:
logger.warning("Some query files are empty, but you use --write_even_empty option -> empty files will be create")
else:
logger.error("Error(s) occured, see above")
ApytramLib.ApytramNeeds.end(1, TmpDirName, keep_tmp=args.keep_tmp)
### If iteration begin not from 1, the temporary directory must be given by the user
if StartIteration != 1 and not args.tmp:
logger.error("If you want to restart a previous job, the previous temporary directory must be given.")
ApytramLib.ApytramNeeds.end(1, TmpDirName, keep_tmp=args.keep_tmp)
### Get the available free space of the tmp dir
FreeSpaceTmpDir = ApytramLib.ApytramNeeds.get_free_space(TmpDirName)
logger.debug("%s free space in %s", FreeSpaceTmpDir, TmpDirName)
if UseIndex:
if UseIndex:
logger.warn("Use index files instead of Blastdbcmd to retrieve reads.")
logger.warn("Require raw reads for each species.")
logger.warn("Check species")
### Check that there is a database for each species, otherwise build it
for Species in SpeciesList:
logger.warn("\t- %s ...", Species.Species)
Species.set_TmpDir(TmpDirName + "/db/" + Species.Species)
if UseIndex and not Species.IndexFilename:
Species.IndexFilename = "%s/%s.idx" %(Species.TmpDirName, Species.Species)
if not Species.FormatedDatabase:
logger.info("Database %s does not exist for the species: %s" % (Species.DatabaseName, Species.Species))
Species.prepare_database(FreeSpaceTmpDir, TmpDirName)
Species.build_database(FreeSpaceTmpDir, TmpDirName)
### If Use Index and it not exists, apytram needs raw reads
if UseIndex and not os.path.isfile(Species.IndexFilename):
if Species.InputFastaFilename:
logger.warn("\t\tRaw reads available (%s)", Species.InputFastaFilename)
pass
elif Species.Fasta or Species.Fastq:
logger.warn("\t\t ... Raw reads needed. Get it from Input Fasta or Fastq")
Species.set_TmpDir(TmpDirName + "/db/" + Species.Species)
Species.prepare_database(FreeSpaceTmpDir, TmpDirName)
elif Species.FormatedDatabase:
logger.warn("\t\t ... Raw reads needed. Get it from the database")
Species.get_all_reads()
if not Species.InputFastaFilename:
logger.error("\t\tNo raw reads available for %s.", Species.Species)
ApytramLib.ApytramNeeds.end(1, TmpDirName, keep_tmp=args.keep_tmp)
if UseIndex:
start_index = time.time()
if os.path.isfile(Species.IndexFilename):
Species.IndexDB = SeqIO.index_db(Species.IndexFilename)
if args.debug:
logger.warn("\t\t ... (fasta already indexed) %i sequences (%s seconds)", len(Species.IndexDB), time.time() - start_index)
else:
logger.warn("\t\t ... (fasta already indexed)")
Species.IndexDB.close()
for fasta_file in Species.IndexDB._filenames:
if not os.path.isfile(fasta_file):
logger.error("%s is not a file. (fasta file associeted with the index %s)", fasta_file, Species.IndexFilename)
ApytramLib.ApytramNeeds.end(1, TmpDirName, keep_tmp=args.keep_tmp)
else:
start_index = time.time()
logger.warn("Build index (%s) from %s for %s", Species.IndexFilename, Species.InputFastaFilename, Species.Species)
Species.IndexDB = SeqIO.index_db(Species.IndexFilename, Species.InputFastaFilename, "fasta")
logger.warn("\t\t ... fasta indexed")
if args.debug:
logger.warn("\t\t ... %i sequences indexed in %s seconds", len(Species.IndexDB), time.time() - start_index)
Species.IndexDB.close()
Species.add_time_statistic("Prep_fasta_index", start = start)
Species.logger.info("End Prep fasta index for %s (%s seconds)", Species.Species , Species.get_time_statistic("Prep_fasta_index"))
if Species.ClstrIndexFilename and Species.ClstrFilename:
start_clstr_index = time.time()
if os.path.isfile(Species.ClstrIndexFilename):
Species.ClstrIndexDB = SeqIO.index_db(Species.ClstrIndexFilename)
if args.debug:
logger.warn("\t\t ... (cluster already indexed) %i cluster (%s seconds)", len(Species.ClstrIndexDB), time.time() - start_clstr_index)
else:
logger.warn("\t\t ... (cluster already indexed)")
Species.ClstrIndexDB.close()
for clstr_file in Species.ClstrIndexDB._filenames:
if not os.path.isfile(clstr_file):
logger.error("%s is not a file. (clstr file associated with the index %s)", clstr_file, Species.ClstrIndexFilename)
ApytramLib.ApytramNeeds.end(1, TmpDirName, keep_tmp=args.keep_tmp)
else:
if not Species.ClstrIndexFilename:
Species.ClstrIndexFilename = "%s/%s.clstr.idx" %(Species.TmpDirName, Species.Species)
logger.warn("Build index (%s) from %s for %s", Species.ClstrIndexFilename, Species.ClstrFilename, Species.Species)
Species.ClstrIndexDB = SeqIO.index_db(Species.ClstrIndexFilename, Species.ClstrFilename, "fasta")
logger.warn("\t\t ... cluster indexed")
if args.debug:
logger.warn("\t\t ... %i cluster indexed in %s seconds", len(Species.ClstrIndexDB), time.time() - start_clstr_index)
Species.ClstrIndexDB.close()
Species.add_time_statistic("Prep_clstr_index", start = start_clstr_index)
Species.logger.info("End Prep clstr_index for %s (%s seconds)", Species.Species , Species.get_time_statistic("Prep_clstr_index"))
logger.warn("\t\t ... Ok")
### If there is a query continue, else stop
if not args.query:
logger.info("There is no query (-q), apytram has finished.")
ApytramLib.ApytramNeeds.end(0, TmpDirName, keep_tmp=args.keep_tmp)
else:
ApytramLib.ApytramNeeds.set_directory_from_prefix(args.output_prefix, "output", logger)
### Set up the output directory
if args.output_prefix:
OutDirName = os.path.dirname(args.output_prefix)
OutPrefixName = args.output_prefix
if os.path.isdir(OutDirName):
logger.info("The output directory %s exists", os.path.dirname(args.output_prefix))
elif OutDirName: # if OutDirName is not a empty string we create the directory
logger.info("The output directory %s does not exist, it will be created", os.path.dirname(args.output_prefix))
os.makedirs(os.path.dirname(args.output_prefix))
else:
logger.error("The output prefix must be defined")
ApytramLib.ApytramNeeds.end(1, TmpDirName, keep_tmp=args.keep_tmp)
for Query in QueriesList:
logger.warning("NEW QUERY: %s", Query.Name)
Query.OutPrefixName = "%s.%s" %(OutPrefixName, Query.Name)
Query.NbSpecies = len(SpeciesNamesList)
Query.StartTime = time.time()
for Species in SpeciesList:
Species.new_query(Query)
#Iterative process
while (Query.AbsIteration < MaxIteration) and (Query.continue_iter()) and Query.SequenceNb:
Query.AbsIteration += 1
Query.SpeciesWithoutImprovment[Query.AbsIteration] = []
logger.warning("\tIteration %d/%d", Query.AbsIteration, MaxIteration)
for Species in SpeciesList:
if not Species.Finished:
# Build new baitsequences file
Query.new_species_iteration(SpeciesList)
if Species.Improvment:
Species.new_iteration()
logger.warning("\t\t Start iteration %d/%d for %s", Species.CurrentIteration, MaxIteration, Species.Species)
### Blast bait sequences on database of reads
# Write read names in ReadNamesFile if the file does not exist
if not os.path.isfile(Species.ReadNamesFilename):
Species.fish_reads(Query.BaitSequences, Threads)
else:
logger.warn("%s has already been created, it will be used", Species.ReadNamesFilename)
start_time_enrich_read_list = time.time()
logger.info("%s reads", ApytramLib.ApytramNeeds.count_lines(Species.ReadNamesFilename))
if Species.ClstrIndexDB:
# Get reads names from read clusters
logger.info("Remove duplicated names")
logger.debug("%s reads in %s", ApytramLib.ApytramNeeds.count_lines(Species.ReadNamesFilename), Species.ReadNamesFilename)
ApytramLib.ApytramNeeds.remove_duplicated_read_names(Species.ReadNamesFilename, Species.ReadNamesFilename, logger)
logger.info("%s reads", ApytramLib.ApytramNeeds.count_lines(Species.ReadNamesFilename))
logger.debug("%s reads in %s", ApytramLib.ApytramNeeds.count_lines(Species.ReadNamesFilename), Species.ReadNamesFilename)
logger.info("Get reads names from read clusters")
ApytramLib.ApytramNeeds.retrieve_reads_from_cluster(Species.ClstrIndexDB, Species.ReadNamesFilename, Species.ReadNamesFilename)
logger.debug("%s reads in %s", ApytramLib.ApytramNeeds.count_lines(Species.ReadNamesFilename), Species.ReadNamesFilename)
logger.info("%s reads", ApytramLib.ApytramNeeds.count_lines(Species.ReadNamesFilename))
if Species.PairedData:
# Get paired reads names and remove duplicated names
logger.info("Get paired reads names and remove duplicated names")
ApytramLib.ApytramNeeds.add_paired_read_names(Species.ReadNamesFilename, Species.ParsedReadNamesFilename, logger)
logger.info("%s reads", ApytramLib.ApytramNeeds.count_lines(Species.ParsedReadNamesFilename))
else:
# Remove duplicated names
logger.info("Remove duplicated names")
ApytramLib.ApytramNeeds.remove_duplicated_read_names(Species.ReadNamesFilename, Species.ParsedReadNamesFilename, logger)
logger.info("%s reads", ApytramLib.ApytramNeeds.count_lines(Species.ParsedReadNamesFilename))
logger.info("%s second to retrieve all read names", str(time.time() - start_time_enrich_read_list))
# Count the number of reads which will be used in the Trinity assembly
logger.info("Count the number of reads")
Species.ReadsNumber = ApytramLib.ApytramNeeds.count_lines(Species.ParsedReadNamesFilename)
Species.add_iter_statistic("ReadsNumber", Species.ReadsNumber)
if not Species.ReadsNumber:
logger.warning("No read recruted by Blast at the iteration %s", Species.CurrentIteration)
Species.Improvment = False
Species.CompletedIteration = False
if Species.Improvment:
# Compare the read list names with the list of the previous iteration:
NbNewReads = ApytramLib.ApytramNeeds.number_new_reads(Species.PreviousReadNamesFilename, Species.ParsedReadNamesFilename, nb_intial=Species.ReadsNumber)
OldNumberReads = ApytramLib.ApytramNeeds.count_lines(Species.PreviousReadNamesFilename)
logger.warning("Iteration: %s - Species: %s - Number of new reads: %s", Species.CurrentIteration, Species.Species, NbNewReads)
if (NbNewReads == 0) and not FinishAllIter:
logger.info("Reads from the current iteration are identical to reads from the previous iteration")
Species.Improvment = False
Species.CompletedIteration = False
if OldNumberReads !=0 and NbNewReads > 20000 and NbNewReads > 5 * OldNumberReads:
logger.info("Number of reads has soared -> stop iteration")
Species.Improvment = False
Species.CompletedIteration = False
if Species.Improvment:
### Retrieve reads sequences
if SeqtkAvailable and UseIndex and Species.IndexDB:
Species.get_read_sequences(Threads, Memory, meth="index_seqtk")
elif UseIndex and Species.IndexDB:
Species.get_read_sequences(Threads, Memory, meth="index")
elif SeqtkAvailable and Species.FormatedDatabase:
Species.get_read_sequences(Threads, Memory, meth="blastdbcmd_seqtk")
elif SeqtkAvailable and Species.InputFastaFilename:
Species.get_read_sequences(Threads, Memory, meth="seqtk")
else:
Species.get_read_sequences(Threads, Memory, meth="blastdbcmd")
### Launch Trinity
Species.launch_Trinity(Threads, Memory, long_read=True, cds=args.cds)
if not os.path.isfile(Species.TrinityFastaFilename): # Trinity found nothing
Species.Improvment = False
Species.CompletedIteration = False
if Species.Improvment:
### Filter Trinity contigs to keep only homologous sequences of the reference genes
logger.info("Compare Trinity results with query sequences")
Species.get_homology_between_trinity_results_and_references(Query)
if not Species.HomologyOnRefResult:
logger.info("Reconstructed sequences but no homologous with references (even with the more sensible model)")
Species.Improvment = False
Species.CompletedIteration = False
if Species.Improvment:
# Keep only sequence with a identity percentage > MinIdentitypercentage on the whole hit
# and write filtered sequences in a file
Species.filter_trinity_results_according_homology_results()
### Validated sequences (Species.FilteredTrinityFasta) become bait sequences
if not Species.FilteredTrinityFasta.Sequences:
logger.warning("No sequence has passed the iteration filter at the iteration %s for %s", Species.CurrentIteration, Species.Species)
Species.Improvment = False
Species.CompletedIteration = False
else:
### Compare sequences of the current iteration to those of the previous iteration
logger.info("Compare results with the previous iteration")
#Check if the number of contigs has changed
logger.info("Check if the number of contigs has changed")
if Species.get_iter_statistic("NbContigs") != Species.get_iter_statistic("NbContigs", RelIter=-1):
logger.info("The number of contigs has changed")
elif Query.AbsIteration >= 2:
# Use Exonerate/Blast to compare the current iteration with the previous
Species.compare_current_and_previous_iterations()
# Check that the coverage has increased compared to the previous iteration
if RequiredCoverage <=100:
if Species.CompletedIteration:
logger.info("Check that the coverage has increased compared to the previous iteration")
Species.measure_coverage(Query)
# Stop iteration if both Largecoverage and Total length are not improved
##if Species.get_iter_statistic("AverageLength") != Species.get_iter_statistic("AverageLength", RelIter=-1):
## pass
##elif Species.get_iter_statistic("AverageScore") != Species.get_iter_statistic("AverageScore", RelIter=-1):
## pass
##elif Species.get_iter_statistic("TotalLength") != Species.get_iter_statistic("TotalLength", RelIter=-1):
## pass
##elif Species.get_iter_statistic("TotalScore") != Species.get_iter_statistic("TotalScore", RelIter=-1):
## pass
##elif Species.get_iter_statistic("BestScore") != Species.get_iter_statistic("BestScore", RelIter=-1):
## pass
##elif Species.get_iter_statistic("LargeCoverage") != Species.get_iter_statistic("LargeCoverage", RelIter=-1):
## logger.info("This iteration have a large coverage inferior (or equal) to the previous iteration")
## Species.Improvment = False
# Stop iteration if the RequiredCoverage is reached
if Species.get_iter_statistic("StrictCoverage") >= RequiredCoverage:
logger.info("This iteration attains the required bait sequence coverage (%d >= %d)", Species.get_iter_statistic("StrictCoverage"), RequiredCoverage)
Species.Improvment = False
### Write a fasta file for this iteration if the option --keep_iterations was selected
#if KeepIterations:
# if not args.no_best_file:
# # Best sequences of the iteration
# ExitCode = ApytramLib.ApytramNeeds.write_apytram_output(FilteredTrinityFasta, TrinityExonerateResultsDict,
# "%s.iter_%d.best.fasta" %(OutPrefixName,i),
# Header = TrinityExonerateProcess.Ryo.replace('%',"").replace("\n","").split(),
# Names = BestScoreNames.values(),
# Message = "iter_%d.best." %i)
# # All sequences of the iteration
# ExitCode = ApytramLib.ApytramNeeds.write_apytram_output(FilteredTrinityFasta,
# TrinityExonerateResultsDict,
# "%s.iter_%d.fasta" %(OutPrefixName,i),
# Header = TrinityExonerateProcess.Ryo.replace('%',"").replace("\n","").split(),
# Message = "iter_%d." %i)
# # Mafft alignment
# ApytramLib.ApytramNeeds.write_in_file(MafftResult,"%s.iter_%s.ali.fasta" %(OutPrefixName,i))
# End iteration
Species.FinalIteration = Species.CurrentIteration
if not Species.CompletedIteration:
logger.debug("Iteration stop before end")
Species.FinalIteration -= 1
if not Species.Improvment:
Query.SpeciesWithoutImprovment[Query.AbsIteration].append(Species.Species)
Species.end_iteration() # just stop timer
logger.info("End of the iteration %s for %s : --- %s seconds ---", Species.CurrentIteration, Species.Species, Species.get_iter_statistic("IterationTime"))
if (time.time() - Query.StartTime) > MaxTime:
logger.warn("No new iteration for this query and this species will begin because the maximum duration (%s seconds) of the job is attained. (%s seconds)", MaxTime, str(time.time() - Query.StartTime))
Query.Stop = True
logger.info("End of Iterations for %s. Iterative process takes %s seconds.", Query.Name, str(time.time() - Query.StartTime))
### Final filter
start_output = time.time()
for Species in SpeciesList:
if Species.FinalIteration: #We check that there is at least one iteration with a result
if Species.FinalMinLength or Species.FinalMinIdentityPercentage or Species.FinalMinAliLength: # A final filter is required
start_iter_i = time.time()
Species.new_iteration()
logger.info("Start final filter for %s", Species.Species)
# Keep only sequence with a identity percentage > FinalMinIdentitypercentage on the whole hit
# and write filtered sequences in a file
Species.filter_trinity_results_according_homology_results(final_iteration=True)
else:
logger.warn("No results for %s", Species.Species)
if Species.FilteredTrinityFasta.Sequences or args.write_even_empty: # If sequences pass the last filter
Species.rename_sequences()
logger.debug("output: %s", Species.Species)
# Prepare fasta output files by species
if not args.no_best_file: