def get_fasta_file_names(self):
     fa_files = []
     pipelne_utils   = PipelneUtils()
     files = pipelne_utils.get_all_files(self.fasta_dir)
     for full_name in files.keys():    
         if (files[full_name][1] == ".unique") and ((files[full_name][0].split(".")[-1].strip() == "fa") or (files[full_name][0].split("_")[-1] == "FILTERED")):
             fa_files.append(full_name)
     return fa_files
Esempio n. 2
0
 def get_fasta_file_names(self):
     fa_files = []
     pipelne_utils   = PipelneUtils()
     files = pipelne_utils.get_all_files(self.in_file_path)
     for full_name in files.keys():    
         if (files[full_name][1] == ".unique") and (files[full_name][0].split(".")[-1].strip() == "fa"):
             print full_name
             fa_files.append(full_name)
     return fa_files
 def __init__(self, command_line_args = None, configuration_dictionary = None):
     self.args = command_line_args
     self.general_config_dict = configuration_dictionary
     self.known_header_list  = C.csv_header_list
     self.pipeline_run_items = C.pipeline_run_items
     self.primer_suites      = self.convert_primer_suites(C.primer_suites)
     self.dna_regions        = C.dna_regions
     self.data_object = {}
     self.data_object['general'] = {}
     self.warn_msg = """\n\tThe config File seems to be okay. If the items above look correct
     then press 'c' to continue the pipeline\n"""
     self.res_headers = []
     self.env = {}
     self.utils  = PipelneUtils()
Esempio n. 4
0
    def __init__(self, host="bpcweb7", db="test"):
# , read_default_file=os.path.expanduser("~/.my.cnf"), port = 3306
        
        self.utils  = PipelneUtils()        
        self.conn   = None
        self.cursor = None
        self.rows   = 0
        self.new_id = None
        self.lastrowid = None
        
        try:           
            self.utils.print_both("=" * 40)
            self.utils.print_both("host = " + str(host) + ", db = "  + str(db))
            self.utils.print_both("=" * 40)
            read_default_file = os.path.expanduser("~/.my.cnf")
            port_env = 3306
            
            if self.utils.is_local():
                host = "127.0.0.1"
                if db == "env454":
                    port_env = 3308
                    read_default_file = os.path.expanduser("~/.my.cnf_server")
                else:
                    db = "test_env454"
            self.conn   = MySQLdb.connect(host = host, db = db, read_default_file = read_default_file, port = port_env)
            self.cursor = self.conn.cursor()
            # self.escape = self.conn.escape()
                   
        except MySQLdb.Error, e:
            self.utils.print_both("Error %d: %s" % (e.args[0], e.args[1]))
            raise
Esempio n. 5
0
    def __init__(self, runobj = None):
        self.utils      = PipelneUtils()
        self.runobj     = runobj
        self.run_keys   = self.runobj.run_keys
        self.rundate    = self.runobj.run
        try:
            self.use_cluster = self.runobj.use_cluster
        except:
            self.use_cluster = True
        self.chg_suffix         = ".chg"
        self.chimeras_suffix    = ".chimeras"      
        self.ref_suffix         = ".db"      
        self.denovo_suffix      = ".txt"        
        self.nonchimeric_suffix = "." + C.nonchimeric_suffix #".nonchimeric.fa"
        self.chimeric_suffix    = ".chimeric.fa"
        self.base_suffix        = "unique" + self.chimeras_suffix

        try:
            if self.runobj.lane_name:
                lane_name = self.runobj.lane_name
            else:
                lane_name = ''
        except:
            lane_name = ''

        if self.runobj.vamps_user_upload:
            site       = self.runobj.site
            dir_prefix = self.runobj.user + '_' + self.runobj.run
            self.dirs = Dirs(self.runobj.vamps_user_upload, dir_prefix, self.runobj.platform, lane_name = lane_name, site = site) 
            self.idx_keys = convert_unicode_dictionary_to_str(json.loads(open(self.runobj.trim_status_file_name,"r").read()))["new_lane_keys"] 
            self.indir  = self.dirs.check_dir(self.dirs.trimming_dir)
            self.outdir = self.dirs.check_dir(self.dirs.chimera_dir)
        else:
            site = ''
            dir_prefix = self.runobj.run
            self.dirs = Dirs(self.runobj.vamps_user_upload, dir_prefix, self.runobj.platform, lane_name = lane_name, site = site) 
            self.indir  = self.dirs.check_dir(self.dirs.reads_overlap_dir)
            self.outdir = self.dirs.check_dir(self.dirs.chimera_dir)
        
        
#         self.usearch_cmd = C.usearch_cmd
        self.usearch_cmd = C.usearch6_cmd        
        #self.abskew      = C.chimera_checking_abskew
        self.refdb       = C.chimera_checking_refdb_6
        self.its_refdb   = C.chimera_checking_its_refdb_6
        self.input_file_names  = self.make_chimera_input_illumina_file_names()
    def get_fasta_file_names(self):
        fa_files = []
        pipelne_utils   = PipelneUtils()
        files = pipelne_utils.get_all_files(self.fasta_dir)

        for full_name in files.keys():
                
#             if (files[full_name][1] == ".unique") and ((files[full_name][0].split(".")[-1].strip() == "fa") or (files[full_name][0].split("_")[-1] == C.filtered_suffix)):
            if (full_name.endswith(self.nonchimeric_suffix)):                
                fa_files.append(full_name)
                print full_name
                self.suffix_used = self.nonchimeric_suffix
                next 
            elif (full_name.endswith(self.fa_unique_suffix)):
                fa_files.append(full_name)
                print full_name
                self.suffix_used = self.fa_unique_suffix                
        return fa_files
Esempio n. 7
0
    def __init__(self, runobj = None):
        self.utils       = PipelneUtils()
        self.runobj      = runobj
        self.rundate     = self.runobj.run
        self.use_cluster = 1       
        self.unique_fasta_files = []
#        if self.runobj.vamps_user_upload:
#            site       = self.runobj.site
#            dir_prefix = self.runobj.user + '_' + self.runobj.run
#        else:
#            site = ''
#            dir_prefix = self.runobj.run         
#        dirs = Dirs(self.runobj.vamps_user_upload, dir_prefix, self.runobj.platform, site = site)

        if self.runobj.vamps_user_upload:
            site = self.runobj.site
            dir_prefix=self.runobj.user+'_'+self.runobj.run
        else:
            site = ''
            dir_prefix = self.runobj.run
        if self.runobj.lane_name:
            lane_name = self.runobj.lane_name
        else:
            lane_name = ''
        
        self.dirs = Dirs(self.runobj.vamps_user_upload, dir_prefix, self.runobj.platform, lane_name = lane_name, site = site) 
 
        
        self.analysis_dir = self.dirs.check_dir(self.dirs.analysis_dir)
        self.fasta_dir    = self.dirs.check_dir(self.dirs.reads_overlap_dir)
        self.gast_dir     = self.dirs.check_dir(self.dirs.gast_dir)

        host_name     = runobj.database_host
        database_name = runobj.database_name
        
        self.filenames   = []
        self.my_conn     = MyConnection(host = 'newbpcdb2.jbpc-np.mbl.edu', db="env454")
#         self.my_conn     = MyConnection()

#         self.my_conn     = MyConnection(host = 'localhost', db="test_env454")
        self.sequence_table_name = "sequence_ill" 
        self.sequence_field_name = "sequence_comp" 
        self.my_csv              = None

        self.unique_file_counts = self.dirs.unique_file_counts
        self.dirs.delete_file(self.unique_file_counts)
        self.seq_id_dict = {}
        self.tax_id_dict = {}
        self.run_id      = None
#        self.nonchimeras_suffix = ".nonchimeric.fa"
        self.nonchimeric_suffix = "." + C.nonchimeric_suffix #".nonchimeric.fa"
        self.fa_unique_suffix   = ".fa." + C.unique_suffix #.fa.unique
        self.v6_unique_suffix   = "MERGED_V6_PRIMERS_REMOVED." + C.unique_suffix
        self.suff_list = [self.nonchimeric_suffix, self.fa_unique_suffix, self.v6_unique_suffix]

#         self.merge_unique_suffix = "." + C.filtered_suffix + "." + C.unique_suffix #.MERGED-MAX-MISMATCH-3.unique
        self.suffix_used        = ""
Esempio n. 8
0
    def __init__(self, runobj = None):
        self.utils      = PipelneUtils()
        self.runobj     = runobj
        self.run_keys   = self.runobj.run_keys
        self.rundate    = self.runobj.run
        try:
            self.use_cluster = self.runobj.use_cluster
        except:
            self.use_cluster = True
        self.chg_suffix         = ".chg"
        self.chimeras_suffix    = ".chimeras"      
        self.ref_suffix         = ".db"      
        self.denovo_suffix      = ".txt"        
        self.nonchimeric_suffix = "." + C.nonchimeric_suffix #".nonchimeric.fa"
        self.chimeric_suffix    = ".chimeric.fa"
        self.base_suffix        = "unique" + self.chimeras_suffix

        try:
            if self.runobj.lane_name:
                lane_name = self.runobj.lane_name
            else:
                lane_name = ''
        except:
            lane_name = ''

        if self.runobj.vamps_user_upload:
            os.environ['SGE_ROOT'] ='/opt/sge'
            os.environ['SGE_CELL'] ='grendel'
            path                   = os.environ['PATH']
            os.environ['PATH']     = '/opt/sge/bin/lx24-amd64:'+path
            site       = self.runobj.site
            dir_prefix = self.runobj.user + '_' + self.runobj.run
            self.dirs = Dirs(self.runobj.vamps_user_upload, dir_prefix, self.runobj.platform, lane_name = lane_name, site = site) 
            self.idx_keys = convert_unicode_dictionary_to_str(json.loads(open(self.runobj.trim_status_file_name,"r").read()))["new_lane_keys"] 
            self.analysis_dir = self.dirs.check_dir(self.dirs.analysis_dir)  
            self.indir  = self.dirs.check_dir(self.dirs.trimming_dir)
            self.outdir = self.dirs.check_dir(self.dirs.chimera_dir)        
            self.gast_dir = self.dirs.check_dir(self.dirs.gast_dir)

        else:
            site = ''
            dir_prefix = self.runobj.run
            self.dirs = Dirs(self.runobj.vamps_user_upload, dir_prefix, self.runobj.platform, lane_name = lane_name, site = site) 
            self.indir  = self.dirs.check_dir(self.dirs.reads_overlap_dir)
            self.outdir = self.dirs.check_dir(self.dirs.chimera_dir)
        
        
#         self.usearch_cmd = C.usearch_cmd
        self.usearch_cmd = C.usearch6_cmd        
        #self.abskew      = C.chimera_checking_abskew
        self.refdb       = C.chimera_checking_refdb_6
        self.its_refdb   = C.chimera_checking_its_refdb_6
        self.input_file_names  = self.make_chimera_input_illumina_file_names()
    def __init__(self, runobj):
        self.utils = PipelneUtils()
        self.runobj         = runobj
        self.out_files      = {}
        self.id_dataset_idx = {}
        self.in_file_path   = self.runobj.input_dir

        if self.runobj.vamps_user_upload:
            site = self.runobj.site
            dir_prefix=self.runobj.user+'_'+self.runobj.run
        else:
            site = ''
            dir_prefix = self.runobj.run
        if self.runobj.lane_name:
            lane_name = self.runobj.lane_name
        else:
            lane_name = ''

        dirs      = Dirs(self.runobj.vamps_user_upload, dir_prefix, self.runobj.platform, lane_name = lane_name, site = site)
        self.dirs = dirs
        self.out_file_path = dirs.check_dir(dirs.analysis_dir)
        self.results_path  = dirs.check_dir(dirs.reads_overlap_dir)
        self.platform = self.runobj.platform
Esempio n. 10
0
class MyConnection:
    """
    Connection to env454
    Takes parameters from ~/.my.cnf, default host = "vampsdev", db="test"
    if different use my_conn = MyConnection(host, db)
    """
    def __init__(self, host="bpcweb7", db="test"):
# , read_default_file=os.path.expanduser("~/.my.cnf"), port = 3306
        
        self.utils  = PipelneUtils()        
        self.conn   = None
        self.cursor = None
        self.rows   = 0
        self.new_id = None
        self.lastrowid = None
        
        try:           
            self.utils.print_both("=" * 40)
            self.utils.print_both("host = " + str(host) + ", db = "  + str(db))
            self.utils.print_both("=" * 40)
            read_default_file = os.path.expanduser("~/.my.cnf")
            port_env = 3306
            
            if self.utils.is_local():
                host = "127.0.0.1"
                if db == "env454":
                    port_env = 3308
                    read_default_file = os.path.expanduser("~/.my.cnf_server")
                else:
                    db = "test_env454"
            self.conn   = MySQLdb.connect(host = host, db = db, read_default_file = read_default_file, port = port_env)
            self.cursor = self.conn.cursor()
            # self.escape = self.conn.escape()
                   
        except MySQLdb.Error, e:
            self.utils.print_both("Error %d: %s" % (e.args[0], e.args[1]))
            raise
        except:                       # catch everything
    def __init__(self, runobj = None):
        self.utils      = PipelneUtils()
        self.runobj     = runobj
        self.run_keys   = self.runobj.run_keys
        self.rundate    = self.runobj.run
        
        self.chg_suffix         = ".chg"
        self.chimeras_suffix    = ".chimeras"      
        self.ref_suffix         = ".db"      
        self.denovo_suffix      = ".txt"        
        self.nonchimeric_suffix = "." + C.nonchimeric_suffix #".nonchimeric.fa"
        self.chimeric_suffix    = ".chimeric.fa"
        self.base_suffix        = "unique" + self.chimeras_suffix

        try:
            if self.runobj.lane_name:
                lane_name = self.runobj.lane_name
            else:
                lane_name = ''
        except:
            lane_name = ''

        if self.runobj.vamps_user_upload:
            site       = self.runobj.site
            dir_prefix = self.runobj.user + '_' + self.runobj.run
            self.dirs = Dirs(self.runobj.vamps_user_upload, dir_prefix, self.runobj.platform, lane_name = lane_name, site = site) 
            self.idx_keys = convert_unicode_dictionary_to_str(json.loads(open(self.runobj.trim_status_file_name,"r").read()))["new_lane_keys"] 
            self.indir  = self.dirs.check_dir(self.dirs.trimming_dir)
            self.outdir = self.dirs.check_dir(self.dirs.chimera_dir)
        else:
            site = ''
            dir_prefix = self.runobj.run
            self.dirs = Dirs(self.runobj.vamps_user_upload, dir_prefix, self.runobj.platform, lane_name = lane_name, site = site) 
            self.indir  = self.dirs.check_dir(self.dirs.reads_overlap_dir)
            self.outdir = self.dirs.check_dir(self.dirs.chimera_dir)
        
        
#         self.usearch_cmd = C.usearch_cmd
        self.usearch_cmd = C.usearch6_cmd        
        #self.abskew      = C.chimera_checking_abskew
        self.refdb       = C.chimera_checking_refdb_6
        self.its_refdb   = C.chimera_checking_its_refdb_6
        self.input_file_names  = self.make_chimera_input_illumina_file_names()
Esempio n. 12
0
 def __init__(self, runobj):
     self.utils = PipelneUtils()
     self.runobj         = runobj
     self.out_files      = {} 
     self.id_dataset_idx = {}
     self.in_file_path   = self.runobj.input_dir
             
     if self.runobj.vamps_user_upload:
         site = self.runobj.site
         dir_prefix=self.runobj.user+'_'+self.runobj.run
     else:
         site = ''
         dir_prefix = self.runobj.run
     if self.runobj.lane_name:
         lane_name = self.runobj.lane_name
     else:
         lane_name = ''
     
     dirs      = Dirs(self.runobj.vamps_user_upload, dir_prefix, self.runobj.platform, lane_name = lane_name, site = site) 
     self.dirs = dirs
     self.out_file_path = dirs.check_dir(dirs.analysis_dir)
     self.results_path  = dirs.check_dir(dirs.reads_overlap_dir)
Esempio n. 13
0
from pipeline.get_ini import readCSV
from pipeline.pipelinelogging import logger
from pipeline.utils import Dirs, PipelneUtils
import IlluminaUtils.lib.fastalib as fastalib

try:
    import MySQLdb
except MySQLdb.Error, e:
    message = """
    MySQLdb ERROR
      To load the correct module, try running these commands before running the pipeline:
       
source /xraid/bioware/Modules/etc/profile.modules
module load bioware
    """
    PipelneUtils.print_both(message)
    PipelneUtils.print_both("Error %d: %s" % (e.args[0], e.args[1]))
    raise
except:                       # catch everything
    PipelneUtils.print_both("Unexpected:")
#     print "Unexpected:"         # handle unexpected exceptions
    PipelneUtils.print_both(sys.exc_info()[0])
#     print sys.exc_info()[0]     # info about curr exception (type,value,traceback)
    raise          

#     sys.exit("""
#     MySQLdb ERROR
#       To load the correct module, try running these commands before running the pipeline:
#       
# source /xraid/bioware/Modules/etc/profile.modules
# module load bioware
Esempio n. 14
0
class IlluminaFiles:
    """
    0) from run create all dataset_lines names files in output dir
    1) split fastq files from casava into files with dataset_names
    2) create ini files 
    3) process them through Meren's script
    4) result - files dataset_lane-PERFECT_reads.fa.unique with frequencies - to process with env454upload()    
    
    """
    def __init__(self, runobj):
        self.utils = PipelneUtils()
        self.runobj         = runobj
        self.out_files      = {} 
        self.id_dataset_idx = {}
        self.in_file_path   = self.runobj.input_dir
                
        if self.runobj.vamps_user_upload:
            site = self.runobj.site
            dir_prefix=self.runobj.user+'_'+self.runobj.run
        else:
            site = ''
            dir_prefix = self.runobj.run
        if self.runobj.lane_name:
            lane_name = self.runobj.lane_name
        else:
            lane_name = ''
        
        dirs      = Dirs(self.runobj.vamps_user_upload, dir_prefix, self.runobj.platform, lane_name = lane_name, site = site) 
        self.dirs = dirs
        self.out_file_path = dirs.check_dir(dirs.analysis_dir)
        self.results_path  = dirs.check_dir(dirs.reads_overlap_dir)
        
    def split_files(self, compressed = False):
        """
        TODO: *) fastq_file_names method to collect all file_names with full path or directories_names (see get_all_files()?)
        """   
#        print "compressed = %s" %       compressed
#        compressed = ast.literal_eval(compressed)     
        (in_files_r1, in_files_r2) = self.get_fastq_file_names(self.in_file_path)
#         correct_file_names = self.get_correct_file_names(in_files_r1)
        if (len(in_files_r1) > 0):
            self.read1(in_files_r1, compressed)
            self.read2(in_files_r2, compressed)
            self.create_inis()
        else:
#             print "ERROR: There is something wrong with fastq file names. Please check if they start with correct indexes."
#             logger.debug("ERROR: There is something wrong with fastq file names. Please check if they start with correct indexes.")
            self.utils.print_both("ERROR: There is something wrong with fastq file names. Please check if they start with correct indexes.")
        self.close_dataset_files()

            

#        self.perfect_reads()
#        self.uniq_fa()

    def open_dataset_files(self):
        file_name_base = [i + "_R1" for i in self.runobj.samples.keys()] + [i + "_R2" for i in self.runobj.samples.keys()]
        for f_name in file_name_base:
            output_file = os.path.join(self.out_file_path, f_name + ".fastq")
            self.out_files[f_name] = fq.FastQOutput(output_file)
        self.out_files["unknown"] = fq.FastQOutput(os.path.join(self.out_file_path, "unknown" + ".fastq"))        

    def close_dataset_files(self):
        [o_file[1].close() for o_file in self.out_files.iteritems()] 
        return
      
#     def perfect_reads(self):
#         self.utils.print_both("Extract perfect V6 reads:")
#         for idx_key in self.runobj.samples.keys():
#             file_name = os.path.join(self.out_file_path, idx_key + ".ini")
#             program_name = C.perfect_overlap_cmd
#             if self.utils.is_local():
#                 program_name = C.perfect_overlap_cmd_local       
#             try:
#                 if self.runobj.samples[idx_key].primer_suite.lower().startswith('archaeal'):
#                     call([program_name, file_name, "--archaea"]) 
#                 else: 
#                     call([program_name, file_name])
#             except:
#                 self.utils.print_both("Problems with program_name = %s, file_name = %s" % (program_name, file_name))
#                 raise  
#     
    def call_sh_script(self, script_name_w_path, where_to_run):
        try:
            call(['chmod', '0774', script_name_w_path])
            if self.utils.is_local():
                self.utils.print_both("call(['qsub', script_name_w_path], cwd=(where_to_run))")
                call(['bash', script_name_w_path], cwd=(where_to_run))                
            else:
                call(['qsub', script_name_w_path], cwd=(where_to_run))
#             pass
        except:
            self.utils.print_both("Problems with script_name = %s or qsub" % (script_name_w_path))
            raise     
        
#     todo: combine and DRY with partial - it's the same command, different arguments
    def merge_perfect(self):
        self.utils.print_both("merge perfect V6 reads:")
        program_name = C.perfect_overlap_cmd
        if self.utils.is_local():
            program_name = C.perfect_overlap_cmd_local
        add_arg = " --marker-gene-stringent --retain-only-overlap --max-num-mismatches 0"
        command_line          = program_name + add_arg
        file_list             = self.dirs.get_all_files_by_ext(self.out_file_path, "ini")
        script_file_name      = self.create_job_array_script(command_line, self.dirs.analysis_dir, file_list)
        script_file_name_full = os.path.join(self.dirs.analysis_dir, script_file_name)
        self.call_sh_script(script_file_name_full, self.dirs.analysis_dir)  
        return script_file_name    
    
    def trim_primers_perfect(self):
        self.utils.print_both("trim primers from perfect V6 reads:")
        
        merged_file_names = self.dirs.get_all_files_by_ext(self.dirs.reads_overlap_dir, "_MERGED")
        primer_suite = self.get_config_values('primer_suite')
        add_arg = ""
        if any([s.lower().startswith("Archaeal".lower()) for s in primer_suite]):
            add_arg += " --archaea"
        program_name = C.trim_primers_cmd + add_arg
        script_file_name      = self.create_job_array_script(program_name, self.dirs.reads_overlap_dir, merged_file_names)
        script_file_name_full = os.path.join(self.dirs.reads_overlap_dir, script_file_name)
        self.call_sh_script(script_file_name_full, self.dirs.reads_overlap_dir)  
        return script_file_name    

    """    
    def perfect_reads_cluster(self):
        '''
        iu-merge-pairs anna.ini --marker-gene-stringent --retain-only-overlap --max-num-mismatches 0
​            Each flag is critical. ​marker-gene-stringent looks complete overlaps, retain-only-overlap gets rid of adapters, max-num-mismatches retains only perfect overlaps. 
            This generates the test_MERGED file with all complete overlaps without any mismatches. But it has all the primers. 
            Then we process this file with the new and shiny iu-analyze-v6-complete-overlaps script:
        iu-trim-V6-primers test_MERGED

        '''
        self.utils.print_both("Extract perfect V6 reads:")
        script_file_name      = self.merge_perfect()
        trim_script_file_name = self.trim_primers_perfect()

        return (script_file_name, trim_script_file_name)    
    """          
                              
    def partial_overlap_reads_cluster(self):
        self.utils.print_both("Extract partial_overlap V4V5 reads:")
        program_name = C.partial_overlap_cmd
        if self.utils.is_local():
            program_name = C.partial_overlap_cmd_local       
        dna_region = self.get_config_values('dna_region')
        if ("ITS1" in list(dna_region)):
            add_arg = "--marker-gene-stringent"
        else:
            add_arg = ""
#         TODO: this part is the same in perfect overlap - move into a method    
        command_line          = program_name + " --enforce-Q30-check " + add_arg 
        file_list             = self.dirs.get_all_files_by_ext(self.out_file_path, "ini")
        script_file_name      = self.create_job_array_script(command_line, self.dirs.analysis_dir, file_list)
        script_file_name_full = os.path.join(self.dirs.analysis_dir, script_file_name)
        self.call_sh_script(script_file_name_full, self.dirs.analysis_dir)  
        self.utils.print_both("self.dirs.chmod_all(%s)" % (self.dirs.analysis_dir))
        self.dirs.chmod_all(self.dirs.analysis_dir)        
        
        return script_file_name      
                    
    def partial_overlap_reads(self):
        self.utils.print_both("Extract partial_overlap V4V5 reads:")
        for idx_key in self.runobj.samples.keys():
            ini_file_name = os.path.join(self.out_file_path, idx_key + ".ini")
            program_name = C.partial_overlap_cmd
            if self.utils.is_local():
                program_name = C.partial_overlap_cmd_local        
            try:
                if (self.runobj.samples[idx_key].dna_region == "ITS1"):
                    call([program_name, "--enforce-Q30-check", "--marker-gene-stringent", ini_file_name])
                else:
                    call([program_name, "--enforce-Q30-check", ini_file_name])
                               
#                 call([program_name, ini_file_name])           
#                 call([program_name, ini_file_name, idx_key])
#                 call([program_name, "--fast-merge", ini_file_name, idx_key])
            except Exception:
#                 except Exception, err:
                message = traceback.format_exc()
                self.utils.print_both(message)
    #or
#     print sys.exc_info()[0]

                self.utils.print_both("Problems with program_name = %s" % (program_name))
                raise  
                
#             print "HERE: program_name = " % (program_name)   
#             call([program_name, "--fast-merge", "--compute-qual-dicts", ini_file_name, idx_key])
            
    def get_config_values(self, key):
        config_path_data = [v for k, v in self.runobj.configPath.items()]
        return set([a[key] for a in config_path_data if key in a.keys()])
        
    def make_users_email(self):
        username = getpass.getuser() 
        return username + "@mbl.edu"
                
    def create_job_array_script(self, command_line, dir_to_run, files_list):
        files_string         = " ".join(files_list)
        files_list_size         = len(files_list)
        command_file_name = os.path.basename(command_line.split(" ")[0])
        script_file_name  = command_file_name + "_" + self.runobj.run + "_" + self.runobj.lane_name + ".sh"
        script_file_name_full = os.path.join(dir_to_run, script_file_name)
        log_file_name     = script_file_name + ".sge_script.sh.log"
        email_mbl         = self.make_users_email()
        text = (
                '''#!/bin/bash
#$ -cwd
#$ -S /bin/bash
#$ -N %s
# Giving the name of the output log file
#$ -o %s
# Combining output/error messages into one file
#$ -j y
# Send mail to these users
#$ -M %s
# Send mail at job end; -m eas sends on end, abort, suspend.
#$ -m eas
#$ -t 1-%s
# Now the script will iterate %s times.

  file_list=(%s)
  
  i=$(expr $SGE_TASK_ID - 1)
#   echo "i = $i"
  # . /etc/profile.d/modules.sh
  # . /xraid/bioware/bioware-loader.sh
  . /xraid/bioware/Modules/etc/profile.modules
  module load bioware
    
  echo "%s ${file_list[$i]}"  
  %s ${file_list[$i]}  
''' % (script_file_name, log_file_name, email_mbl, files_list_size, files_list_size, files_string, command_line, command_line)
# ''' % (script_file_name, log_file_name, email_mbl, files_list_size, files_list_size, files_string, command_line)
                )
        self.open_write_close(script_file_name_full, text)
        return script_file_name

    def filter_mismatches_cluster(self, max_mismatch = 3):
        self.utils.print_both("Filter mismatches if more then %s" % (max_mismatch))
        command_line = C.filter_mismatch_cmd
        if self.utils.is_local():
            command_line = C.filter_mismatch_cmd_local    
        files_dir = self.dirs.reads_overlap_dir   
                
        file_list             = self.dirs.get_all_files_by_ext(files_dir, "_MERGED")
        script_file_name      = self.create_job_array_script(command_line, files_dir, file_list)
        script_file_name_full = os.path.join(files_dir, script_file_name)
        self.call_sh_script(script_file_name_full, files_dir)
        self.utils.print_both("self.dirs.chmod_all(%s)" % (files_dir))
        self.dirs.chmod_all(files_dir)        
        
        return script_file_name              

    def filter_mismatches(self, max_mismatch = 3):
        self.utils.print_both("Filter mismatches if more then %s" % (max_mismatch))
        n = 0        
        files = self.dirs.get_all_files()
        for full_name in files.keys():    
            if files[full_name][0].endswith('_MERGED'):
                n +=1   
#                print "%s fasta file: %s" % (n, full_name)
                program_name = C.filter_mismatch_cmd
                if self.utils.is_local():
                    program_name = C.filter_mismatch_cmd_local
#                 output_flag = "--output " + full_name + "_FILTERED"
# TODO:    Remove!!!
#                 output_flag = "-o " + full_name + "_FILTERED"           
#                 output_flag = "-o TTAGGC_NNNNTGACT_1_MERGED_FILTERED"           

#                 print "output_flag = %s" % (output_flag)
#                 print "%s %s %s" % (program_name, full_name, output_flag)                
#                 call([program_name, full_name, output_flag])
                call([program_name, full_name])

    def uniq_fa_cluster(self):
        self.utils.print_both("Uniqueing fasta files")
        command_line = C.fastaunique_cmd
        if self.utils.is_local():
            command_line = C.fastaunique_cmd_local   
        files_dir = self.dirs.reads_overlap_dir   
                
        file_list             = self.dirs.get_all_files_by_ext(files_dir, C.filtered_suffix)
        if len(file_list) == 0:
            file_list         = self.dirs.get_all_files_by_ext(files_dir, ".fa")
        if len(file_list) == 0:
            file_list         = self.dirs.get_all_files_by_ext(files_dir, "MERGED_V6_PRIMERS_REMOVED")
        
        script_file_name      = self.create_job_array_script(command_line, files_dir, file_list)
        script_file_name_full = os.path.join(files_dir, script_file_name)
        self.call_sh_script(script_file_name_full, files_dir)  
        self.utils.print_both("self.dirs.chmod_all(%s)" % (files_dir))
        self.dirs.chmod_all(files_dir)        
        return script_file_name                           
                                       
    def uniq_fa(self):
        n = 0        
        self.utils.print_both("Uniqueing fasta files")
        files = self.dirs.get_all_files()
        for full_name in files.keys():    
#             if files[full_name][1] == ".fa" or files[full_name][0].endswith('_MERGED_FILTERED'):
            if files[full_name][1] == ".fa" or files[full_name][0].endswith(C.filtered_suffix):
                n +=1   
#                print "%s fasta file: %s" % (n, full_name)
                program_name = C.fastaunique_cmd
                if self.utils.is_local():
                    program_name = C.fastaunique_cmd_local                
                call([program_name, full_name])

    def get_primers(self):
        proximal_primer = ""
        distal_primer   = ""
        primers         = {}
        for idx_key in self.runobj.samples.keys():
            primer_suite = self.runobj.samples[idx_key].primer_suite.lower()

            if primer_suite in C.primers_dict:
                proximal_primer = C.primers_dict[primer_suite]["proximal_primer"]
                distal_primer = C.primers_dict[primer_suite]["distal_primer"]
#                 print "proximal_primer: %s. distal_primer: %s" % (proximal_primer, distal_primer)
            else:
                self.utils.print_both("ERROR! Something wrong with the primer suite name: %s. NB: For v6mod it suppose to be 'Archaeal V6mod Suite'\n" % (primer_suite))
            primers[idx_key] = (proximal_primer, distal_primer) 
            
        return primers
        
    def create_inis(self):
        for idx_key in self.runobj.samples.keys():
            run_key = idx_key.split('_')[1].replace("N", ".");
            "todo: check if works w/o NNNN when there is a proper csv"
            email = self.runobj.samples[idx_key].email
#        for dataset in self.dataset_emails.keys():
#            dataset_idx_base = dataset + "_" + self.dataset_index[dataset]
#            print "dataset = %s, self.dataset_emails[dataset] = %s" % (dataset, self.dataset_emails[dataset])
            text = """[general]
project_name = %s
researcher_email = %s
input_directory = %s
output_directory = %s

[files]
pair_1 = %s
pair_2 = %s
""" % (idx_key, email, self.out_file_path, self.results_path, idx_key + "_R1.fastq", idx_key + "_R2.fastq")

            "That's for parital overlap (v4v5 miseq illumina)" 
            if not self.runobj.do_perfect:
                primers = self.get_primers()    
                text += """
# following section is optional
[prefixes]
pair_1_prefix = ^""" + run_key + primers[idx_key][0] + "\npair_2_prefix = ^" + primers[idx_key][1]
                
            ini_file_name = os.path.join(self.out_file_path,  idx_key + ".ini")
            self.open_write_close(ini_file_name, text)

    def open_write_close(self, script_file_name, text):
        ini_file = open(script_file_name, "w")
        ini_file.write(text)
        ini_file.close()
 
    def get_fastq_file_names(self, f_input_file_path):
        in_files_r1 = []
        in_files_r2 = []
        "TODO: exclude dir with new created files from the loop"
        for dirname, dirnames, filenames in os.walk(f_input_file_path):
            correct_file_names = self.get_correct_file_names(filenames)

            for filename in sorted(list(correct_file_names)):
                if filename.find('_R1_') > 0:
                    in_files_r1.append(os.path.join(dirname, filename))
                elif filename.find('_R2_') > 0:
                    in_files_r2.append(os.path.join(dirname, filename))
                else:
                    sys.stderr.write("No read number in the file name: %s\n" % filename)
        self.utils.print_both("FFF0: in_files_r1 %s\n, in_files_r2 %s" % (in_files_r1, in_files_r2))                    
        return (in_files_r1, in_files_r2)
    
    def get_correct_file_names(self, filenames):
        correct_file_names = [];
        for file1 in filenames:
            index_sequence = self.get_index(file1)
#             self.runobj.run_keys
#             
            good_run_key_lane_names = [x for x in self.runobj.run_keys if x.startswith(index_sequence)]
            if len(good_run_key_lane_names) > 0:
                correct_file_names.append(file1)
        return set(correct_file_names)
        
    def read1(self, files_r1, compressed):
        """ loop through the fastq_file_names
            1) e.pair_no = 1, find run_key -> dataset name
            2) collect the relevant part of id
        """
        for file_r1 in files_r1:
            self.utils.print_both("====\nFFF1: file %s" % file_r1)
            f_input  = fq.FastQSource(file_r1, compressed)
            index_sequence = self.get_index(file_r1)
            while f_input.next(trim_to = C.trimming_length):
                e = f_input.entry
                # todo: a fork with or without NNNN, add an argument
                #                 ini_run_key  = index_sequence + "_" + "NNNN" + e.sequence[4:9] + "_" + e.lane_number   
                has_ns = any("NNNN" in s for s in self.runobj.run_keys)           
#                 has_ns = True             
                ini_run_key  = index_sequence + "_" + self.get_run_key(e.sequence, has_ns) + "_" + e.lane_number
                if int(e.pair_no) == 1:
                    dataset_file_name_base_r1 = ini_run_key + "_R1"
                    if (dataset_file_name_base_r1 in self.out_files.keys()):
                        self.out_files[dataset_file_name_base_r1].store_entry(e)
                        "TODO: make a method:"
                        short_id1 = e.header_line.split()[0]
                        short_id2 = ":".join(e.header_line.split()[1].split(":")[1:])
                        id2 = short_id1 + " 2:" + short_id2
                        self.id_dataset_idx[id2] = ini_run_key
                else:
                    self.out_files["unknown"].store_entry(e)
                    
    # def truncate_seq(self, seq):
    #     return seq[:C.trimming_length]
    
                    
    def get_run_key(self, e_sequence, has_ns = "True"):
        if has_ns:
            return ("NNNN" + e_sequence[4:9])
        else:
            return e_sequence[0:5]
    
    def remove_end_ns_strip(self, e_sequence):
        if e_sequence.endswith('N'):
            return e_sequence.rstrip('N')
        else:
            return e_sequence
        
    def read2(self, files_r2, compressed):
        "3) e.pair_no = 2, find id from 2), assign dataset_name"
        for file_r2 in files_r2:
            self.utils.print_both("FFF2: file %s" % file_r2)
            f_input  = fq.FastQSource(file_r2, compressed)
            while f_input.next(trim_to = C.trimming_length):
                e = f_input.entry
                
#                 start = time.time()  
#                 time_before = self.utils.get_time_now()
#                 e.sequence = self.remove_end_ns_strip(e.sequence)
#                 elapsed = (time.time() - start)
#                 print "remove_end_ns_strip with strip is done in: %s" % (elapsed)      
                
                if (int(e.pair_no) == 2) and (e.header_line in self.id_dataset_idx):
                    file_name = self.id_dataset_idx[e.header_line] + "_R2"
                    self.out_files[file_name].store_entry(e)        
                else:
                    self.out_files["unknown"].store_entry(e)

    def get_index(self, file_r1):
        file_name_parts = os.path.basename(file_r1).split("_")
#        if the file name starts with "IDX, then actual idx will be next.
        index = file_name_parts[0]
        if file_name_parts[0].startswith("IDX"):
            index = file_name_parts[1]
        return index
class IlluminaFiles:
    """
    0) from run create all dataset_lines names files in output dir
    1) split fastq files from casava into files with dataset_names
    2) create ini files 
    3) process them through Meren's script
    4) result - files dataset_lane-PERFECT_reads.fa.unique with frequencies - to process with env454upload()    
    
    """
    def __init__(self, runobj):
        self.utils = PipelneUtils()
        self.runobj         = runobj
        self.out_files      = {} 
        self.id_dataset_idx = {}
        self.in_file_path   = self.runobj.input_dir
                
        if self.runobj.vamps_user_upload:
            site = self.runobj.site
            dir_prefix=self.runobj.user+'_'+self.runobj.run
        else:
            site = ''
            dir_prefix = self.runobj.run
        if self.runobj.lane_name:
            lane_name = self.runobj.lane_name
        else:
            lane_name = ''
        
        dirs = Dirs(self.runobj.vamps_user_upload, dir_prefix, self.runobj.platform, lane_name = lane_name, site = site) 

        self.out_file_path = dirs.check_dir(dirs.analysis_dir)
        self.results_path  = dirs.check_dir(dirs.reads_overlap_dir)
        
    def split_files(self, compressed = False):
        """
        TODO: *) fastq_file_names method to collect all file_names with full path or directories_names (see get_all_files()?)
        """   
#        print "compressed = %s" %       compressed
#        compressed = ast.literal_eval(compressed)     
        (in_files_r1, in_files_r2) = self.get_fastq_file_names(self.in_file_path)
        self.read1(in_files_r1, compressed)
        self.read2(in_files_r2, compressed)
        self.create_inis()
        self.close_dataset_files()

#        self.perfect_reads()
#        self.uniq_fa()

    def open_dataset_files(self):
        file_name_base = [i + "_R1" for i in self.runobj.samples.keys()] + [i + "_R2" for i in self.runobj.samples.keys()]
        for f_name in file_name_base:
            output_file = os.path.join(self.out_file_path, f_name + ".fastq")
            self.out_files[f_name] = fq.FastQOutput(output_file)
        self.out_files["unknown"] = fq.FastQOutput(os.path.join(self.out_file_path, "unknown" + ".fastq"))        

    def close_dataset_files(self):
        [o_file[1].close() for o_file in self.out_files.iteritems()] 
        return
   
    def get_all_files(self):
        files = {}
        for dirname, dirnames, filenames in os.walk(self.out_file_path):
            for file_name in filenames:
                full_name = os.path.join(dirname, file_name)
                (file_base, file_extension) = os.path.splitext(os.path.join(dirname, file_name))
                files[full_name] = (file_base, file_extension)
#        print "len(files) = %s" % len(files)
        return files
    
    def perfect_reads(self):
        print "Extract perfect V6 reads:"
        for idx_key in self.runobj.samples.keys():
            file_name = os.path.join(self.out_file_path, idx_key + ".ini")
            program_name = C.perfect_overlap_cmd
            if self.utils.is_local():
                program_name = C.perfect_overlap_cmd_local                    
            try:
                if self.runobj.samples[idx_key].primer_suite.startswith('Archaeal'):
                    call([program_name, file_name, "--archaea"]) 
                else: 
                    call([program_name, file_name])
            except:
                print "Problems with program_name = %s, file_name = %s" % (program_name, file_name)
                raise  


    def partial_overlap_reads(self):
        print "Extract partial_overlap V4V5 reads:"
        for idx_key in self.runobj.samples.keys():
            ini_file_name = os.path.join(self.out_file_path, idx_key + ".ini")
            program_name = C.partial_overlap_cmd
            if self.utils.is_local():
                program_name = C.partial_overlap_cmd_local           
            call([program_name, "--fast-merge", "--compute-qual-dicts", ini_file_name, idx_key])
            
    def filter_mismatches(self, max_mismatch = 3):
        print "Filter mismatches if more then %s" % (max_mismatch)
        n = 0        
        files = self.get_all_files()
        for full_name in files.keys():    
            if files[full_name][0].endswith('_MERGED'):
                n +=1   
#                print "%s fasta file: %s" % (n, full_name)
                program_name = C.filter_mismatch_cmd
                if self.utils.is_local():
                    program_name = C.filter_mismatch_cmd_local
                output_flag = "--output " + full_name + "_FILTERED"                
                call([program_name, full_name, output_flag])
                    
    def uniq_fa(self):
        n = 0        
        print "Uniqueing fasta files"      
        files = self.get_all_files()
        for full_name in files.keys():    
            if files[full_name][1] == ".fa" or files[full_name][0].endswith('_MERGED_FILTERED'):
                n +=1   
#                print "%s fasta file: %s" % (n, full_name)
                program_name = C.fastaunique_cmd
                if self.utils.is_local():
                    program_name = C.fastaunique_cmd_local                
                call([program_name, full_name])

    def create_inis(self):
        for idx_key in self.runobj.samples.keys():
            run_key = idx_key.split('_')[1].replace("N", ".");
            email = self.runobj.samples[idx_key].email
#        for dataset in self.dataset_emails.keys():
#            dataset_idx_base = dataset + "_" + self.dataset_index[dataset]
#            print "dataset = %s, self.dataset_emails[dataset] = %s" % (dataset, self.dataset_emails[dataset])
            text = """[general]
project_name = %s
researcher_email = %s
input_directory = %s
output_directory = %s

[files]
pair_1 = %s
pair_2 = %s
""" % (idx_key, email, self.out_file_path, self.results_path, idx_key + "_R1.fastq", idx_key + "_R2.fastq")

            "That's for v4v5 miseq illumina" 
            if not self.runobj.do_perfect:    
                text += """
# following section is optional
[prefixes]
pair_1_prefix = ^""" + run_key + """CCAGCAGC[C,T]GCGGTAA.
pair_2_prefix = ^CCGTC[A,T]ATT[C,T].TTT[G,A]A.T
                """
                
            ini_file_name = os.path.join(self.out_file_path,  idx_key + ".ini")
            self.open_write_close(ini_file_name, text)

    def open_write_close(self, ini_file_name, text):
        ini_file = open(ini_file_name, "w")
        ini_file.write(text)
        ini_file.close()
 
    def get_fastq_file_names(self, f_input_file_path):
        in_files_r1 = []
        in_files_r2 = []
        "TODO: exclude dir with new created files from the loop"
        for dirname, dirnames, filenames in os.walk(f_input_file_path):
            for filename in filenames:
                if filename.find('_R1_') > 0:
                    in_files_r1.append(os.path.join(dirname, filename))
                elif filename.find('_R2_') > 0:
                    in_files_r2.append(os.path.join(dirname, filename))
                else:
                    sys.stderr.write("No read number in the file name: %s\n" % filename)
        return (in_files_r1, in_files_r2)
        
    def read1(self, files_r1, compressed):
        """ loop through the fastq_file_names
            1) e.pair_no = 1, find run_key -> dataset name
            2) collect the relevant part of id
        """
        for file_r1 in files_r1:
            print "FFF1: file %s" % file_r1
            index_sequence = self.get_index(file_r1)
            f_input  = fq.FastQSource(file_r1, compressed)
            while f_input.next():
                e = f_input.entry
                ini_run_key  = index_sequence + "_" + "NNNN" + e.sequence[4:9] + "_" + e.lane_number                
#                ini_run_key  = e.index_sequence + "_" + "NNNN" + e.sequence[4:9] + "_" + e.lane_number                
                if ini_run_key in self.runobj.samples.keys() and int(e.pair_no) == 1:
                    dataset_file_name_base_r1 = ini_run_key + "_R1"
                    if (dataset_file_name_base_r1 in self.out_files.keys()):
                        self.out_files[dataset_file_name_base_r1].store_entry(e)
                        "TODO: make a method:"
                        short_id1 = e.header_line.split()[0]
                        short_id2 = ":".join(e.header_line.split()[1].split(":")[1:])
                        id2 = short_id1 + " 2:" + short_id2
                        self.id_dataset_idx[id2] = ini_run_key
                else:
                    self.out_files["unknown"].store_entry(e)
                    
    def read2(self, files_r2, compressed):
        "3) e.pair_no = 2, find id from 2), assign dataset_name"
        for file_r2 in files_r2:
            print "FFF2: file %s" % file_r2
            f_input  = fq.FastQSource(file_r2, compressed)
            while f_input.next():
                e = f_input.entry
                
                if (int(e.pair_no) == 2) and (e.header_line in self.id_dataset_idx):
                    file_name = self.id_dataset_idx[e.header_line] + "_R2"
                    self.out_files[file_name].store_entry(e)        
                else:
                    self.out_files["unknown"].store_entry(e)

    def get_index(self, file_r1):
        file_name_parts = os.path.basename(file_r1).split("_")
#        if the file name starts with "IDX, then actual idx will be next.
        index = file_name_parts[0]
        if file_name_parts[0].startswith("IDX"):
            index = file_name_parts[1]
        return index
    def __init__(self, runobj=None):
        self.utils = PipelneUtils()
        self.runobj = runobj
        self.run_keys = self.runobj.run_keys
        self.rundate = self.runobj.run
        try:
            self.use_cluster = self.runobj.use_cluster
        except:
            self.use_cluster = True
        self.chg_suffix = ".chg"
        self.chimeras_suffix = ".chimeras"
        self.ref_suffix = ".db"
        self.denovo_suffix = ".txt"
        self.nonchimeric_suffix = "." + C.nonchimeric_suffix  #".nonchimeric.fa"
        self.chimeric_suffix = ".chimeric.fa"
        self.base_suffix = "unique" + self.chimeras_suffix

        self.cluster_slots = {
            "grendel": [12, 8],
            "cricket": [40],
            "cluster5": [32]
        }

        try:
            if self.runobj.lane_name:
                lane_name = self.runobj.lane_name
            else:
                lane_name = ''
        except:
            lane_name = ''

        if self.runobj.vamps_user_upload:
            os.environ['SGE_ROOT'] = '/opt/sge'
            os.environ['SGE_CELL'] = 'grendel'
            path = os.environ['PATH']
            os.environ['PATH'] = '/opt/sge/bin/lx24-amd64:' + path
            site = self.runobj.site
            dir_prefix = self.runobj.user + '_' + self.runobj.run
            self.dirs = Dirs(self.runobj.vamps_user_upload,
                             dir_prefix,
                             self.runobj.platform,
                             lane_name=lane_name,
                             site=site)
            self.idx_keys = convert_unicode_dictionary_to_str(
                json.loads(
                    open(self.runobj.trim_status_file_name,
                         "r").read()))["new_lane_keys"]
            self.analysis_dir = self.dirs.check_dir(self.dirs.analysis_dir)
            self.indir = self.dirs.check_dir(self.dirs.trimming_dir)
            self.outdir = self.dirs.check_dir(self.dirs.chimera_dir)
            self.gast_dir = self.dirs.check_dir(self.dirs.gast_dir)

        else:
            site = ''
            dir_prefix = self.runobj.run
            self.dirs = Dirs(self.runobj.vamps_user_upload,
                             dir_prefix,
                             self.runobj.platform,
                             lane_name=lane_name,
                             site=site)
            self.indir = self.dirs.check_dir(self.dirs.reads_overlap_dir)
            self.outdir = self.dirs.check_dir(self.dirs.chimera_dir)

#         self.usearch_cmd = C.usearch_cmd
        self.usearch_cmd = C.usearch6_cmd
        if self.utils.is_local():
            self.usearch_cmd = C.usearch6_cmd_local
        #self.abskew      = C.chimera_checking_abskew
        self.refdb = C.chimera_checking_refdb
        if self.utils.is_local():
            self.refdb_local = C.chimera_checking_refdb_local
        self.its_refdb = C.chimera_checking_its_refdb
        self.input_file_names = self.make_chimera_input_illumina_file_names()
class IlluminaFiles:
    """
    0) from run create all dataset_lines names files in output dir
    1) split fastq files from casava into files with dataset_names
    2) create ini files 
    3) process them through Meren's script
    4) result - files dataset_lane-PERFECT_reads.fa.unique with frequencies - to process with env454upload()    
    
    """
    def __init__(self, runobj):
        self.utils = PipelneUtils()
        self.runobj         = runobj
        self.out_files      = {} 
        self.id_dataset_idx = {}
        self.in_file_path   = self.runobj.input_dir
                
        if self.runobj.vamps_user_upload:
            site = self.runobj.site
            dir_prefix=self.runobj.user+'_'+self.runobj.run
        else:
            site = ''
            dir_prefix = self.runobj.run
        if self.runobj.lane_name:
            lane_name = self.runobj.lane_name
        else:
            lane_name = ''
        
        dirs      = Dirs(self.runobj.vamps_user_upload, dir_prefix, self.runobj.platform, lane_name = lane_name, site = site) 
        self.dirs = dirs
        self.out_file_path = dirs.check_dir(dirs.analysis_dir)
        self.results_path  = dirs.check_dir(dirs.reads_overlap_dir)
        
    def split_files(self, compressed = False):
        """
        TODO: *) fastq_file_names method to collect all file_names with full path or directories_names (see get_all_files()?)
        """   
#        print "compressed = %s" %       compressed
#        compressed = ast.literal_eval(compressed)     
        (in_files_r1, in_files_r2) = self.get_fastq_file_names(self.in_file_path)
        self.read1(in_files_r1, compressed)
        self.read2(in_files_r2, compressed)
        self.create_inis()
        self.close_dataset_files()

#        self.perfect_reads()
#        self.uniq_fa()

    def open_dataset_files(self):
        file_name_base = [i + "_R1" for i in self.runobj.samples.keys()] + [i + "_R2" for i in self.runobj.samples.keys()]
        for f_name in file_name_base:
            output_file = os.path.join(self.out_file_path, f_name + ".fastq")
            self.out_files[f_name] = fq.FastQOutput(output_file)
        self.out_files["unknown"] = fq.FastQOutput(os.path.join(self.out_file_path, "unknown" + ".fastq"))        

    def close_dataset_files(self):
        [o_file[1].close() for o_file in self.out_files.iteritems()] 
        return
   
    def get_all_files(self):
        files = {}
        for dirname, dirnames, filenames in os.walk(self.out_file_path):
            for file_name in filenames:
                full_name = os.path.join(dirname, file_name)
                (file_base, file_extension) = os.path.splitext(os.path.join(dirname, file_name))
                files[full_name] = (file_base, file_extension)
#        print "len(files) = %s" % len(files)
        return files
    
    def perfect_reads(self):
        print "Extract perfect V6 reads:"
        for idx_key in self.runobj.samples.keys():
            file_name = os.path.join(self.out_file_path, idx_key + ".ini")
            program_name = C.perfect_overlap_cmd
            if self.utils.is_local():
                program_name = C.perfect_overlap_cmd_local                    
            try:
                if self.runobj.samples[idx_key].primer_suite.startswith('Archaeal'):
                    call([program_name, file_name, "--archaea"]) 
                else: 
                    call([program_name, file_name])
            except:
                print "Problems with program_name = %s, file_name = %s" % (program_name, file_name)
                raise  
    
    def call_sh_script(self, script_name_w_path, where_to_run):
        try:
            call(['chmod', '0774', script_name_w_path])
            call(['qsub', script_name_w_path], cwd=(where_to_run))
#             pass
        except:
            print "Problems with script_name = %s" % (script_name_w_path)
            raise  
        
    def perfect_reads_cluster(self):
        print "Extract perfect V6 reads:"
        program_name = C.perfect_overlap_cmd
        if self.utils.is_local():
            program_name = C.perfect_overlap_cmd_local
        primer_suite = self.get_config_values('primer_suite')
        if any("Archaeal" in s for s in primer_suite):
            add_arg = " --archaea"
        else: 
            add_arg = ""
        command_line          = program_name + add_arg
        file_list             = self.dirs.get_all_files_by_ext(self.out_file_path, "ini")
        script_file_name      = self.create_job_array_script(command_line, self.dirs.analysis_dir, file_list)
        script_file_name_full = os.path.join(self.dirs.analysis_dir, script_file_name)
        self.call_sh_script(script_file_name_full, self.dirs.analysis_dir)  
        return script_file_name              
                          
    def partial_overlap_reads_cluster(self):
        print "Extract partial_overlap V4V5 reads:"
        program_name = C.partial_overlap_cmd
        if self.utils.is_local():
            program_name = C.partial_overlap_cmd_local       
        dna_region = self.get_config_values('dna_region')
        if ("ITS1" in list(dna_region)):
            add_arg = "--marker-gene-stringent"
        else:
            add_arg = ""
#         TODO: this part is the same in perfect overlap - move into a method    
        command_line          = program_name + " --enforce-Q30-check " + add_arg
        file_list             = self.dirs.get_all_files_by_ext(self.out_file_path, "ini")
        script_file_name      = self.create_job_array_script(command_line, self.dirs.analysis_dir, file_list)
        script_file_name_full = os.path.join(self.dirs.analysis_dir, script_file_name)
        self.call_sh_script(script_file_name_full, self.dirs.analysis_dir)  
        return script_file_name      
                    
    def partial_overlap_reads(self):
        print "Extract partial_overlap V4V5 reads:"
        for idx_key in self.runobj.samples.keys():
            ini_file_name = os.path.join(self.out_file_path, idx_key + ".ini")
            program_name = C.partial_overlap_cmd
            if self.utils.is_local():
                program_name = C.partial_overlap_cmd_local        
            try:
                if (self.runobj.samples[idx_key].dna_region == "ITS1"):
                    call([program_name, "--enforce-Q30-check", "--marker-gene-stringent", ini_file_name])
                else:
                    call([program_name, "--enforce-Q30-check", ini_file_name])
                               
#                 call([program_name, ini_file_name])           
#                 call([program_name, ini_file_name, idx_key])
#                 call([program_name, "--fast-merge", ini_file_name, idx_key])
            except Exception:
#                 except Exception, err:
                print traceback.format_exc()
    #or
#     print sys.exc_info()[0]

                print "Problems with program_name = %s" % (program_name)
                raise  
                
#             print "HERE: program_name = " % (program_name)   
#             call([program_name, "--fast-merge", "--compute-qual-dicts", ini_file_name, idx_key])
            
    def get_config_values(self, key):
        config_path_data = [v for k, v in self.runobj.configPath.items()]
        return set([a[key] for a in config_path_data if key in a.keys()])
        
    def make_users_email(self):
        username = getpass.getuser() 
        return username + "@mbl.edu"
                
    def create_job_array_script(self, command_line, dir_to_run, files_list):
        files_string         = " ".join(files_list)
        files_list_size         = len(files_list)
        command_file_name = os.path.basename(command_line.split(" ")[0])
        script_file_name  = command_file_name + "_" + self.runobj.run + "_" + self.runobj.lane_name + ".sh"
        script_file_name_full = os.path.join(dir_to_run, script_file_name)
        log_file_name     = script_file_name + ".sge_script.sh.log"
        email_mbl         = self.make_users_email()
        text = (
                '''#!/bin/bash
#$ -cwd
#$ -S /bin/bash
#$ -N %s
# Giving the name of the output log file
#$ -o %s
# Combining output/error messages into one file
#$ -j y
# Send mail to these users
#$ -M %s
# Send mail at job end; -m eas sends on end, abort, suspend.
#$ -m eas
#$ -t 1-%s
# Now the script will iterate %s times.

  file_list=(%s)
  
  i=$(expr $SGE_TASK_ID - 1)
#   echo "i = $i"
  source ~/.bashrc
  module load bioware
    
  echo "%s ${file_list[$i]}"  
  %s ${file_list[$i]}  
''' % (script_file_name, log_file_name, email_mbl, files_list_size, files_list_size, files_string, command_line, command_line)
# ''' % (script_file_name, log_file_name, email_mbl, files_list_size, files_list_size, files_string, command_line)
                )
        self.open_write_close(script_file_name_full, text)
        return script_file_name

    def filter_mismatches_cluster(self, max_mismatch = 3):
        print "Filter mismatches if more then %s" % (max_mismatch)
        command_line = C.filter_mismatch_cmd
        if self.utils.is_local():
            command_line = C.filter_mismatch_cmd_local    
        files_dir = self.dirs.reads_overlap_dir   
                
        file_list             = self.dirs.get_all_files_by_ext(files_dir, "_MERGED")
        script_file_name      = self.create_job_array_script(command_line, files_dir, file_list)
        script_file_name_full = os.path.join(files_dir, script_file_name)
        self.call_sh_script(script_file_name_full, files_dir)  
        return script_file_name              

    def filter_mismatches(self, max_mismatch = 3):
        print "Filter mismatches if more then %s" % (max_mismatch)
        n = 0        
        files = self.get_all_files()
        for full_name in files.keys():    
            if files[full_name][0].endswith('_MERGED'):
                n +=1   
#                print "%s fasta file: %s" % (n, full_name)
                program_name = C.filter_mismatch_cmd
                if self.utils.is_local():
                    program_name = C.filter_mismatch_cmd_local
#                 output_flag = "--output " + full_name + "_FILTERED"
# TODO:    Remove!!!
#                 output_flag = "-o " + full_name + "_FILTERED"           
#                 output_flag = "-o TTAGGC_NNNNTGACT_1_MERGED_FILTERED"           

#                 print "output_flag = %s" % (output_flag)
#                 print "%s %s %s" % (program_name, full_name, output_flag)                
#                 call([program_name, full_name, output_flag])
                call([program_name, full_name])

    def uniq_fa_cluster(self):
        print "Uniqueing fasta files"      
        command_line = C.fastaunique_cmd
        if self.utils.is_local():
            command_line = C.fastaunique_cmd_local   
        files_dir = self.dirs.reads_overlap_dir   
                
        file_list             = self.dirs.get_all_files_by_ext(files_dir, C.filtered_suffix)
        if len(file_list) == 0:
            file_list         = self.dirs.get_all_files_by_ext(files_dir, ".fa")
        script_file_name      = self.create_job_array_script(command_line, files_dir, file_list)
        script_file_name_full = os.path.join(files_dir, script_file_name)
        self.call_sh_script(script_file_name_full, files_dir)  
        return script_file_name                           
                                       
    def uniq_fa(self):
        n = 0        
        print "Uniqueing fasta files"      
        files = self.get_all_files()
        for full_name in files.keys():    
#             if files[full_name][1] == ".fa" or files[full_name][0].endswith('_MERGED_FILTERED'):
            if files[full_name][1] == ".fa" or files[full_name][0].endswith(C.filtered_suffix):
                n +=1   
#                print "%s fasta file: %s" % (n, full_name)
                program_name = C.fastaunique_cmd
                if self.utils.is_local():
                    program_name = C.fastaunique_cmd_local                
                call([program_name, full_name])

    def get_primers(self):
        proximal_primer = ""
        distal_primer   = ""
        primers         = {}
        for idx_key in self.runobj.samples.keys():
            if self.runobj.samples[idx_key].primer_suite in C.primers_dict:
                proximal_primer = C.primers_dict[self.runobj.samples[idx_key].primer_suite]["proximal_primer"]
                distal_primer = C.primers_dict[self.runobj.samples[idx_key].primer_suite]["distal_primer"]

#            if self.runobj.samples[idx_key].primer_suite.startswith('Archaeal V4-V5'):
#                proximal_primer = "G[C,T][C,T]TAAA..[A,G][C,T][C,T][C,T]GTAGC"
#                distal_primer   = "CCGGCGTTGA.TCCAATT"
#            elif self.runobj.samples[idx_key].primer_suite.startswith('Bacterial V4-V5'):
#                proximal_primer = "CCAGCAGC[C,T]GCGGTAA."
#                distal_primer   = "CCGTC[A,T]ATT[C,T].TTT[G,A]A.T"
#            elif self.runobj.samples[idx_key].primer_suite.startswith('Archaeal V6mod'):
#                proximal_primer = "AATTGGCGGGGGAGCAC"
#                distal_primer   = "GCCATGCACC[A,T]CCTCT"
#            elif self.runobj.samples[idx_key].primer_suite.startswith('Fungal ITS1'):
#                proximal_primer = "GTAAAAGTCGTAACAAGGTTTC"
#                distal_primer   = "GTTCAAAGA[C,T]TCGATGATTCAC"
            else:
                print "ERROR! Something wrong with the primer suite name: %s. NB: For v6mod it suppose to be 'Archaeal V6mod Suite'" % (self.runobj.samples[idx_key].primer_suite)
            primers[idx_key] = (proximal_primer, distal_primer) 
            
        return primers
        
    def create_inis(self):
        primers = self.get_primers()
        for idx_key in self.runobj.samples.keys():
            run_key = idx_key.split('_')[1].replace("N", ".");
            email = self.runobj.samples[idx_key].email
#        for dataset in self.dataset_emails.keys():
#            dataset_idx_base = dataset + "_" + self.dataset_index[dataset]
#            print "dataset = %s, self.dataset_emails[dataset] = %s" % (dataset, self.dataset_emails[dataset])
            text = """[general]
project_name = %s
researcher_email = %s
input_directory = %s
output_directory = %s

[files]
pair_1 = %s
pair_2 = %s
""" % (idx_key, email, self.out_file_path, self.results_path, idx_key + "_R1.fastq", idx_key + "_R2.fastq")

            "That's for v4v5 miseq illumina" 
            if not self.runobj.do_perfect:    
                text += """
# following section is optional
[prefixes]
pair_1_prefix = ^""" + run_key + primers[idx_key][0] + "\npair_2_prefix = ^" + primers[idx_key][1]
                
            ini_file_name = os.path.join(self.out_file_path,  idx_key + ".ini")
            self.open_write_close(ini_file_name, text)

    def open_write_close(self, script_file_name, text):
        ini_file = open(script_file_name, "w")
        ini_file.write(text)
        ini_file.close()
 
    def get_fastq_file_names(self, f_input_file_path):
        in_files_r1 = []
        in_files_r2 = []
        "TODO: exclude dir with new created files from the loop"
        for dirname, dirnames, filenames in os.walk(f_input_file_path):
            for filename in filenames:
                if filename.find('_R1_') > 0:
                    in_files_r1.append(os.path.join(dirname, filename))
                elif filename.find('_R2_') > 0:
                    in_files_r2.append(os.path.join(dirname, filename))
                else:
                    sys.stderr.write("No read number in the file name: %s\n" % filename)
        return (in_files_r1, in_files_r2)
        
    def read1(self, files_r1, compressed):
        """ loop through the fastq_file_names
            1) e.pair_no = 1, find run_key -> dataset name
            2) collect the relevant part of id
        """
        for file_r1 in files_r1:
            print "FFF1: file %s" % file_r1
            index_sequence = self.get_index(file_r1)
            f_input  = fq.FastQSource(file_r1, compressed)
            while f_input.next():
                e = f_input.entry
                ini_run_key  = index_sequence + "_" + "NNNN" + e.sequence[4:9] + "_" + e.lane_number                
#                ini_run_key  = e.index_sequence + "_" + "NNNN" + e.sequence[4:9] + "_" + e.lane_number                
                if ini_run_key in self.runobj.samples.keys() and int(e.pair_no) == 1:
                    dataset_file_name_base_r1 = ini_run_key + "_R1"
                    if (dataset_file_name_base_r1 in self.out_files.keys()):
                        self.out_files[dataset_file_name_base_r1].store_entry(e)
                        "TODO: make a method:"
                        short_id1 = e.header_line.split()[0]
                        short_id2 = ":".join(e.header_line.split()[1].split(":")[1:])
                        id2 = short_id1 + " 2:" + short_id2
                        self.id_dataset_idx[id2] = ini_run_key
                else:
                    self.out_files["unknown"].store_entry(e)
                    
    def read2(self, files_r2, compressed):
        "3) e.pair_no = 2, find id from 2), assign dataset_name"
        for file_r2 in files_r2:
            print "FFF2: file %s" % file_r2
            f_input  = fq.FastQSource(file_r2, compressed)
            while f_input.next():
                e = f_input.entry
                
                if (int(e.pair_no) == 2) and (e.header_line in self.id_dataset_idx):
                    file_name = self.id_dataset_idx[e.header_line] + "_R2"
                    self.out_files[file_name].store_entry(e)        
                else:
                    self.out_files["unknown"].store_entry(e)

    def get_index(self, file_r1):
        file_name_parts = os.path.basename(file_r1).split("_")
#        if the file name starts with "IDX, then actual idx will be next.
        index = file_name_parts[0]
        if file_name_parts[0].startswith("IDX"):
            index = file_name_parts[1]
        return index
class IlluminaFiles:
    """
    0) from run create all dataset_lines names files in output dir
    1) split fastq files from casava into files with dataset_names
    2) create ini files
    3) process them through Meren's script
    4) result - files dataset_lane-PERFECT_reads.fa.unique with frequencies - to process with env454upload()
    """
    def __init__(self, runobj):
        self.utils = PipelneUtils()
        self.runobj         = runobj
        self.out_files      = {}
        self.id_dataset_idx = {}
        self.in_file_path   = self.runobj.input_dir

        if self.runobj.vamps_user_upload:
            site = self.runobj.site
            dir_prefix=self.runobj.user+'_'+self.runobj.run
        else:
            site = ''
            dir_prefix = self.runobj.run
        if self.runobj.lane_name:
            lane_name = self.runobj.lane_name
        else:
            lane_name = ''

        dirs      = Dirs(self.runobj.vamps_user_upload, dir_prefix, self.runobj.platform, lane_name = lane_name, site = site)
        self.dirs = dirs
        self.out_file_path = dirs.check_dir(dirs.analysis_dir)
        self.results_path  = dirs.check_dir(dirs.reads_overlap_dir)
        self.platform = self.runobj.platform

    def split_files(self, compressed = False):
        """
        TODO: *) fastq_file_names method to collect all file_names with full path or directories_names (see get_all_files()?)
        """
#        logger.debug("compressed = %s" %       compressed)
#        compressed = ast.literal_eval(compressed)
        (in_files_r1, in_files_r2) = self.get_fastq_file_names(self.in_file_path)
#         correct_file_names = self.get_correct_file_names(in_files_r1)
        if (len(in_files_r1) > 0):
            self.read1(in_files_r1, compressed)
            self.read2(in_files_r2, compressed)
            self.create_inis()
        else:
#             logger.debug("ERROR: There is something wrong with fastq file names. Please check if they start with correct indexes.")
#             logger.debug("ERROR: There is something wrong with fastq file names. Please check if they start with correct indexes.")
            self.utils.print_both("ERROR: There is something wrong with fastq file names. Please check if they start with correct indexes.")
        self.close_dataset_files()



#        self.perfect_reads()
#        self.uniq_fa()

    def open_dataset_files(self):
        file_name_base = [i + "_R1" for i in self.runobj.samples.keys()] + [i + "_R2" for i in self.runobj.samples.keys()]
        for f_name in file_name_base:
            output_file = os.path.join(self.out_file_path, f_name + ".fastq")
            self.out_files[f_name] = fq.FastQOutput(output_file)
        self.out_files["unknown"] = fq.FastQOutput(os.path.join(self.out_file_path, "unknown" + ".fastq"))

    def close_dataset_files(self):
        [o_file[1].close() for o_file in self.out_files.items()]
        return

#     def perfect_reads(self):
#         self.utils.print_both("Extract perfect V6 reads:")
#         for idx_key in self.runobj.samples.keys():
#             file_name = os.path.join(self.out_file_path, idx_key + ".ini")
#             program_name = C.perfect_overlap_cmd
#             if self.utils.is_local():
#                 program_name = C.perfect_overlap_cmd_local
#             try:
#                 if self.runobj.samples[idx_key].primer_suite.lower().startswith('archaeal'):
#                     call([program_name, file_name, "--archaea"])
#                 else:
#                     call([program_name, file_name])
#             except:
#                 self.utils.print_both("Problems with program_name = %s, file_name = %s" % (program_name, file_name))
#                 raise
#
#     TODO: use from util
    def call_sh_script(self, script_name_w_path, where_to_run):
        try:
            call(['chmod', '0774', script_name_w_path])
            if self.utils.is_local():
                self.utils.print_both("call(['qsub', script_name_w_path], cwd=(where_to_run))")
                call(['bash', script_name_w_path], cwd=(where_to_run))
            else:
                call(['qsub', script_name_w_path], cwd=(where_to_run))
#             pass
        except:
            self.utils.print_both("Problems with script_name = %s or qsub" % (script_name_w_path))
            raise

#     todo: combine and DRY with partial - it's the same command, different arguments
    def merge_perfect(self):
        self.utils.print_both("merge perfect V6 reads:")
        program_name = C.perfect_overlap_cmd
        if self.utils.is_local():
            program_name = C.perfect_overlap_cmd_local
        add_arg = " --marker-gene-stringent --retain-only-overlap --max-num-mismatches 0"
        command_line          = program_name + add_arg
        file_list             = self.dirs.get_all_files_by_ext(self.out_file_path, "ini")
        script_file_name      = self.create_job_array_script(command_line, self.dirs.analysis_dir, file_list)
        script_file_name_full = os.path.join(self.dirs.analysis_dir, script_file_name)
        self.call_sh_script(script_file_name_full, self.dirs.analysis_dir)
        return script_file_name

    def trim_primers_perfect(self):
        self.utils.print_both("trim primers from perfect V6 reads:")

        merged_file_names = self.dirs.get_all_files_by_ext(self.dirs.reads_overlap_dir, "_MERGED")
        primer_suite = self.get_config_values('primer_suite')
        add_arg = ""
        if any([s.lower().startswith("archaeal") for s in primer_suite]):
            add_arg += " --archaea"
        program_name = C.trim_primers_cmd + add_arg
        script_file_name      = self.create_job_array_script(program_name, self.dirs.reads_overlap_dir, merged_file_names)
        script_file_name_full = os.path.join(self.dirs.reads_overlap_dir, script_file_name)
        self.call_sh_script(script_file_name_full, self.dirs.reads_overlap_dir)
        return script_file_name

    """
    def perfect_reads_cluster(self):
        '''
        iu-merge-pairs anna.ini --marker-gene-stringent --retain-only-overlap --max-num-mismatches 0
​            Each flag is critical. ​marker-gene-stringent looks complete overlaps, retain-only-overlap gets rid of adapters, max-num-mismatches retains only perfect overlaps.
            This generates the test_MERGED file with all complete overlaps without any mismatches. But it has all the primers.
            Then we process this file with the new and shiny iu-analyze-v6-complete-overlaps script:
        iu-trim-V6-primers test_MERGED

        '''
        self.utils.print_both("Extract perfect V6 reads:")
        script_file_name      = self.merge_perfect()
        trim_script_file_name = self.trim_primers_perfect()

        return (script_file_name, trim_script_file_name)
    """

    def partial_overlap_reads_cluster(self):
        self.utils.print_both("Extract partial_overlap reads (from partial_overlap_reads_cluster):")
        program_name = C.partial_overlap_cmd
        if self.utils.is_local():
            program_name = C.partial_overlap_cmd_local
        dna_region = self.get_config_values('dna_region')
        if set(C.marker_gene_stringent_regions) & set(list(dna_region)):
            add_arg = "--marker-gene-stringent"
        else:
            add_arg = ""
#         TODO: this part is the same in perfect overlap - move into a method
        command_line          = program_name + " --enforce-Q30-check " + add_arg
        file_list             = self.dirs.get_all_files_by_ext(self.out_file_path, "ini")
        script_file_name      = self.create_job_array_script(command_line, self.dirs.analysis_dir, file_list)
        script_file_name_full = os.path.join(self.dirs.analysis_dir, script_file_name)
        self.call_sh_script(script_file_name_full, self.dirs.analysis_dir)
        self.utils.print_both("self.dirs.chmod_all(%s)" % (self.dirs.analysis_dir))
        self.dirs.chmod_all(self.dirs.analysis_dir)

        return script_file_name

    def partial_overlap_reads(self):
        self.utils.print_both("Extract partial_overlap reads (from partial_overlap_reads):")
        for idx_key in self.runobj.samples.keys():
            ini_file_name = os.path.join(self.out_file_path, idx_key + ".ini")
            program_name = C.partial_overlap_cmd
            if self.utils.is_local():
                program_name = C.partial_overlap_cmd_local
            try:
                if set(C.marker_gene_stringent_regions) & set(list(self.runobj.samples[idx_key].dna_region)):
                # if (self.runobj.samples[idx_key].dna_region == "ITS1"):
                    call([program_name, "--enforce-Q30-check", "--marker-gene-stringent", ini_file_name])
                else:
                    call([program_name, "--enforce-Q30-check", ini_file_name])

#                 call([program_name, ini_file_name])
#                 call([program_name, ini_file_name, idx_key])
#                 call([program_name, "--fast-merge", ini_file_name, idx_key])
            except Exception:
#                 except Exception, err:
                message = traceback.format_exc()
                self.utils.print_both(message)
    #or
#     logger.debug(sys.exc_info()[0])

                self.utils.print_both("Problems with program_name = %s" % (program_name))
                raise

#             logger.debug("HERE: program_name = " % (program_name))
#             call([program_name, "--fast-merge", "--compute-qual-dicts", ini_file_name, idx_key])

    def get_config_values(self, key):
        config_path_data = [v for k, v in self.runobj.configPath.items()]
        return set([a[key] for a in config_path_data if key in a.keys()])

#     TODO: use from util
    def make_users_email(self):
        username = getpass.getuser()
        return username + "@mbl.edu"

#     TODO: use from util
#     Removed by Hilary's request:
#     # Send mail at job end (e); -m as sends abort, suspend.
#     #$ -m as
    def create_job_array_script(self, command_line, dir_to_run, files_list):
        files_string         = " ".join(files_list)
        files_list_size         = len(files_list)
        command_file_name = os.path.basename(command_line.split(" ")[0])
        script_file_name  = command_file_name + "_" + self.runobj.run + "_" + self.runobj.lane_name + ".sh"
        script_file_name_full = os.path.join(dir_to_run, script_file_name)
        log_file_name     = script_file_name + ".sge_script.sh.log"
        # email_mbl         = self.make_users_email()
        email_mbl = C.email_mbl
        text = (
                '''#!/bin/bash
#$ -cwd
#$ -S /bin/bash
#$ -N %s
# Giving the name of the output log file
#$ -o %s
# Combining output/error messages into one file
#$ -j y
# Send mail to these users
#$ -M %s
#$ -t 1-%s
# Now the script will iterate %s times.

  file_list=(%s)

  i=$(expr $SGE_TASK_ID - 1)
  # echo "i = $i"
  # . /etc/profile.d/modules.sh
  # . /xraid/bioware/bioware-loader.sh

  shopt -s expand_aliases # It will expand aliases that are loaded via modules
  . /xraid/bioware/Modules/etc/profile.modules
  module load bioware

  echo "%s ${file_list[$i]}"
  %s ${file_list[$i]}
''' % (script_file_name, log_file_name, email_mbl, files_list_size, files_list_size, files_string, command_line, command_line)
# ''' % (script_file_name, log_file_name, email_mbl, files_list_size, files_list_size, files_string, command_line)
                )
        self.open_write_close(script_file_name_full, text)
        return script_file_name

    def filter_mismatches_cluster(self, max_mismatch = 3):
        self.utils.print_both("Filter mismatches if more then %s" % (max_mismatch))
        command_line = C.filter_mismatch_cmd
        if self.utils.is_local():
            command_line = C.filter_mismatch_cmd_local
        files_dir = self.dirs.reads_overlap_dir

        file_list             = self.dirs.get_all_files_by_ext(files_dir, "_MERGED")
        script_file_name      = self.create_job_array_script(command_line, files_dir, file_list)
        script_file_name_full = os.path.join(files_dir, script_file_name)
        self.utils.call_sh_script(script_file_name_full, files_dir)
        self.utils.print_both("self.dirs.chmod_all(%s)" % (files_dir))
        self.dirs.chmod_all(files_dir)

        return script_file_name

    def filter_mismatches(self, max_mismatch = 3):
        self.utils.print_both("Filter mismatches if more then %s" % (max_mismatch))
        n = 0
        files = self.dirs.get_all_files()
        for full_name in files.keys():
            if files[full_name][0].endswith('_MERGED'):
                n +=1
#                logger.debug("%s fasta file: %s" % (n, full_name))
                program_name = C.filter_mismatch_cmd
                if self.utils.is_local():
                    program_name = C.filter_mismatch_cmd_local
                call([program_name, full_name])

    def uniq_fa_cluster(self):
        self.utils.print_both("Uniqueing fasta files")
        command_line = C.fastaunique_cmd
        if self.utils.is_local():
            command_line = C.fastaunique_cmd_local
        files_dir = self.dirs.reads_overlap_dir

        file_list             = self.dirs.get_all_files_by_ext(files_dir, C.filtered_suffix)
        if len(file_list) == 0:
            file_list         = self.dirs.get_all_files_by_ext(files_dir, ".fa")
        if len(file_list) == 0:
            file_list         = self.dirs.get_all_files_by_ext(files_dir, "MERGED_V6_PRIMERS_REMOVED")

        script_file_name      = self.create_job_array_script(command_line, files_dir, file_list)
        script_file_name_full = os.path.join(files_dir, script_file_name)
        self.call_sh_script(script_file_name_full, files_dir)
        self.utils.print_both("self.dirs.chmod_all(%s)" % (files_dir))
        self.dirs.chmod_all(files_dir)
        return script_file_name

    def uniq_fa(self):
        n = 0
        self.utils.print_both("Uniqueing fasta files")
        files = self.dirs.get_all_files()
        for full_name in files.keys():
#             if files[full_name][1] == ".fa" or files[full_name][0].endswith('_MERGED_FILTERED'):
            if files[full_name][1] == ".fa" or files[full_name][0].endswith(C.filtered_suffix):
                n +=1
                program_name = C.fastaunique_cmd
                if self.utils.is_local():
                    program_name = C.fastaunique_cmd_local
                call([program_name, full_name])

    def get_primers(self):
        proximal_primer = ""
        distal_primer   = ""
        primers         = {}
        for idx_key in self.runobj.samples.keys():
            primer_suite = self.runobj.samples[idx_key].primer_suite.lower()

            if primer_suite in C.primers_dict:
                proximal_primer = C.primers_dict[primer_suite]["proximal_primer"]
                distal_primer = C.primers_dict[primer_suite]["distal_primer"]
            else:
                self.utils.print_both("ERROR! Something wrong with the primer suite name: %s. NB: For v6mod it suppose to be 'Archaeal V6mod Suite'\n" % (primer_suite))
            primers[idx_key] = (proximal_primer, distal_primer)

        return primers

    def create_inis(self):
        for idx_key in self.runobj.samples.keys():
            run_key = idx_key.split('_')[1].replace("N", ".");
            "todo: check if works w/o NNNN when there is a proper csv"
            email = self.runobj.samples[idx_key].email
            text = """[general]
project_name = %s
researcher_email = %s
input_directory = %s
output_directory = %s

[files]
pair_1 = %s
pair_2 = %s
""" % (idx_key, email, self.out_file_path, self.results_path, idx_key + "_R1.fastq", idx_key + "_R2.fastq")

            "That's for parital overlap (v4v5 and hapto miseq illumina)"
            if not self.runobj.do_perfect:
                primers = self.get_primers()
                # logger.debug("run_key = %s, idx_key = %s, primers[idx_key][0], primers[idx_key][1] = %s" (run_key, idx_key, primers[idx_key][0], primers[idx_key][1]))
                text += """
# following section is optional
[prefixes]
pair_1_prefix = ^""" + run_key + primers[idx_key][0] + "\npair_2_prefix = ^" + primers[idx_key][1]

            ini_file_name = os.path.join(self.out_file_path,  idx_key + ".ini")
            self.open_write_close(ini_file_name, text)

#     TODO: use from utils
    def open_write_close(self, script_file_name, text):
        ini_file = open(script_file_name, "w")
        ini_file.write(text)
        ini_file.close()

    def get_fastq_file_names(self, f_input_file_path):
        in_files_r1 = []
        in_files_r2 = []
        "TODO: exclude dir with new created files from the loop"
        for dirname, dirnames, filenames in os.walk(f_input_file_path):
            correct_file_names = self.get_correct_file_names(filenames)

            for filename in sorted(list(correct_file_names)):
                if filename.find('_R1_') > 0:
                    in_files_r1.append(os.path.join(dirname, filename))
                elif filename.find('_R2_') > 0:
                    in_files_r2.append(os.path.join(dirname, filename))
                else:
                    sys.stderr.write("No read number in the file name: %s\n" % filename)
        self.utils.print_both("FFF0: in_files_r1 %s\n, in_files_r2 %s" % (in_files_r1, in_files_r2))
        return (in_files_r1, in_files_r2)

    def get_correct_file_names(self, filenames):
        correct_file_names = [];
        for file1 in filenames:
            index_sequence = self.get_index(file1)
#             self.runobj.run_keys
#
            good_run_key_lane_names = [x for x in self.runobj.run_keys if x.startswith(index_sequence)]
            if len(good_run_key_lane_names) > 0:
                correct_file_names.append(file1)
        return set(correct_file_names)


    def get_run_key(self, e_sequence, has_ns = "True"):
        if has_ns:
            return ("NNNN" + e_sequence[4:9])
        else:
            return e_sequence[0:5]

    def get_ini_run_key(self, index_sequence, e):
        has_ns = any("NNNN" in s for s in self.runobj.run_keys)

        lane_number = e.lane_number
        if self.platform == "nextseq":
            lane_number = "1"
        return index_sequence + "_" + self.get_run_key(e.sequence, has_ns) + "_" + lane_number

    def read1(self, files_r1, compressed):
        """ loop through the fastq_file_names
            1) e.pair_no = 1, find run_key -> dataset name
            2) collect the relevant part of id
        """
        for file_r1 in files_r1:
            self.utils.print_both("====\nFFF1: file %s" % file_r1)
            f_input  = fq.FastQSource(file_r1, compressed)
            index_sequence = self.get_index(file_r1)
            while f_input.next(trim_to = C.trimming_length):
            # while f_input.next(trim_to = C.trimming_length[self.platform]):
                e = f_input.entry
                # todo: a fork with or without NNNN, add an argument
                #                 ini_run_key  = index_sequence + "_" + "NNNN" + e.sequence[4:9] + "_" + e.lane_number
                # lane_number = e.lane_number
                # if self.platform == "nextseq":
                #     lane_number = "1"
                # ini_run_key  = index_sequence + "_" + self.get_run_key(e.sequence, has_ns) + "_" + lane_number
                ini_run_key = self.get_ini_run_key(index_sequence, e)
                if int(e.pair_no) == 1:
                    dataset_file_name_base_r1 = ini_run_key + "_R1"
                    if (dataset_file_name_base_r1 in self.out_files.keys()):
                        self.out_files[dataset_file_name_base_r1].store_entry(e)
                        "TODO: make a method:"
                        short_id1 = e.header_line.split()[0]
                        short_id2 = ":".join(e.header_line.split()[1].split(":")[1:])
                        id2 = short_id1 + " 2:" + short_id2
                        self.id_dataset_idx[id2] = ini_run_key
                    else:
                        self.out_files["unknown"].store_entry(e)

    # def truncate_seq(self, seq):
    #     return seq[:C.trimming_length]

    def remove_end_ns_strip(self, e_sequence):
        if e_sequence.endswith('N'):
            return e_sequence.rstrip('N')
        else:
            return e_sequence

    def read2(self, files_r2, compressed):
        "3) e.pair_no = 2, find id from 2), assign dataset_name"
        for file_r2 in files_r2:
            self.utils.print_both("FFF2: file %s" % file_r2)
            f_input  = fq.FastQSource(file_r2, compressed)
            while f_input.next(trim_to = C.trimming_length):
                e = f_input.entry

                if (int(e.pair_no) == 2) and (e.header_line in self.id_dataset_idx):
                    file_name = self.id_dataset_idx[e.header_line] + "_R2"
                    self.out_files[file_name].store_entry(e)
                else:
                    self.out_files["unknown"].store_entry(e)

    def get_index(self, file_r1):
        file_name_parts = os.path.basename(file_r1).split("_")
#        if the file name starts with "IDX, then actual idx will be next.
        index = file_name_parts[0]
        if file_name_parts[0].startswith("IDX"):
            index = file_name_parts[1]
        return index
Esempio n. 19
0
class Chimera:
    """ Define here """
    def __init__(self, runobj = None):
        self.utils      = PipelneUtils()
        self.runobj     = runobj
        self.run_keys   = self.runobj.run_keys
        self.rundate    = self.runobj.run
        try:
            self.use_cluster = self.runobj.use_cluster
        except:
            self.use_cluster = True
        self.chg_suffix         = ".chg"
        self.chimeras_suffix    = ".chimeras"      
        self.ref_suffix         = ".db"      
        self.denovo_suffix      = ".txt"        
        self.nonchimeric_suffix = "." + C.nonchimeric_suffix #".nonchimeric.fa"
        self.chimeric_suffix    = ".chimeric.fa"
        self.base_suffix        = "unique" + self.chimeras_suffix

        try:
            if self.runobj.lane_name:
                lane_name = self.runobj.lane_name
            else:
                lane_name = ''
        except:
            lane_name = ''

        if self.runobj.vamps_user_upload:
            site       = self.runobj.site
            dir_prefix = self.runobj.user + '_' + self.runobj.run
            self.dirs = Dirs(self.runobj.vamps_user_upload, dir_prefix, self.runobj.platform, lane_name = lane_name, site = site) 
            self.idx_keys = convert_unicode_dictionary_to_str(json.loads(open(self.runobj.trim_status_file_name,"r").read()))["new_lane_keys"] 
            self.indir  = self.dirs.check_dir(self.dirs.trimming_dir)
            self.outdir = self.dirs.check_dir(self.dirs.chimera_dir)
        else:
            site = ''
            dir_prefix = self.runobj.run
            self.dirs = Dirs(self.runobj.vamps_user_upload, dir_prefix, self.runobj.platform, lane_name = lane_name, site = site) 
            self.indir  = self.dirs.check_dir(self.dirs.reads_overlap_dir)
            self.outdir = self.dirs.check_dir(self.dirs.chimera_dir)
        
        
#         self.usearch_cmd = C.usearch_cmd
        self.usearch_cmd = C.usearch6_cmd        
        #self.abskew      = C.chimera_checking_abskew
        self.refdb       = C.chimera_checking_refdb_6
        self.its_refdb   = C.chimera_checking_its_refdb_6
        self.input_file_names  = self.make_chimera_input_illumina_file_names()
#         pprint(self.run_keys)
#         self.output_file_names = self.make_chimera_output_illumina_file_names(self.input_file_names)
        
    def make_chimera_input_illumina_file_names(self):
        input_file_names = {} 
        
        for idx_key in self.run_keys:
            file_name = idx_key + "_" + C.filtered_suffix + ".unique" 
           
            if os.path.exists(os.path.join(self.indir, file_name)):
                input_file_names[idx_key] = file_name
        
        return input_file_names
            
#     def make_chimera_output_illumina_file_names(self, input_file_names):
#         output_file_names = {} 
#         for idx_key, input_file_name in input_file_names.iteritems():
#             output_file_names[idx_key] = input_file_name
#         return output_file_names

    def get_current_dirname(self, in_or_out = ""):
        if in_or_out == "":
            cur_dirname    = self.indir 
        else:
            cur_dirname    = self.outdir
        return cur_dirname

    def is_chimera_check_file(self, filename):
        return filename.endswith((self.chimeras_suffix + self.denovo_suffix, self.chimeras_suffix + self.ref_suffix, self.chimeric_suffix, self.nonchimeric_suffix))

    def get_current_filenames(self, cur_dirname):
        cur_file_names = []
        if cur_dirname == self.indir:
            cur_file_names = self.input_file_names.values()
        elif cur_dirname == self.outdir:
            cur_file_names = self.get_chimera_file_names(self.outdir)
        return cur_file_names

    def get_chimera_file_names(self, cur_dirname):
        cur_file_names = []        
        for dirname, dirnames, filenames in os.walk(cur_dirname):
            cur_file_names = [filename for filename in filenames if (self.is_chimera_check_file(filename))]
        return cur_file_names

#     def illumina_frequency_size(self, in_or_out = "", find = "frequency:", replace = ";size="):
#         cur_dirname    = self.get_current_dirname(in_or_out)
#         cur_file_names = self.get_current_filenames(cur_dirname)
# #         print "cur_file_names: "
# #         pprint(cur_file_names)
#         change_from_suffix = ""
#         change_to_suffix   = self.chg_suffix
# #         print "find = %s, replace = %s" % (find, replace)
#         regex              = re.compile(r"%s" % find)
# 
#         for cur_file_name in cur_file_names:
#             file_name = os.path.join(cur_dirname, cur_file_name)
#             with open(file_name + change_from_suffix, "r") as sources:
#                 lines = sources.readlines()
#             with open(file_name + change_to_suffix, "w") as target:
#                 for line in lines:
#                         target.write(regex.sub(replace, line))

    def read_file(self, source_name):
        with open(source_name, "r") as sources:
            return sources.readlines()

    def illumina_sed(self, lines, target_name, regex, replace, uppercase):
        with open(target_name, "w") as target:
            for line in lines:
                if line.startswith(">"):
                    line1 = regex.sub(replace, line)
                else:
                    if (uppercase):
                        line1 = line.upper()
                    else:
                        line1 = line
                target.write(line1)  


    def call_illumina_sed(self, from_to):
        """
            from_to = from_frequency_to_size or from_size_to_frequency
        """
        sed_from_to = namedtuple('sed_from_to', 'find, replace, cur_dirname, cur_file_names, change_from_suffix, change_to_suffix, uppercase')

        from_frequency_to_size = sed_from_to(
        find               = "frequency:",
        replace            = ";size=",
        cur_dirname        = self.indir,
        cur_file_names     = self.get_current_filenames(self.indir),
        change_from_suffix = "",
        change_to_suffix   = self.chg_suffix,
        uppercase          = True
        )

        from_size_to_frequency = sed_from_to(
        find               = ";size=",
        replace            = "frequency:",
        cur_dirname        = self.outdir,
        cur_file_names     = self.get_chimera_file_names(self.outdir),
        change_from_suffix = "",
        change_to_suffix   = "",
        uppercase          = False        
        )
        
        if (from_to == "from_frequency_to_size"):
            tuple_name = from_frequency_to_size
        elif (from_to == "from_size_to_frequency"):
            tuple_name = from_size_to_frequency
        
        regex          = re.compile(r"%s" % tuple_name.find)                                
#         print "find = %s, replace = %s" % (find, replace)
        if (not tuple_name.cur_file_names) and (tuple_name == from_frequency_to_size):
            self.utils.print_both('ERROR: Did not find uniqued files (".unique") in %s, please check if the previous step has finished. Exiting.\n' % self.indir)
            sys.exit()
        for cur_file_name in tuple_name.cur_file_names:
            file_name = os.path.join(tuple_name.cur_dirname, cur_file_name)           
            source_name = file_name + tuple_name.change_from_suffix
            target_name = file_name + tuple_name.change_to_suffix 
            lines = self.read_file(source_name)
            self.illumina_sed(lines, target_name, regex, tuple_name.replace, tuple_name.uppercase)

    def illumina_freq_to_size_in_chg(self):
#         TODO: not used?
        find1    = "frequency:"
        replace1 = ";size="
        regex1   = re.compile(r"%s" % find1)        
        
#         print "cur_file_names: "
#         pprint(cur_file_names)
        cur_dirname        = self.get_current_dirname()
        cur_file_names     = self.get_current_filenames(cur_dirname)
        change_from_suffix = ""
        change_to_suffix   = self.chg_suffix
#         print "find = %s, replace = %s" % (find, replace)
 
        for cur_file_name in cur_file_names:
            file_name = os.path.join(cur_dirname, cur_file_name)
            with open(file_name + change_from_suffix, "r") as sources:
                lines = sources.readlines()
            with open(file_name + change_to_suffix, "w") as target:
#                 line2 = [regex1.sub(replace1, line) if line.startswith(">") else line.upper() for line in lines]
                for line in lines:
                    if line.startswith(">"):
                        line1 = regex1.sub(replace1, line)
                    else:
                        line1 = line.upper()
#                     print line1
                    target.write(line1)  


    def illumina_size_to_freq_in_chimer(self):
        find1           = ";size="
        replace1        = "frequency:"
        regex1          = re.compile(r"%s" % find1)        
 
        cur_file_names = self.get_chimera_file_names(self.outdir)
                    
        for file_chim in cur_file_names:
            file_chim_path = os.path.join(self.outdir, file_chim)
            with open(file_chim_path, "r") as sources:
                lines = sources.readlines()
            with open(file_chim_path, "w") as target:
                for line in lines:
                    line1 = regex1.sub(replace1, line)
                    target.write(line1)                    
              
    def illumina_rm_size_files(self):
        for idx_key in self.input_file_names:
            file_name = os.path.join(self.indir, self.input_file_names[idx_key] + self.chg_suffix)
            if os.path.exists(file_name):
                os.remove(file_name)
    
#     def illumina_chimera_size_files(self):
#     
#     import os
# [os.rename(f, f.replace('_', '-')) for f in os.listdir('.') if not f.startswith('.')]

        
          
    def check_if_cluster_is_done(self, time_before):
        cluster_done = False
        check_qstat_cmd_line = "qstat | grep \"%s\" | grep usearch | wc -l" % time_before
#         check_qstat_cmd_line = "qstat | grep usearch"

        self.utils.print_both("check_qstat_cmd_line = %s" % check_qstat_cmd_line)
        
        try:
            p = subprocess.Popen(check_qstat_cmd_line, stdout=subprocess.PIPE, shell=True)
            (output, err) = p.communicate()
            num_proc = int(output)
            self.utils.print_both("qstat is running %s 'usearch' processes" % num_proc)
    #         pprint(p)
            
            if (num_proc == 0):
                cluster_done = True
    #         print "cluster_done from check_if_cluster_is_done = %s" % cluster_done
        except:
            self.utils.print_both("Chimera checking can be done only on a cluster.")
            raise

        return cluster_done
        
          
    def create_chimera_cmd(self, input_file_name, output_file_name, ref_or_novo, ref_db = ""):
        """
        http://www.drive5.com/usearch/manual/uchime_denovo.html
        from usearch -help
        Chimera detection (UCHIME ref. db. mode):
          usearch -uchime q.fasta [-db db.fasta] [-chimeras ch.fasta]
            [-nonchimeras good.fasta] [-uchimeout results.uch] [-uchimealns results.alns]
         
        Chimera detection (UCHIME de novo mode):
          usearch -uchime amplicons.fasta [-chimeras ch.fasta] [-nonchimeras good.fasta]
             [-uchimeout results.uch] [-uchimealns results.alns]
          Input is estimated amplicons with integer abundances specified using ";size=N".
        usearch -uchime_denovo amplicons.fasta -uchimeout results.uchime
        """        

        uchime_cmd_append = ""
        db_cmd_append     = ""
        dir_cmd_append    = ""

        if (ref_or_novo == "denovo"):
            uchime_cmd_append = " -uchime_denovo "           
            output_file_name  = output_file_name + self.chimeras_suffix + self.denovo_suffix 
        elif (ref_or_novo == "ref"):
            uchime_cmd_append = " -uchime_ref "
            output_file_name  = output_file_name + self.chimeras_suffix + self.ref_suffix           
            db_cmd_append     = " -db " + ref_db   
            dir_cmd_append    = " -strand plus"
        else:
            self.utils.print_both("Incorrect method, should be \"denovo\" or \"ref\"") 
        self.utils.print_both("output_file_name = %s" % output_file_name) 


        uchime_cmd = C.clusterize_cmd
        uchime_cmd += " "
        uchime_cmd += self.usearch_cmd
        uchime_cmd += uchime_cmd_append + input_file_name
        uchime_cmd += db_cmd_append
        uchime_cmd += " -uchimeout " + output_file_name
        """if we need nonchimeric for denovo and db separate we might create them here
#         uchime_cmd += " -nonchimeras "
#         uchime_cmd += (output_file_name + self.nonchimeric_suffix)
"""
        uchime_cmd += " -chimeras " + (output_file_name + self.chimeric_suffix)         
        uchime_cmd += dir_cmd_append
        uchime_cmd += " -notrunclabels"
        
        
#         print "uchime_cmd FROM create_chimera_cmd = %s" % (uchime_cmd)
        return uchime_cmd
        
    def get_ref_db(self, dna_region):
        ref_db = ''
        if dna_region.upper() == 'ITS':
            logger.debug("got an ITS dna region so using refdb: " + self.its_refdb)
            ref_db = self.its_refdb
        else:
            logger.debug("using standard refdb: " + self.refdb)
            ref_db = self.refdb
        return ref_db       
    
    def chimera_checking(self, ref_or_novo):
        chimera_region_found = False
        output = {}
        
        for idx_key in self.input_file_names:
#             print "idx_key, self.input_file_names[idx_key] = %s, %s" % (idx_key, self.input_file_names)
            input_file_name  = os.path.join(self.indir,  self.input_file_names[idx_key] + self.chg_suffix)        
            output_file_name = os.path.join(self.outdir, self.input_file_names[idx_key])        
            dna_region       = self.runobj.samples[idx_key].dna_region
#             print "dna_region = %s" % dna_region
            if dna_region in C.regions_to_chimera_check:
                chimera_region_found = True
            else:
                logger.debug('region not checked: ' +  dna_region)
                continue
            
#             print "input_file_name = %s \noutput_file_name = %s" % (input_file_name, output_file_name)
            ref_db     = self.get_ref_db(dna_region)
#             print "dna_region = %s; ref_db = %s; ref_or_novo = %s" % (dna_region, ref_db, ref_or_novo)
            
            uchime_cmd = self.create_chimera_cmd(input_file_name, output_file_name, ref_or_novo, ref_db)
            self.utils.print_both("\n==================\n%s command: %s" % (ref_or_novo, uchime_cmd))
            
            try:
                logger.info("chimera checking command: " + str(uchime_cmd))
                output[idx_key] = subprocess.Popen(uchime_cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)

            except OSError, e:
                self.utils.print_both("Problems with this command: %s" % (uchime_cmd))
                if self.utils.is_local():
                    print >>sys.stderr, "Execution of %s failed: %s" % (uchime_cmd, e)
                else:
                    print >>sys.stderr, "Execution of %s failed: %s" % (uchime_cmd, e)
                    self.utils.print_both("Execution of %s failed: %s" % (uchime_cmd, e))
                    raise                  
                               
# ???
        if not chimera_region_found:            
            return ('NOREGION', 'No regions found that need checking', '')
        else:
            return ("The usearch commands were created")
Esempio n. 20
0
class dbUpload:
    """db upload methods"""
    Name = "dbUpload"
    """
    TODO: add tests and test case
    TODO: change hardcoded values to args: 
        self.sequence_table_name = "sequence_ill", 
        self.sequence_field_name = "sequence_comp"  
    TODO: generalize all bulk uploads and all inserts? to not copy and paste
    TODO: add refssu_id
    TODO: change csv validaton for new fields
    Order:
        # put_run_info
        # insert_seq()
        # insert_pdr_info()
        # gast
        # insert_taxonomy()
        # insert_sequence_uniq_info_ill()

    """
    def __init__(self, runobj = None):
        self.utils       = PipelneUtils()
        self.runobj      = runobj
        self.rundate     = self.runobj.run
        self.use_cluster = 1       
        self.unique_fasta_files = []
#        if self.runobj.vamps_user_upload:
#            site       = self.runobj.site
#            dir_prefix = self.runobj.user + '_' + self.runobj.run
#        else:
#            site = ''
#            dir_prefix = self.runobj.run         
#        dirs = Dirs(self.runobj.vamps_user_upload, dir_prefix, self.runobj.platform, site = site)

        if self.runobj.vamps_user_upload:
            site = self.runobj.site
            dir_prefix=self.runobj.user+'_'+self.runobj.run
        else:
            site = ''
            dir_prefix = self.runobj.run
        if self.runobj.lane_name:
            lane_name = self.runobj.lane_name
        else:
            lane_name = ''
        
        self.dirs = Dirs(self.runobj.vamps_user_upload, dir_prefix, self.runobj.platform, lane_name = lane_name, site = site) 
 
        
        self.analysis_dir = self.dirs.check_dir(self.dirs.analysis_dir)
        self.fasta_dir    = self.dirs.check_dir(self.dirs.reads_overlap_dir)
        self.gast_dir     = self.dirs.check_dir(self.dirs.gast_dir)

        host_name     = runobj.database_host
        database_name = runobj.database_name
        
        self.filenames   = []
        self.my_conn     = MyConnection(host = 'newbpcdb2.jbpc-np.mbl.edu', db="env454")
#         self.my_conn     = MyConnection()

#         self.my_conn     = MyConnection(host = 'localhost', db="test_env454")
        self.sequence_table_name = "sequence_ill" 
        self.sequence_field_name = "sequence_comp" 
        self.my_csv              = None

        self.unique_file_counts = self.dirs.unique_file_counts
        self.dirs.delete_file(self.unique_file_counts)
        self.seq_id_dict = {}
        self.tax_id_dict = {}
        self.run_id      = None
#        self.nonchimeras_suffix = ".nonchimeric.fa"
        self.nonchimeric_suffix = "." + C.nonchimeric_suffix #".nonchimeric.fa"
        self.fa_unique_suffix   = ".fa." + C.unique_suffix #.fa.unique
        self.v6_unique_suffix   = "MERGED_V6_PRIMERS_REMOVED." + C.unique_suffix
        self.suff_list = [self.nonchimeric_suffix, self.fa_unique_suffix, self.v6_unique_suffix]

#         self.merge_unique_suffix = "." + C.filtered_suffix + "." + C.unique_suffix #.MERGED-MAX-MISMATCH-3.unique
        self.suffix_used        = ""
        
#        self.refdb_dir = '/xraid2-2/vampsweb/blastdbs/'
   
   
    def get_fasta_file_names(self):
        files_names = self.dirs.get_all_files(self.fasta_dir)
        self.unique_fasta_files = [f for f in files_names.keys() if f.endswith(tuple(self.suff_list))]
# needs return because how it's called from pipelineprocesor
        return self.unique_fasta_files
        

    def get_run_info_ill_id(self, filename_base):
        
        my_sql = """SELECT run_info_ill_id FROM run_info_ill 
                    JOIN run using(run_id)
                    WHERE file_prefix = '%s'
                    and run = '%s'
        """ % (filename_base, self.rundate)
        res    = self.my_conn.execute_fetch_select(my_sql)
        if res:
            return int(res[0][0])
        
    def make_seq_upper(self, filename):
        read_fasta = fastalib.ReadFasta(filename)
        sequences  = [seq.upper() for seq in read_fasta.sequences] #here we make uppercase for VAMPS compartibility    
        read_fasta.close()
        return sequences 
        
    def insert_seq(self, sequences):
      query_tmpl = "INSERT IGNORE INTO %s (%s) VALUES (COMPRESS(%s))"
      val_tmpl   = "'%s'"
      my_sql     = query_tmpl % (self.sequence_table_name, self.sequence_field_name, ')), (COMPRESS('.join([val_tmpl % key for key in sequences]))
      seq_id     = self.my_conn.execute_no_fetch(my_sql)
      self.utils.print_both("sequences in file: %s\n" % (len(sequences)))
      return seq_id
    #     try:
    #         query_tmpl = "INSERT IGNORE INTO %s (%s) VALUES (COMPRESS(%s))"
    #         val_tmpl   = "'%s'"
    #         my_sql     = query_tmpl % (self.sequence_table_name, self.sequence_field_name, ')), (COMPRESS('.join([val_tmpl % key for key in sequences]))
    #         seq_id     = self.my_conn.execute_no_fetch(my_sql)
    # #         print "sequences in file: %s" % (len(sequences))
    #         self.utils.print_both("sequences in file: %s\n" % (len(sequences)))
    #         return seq_id
    #     except self.my_conn.conn.cursor._mysql_exceptions.Error as err:
    #         if err.errno == 1582:
    #             self.utils.print_both(("ERROR: _mysql_exceptions.OperationalError: (1582, \"Incorrect parameter count in the call to native function 'COMPRESS'\"), there is an empty fasta in %s") % self.fasta_dir)
    #         else:
    #             raise
    #     except:
    #         if len(sequences) == 0:
    #             self.utils.print_both(("ERROR: There are no sequences, please check if there are correct fasta files in the directory %s") % self.fasta_dir)
    #         raise
        
    def get_seq_id_dict(self, sequences):
        id_name    = self.sequence_table_name + "_id" 
        query_tmpl = """SELECT %s, uncompress(%s) FROM %s WHERE %s in (COMPRESS(%s))"""
        val_tmpl   = "'%s'"
        try:
            my_sql     = query_tmpl % (id_name, self.sequence_field_name, self.sequence_table_name, self.sequence_field_name, '), COMPRESS('.join([val_tmpl % key for key in sequences]))
            res        = self.my_conn.execute_fetch_select(my_sql)
            one_seq_id_dict = dict((y, int(x)) for x, y in res)
            self.seq_id_dict.update(one_seq_id_dict)
        except:
            if len(sequences) == 0:
                self.utils.print_both(("ERROR: There are no sequences, please check if there are correct fasta files in the directory %s") % self.fasta_dir)
            raise


    def get_id(self, table_name, value):
        id_name = table_name + '_id'
        my_sql  = """SELECT %s FROM %s WHERE %s = '%s'""" % (id_name, table_name, table_name, value)
        res     = self.my_conn.execute_fetch_select(my_sql)
        if res:
            return int(res[0][0])         
            
    def get_sequence_id(self, seq):
        my_sql = """SELECT sequence_ill_id FROM sequence_ill WHERE COMPRESS('%s') = sequence_comp""" % (seq)
        res    = self.my_conn.execute_fetch_select(my_sql)
        if res:
            return int(res[0][0])     
    
    def insert_pdr_info(self, fasta, run_info_ill_id):
        res_id = ""
        if (not run_info_ill_id):
            self.utils.print_both("ERROR: There is no run info yet, please check if it's uploaded to env454")
            
        # ------- insert sequence info per run/project/dataset --------
        seq_upper = fasta.seq.upper()
        sequence_ill_id = self.seq_id_dict[seq_upper]

        seq_count       = int(fasta.id.split('|')[-1].split(':')[-1])
#        print run_info_ill_id, sequence_ill_id, seq_count
        my_sql          = """INSERT IGNORE INTO sequence_pdr_info_ill (run_info_ill_id, sequence_ill_id, seq_count) 
                             VALUES (%s, %s, %s)""" % (run_info_ill_id, sequence_ill_id, seq_count)

        try:
            res_id = self.my_conn.execute_no_fetch(my_sql)
            return res_id
        except:
            self.utils.print_both("Offensive query: %s" % my_sql)
            raise
        
    def make_gast_files_dict(self):
        return self.dirs.get_all_files(self.gast_dir, "gast")
        
        
    def gast_filename(self, filename):
#         todo: if filename in make_gast_files_dict, use it full path
        gast_file_names = self.make_gast_files_dict()
        gast_file_name_path = ""
        for gast_file_name_path, tpls in gast_file_names.iteritems():
            if any(t.endswith(filename) for t in tpls):
                return gast_file_name_path 
    
    def get_gast_result(self, filename):
        gast_file_name = self.gast_filename(filename)
        self.utils.print_both("current gast_file_name = %s." % gast_file_name)
        
        try:
            with open(gast_file_name) as fd:
                gast_dict = dict([(l.split("\t")[0], l.split("\t")[1:]) for l in fd])    
            return gast_dict
        except IOError, e:
#            print dir(e)
#['__class__', '__delattr__', '__dict__', '__doc__', '__format__', '__getattribute__', '__getitem__', '__getslice__', '__hash__', '__init__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__setstate__', '__sizeof__', '__str__', '__subclasshook__', '__unicode__', 'args', 'errno', 'filename', 'message', 'strerror']
#            print "errno = %s" % e.errno
            logger.debug("errno = %s" % e.errno)
            if e.errno == 2:
                # suppress "No such file or directory" error
                pass            
#         except OSError, e:
        except TypeError, e:
            self.utils.print_both("Check if there is a gast file under %s for %s." % (self.gast_dir, filename))
            pass            
Esempio n. 21
0
    def put_seq_statistics_in_file(self, filename, seq_in_file):
        pipelne_utils   = PipelneUtils()
#        if os.path.exists(file_full):
#            os.remove(file_full)
        pipelne_utils.write_seq_frequencies_in_file(self.unique_file_counts, filename, seq_in_file)       
class Chimera:
    """ Define here """
    def __init__(self, runobj = None):
        self.utils      = PipelneUtils()
        self.runobj     = runobj
        self.run_keys   = self.runobj.run_keys
        self.rundate    = self.runobj.run
        
        self.chg_suffix         = ".chg"
        self.chimeras_suffix    = ".chimeras"      
        self.ref_suffix         = ".db"      
        self.denovo_suffix      = ".txt"        
        self.nonchimeric_suffix = "." + C.nonchimeric_suffix #".nonchimeric.fa"
        self.chimeric_suffix    = ".chimeric.fa"
        self.base_suffix        = "unique" + self.chimeras_suffix

        try:
            if self.runobj.lane_name:
                lane_name = self.runobj.lane_name
            else:
                lane_name = ''
        except:
            lane_name = ''

        if self.runobj.vamps_user_upload:
            site       = self.runobj.site
            dir_prefix = self.runobj.user + '_' + self.runobj.run
            self.dirs = Dirs(self.runobj.vamps_user_upload, dir_prefix, self.runobj.platform, lane_name = lane_name, site = site) 
            self.idx_keys = convert_unicode_dictionary_to_str(json.loads(open(self.runobj.trim_status_file_name,"r").read()))["new_lane_keys"] 
            self.indir  = self.dirs.check_dir(self.dirs.trimming_dir)
            self.outdir = self.dirs.check_dir(self.dirs.chimera_dir)
        else:
            site = ''
            dir_prefix = self.runobj.run
            self.dirs = Dirs(self.runobj.vamps_user_upload, dir_prefix, self.runobj.platform, lane_name = lane_name, site = site) 
            self.indir  = self.dirs.check_dir(self.dirs.reads_overlap_dir)
            self.outdir = self.dirs.check_dir(self.dirs.chimera_dir)
        
        
#         self.usearch_cmd = C.usearch_cmd
        self.usearch_cmd = C.usearch6_cmd        
        #self.abskew      = C.chimera_checking_abskew
        self.refdb       = C.chimera_checking_refdb_6
        self.its_refdb   = C.chimera_checking_its_refdb_6
        self.input_file_names  = self.make_chimera_input_illumina_file_names()
#         pprint(self.run_keys)
#         self.output_file_names = self.make_chimera_output_illumina_file_names(self.input_file_names)
        
    def make_chimera_input_illumina_file_names(self):
        input_file_names = {} 
        
        for idx_key in self.run_keys:
            file_name = idx_key + "_" + C.filtered_suffix + ".unique" 
           
            if os.path.exists(os.path.join(self.indir, file_name)):
                input_file_names[idx_key] = file_name
        
        return input_file_names
            
#     def make_chimera_output_illumina_file_names(self, input_file_names):
#         output_file_names = {} 
#         for idx_key, input_file_name in input_file_names.iteritems():
#             output_file_names[idx_key] = input_file_name
#         return output_file_names

    def get_current_dirname(self, in_or_out = ""):
        if in_or_out == "":
            cur_dirname    = self.indir 
        else:
            cur_dirname    = self.outdir
        return cur_dirname

    def is_chimera_check_file(self, filename):
        return filename.endswith((self.chimeras_suffix + self.denovo_suffix, self.chimeras_suffix + self.ref_suffix, self.chimeric_suffix, self.nonchimeric_suffix))

    def get_current_filenames(self, cur_dirname):
        cur_file_names = []
        if cur_dirname == self.indir:
            cur_file_names = self.input_file_names.values()
        elif cur_dirname == self.outdir:
            cur_file_names = self.get_chimera_file_names(self.outdir)
        return cur_file_names

    def get_chimera_file_names(self, cur_dirname):
        cur_file_names = []        
        for dirname, dirnames, filenames in os.walk(cur_dirname):
            cur_file_names = [filename for filename in filenames if (self.is_chimera_check_file(filename))]
        return cur_file_names

#     def illumina_frequency_size(self, in_or_out = "", find = "frequency:", replace = ";size="):
#         cur_dirname    = self.get_current_dirname(in_or_out)
#         cur_file_names = self.get_current_filenames(cur_dirname)
# #         print "cur_file_names: "
# #         pprint(cur_file_names)
#         change_from_suffix = ""
#         change_to_suffix   = self.chg_suffix
# #         print "find = %s, replace = %s" % (find, replace)
#         regex              = re.compile(r"%s" % find)
# 
#         for cur_file_name in cur_file_names:
#             file_name = os.path.join(cur_dirname, cur_file_name)
#             with open(file_name + change_from_suffix, "r") as sources:
#                 lines = sources.readlines()
#             with open(file_name + change_to_suffix, "w") as target:
#                 for line in lines:
#                         target.write(regex.sub(replace, line))

    def read_file(self, source_name):
        with open(source_name, "r") as sources:
            return sources.readlines()

    def illumina_sed(self, lines, target_name, regex, replace, uppercase):
        with open(target_name, "w") as target:
            for line in lines:
                if line.startswith(">"):
                    line1 = regex.sub(replace, line)
                else:
                    if (uppercase):
                        line1 = line.upper()
                    else:
                        line1 = line
                target.write(line1)  


    def call_illumina_sed(self, from_to):
        """
            from_to = from_frequency_to_size or from_size_to_frequency
        """
        sed_from_to = namedtuple('sed_from_to', 'find, replace, cur_dirname, cur_file_names, change_from_suffix, change_to_suffix, uppercase')

        from_frequency_to_size = sed_from_to(
        find               = "frequency:",
        replace            = ";size=",
        cur_dirname        = self.indir,
        cur_file_names     = self.get_current_filenames(self.indir),
        change_from_suffix = "",
        change_to_suffix   = self.chg_suffix,
        uppercase          = True
        )

        from_size_to_frequency = sed_from_to(
        find               = ";size=",
        replace            = "frequency:",
        cur_dirname        = self.outdir,
        cur_file_names     = self.get_chimera_file_names(self.outdir),
        change_from_suffix = "",
        change_to_suffix   = "",
        uppercase          = False        
        )
        
        if (from_to == "from_frequency_to_size"):
            tuple_name = from_frequency_to_size
        elif (from_to == "from_size_to_frequency"):
            tuple_name = from_size_to_frequency
        
        regex          = re.compile(r"%s" % tuple_name.find)                                
#         print "find = %s, replace = %s" % (find, replace)
 
        for cur_file_name in tuple_name.cur_file_names:
            file_name = os.path.join(tuple_name.cur_dirname, cur_file_name)           
            source_name = file_name + tuple_name.change_from_suffix
            target_name = file_name + tuple_name.change_to_suffix 
            lines = self.read_file(source_name)
            self.illumina_sed(lines, target_name, regex, tuple_name.replace, tuple_name.uppercase)

    def illumina_freq_to_size_in_chg(self):
#         TODO: not used?
        find1    = "frequency:"
        replace1 = ";size="
        regex1   = re.compile(r"%s" % find1)        
        
#         print "cur_file_names: "
#         pprint(cur_file_names)
        cur_dirname        = self.get_current_dirname()
        cur_file_names     = self.get_current_filenames(cur_dirname)
        change_from_suffix = ""
        change_to_suffix   = self.chg_suffix
#         print "find = %s, replace = %s" % (find, replace)
 
        for cur_file_name in cur_file_names:
            file_name = os.path.join(cur_dirname, cur_file_name)
            with open(file_name + change_from_suffix, "r") as sources:
                lines = sources.readlines()
            with open(file_name + change_to_suffix, "w") as target:
#                 line2 = [regex1.sub(replace1, line) if line.startswith(">") else line.upper() for line in lines]
                for line in lines:
                    if line.startswith(">"):
                        line1 = regex1.sub(replace1, line)
                    else:
                        line1 = line.upper()
#                     print line1
                    target.write(line1)  


    def illumina_size_to_freq_in_chimer(self):
        find1           = ";size="
        replace1        = "frequency:"
        regex1          = re.compile(r"%s" % find1)        
 
        cur_file_names = self.get_chimera_file_names(self.outdir)
                    
        for file_chim in cur_file_names:
            file_chim_path = os.path.join(self.outdir, file_chim)
            with open(file_chim_path, "r") as sources:
                lines = sources.readlines()
            with open(file_chim_path, "w") as target:
                for line in lines:
                    line1 = regex1.sub(replace1, line)
                    target.write(line1)                    
              
    def illumina_rm_size_files(self):
        for idx_key in self.input_file_names:
            file_name = os.path.join(self.indir, self.input_file_names[idx_key] + self.chg_suffix)
            if os.path.exists(file_name):
                os.remove(file_name)
    
#     def illumina_chimera_size_files(self):
#     
#     import os
# [os.rename(f, f.replace('_', '-')) for f in os.listdir('.') if not f.startswith('.')]

        
          
    def check_if_cluster_is_done(self, time_before):
        cluster_done = False
        check_qstat_cmd_line = "qstat | grep \"%s\" | grep usearch | wc -l" % time_before
#         check_qstat_cmd_line = "qstat | grep usearch"

        print "check_qstat_cmd_line = %s" % check_qstat_cmd_line
        
        try:
            p = subprocess.Popen(check_qstat_cmd_line, stdout=subprocess.PIPE, shell=True)
            (output, err) = p.communicate()
            num_proc = int(output)
            print "qstat is running %s 'usearch' processes" % num_proc
    #         pprint(p)
            
            if (num_proc == 0):
                cluster_done = True
    #         print "cluster_done from check_if_cluster_is_done = %s" % cluster_done
        except:
            print "Chimera checking can be done only on a cluster."
            raise

        return cluster_done
        
          
    def create_chimera_cmd(self, input_file_name, output_file_name, ref_or_novo, ref_db = ""):
        """
        http://www.drive5.com/usearch/manual/uchime_denovo.html
        from usearch -help
        Chimera detection (UCHIME ref. db. mode):
          usearch -uchime q.fasta [-db db.fasta] [-chimeras ch.fasta]
            [-nonchimeras good.fasta] [-uchimeout results.uch] [-uchimealns results.alns]
         
        Chimera detection (UCHIME de novo mode):
          usearch -uchime amplicons.fasta [-chimeras ch.fasta] [-nonchimeras good.fasta]
             [-uchimeout results.uch] [-uchimealns results.alns]
          Input is estimated amplicons with integer abundances specified using ";size=N".
        usearch -uchime_denovo amplicons.fasta -uchimeout results.uchime
        """        

        uchime_cmd_append = ""
        db_cmd_append     = ""
        dir_cmd_append    = ""

        if (ref_or_novo == "denovo"):
            uchime_cmd_append = " -uchime_denovo "           
            output_file_name  = output_file_name + self.chimeras_suffix + self.denovo_suffix 
        elif (ref_or_novo == "ref"):
            uchime_cmd_append = " -uchime_ref "
            output_file_name  = output_file_name + self.chimeras_suffix + self.ref_suffix           
            db_cmd_append     = " -db " + ref_db   
            dir_cmd_append    = " -strand plus"
        else:
            print "Incorrect method, should be \"denovo\" or \"ref\"" 
        print "output_file_name = %s" % output_file_name 


        uchime_cmd = C.clusterize_cmd
        uchime_cmd += " "
        uchime_cmd += self.usearch_cmd
        uchime_cmd += uchime_cmd_append + input_file_name
        uchime_cmd += db_cmd_append
        uchime_cmd += " -uchimeout " + output_file_name
        """if we need nonchimeric for denovo and db separate we might create them here
#         uchime_cmd += " -nonchimeras "
#         uchime_cmd += (output_file_name + self.nonchimeric_suffix)
"""
        uchime_cmd += " -chimeras " + (output_file_name + self.chimeric_suffix)         
        uchime_cmd += dir_cmd_append
        uchime_cmd += " -notrunclabels"
        
        
#         print "uchime_cmd FROM create_chimera_cmd = %s" % (uchime_cmd)
        return uchime_cmd
        
    def get_ref_db(self, dna_region):
        ref_db = ''
        if dna_region.upper() == 'ITS':
            logger.debug("got an ITS dna region so using refdb: " + self.its_refdb)
            ref_db = self.its_refdb
        else:
            logger.debug("using standard refdb: " + self.refdb)
            ref_db = self.refdb
        return ref_db       
    
    def chimera_checking(self, ref_or_novo):
        chimera_region_found = False
        output = {}
        
        for idx_key in self.input_file_names:
#             print "idx_key, self.input_file_names[idx_key] = %s, %s" % (idx_key, self.input_file_names)
            input_file_name  = os.path.join(self.indir,  self.input_file_names[idx_key] + self.chg_suffix)        
            output_file_name = os.path.join(self.outdir, self.input_file_names[idx_key])        
            dna_region       = self.runobj.samples[idx_key].dna_region
#             print "dna_region = %s" % dna_region
            if dna_region in C.regions_to_chimera_check:
                chimera_region_found = True
            else:
                logger.debug('region not checked: ' +  dna_region)
                continue
            
#             print "input_file_name = %s \noutput_file_name = %s" % (input_file_name, output_file_name)
            ref_db     = self.get_ref_db(dna_region)
#             print "dna_region = %s; ref_db = %s; ref_or_novo = %s" % (dna_region, ref_db, ref_or_novo)
            
            uchime_cmd = self.create_chimera_cmd(input_file_name, output_file_name, ref_or_novo, ref_db)
            print "\n==================\n%s command: %s" % (ref_or_novo, uchime_cmd)
            
            try:
                logger.info("chimera checking command: " + str(uchime_cmd))
                output[idx_key] = subprocess.Popen(uchime_cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)

            except OSError, e:
                print "Problems with this command: %s" % (uchime_cmd)
                if self.utils.is_local():
                    print >>sys.stderr, "Execution of %s failed: %s" % (uchime_cmd, e)
                else:
                    print >>sys.stderr, "Execution of %s failed: %s" % (uchime_cmd, e)
                    raise                  
                               
# ???
        if not chimera_region_found:            
            return ('NOREGION', 'No regions found that need checking', '')
        else:
            return ("The usearch commands were created")
class MetadataUtils:
    """
    Class to read metadata files (csv and ini style)
    validate and create a dictionary from them
    Two parts:
    1) From pipeline-ui.py to validate the input args
    2) From runconfig.py to write the final ini file and create the dictionary
    that is used to create the run object
    """
    Name = "MetadataUtils"
    def __init__(self, command_line_args = None, configuration_dictionary = None):
        self.args = command_line_args
        self.general_config_dict = configuration_dictionary
        self.known_header_list  = C.csv_header_list
        self.pipeline_run_items = C.pipeline_run_items
        self.primer_suites      = self.convert_primer_suites(C.primer_suites)
        self.dna_regions        = C.dna_regions
        self.data_object = {}
        self.data_object['general'] = {}
        self.warn_msg = """\n\tThe config File seems to be okay. If the items above look correct
        then press 'c' to continue the pipeline\n"""
        self.res_headers = []
        self.env = {}
        self.utils  = PipelneUtils()


    def convert_and_save_ini(self, analysis_dir):

        new_ini_file = os.path.join(analysis_dir, self.general_config_dict['run'] + '.ini')
        #new_ini_file = os.path.join(self.general_config_dict['output_dir'],self.general_config_dict['run'],self.general_config_dict['run'] + '.ini')
        # converts csv to ini and saves to output_dir
        if self.general_config_dict['platform'] == 'vamps':
            self.save_ini_file(new_ini_file)
        else:
            self.convert_csv_to_ini(new_ini_file)
        self.general_config_dict['configPath'] = new_ini_file

        # change path and type to new ini
        # regardless of what they were before



    def validate(self, analysis_dir):

        if self.general_config_dict['platform'] in C.illumina_list:
            self.warn_msg = self.validate_illumina_ini(analysis_dir)
        elif self.general_config_dict['platform'] == '454':
            data = self.validate_454_ini(analysis_dir)
        elif self.general_config_dict['platform'] == 'ion_torrent':
            pass
        elif self.general_config_dict['platform'] == 'vamps':
            data = self.validate_vamps_ini(analysis_dir)
        else:
            sys.exit("Unknown platform and configFile type for validation")


        return self.data_object

    def get_general_data(self):
        """
        """
        return self.data_object['general']

    def validate_vamps_ini(self, analysis_dir):
        # configPath is the new configPath
        'todo: Andy, what should be here, just directory name or directory + number.ini?'
        self.data_object = self.configDictionaryFromFile_ini(self.general_config_dict['configPath'])
        if 'fasta_file' in self.data_object and not os.path.exists(self.data_object['fasta_file']):
            sys.exit("Fasta file path doesn't exist: "+self.data_object['fasta_file'] )
        elif 'fasta_file' in self.data_object['general'] and not os.path.exists(self.data_object['general']['fasta_file']):
            sys.exit("Fasta file path doesn't exist: "+self.data_object['general']['fasta_file'] )

    def validate_454_ini(self, analysis_dir):
        print("TODO - write validation def for 454/ini")
        #self.data_object = self.create_dictionary_from_ini()
        # 454 ini file requirements:



    def validate_illumina_ini(self, analysis_dir):
        """
        The csv headers are checked earlier
        """

        print("Validating ini type Config File (may have been converted from csv)")
        new_ini_file = os.path.join(analysis_dir, self.general_config_dict['run'] + '.ini')
        print("New ini file location: "+new_ini_file)
        return_code = False
        error_code  = False
        warn_code   = False
        msg = ''
        error=False
        warn=False
        #print('configpath',self.general_config_dict['configPath'])
        # configPath here is the new configPath
        self.data_object = self.configDictionaryFromFile_ini(self.general_config_dict['configPath'])


        (error_code,warn_code) = self.check_for_missing_values(self.data_object)
        if error_code: error=True
        if warn_code: warn=True
        (error_code,warn_code) = self.check_for_datasets(self.data_object)
        if error_code: error=True
        if warn_code: warn=True
        (error_code,warn_code) = self.check_domain_suite_region(self.data_object)
        if error_code: error=True
        if warn_code: warn=True
        (error_code,warn_code) = self.check_project_name(self.data_object)
        if error_code: error=True
        if warn_code: warn=True
        (error_code,warn_code) = self.check_dataset_name(self.data_object)
        if error_code: error=True
        if warn_code: warn=True
        (error_code,warn_code) = self.check_projects_and_datasets(self.data_object)
        if error_code: error=True
        if warn_code: warn=True
        #print(self.data_object['input_dir'])
        #print(self.data_object['input_files'])


        if 'input_dir' not in self.data_object['general'] and 'input_files' not in self.data_object['general']:
            logger.warning("No input directory and no input files")
            warn=True
        elif not os.path.isdir(self.data_object['general']['input_dir']):
            logger.error("That is not a directory: "+self.data_object['general']['input_dir'])
            error=True
        elif self.data_object['general']['input_file_format'] == 'fastq' and self.data_object['general']['platform'] in C.illumina_list:
                file_exists = False
    #            if 'input_dir' in self.data_object['general'] and self.data_object['general']['input_dir']:
                for dirname, dirnames, filenames in os.walk(self.data_object['general']['input_dir']):
    #                if not filenames:
                    for file_name in filenames:
                        if os.path.isfile(os.path.join(dirname, file_name)):
                            file_exists = True
                            break
                if not file_exists:
                    logger.error("There are no files found in the input directory: "+self.data_object['general']['input_dir'])
                    error=True
        elif 'input_dir' in self.data_object['general'] and self.data_object['general']['input_dir'] and ('input_files' not in self.data_object['general'] or not self.data_object['general']['input_files']):
            logger.error("There are no files found in the input directory: "+self.data_object['general']['input_dir'])
            error=True

        if error:
            sys.exit( """\n\t\033[91mTHERE WERE SEVERE PROBLEMS WITH THE CSV and/or CONFIG FILE - EXITING
            PLEASE CORRECT THEM AND START OVER.\033[0m\n
            To view the errors add ' --loglevel info' to the command line.\n""")
        elif warn:
            msg = """\n\t\033[93mTHERE WERE NON-FATAL PROBLEMS WITH THE CSV and/or CONFIG FILE THAT MAY OR MAY NOT CAUSE PROBLEMS.\033[0m\n
                To view the warnings add ' --loglevel warning' to the command line.\n"""
            print("\033[92mCSV File Passed Vaidation! (with warnings)\033[0m")
        else:
            print("\033[92mCSV File Passed Vaidation!\033[0m")
        return msg

    def validate_dictionary(self, config_info):
        """
        This is only used for data that comes in as a dictionary rather than a file
        such as with vamps user uploads
        """
        print("TODO - Validating input dictionary")
        # must be a general section
        # should I create a dict here??? -That would render much code in
        #    runconfig useless.
        # are we going to continue developing ini style config files if
        #   no one uses them?
        configDict = config_info

        return configDict




    def populate_data_object_454(self, args):
        data = {}
        data['general'] = {}
        test_datasets = {}
        dataset_counter = {}
        headers = ''
        if self.runobj:
            infile = self.runobj.configPath
        else:
            infile = args.configPath
            data['general']['input_dir'] = args.input_dir
            #data['general']['output_dir'] = os.path.join(args.output_dir,args.run)
            data['general']['output_dir'] = args.output_dir
            data['general']['platform'] = args.platform
            data['general']['run'] = args.run
            #data['general']['run_date'] = args.run
            data['general']["input_file_format"] = args.input_file_format
            data['general']["input_file_suffix"] = args.input_file_suffix

        return data['general']




    def get_input_files(self):

        files_list = []

        if os.path.isdir(self.general_config_dict['input_dir']):

            for infile in glob.glob( os.path.join(self.general_config_dict['input_dir'], '*') ):
                if os.path.isdir(infile) == True:

                    for infile2 in glob.glob( os.path.join( infile,'*') ):
                        if os.path.isdir(infile2) == True:
                            pass
                        else:
                            sub_dir = os.path.basename(infile)

                            files_list.append(os.path.join(sub_dir,os.path.basename(infile2)))
                else:
                    files_list.append(os.path.basename(infile))
#        else:
#            if fasta_file:
#                pass
#            logger.warning("No input directory or directory permissions problem: "+self.general_config_dict['input_dir'])

        return files_list

    def check_for_input_files(self, data_object):

        file_count = 0
        files_list = []
        imports_list = []
        lanes_list = []


        #input_dir = os.path.join(data_object['general']['input_dir'],"fasta")
        input_dir = data_object['general']['input_dir']
        if os.path.isdir(input_dir):
            p = data_object['general']['input_dir'], '*'+data_object['general']['input_file_suffix']


            for infile in glob.glob( os.path.join(input_dir, '*'+data_object['general']['input_file_suffix']) ):
                files_list.append(os.path.basename(infile))
                for x in data_object:
                    if 'file_prefix' in data_object[x]:
                        pass
                        #print(data_object[x]['file_prefix'])

                        #if os.path.basename(infile).split('-')[0] == data_object[x]['file_prefix']:
                            #lanes_list.append(data_object[x]['lane'])

                file_count += 1
        else:

            logger.info("No input directory or directory permissions problem: "+input_dir)
            print("No input directory or directory permissions problem: "+input_dir)
        if not file_count:
            #sys.exit("ERROR: No files were found in '"+input_dir+"' with a suffix of '"+data_object['general']['input_file_suffix']+"'")
            logger.info("ERROR: No files were found in '"+input_dir+"' with a suffix of '"+data_object['general']['input_file_suffix']+"'")

        data_object['general']['files_list'] = files_list
        data_object['general']['file_count'] = file_count
        # all the files in an illumina directory should be the same type
        #data_object['general']['file_formats_list'] = [data_object['general']["input_file_format"]] * file_count
        #data_object['general']['lanes_list'] = lanes_list
        #print("Files LIST",data_object['general']['files_list'])


        return data_object


    def check_for_missing_values(self, data):
        missing_key   = ''
        error = False
        warn = False
        for item in data:
            if item == 'general':
                for k,v in data[item].items():
                    if not k:
                        #sys.exit("ERROR: key for: '"+v+"' is missing or corrupt - Exiting")
                        logger.warning("(key: "+item+") key for: '"+v+"' is missing or corrupt - Continuing")
                        warn=True
                    if v == '':
                        logger.warning("(key: "+item+") value of: '"+k+"' is missing or corrupt - Continuing")
                        warn=True

        for item in data:
            if item != 'general':
                for k,v in data[item].items():
                    if not k:
                        #sys.exit("ERROR: key for: '"+v+"' is missing or corrupt - Exiting")
                        logger.warning("(key: "+item+") key for: '"+v+"' is missing or corrupt - Continuing")
                        warn=True
                    if not v:
                        if (k == 'barcode' or k == 'adaptor'): #these could be empty
                            logger.warning("(key: "+item+") value of: '"+k+"' is missing or corrupt - Continuing")
                        else:
                            logger.error("(key: "+item+") value of: '"+k+"' is missing or corrupt - Continuing")
                            error=True
        return (error,warn)

    def check_for_datasets(self,data):
        error = False
        warn=False
        for item in data:
            if item != 'general':
                #print('ds',data[item]['dataset'])
                if not data[item]['dataset']:
                #if 'dataset' not in data[item]:
                    logger.error("Current dataset name is missing or corrupt - Exiting (key: "+item+")")
                    error=True
        return (error,warn)

    def check_domain_suite_region(self,data):
        error = False
        warn=False

        for item in data:

            if item != 'general':
                primer_suite = self.convert_primer_suites(data[item]['primer_suite'])
                dna_region   = self.convert_primer_suites(data[item]['dna_region'])

                # CHECK MUST MATCH: "Domain","Primer Suite","DNA Region"
                if primer_suite not in self.primer_suites:
                    logger.error("Primer Suite not found: "+primer_suite+" - Exiting (key: "+item+")")
                    error=True
                if dna_region not in self.dna_regions:
                    logger.error("DNA Region not found: "+dna_region+" - Exiting (key: "+item+")")
                    error=True
                if dna_region not in primer_suite:
                    logger.error("DNA Region ("+dna_region+") not found in Primer Suite ("+primer_suite+") - Exiting (key: "+item+")")
                    error=True
        return (error, warn)

    def convert_primer_suites(self, suite):
        import re
        if type(suite) is list:
            conv_suite = [re.sub(r'[_ -]', '', item.lower()) for item in suite]
        if type(suite) is str:
            conv_suite = re.sub(r'[_ -]', '', suite.lower())
                # suite.lower().translate(None, '_- ')
        return conv_suite

    def check_project_name(self, data):
        """
        # CHECK: project name format: 3 parts; end with Bv6,Ev9,Av6 or something similar
        """
        error   =False
        warn    =False
        for item in data:
            if item != 'general':
                try:
                    (a,b,c) = data[item]['project'].split('_')
                except:
                    logger.error("project not in correct format: ")
                    logger.error(data[item]['project'])
                    logger.error(" - Exiting (key: ")
                    logger.error(data[item])
                    error=True
                (a,b,c) = data[item]['project'].split('_')
                #if c[0] not in [i[0].upper() for i in domains]:
                #    sys.exit("ERROR : Project suffix has incorrect/non-existant domain: "+c)
                # logger.error("c[1:] = ")
                # logger.error(c[1:])
                # logger.error("c.lower() =")
                # logger.error(c.lower())
                # logger.error("self.dna_regions")
                # logger.error(self.dna_regions )

                if (c[1:].lower() not in self.dna_regions) and (c.lower() not in self.dna_regions):
                    logger.error("Project suffix has incorrect DNA region: ")
                    logger.error(c)
                    logger.error(" - Exiting (key: ")
                    logger.error(data[item])
                    error = True
        return (error, warn)

    def check_dataset_name(self,data):
        """
        # CHECK: dataset name can be ONLY alphanumeric and underscore
                    and cannot start with a number!
        """
        error   =False
        warn    =False
        for item in data:
            if item != 'general':
                dataset_name = data[item]['dataset']
                if not re.match("^[A-Za-z0-9_]*$", dataset_name):
                    logger.error("Dataset name has illeagal character(s): "+dataset_name+" (must be alphanumeric and underscore only)")
                    error = True
                #if  re.match("^[0-9]", dataset_name):
                 #   logger.error("Dataset name cannot begin with a digit: "+dataset_name)
                  #  error = True

        return (error, warn)

    def get_my_conn(self):
        try:
            host = self.general_config_dict['database_host']
        except:
            raise
        try:
            db = self.general_config_dict['database_name']
        except:
            raise
        if self.utils.is_local():
            host = 'localhost'
            db   = "test_env454"

        self.my_conn = MyConnection(host = host, db = db)

    def check_projects_and_datasets(self, data):
        self.get_my_conn()
        project_dataset = {}
        projects = {}
        datasets = {}
        error   =False
        warn    =False
        for item in data:
            if item != 'general':
                #project_dataset[data[item]['project']+'--'+data[item]['dataset']] = 1
                datasets[data[item]['dataset']] = data[item]['project']
                projects[data[item]['project']] = 1
        for p in projects:
            #print(p)
            my_sql = """SELECT project FROM project WHERE project = '%s'""" % (p)
            res    = self.my_conn.execute_fetch_select(my_sql)
            if res:
                logger.warning("project '"+p+"' already exists in the database - is this okay?")
                warn = True
            else:
                logger.debug("project '"+p+"' is new")

            ds_found_count = 0
            for d in datasets:
                if datasets[d] == p:

                    #print("\t%s" % (d))
                    my_sql = """SELECT dataset FROM dataset WHERE dataset = '%s'""" % (d)
                    res    = self.my_conn.execute_fetch_select(my_sql)
                    if res:
                        ds_found_count += 1
                        if ds_found_count >3:
                            logger.warning("\t\tPossibly more .... - Exiting after just three")
                            break
                        logger.warning("\tdataset '"+d+"' already exists in the database - is this okay?")
                        warn=True
                    else:
                        logger.debug("\tdataset '"+d+"' is new")
            logger.debug("\tDataset Count: "+str(len(datasets)))
        return (error,warn)


    def get_confirmation(self, steps, general_data):
        print("\n")
        for item,value in general_data.items():
            #print(len(value))
            if type(value) != bool and len(value) > 80:
                tmp = value.split(',')
                print("%-20s = %s .. %s" % (item,tmp[0],tmp[-1]))
            else:
                print("%-20s = %-20s" % (item,value))
        print("\nStep(s) to be performed: \033[1;36m",steps,'\033[0m')
        print("\n"+self.warn_msg+"\n")
        if 'validate' in steps.split(','):
            # print(we are done)
            sys.exit()
        if self.utils.is_local():
            return 'c'
        else:
            return 'c'

            # return raw_input("\nDoes this look okay? (q to quit, v to view configFile, c to continue) ")

    def convert_csv_to_ini(self, new_ini_file):
        #print(self.args)
        from pipeline.get_ini import readCSV

        print('CSV path', self.general_config_dict['csvPath'])
        my_csv = readCSV(file_path = self.general_config_dict['csvPath'])

        content     = my_csv.read_csv()
        headers     = content[1].keys()
        headers_clean = [x.strip('"').replace(" ", "_").lower() for x in headers]
        projects = {}
        #print
        #print(content[1])
        #print
        # get list of keys
        keys_list = []
        if self.check_headers(headers_clean):
            logger.info("CSV headers okay")
            for k,values in content.items():
                keys_list.append(values['barcode_index']+"_"+values['run_key']+"_"+values['lane'])

        fh = open(new_ini_file,'w')
        # general section
        fh.write("#\n#\tCreated by MBL Pipeline for run: "+self.general_config_dict['run']+" on "+self.general_config_dict['date']+"\n#\n\n")
        fh.write("[general]\n")
        fh.write("run = "+self.general_config_dict['run']+"\n")
        fh.write("configPath = "+new_ini_file+"\n")

        fh.write("configPath_orig = " + self.general_config_dict['configPath']+"\n")
        fh.write("platform = " + self.general_config_dict['platform']+"\n")
        fh.write("output_dir = " + os.path.dirname(new_ini_file)+"\n")
        #fh.write("output_dir = "+os.path.join(self.general_config_dict['baseoutputdir'],self.general_config_dict['run'])+"\n")
        if self.general_config_dict['platform'] in C.illumina_list:
            #fh.write("input_file_suffix = "  + self.general_config_dict['input_file_suffix']+"\n")
            fh.write("input_file_format = " + self.general_config_dict['input_file_format']+"\n")
            fh.write("anchor_file = "        + self.general_config_dict['anchor_file']+"\n")
            fh.write("primer_file = "        + self.general_config_dict['primer_file']+"\n")
            fh.write("compressed = "          + str(self.general_config_dict['compressed'])+"\n")
            fh.write("do_perfect = "          + str(self.general_config_dict['do_perfect'])+"\n")
            fh.write("lane_name = "          + str(self.general_config_dict['lane_name'])+"\n")
            fh.write("database_host = "          + self.general_config_dict['database_host']+"\n")
            fh.write("database_name = "          + self.general_config_dict['database_name']+"\n")

        fh.write("input_dir = "          + self.general_config_dict['input_dir']+"\n")
        fh.write("require_distal = "     + str(self.general_config_dict['require_distal'])+"\n")
        fh.write("use_cluster = "              + str(self.general_config_dict['use_cluster'])+"\n")
        fh.write("date = "              + str(datetime.date.today())+"\n")
        fh.write("site = "              + self.general_config_dict['site']+"\n")
        fh.write("load_vamps_database = " + str(self.general_config_dict['load_vamps_database'])+"\n")
        fh.write("idx_keys = "           +','.join(keys_list)+"\n")
        if 'input_dir' in self.general_config_dict and self.general_config_dict['input_dir'] != '':
            file_list = self.get_input_files()
            fh.write("input_files = "     + ','.join(file_list)+"\n")
        else:
            fh.write("input_files = \n")
        #fh.write(getattr(args,'force_runkey', ""))

        for k, values in content.items():
            fh.write("\n")
            if self.general_config_dict['platform'] in C.illumina_list:
                fh.write("["+values['barcode_index']+"_"+values['run_key']+"_"+values['lane']+"]\n")
            elif self.general_config_dict['platform'] == '454':
                fh.write("["+values['lane']+"_"+values['run_key']+"]\n")

            for v in values:
                if v == "env_sample_source":
                    try:
                        new_val = [str(j[0]) for j in self.env if j[1] == values[v]][0]
                    except:
                        text = """There was an error in env_sample_source. Please check your metadata.
Possible values:
-----------
air
extreme habitat
host associated
human associated
human-amniotic-fluid
human-blood
human-gut
human-oral
human-skin
human-urine
human-vaginal
indoor
microbial mat/biofilm
miscellaneous_natural_or_artificial_environment
plant associated
sediment
soil/sand
unknown
wastewater/sludge
water-freshwater
water-marine
-----------
"""
                        print(text)
                        raise
                    fh.write("env_sample_source_id = "+new_val+"\n")
                else:
                    fh.write(v+" = "+values[v]+"\n")

        fh.close()

        return new_ini_file

    def save_ini_file(self,new_ini_file):
        # give it a new name
        out_fh = open(new_ini_file,'w')
        #for line in open(os.path.abspath(self.general_config_dict['configPath']),"r"):
        #    out_fh.write(line)
        self.general_config_dict['configPath_original'] = self.general_config_dict['configPath']
        self.general_config_dict['configPath'] = new_ini_file

        out_fh.write("#\n#\tCreated by MBL Pipeline for run: "+self.general_config_dict['run']+" on "+self.general_config_dict['date']+"\n#\n\n")
        out_fh.write("[general]\n")
        for item in self.general_config_dict:

            out_fh.write(item+" = "+str(self.general_config_dict[item]) + "\n")
        #out_fh.write("\n["+self.general_config_dict['platform']+"]\n")
        #for item in self.general_config_dict:
        #    if item not in C.general_run_items:
        #        out_fh.write(item+" = "+str(self.general_config_dict[item]) + "\n")



        if 'fasta_file' in self.general_config_dict and self.general_config_dict['fasta_file'] != '':
            (path,fasta) = os.path.split(self.general_config_dict['fasta_file'])
            if 'input_dir' in self.general_config_dict and self.general_config_dict['input_dir'] != path:
                sys.exit("Your input_dir and fasta_file directory don't agree - Exiting\n\t"+self.general_config_dict['input_dir']+" != "+self.general_config_dict['fasta_file'])

            out_fh.write("input_dir = "+path+"\n")
            out_fh.write("input_files = "+fasta+"\n")
            #out_fh.write("input_file_suffix = fasta\n")
        elif 'input_dir' in self.general_config_dict and self.general_config_dict['input_dir'] != '':
            file_list = self.get_input_files()
            out_fh.write("input_files = "     + ','.join(file_list)+"\n")
        else:
            out_fh.write("input_files = \n")
        out_fh.close()

    def check_headers(self, headers):
        if self.general_config_dict['platform'] in C.illumina_list:
            pl = self.general_config_dict['platform']
            known_header_list = self.known_header_list[pl]
        elif self.general_config_dict['platform'] == '454':
            known_header_list = self.known_header_list['454']
        else:
            logger.error("in utils: check_headers - unknown platform")
        #print(  sorted(known_header_list))
        #print(sorted(headers))
        self.res_headers = headers
        if "env_sample_source" in headers:
            self.env_source_to_id(headers)

        if sorted(known_header_list) != sorted(self.res_headers):
            print("=" * 40)
            print("csv file header problem")
            print("%-20s %-20s" % ("REQUIRED", "YOUR CSV"))
            for i in sorted(known_header_list):
                if i in headers:
                    print("%-20s%-20s" % (i,i))
                else:
                    print("%-20s%-20s" % (i,"----------- <--- missing"))
            for i in headers:

                if i not in known_header_list:
                    print("%-20s%-20s" % (" ",i+" <--- extra"))
            print("=" * 40)
            sys.exit("ERROR : unknown or missing headers\n")
        else:
            return True

    def env_source_to_id(self, headers):
        logger.error("self.utils.is_local() LLL2 metadata")
        logger.error(self.utils.is_local())
        if self.utils.is_local():
            self.my_conn     = MyConnection(host = 'localhost', db="test_env454")
        else:
            self.my_conn = MyConnection(host='bpcdb1', db="env454")
        # self.my_conn     = MyConnection()
        my_sql       = """SELECT * FROM env_sample_source"""
        self.env     = self.my_conn.execute_fetch_select(my_sql)
        self.res_headers = ["env_sample_source_id" if x=="env_sample_source" else x for x in headers]

    def configDictionaryFromFile_ini(self, config_file_path):
        import configparser

        configDict = {}
        user_config = configparser.ConfigParser()
        user_config.read(config_file_path)

        for section in user_config.sections():
            section_dict = configDict[section] = {}
            for option in user_config.options(section):
                section_dict[option] = user_config.get(section,option)
                if section_dict[option] == 'True' or section_dict[option] == 'true':
                    section_dict[option] = True
                elif section_dict[option] == 'False' or section_dict[option] == 'false':
                    section_dict[option] = False

        return configDict

    def get_values(self, args, general_config_dict = {} ):
        collector={}

        for item in self.pipeline_run_items[args.platform]:

            # set collector[item] to the default first
            collector[item] = self.pipeline_run_items[args.platform][item]

            # now look for args (then ini) values to replace
            if item in args and getattr( args, item ) != None:
                collector[item]  = getattr( args, item )
            elif general_config_dict and item in general_config_dict[args.platform] and general_config_dict[args.platform][item] != '':
                collector[item]  = general_config_dict[args.platform][item]

        # get all the items from general_config_dict['general']
        if 'general' in general_config_dict:
            for item in general_config_dict['general']:
                collector[item]  = general_config_dict['general'][item]


        return collector

    def validate_args(self):
        """
        # THOUGHTS
        # vamps users
        # single project and dataset
        # Supply an ini file OR commandline (for web interface), but no csv file
        #
        # MBL pipeline
        # REQUIRE a csv file and a ini file
        """
        collector={}

        if self.args.configPath:
            general_config_dict = self.configDictionaryFromFile_ini(self.args.configPath)
            if self.args.platform in general_config_dict and 'general' in general_config_dict:
                collector= self.get_values( self.args, general_config_dict)
            else:
                sys.exit("The ini file needs both a [general] and ["+ self.args.platform +"] section - Exiting.")
        else:
            # no configPath
            collector= self.get_values( self.args )

        collector['current_db_host_name'] = self.utils.find_in_nested_dict(C.db_cnf, {'host': collector['database_host'], 'db': collector['database_name']})
        if not collector['current_db_host_name']:
            sys.exit("""Please check -db_host and -db_name parameters, 
            the current combination does not exist: 'db_host' = %s, 'db_name' = %s """ % (collector['database_host'], collector['database_name']))

        if self.args.platform in C.illumina_list:
            print("Starting Illumina Pipeline")
            if not self.args.csvPath:
                sys.exit("illumina requires a csv file - Exiting")

        elif self.args.platform == 'vamps':
            print("Starting VAMPS Pipeline:")

            if 'project' not in collector or collector['project'] == '':
                collector['project'] = collector['project'][:1].capitalize() + collector['project'][1:]
            else:
                logger.debug("No project found in vamps pipeline")
            if self.args.fasta_file:
                collector['project'] = self.args.fasta_file
                collector['from_fasta'] = True
        elif self.args.platform == '454':
            print("Starting 454 Pipeline")

        elif self.args.platform == 'ion_torrent':
            print("Starting Ion Torrent Pipeline")

        else:
            sys.exit("Validate args: Unknown Platform")

        if  self.args.configPath:
            collector['configPath'] = self.args.configPath
        else:
            collector['configPath'] = ""
        # these are all the bool items in the collector
        # they need to be converted from str to bool here
        for i in collector:
            if collector[i] == 'True' or collector[i] == 'true':
                collector[i] = True
            elif collector[i] == 'False' or collector[i] == 'false':
                collector[i] = False

        #collector['runcode'] = self.args.run
        collector['run'] = self.args.run
        #collector['run_date'] = self.args.run
        #collector['steps'] = self.args.steps
        collector['platform'] = self.args.platform
        if self.args.input_dir:
            collector['input_dir'] = self.args.input_dir

        collector['date'] = str(datetime.date.today())
        #print(collector)
        return collector
class Chimera:
    """ Define here """
    def __init__(self, runobj=None):
        self.utils = PipelneUtils()
        self.runobj = runobj
        self.run_keys = self.runobj.run_keys
        self.rundate = self.runobj.run
        try:
            self.use_cluster = self.runobj.use_cluster
        except:
            self.use_cluster = True
        self.chg_suffix = ".chg"
        self.chimeras_suffix = ".chimeras"
        self.ref_suffix = ".db"
        self.denovo_suffix = ".txt"
        self.nonchimeric_suffix = "." + C.nonchimeric_suffix  #".nonchimeric.fa"
        self.chimeric_suffix = ".chimeric.fa"
        self.base_suffix = "unique" + self.chimeras_suffix

        self.cluster_slots = {
            "grendel": [12, 8],
            "cricket": [40],
            "cluster5": [32]
        }

        try:
            if self.runobj.lane_name:
                lane_name = self.runobj.lane_name
            else:
                lane_name = ''
        except:
            lane_name = ''

        if self.runobj.vamps_user_upload:
            os.environ['SGE_ROOT'] = '/opt/sge'
            os.environ['SGE_CELL'] = 'grendel'
            path = os.environ['PATH']
            os.environ['PATH'] = '/opt/sge/bin/lx24-amd64:' + path
            site = self.runobj.site
            dir_prefix = self.runobj.user + '_' + self.runobj.run
            self.dirs = Dirs(self.runobj.vamps_user_upload,
                             dir_prefix,
                             self.runobj.platform,
                             lane_name=lane_name,
                             site=site)
            self.idx_keys = convert_unicode_dictionary_to_str(
                json.loads(
                    open(self.runobj.trim_status_file_name,
                         "r").read()))["new_lane_keys"]
            self.analysis_dir = self.dirs.check_dir(self.dirs.analysis_dir)
            self.indir = self.dirs.check_dir(self.dirs.trimming_dir)
            self.outdir = self.dirs.check_dir(self.dirs.chimera_dir)
            self.gast_dir = self.dirs.check_dir(self.dirs.gast_dir)

        else:
            site = ''
            dir_prefix = self.runobj.run
            self.dirs = Dirs(self.runobj.vamps_user_upload,
                             dir_prefix,
                             self.runobj.platform,
                             lane_name=lane_name,
                             site=site)
            self.indir = self.dirs.check_dir(self.dirs.reads_overlap_dir)
            self.outdir = self.dirs.check_dir(self.dirs.chimera_dir)

#         self.usearch_cmd = C.usearch_cmd
        self.usearch_cmd = C.usearch6_cmd
        if self.utils.is_local():
            self.usearch_cmd = C.usearch6_cmd_local
        #self.abskew      = C.chimera_checking_abskew
        self.refdb = C.chimera_checking_refdb
        if self.utils.is_local():
            self.refdb_local = C.chimera_checking_refdb_local
        self.its_refdb = C.chimera_checking_its_refdb
        self.input_file_names = self.make_chimera_input_illumina_file_names()
#         self.output_file_names = self.make_chimera_output_illumina_file_names(self.input_file_names)

    def get_ref_db(self, dna_region):
        ref_db = ''
        if dna_region.upper() == 'ITS':
            ref_db = C.chimera_checking_its_refdb
            logger.debug("got an ITS dna region so using refdb: " + ref_db)
        else:
            ref_db = C.chimera_checking_refdb
            if self.utils.is_local():
                ref_db = C.chimera_checking_refdb_local
            logger.debug("using standard refdb: " + ref_db)
        return ref_db

    def make_chimera_input_illumina_file_names(self):
        input_file_names = {}

        for idx_key in self.run_keys:
            file_name = idx_key + "_" + C.filtered_suffix + ".unique"

            if os.path.exists(os.path.join(self.indir, file_name)):
                input_file_names[idx_key] = file_name

        return input_file_names

    def get_current_dirname(self, in_or_out=""):
        if in_or_out == "":
            cur_dirname = self.indir
        else:
            cur_dirname = self.outdir
        return cur_dirname

    def is_chimera_check_file(self, filename):
        return filename.endswith(
            (self.chimeras_suffix + self.denovo_suffix,
             self.chimeras_suffix + self.ref_suffix, self.chimeric_suffix,
             self.nonchimeric_suffix))

    def get_current_filenames(self, cur_dirname):
        cur_file_names = []
        if cur_dirname == self.indir:
            cur_file_names = self.input_file_names.values()
        elif cur_dirname == self.outdir:
            cur_file_names = self.get_chimera_file_names(self.outdir)
        return cur_file_names

    def get_chimera_file_names(self, cur_dirname):
        cur_file_names = []
        for dirname, dirnames, filenames in os.walk(cur_dirname):
            cur_file_names = [
                filename for filename in filenames
                if (self.is_chimera_check_file(filename))
            ]
        return cur_file_names

    def read_file(self, source_name):
        with open(source_name, "r") as sources:
            return sources.readlines()

    def illumina_sed(self, lines, target_name, regex, replace, uppercase):
        with open(target_name, "w") as target:
            for line in lines:
                if line.startswith(">"):
                    line1 = regex.sub(replace, line)
                else:
                    if (uppercase):
                        line1 = line.upper()
                    else:
                        line1 = line
                target.write(line1)

    def call_illumina_sed(self, from_to):
        """
            from_to = from_frequency_to_size or from_size_to_frequency
        """
        sed_from_to = namedtuple(
            'sed_from_to',
            'find, replace, cur_dirname, cur_file_names, change_from_suffix, change_to_suffix, uppercase'
        )

        from_frequency_to_size = sed_from_to(
            find="frequency:",
            replace=";size=",
            cur_dirname=self.indir,
            cur_file_names=self.get_current_filenames(self.indir),
            change_from_suffix="",
            change_to_suffix=self.chg_suffix,
            uppercase=True)

        from_size_to_frequency = sed_from_to(
            find=";size=",
            replace="frequency:",
            cur_dirname=self.outdir,
            cur_file_names=self.get_chimera_file_names(self.outdir),
            change_from_suffix="",
            change_to_suffix="",
            uppercase=False)

        if (from_to == "from_frequency_to_size"):
            tuple_name = from_frequency_to_size
        elif (from_to == "from_size_to_frequency"):
            tuple_name = from_size_to_frequency

        regex = re.compile(r"%s" % tuple_name.find)
        #         logger.debug("find = %s, replace = %s" % (find, replace))
        if (not tuple_name.cur_file_names) and (tuple_name
                                                == from_frequency_to_size):
            self.utils.print_both(
                'ERROR: Did not find uniqued files ("%s") in %s, please check if the previous step has finished. Exiting.\n'
                % (C.filtered_suffix + ".unique", self.indir))
            sys.exit()
        for cur_file_name in tuple_name.cur_file_names:
            file_name = os.path.join(tuple_name.cur_dirname, cur_file_name)
            source_name = file_name + tuple_name.change_from_suffix
            target_name = file_name + tuple_name.change_to_suffix
            lines = self.read_file(source_name)
            self.illumina_sed(lines, target_name, regex, tuple_name.replace,
                              tuple_name.uppercase)

    def illumina_freq_to_size_in_chg(self):
        find1 = "frequency:"
        replace1 = ";size="
        regex1 = re.compile(r"%s" % find1)

        #         logger.debug("cur_file_names: ")
        #         pprint(cur_file_names)
        cur_dirname = self.get_current_dirname()
        cur_file_names = self.get_current_filenames(cur_dirname)
        change_from_suffix = ""
        change_to_suffix = self.chg_suffix
        #         logger.debug("find = %s, replace = %s" % (find, replace))

        for cur_file_name in cur_file_names:
            file_name = os.path.join(cur_dirname, cur_file_name)
            lines = self.utils.read_file(file_name + change_from_suffix)
            with open(file_name + change_to_suffix, "w") as target:
                for line in lines:
                    if line.startswith(">"):
                        line1 = regex1.sub(replace1, line)
                    else:
                        line1 = line.upper()
#                     logger.debug(line1)
                    target.write(line1)

    def illumina_size_to_freq_in_chimer(self):
        find1 = ";size="
        replace1 = "frequency:"
        regex1 = re.compile(r"%s" % find1)

        cur_file_names = self.get_chimera_file_names(self.outdir)

        for file_chim in cur_file_names:
            file_chim_path = os.path.join(self.outdir, file_chim)
            lines = self.utils.read_file(file_chim_path)
            with open(file_chim_path, "w") as target:
                for line in lines:
                    line1 = regex1.sub(replace1, line)
                    target.write(line1)

    def illumina_rm_size_files(self):
        for idx_key in self.input_file_names:
            file_name = os.path.join(
                self.indir, self.input_file_names[idx_key] + self.chg_suffix)
            if os.path.exists(file_name):
                pass
                # os.remove(file_name)

    def check_if_chimera_dir_empty(self):
        if not os.listdir(self.outdir):
            self.utils.print_both(
                'ERROR: Did not find files in %s, something is wrong. First check if you ran the command on a cluster. Exiting.\n'
                % self.outdir)
            sys.exit()

    def check_if_cluster_is_done(self, time_before):
        cluster_done = False
        check_qstat_cmd_line = "qstat | grep \"%s\" | grep chimera_ch | wc -l" % time_before
        #         check_qstat_cmd_line = "qstat | grep vsearch"

        self.utils.print_both("check_qstat_cmd_line = %s" %
                              check_qstat_cmd_line)

        try:
            p = subprocess.Popen(check_qstat_cmd_line,
                                 stdout=subprocess.PIPE,
                                 shell=True)
            (output, err) = p.communicate()
            num_proc = int(output)
            self.utils.print_both("qstat is running %s 'vsearch' processes" %
                                  num_proc)
            #         pprint(p)

            if (num_proc == 0):
                cluster_done = True

    #         logger.debug("cluster_done from check_if_cluster_is_done = %s" % cluster_done)
        except:
            self.utils.print_both(
                "Chimera checking can be done only on a cluster.")
            raise

        return cluster_done

    def create_chimera_cmd(self, ref_db):
        """
        /usr/local/bin/vsearch
        -uchime_denovo
        /Users/ashipunova/BPC/py_mbl_sequencing_pipeline/test/miseq/20150223/lane_1_B/analysis/reads_overlap/TGACCA_NNNNCGACG_1_MERGED-MAX-MISMATCH-3.unique.chg
        -uchimeout
        /Users/ashipunova/BPC/py_mbl_sequencing_pipeline/test/miseq/20150223/lane_1_B/analysis/chimera/TGACCA_NNNNCGACG_1_MERGED-MAX-MISMATCH-3.unique.chimeras.txt
        -chimeras
        /Users/ashipunova/BPC/py_mbl_sequencing_pipeline/test/miseq/20150223/lane_1_B/analysis/chimera/TGACCA_NNNNCGACG_1_MERGED-MAX-MISMATCH-3.unique.chimeras.txt.chimeric.fa
        -notrunclabels
        ---
        /usr/local/bin/vsearch
        -uchime_ref
        /Users/ashipunova/BPC/py_mbl_sequencing_pipeline/test/miseq/20150223/lane_1_B/analysis/reads_overlap/TGACCA_NNNNCGACG_1_MERGED-MAX-MISMATCH-3.unique.chg
        -uchimeout
        /Users/ashipunova/BPC/py_mbl_sequencing_pipeline/test/miseq/20150223/lane_1_B/analysis/chimera/TGACCA_NNNNCGACG_1_MERGED-MAX-MISMATCH-3.unique.chimeras.db
        -chimeras
        /Users/ashipunova/BPC/py_mbl_sequencing_pipeline/test/miseq/20150223/lane_1_B/analysis/chimera/TGACCA_NNNNCGACG_1_MERGED-MAX-MISMATCH-3.unique.chimeras.db.chimeric.fa
        -notrunclabels
        -strand
        plus
        -db
        /groups/g454/blastdbs/rRNA16S.gold.fasta

        """
        command_line = []

        ref_or_novo_options = {
            self.denovo_suffix: "-uchime_denovo",
            self.ref_suffix: "-uchime_ref"
        }
        for suff, opt in ref_or_novo_options.items():
            input_file_name = self.indir + "/$filename_base" + self.chg_suffix
            output_file_name = self.outdir + "/$filename_base" + self.chimeras_suffix + suff

            ref_add = ""
            if (opt == "-uchime_ref"):
                ref_add = "-strand plus -db %s" % ref_db

            uchime_cmd = """%s %s %s -uchimeout %s -chimeras %s%s -notrunclabels %s
            """ % (self.usearch_cmd, opt, input_file_name, output_file_name,
                   output_file_name, self.chimeric_suffix, ref_add)
            logger.debug("UUU = uchime_cmd = %s" % uchime_cmd)
            logger.debug("+++")
            command_line.append(uchime_cmd)

        return command_line

    def create_chimera_cmd_old(self,
                               input_file_name,
                               output_file_name,
                               ref_or_novo,
                               ref_db=""):
        """
        http://www.drive5.com/usearch/manual/uchime_denovo.html
        from usearch -help
        Chimera detection (UCHIME ref. db. mode):
          usearch -uchime q.fasta [-db db.fasta] [-chimeras ch.fasta]
            [-nonchimeras good.fasta] [-uchimeout results.uch] [-uchimealns results.alns]

        Chimera detection (UCHIME de novo mode):
          usearch -uchime amplicons.fasta [-chimeras ch.fasta] [-nonchimeras good.fasta]
             [-uchimeout results.uch] [-uchimealns results.alns]
          Input is estimated amplicons with integer abundances specified using ";size=N".
        usearch -uchime_denovo amplicons.fasta -uchimeout results.uchime
        """

        uchime_cmd_append = ""
        db_cmd_append = ""
        dir_cmd_append = ""

        if (ref_or_novo == "denovo"):
            uchime_cmd_append = " -uchime_denovo "
            output_file_name = output_file_name + self.chimeras_suffix + self.denovo_suffix
        elif (ref_or_novo == "ref"):
            uchime_cmd_append = " -uchime_ref "
            output_file_name = output_file_name + self.chimeras_suffix + self.ref_suffix
            db_cmd_append = " -db " + ref_db
            dir_cmd_append = " -strand plus"
        else:
            self.utils.print_both(
                "Error: Incorrect method, should be \"denovo\" or \"ref\"")
        self.utils.print_both("output_file_name = %s" % output_file_name)

        uchime_cmd = C.clusterize_cmd
        if self.utils.is_local():
            uchime_cmd = ""
        uchime_cmd += " "
        uchime_cmd += self.usearch_cmd
        logger.debug("self.usearch_cmd FROM create_chimera_cmd = %s" %
                     (uchime_cmd))

        uchime_cmd += uchime_cmd_append + input_file_name
        logger.debug("uchime_cmd_append FROM create_chimera_cmd = %s" %
                     (uchime_cmd_append))

        uchime_cmd += db_cmd_append

        logger.debug("db_cmd_append FROM create_chimera_cmd = %s" %
                     (db_cmd_append))

        uchime_cmd += " -uchimeout " + output_file_name
        """if we need nonchimeric for denovo and db separate we might create them here"""
        uchime_cmd += " -nonchimeras "
        uchime_cmd += (output_file_name + self.nonchimeric_suffix)

        uchime_cmd += " -chimeras " + (output_file_name + self.chimeric_suffix)
        uchime_cmd += dir_cmd_append

        uchime_cmd += " -notrunclabels"

        logger.debug("uchime_cmd FROM create_chimera_cmd = %s" % (uchime_cmd))
        return uchime_cmd

    def get_sge_cluster_name(self):
        # import subprocess
        result = subprocess.run(['qstat', '-F'], stdout=subprocess.PIPE)
        a1 = result.stdout.decode('utf-8').split()
        for line in a1:
            if (line.find("hostname") !=
                    -1):  #qf:hostname=grendel-01.bpcservers.private
                return line.split("=")[1].split("-")[0]

    def get_sge_slot_number(
            self
    ):  # doesn't work on cricket because: 	qc:slots=12 and qc:slots=8
        result = subprocess.run(['qstat', '-F', 'slots'],
                                stdout=subprocess.PIPE)
        a1 = result.stdout.decode('utf-8').split()
        slots = []
        for line in a1:
            if line.startswith('qc:slots'):
                slots.append(int(line.split("=")[-1]))
        slots_uniq = set(slots)
        return max(slots_uniq)

    # TODO: temp! take from util. change illumina-files to use util, too
    #   create_job_array_script(self, command_line, dir_to_run, files_list, runobj)
    # feb 25 2019 removed, because didn't work on grendel:
    #  Use the allslots pe and all available slots on that cluster
    # #$ -pe allslots %s
    def create_job_array_script(self, script_file_name_base, command_line,
                                dir_to_run, files_list):
        # sge_slot_number = self.get_sge_slot_number()
        sge_cluster_name = self.get_sge_cluster_name()
        sge_slot_number = self.cluster_slots[sge_cluster_name][0]
        logger.debug("sge_slot_number FROM create_job_array_script = %s" %
                     (sge_slot_number))

        files_string = " ".join(files_list)
        files_list_size = len(files_list)
        #         command_file_name = os.path.basename(command_line.split(" ")[0])
        script_file_name = script_file_name_base + "_" + self.runobj.run + "_" + self.runobj.lane_name + ".sh"
        script_file_name_full = os.path.join(dir_to_run, script_file_name)
        log_file_name = script_file_name + ".sge_script.sh.log"
        email_mbl = C.email_mbl
        # self.utils.make_users_email()
        text = (
            '''#!/bin/bash
#$ -cwd
#$ -S /bin/bash
#$ -N %s
# Giving the name of the output log file
#$ -o %s
# Combining output/error messages into one file
#$ -j y
# Send mail to these users
#$ -M %s
# Send mail at job end (e); -m as sends abort, suspend.
#$ -m as
# max_running_tasks
#$ -tc 15
-# Use the allslots pe and all available slots on that cluster
#$ -pe allslots %s
#$ -t 1-%s
# Now the script will iterate %s times.

  file_list=(%s)

  i=$(expr $SGE_TASK_ID - 1)
  echo "i = $i"
  . /bioware/root/Modules/etc/profile.modules
  module load bioware
  module load vsearch

  INFILE=${file_list[$i]}

  filename=$(basename $INFILE)
  echo "INFILE = $INFILE"
  filename_base="${filename%%.*}"
  echo "filename_base = $filename_base"
  echo "%s"
  echo "%s"
  %s
  %s
''' % (script_file_name, log_file_name, email_mbl, sge_slot_number,
        files_list_size, files_list_size, files_string, command_line[0],
        command_line[1], command_line[0], command_line[1])
            # ''' % (script_file_name, log_file_name, email_mbl, files_list_size, files_list_size, files_string, command_line)
        )
        self.utils.open_write_close(script_file_name_full, text)
        return script_file_name

    def create_not_SGE_script(self, script_file_name_base, command_line,
                              dir_to_run, files_list):

        files_string = " ".join(files_list)
        script_file_name = script_file_name_base + "_" + self.runobj.run + "_" + self.runobj.lane_name + ".sh"
        script_file_name_full = os.path.join(dir_to_run, script_file_name)
        text = (
            '''#!/bin/bash

    file_list=(%s)

    . /bioware/root/Modules/etc/profile.modules
    module load bioware
    module load vsearch

    n=0
    for INFILE in "${file_list[@]}"
    do      
    n=$[n + 1]
    echo $n
    echo "INFILE = $INFILE"
    filename=$(basename $INFILE)
    filename_base="${filename%.*}"
    echo "filename_base = $filename_base"

    echo "%s"
    echo "%s"
    %s
    %s
    done
    ''' % (files_string, command_line[0], command_line[1], command_line[0],
           command_line[1])
            # ''' % (script_file_name, log_file_name, email_mbl, files_list_size, files_list_size, files_string, command_line)
        )
        self.utils.open_write_close(script_file_name_full, text)
        return script_file_name

    def chimera_checking(self):
        chimera_region_found = False

        file_list = self.dirs.get_all_files_by_ext(self.indir, self.chg_suffix)
        logger.debug("FFF = file_list = %s" % (file_list))

        #         TODO: method
        dna_region = list(
            set([
                self.runobj.samples[idx_key].dna_region
                for idx_key in self.input_file_names
            ]))[0]
        if dna_region in C.regions_to_chimera_check:
            chimera_region_found = True
        else:
            logger.debug('region not checked: ' + dna_region)
        ref_db = self.get_ref_db(dna_region)
        command_line = self.create_chimera_cmd(ref_db)
        sh_script_file_name = self.create_job_array_script(
            "chimera_checking", command_line, self.indir, file_list)
        script_file_name_full = os.path.join(self.indir, sh_script_file_name)
        self.utils.call_sh_script(script_file_name_full, self.indir)
        self.utils.print_both("self.dirs.chmod_all(%s)" % (self.indir))
        self.dirs.chmod_all(self.indir)
        logger.debug('sh_script_file_name: ' + sh_script_file_name)
        if not chimera_region_found:
            return ('NOREGION', 'No regions found that need checking', '')
        else:
            return ("The vsearch commands were created")

    def get_chimeric_ids(self):
        ids = set()
        chimera_file_names = self.get_chimera_file_names(self.outdir)
        file_ratio = self.check_chimeric_stats()

        for file_name in chimera_file_names:
            #             logger.debug("from get_chimeric_ids: file_name = %s" % file_name)
            if file_name.endswith(self.chimeric_suffix):
                both_or_denovo = self.get_chimeras_suffix(
                    file_ratio, file_name)
                #                 TODO: run ones for each file_base = ".".join(file_name.split(".")[0:3]) (for txt and db)
                if file_name.endswith(both_or_denovo):
                    file_name_path = os.path.join(self.outdir, file_name)
                    self.utils.print_both("Get ids from %s" % file_name_path)
                    read_fasta = fa.ReadFasta(file_name_path)
                    ids.update(set(read_fasta.ids))
        return ids

    def get_chimeras_suffix(self, file_ratio, file_name):
        """ use only de-novo (.txt) chimeric if
            check_chimeric_stats shows
            ratio ref to de-novo > 3
            e.g.
            if denovo_only:
                chimeric_suffix = self.chimeras_suffix + self.denovo_suffix + self.chimeric_suffix
            if no:
                chimeras_suffix = self.chimeric_suffix

            if file_name.endswith(chimeric_suffix):
            ...
                #     first_name, last_name = get_name()

        """
        #         for file_basename in file_ratio:
        (percent_ref, ratio) = file_ratio[".".join(file_name.split(".")[0:3])]

        chimeric_fa_suffix = ""
        #         logger.debug("percent_ref = %s, ratio = %s" % (percent_ref, ratio))
        #         if (percent_ref > 15) and (ratio > 2):
        if ratio > 3:
            chimeric_fa_suffix = self.chimeras_suffix + self.denovo_suffix + self.chimeric_suffix
        else:
            chimeric_fa_suffix = self.chimeric_suffix
        return chimeric_fa_suffix

    def move_out_chimeric(self):
        chimeric_ids = self.get_chimeric_ids()
        for idx_key in self.input_file_names:
            fasta_file_path = os.path.join(self.indir,
                                           self.input_file_names[idx_key])
            read_fasta = fa.ReadFasta(fasta_file_path)
            read_fasta.close()

            non_chimeric_file = fasta_file_path + self.nonchimeric_suffix
            non_chimeric_fasta = fa.FastaOutput(non_chimeric_file)

            fasta = fa.SequenceSource(fasta_file_path, lazy_init=False)
            while fasta.next():
                if not fasta.id in chimeric_ids:
                    non_chimeric_fasta.store(fasta, store_frequencies=False)
            non_chimeric_fasta.close()

    def check_chimeric_stats(self):
        all_lines_suffix = self.denovo_suffix  # ".txt" or ".db, doesn't matter"
        chimera_ref_suffix = self.ref_suffix + self.chimeric_suffix  #".db.chimeric.fa"
        chimera_denovo_suffix = self.denovo_suffix + self.chimeric_suffix  # ".txt.chimeric.fa"
        filenames = self.get_basenames(self.get_current_filenames(self.outdir))
        file_ratio = {}
        for file_basename in filenames:
            # logger.debug(file_basename)
            all_lines = 0
            ref_lines = 0
            denovo_lines = 0
            ratio = 0
            percent_ref = 0
            percent_denovo = 0

            all_lines_file_name = os.path.join(
                self.outdir, file_basename + all_lines_suffix)
            ref_lines_file_name = os.path.join(
                self.outdir, file_basename + chimera_ref_suffix)
            denovo_lines_file_name = os.path.join(
                self.outdir, file_basename + chimera_denovo_suffix)

            all_lines = int(self.wccount(all_lines_file_name) or 0)
            ref_lines = int(self.get_fa_lines_count(ref_lines_file_name) or 0)
            denovo_lines = int(
                self.get_fa_lines_count(denovo_lines_file_name) or 0)

            # denovo_lines = int(denovo_lines or 0)
            if (ref_lines == 0) or (all_lines == 0):
                file_ratio[file_basename] = (0, 0)
                continue
            else:
                percent_ref = self.percent_count(all_lines, ref_lines)

            if (denovo_lines == 0):
                file_ratio[file_basename] = (
                    percent_ref, percent_ref
                )  #use ref instead of ratio, because we are actually looking for a huge difference between ref and denovo (ref > 15 and denovo = 0)
                continue

            if (denovo_lines > 0):
                ratio = self.count_ratio(ref_lines, denovo_lines)
                percent_denovo = self.percent_count(all_lines, denovo_lines)
            file_ratio[file_basename] = (percent_ref, ratio)
            # percent_ref = int(percent_ref or 0)
            if (percent_ref > 15):
                self.utils.print_both("=" * 50)

                self.utils.print_both(file_basename)
                # logger.debug("all_lines_file_name = %s, ref_lines_file_name = %s, denovo_lines_file_name = %s" % (all_lines_file_name, ref_lines_file_name, denovo_lines_file_name))
                self.utils.print_both(
                    "all_lines = %s, ref_lines = %s, denovo_lines = %s" %
                    (all_lines, ref_lines, denovo_lines))
                self.utils.print_both("ratio = %s" % ratio)
                self.utils.print_both("percent_ref = %s, percent_denovo = %s" %
                                      (percent_ref, percent_denovo))
        return file_ratio

    def get_basenames(self, filenames):
        file_basenames = set()
        for f in filenames:
            file_basename = ".".join(f.split(".")[0:3])
            if file_basename.endswith(self.base_suffix):
                file_basenames.add(file_basename)

        return file_basenames

    def wccount(self, filename):
        return subprocess.check_output(['wc', '-l', filename]).split()[0]

    def count_ratio(self, ref_num, denovo_num):
        try:
            return float(ref_num or 0) / float(denovo_num or 0)
        except ZeroDivisionError:
            # logger.debug("There is no denovo chimeras to count ratio.")
            pass

    def get_fa_lines_count(self, file_name):
        # todo: use fastalib to get cnt?
        # return fa.SequenceSource(file_name, lazy_init = False).total_seq
        try:
            file_open = open(file_name)
            return len([l for l in file_open.readlines() if l.startswith('>')])
        except IOError:
            e = sys.exc_info()[1]
            self.utils.print_both(e)
            return 0
            # logger.error("%s\nThere is no such file: %s" % (e, file_name))

    def percent_count(self, all_lines, chimeric_count):
        try:
            return float(chimeric_count or 0) * 100 / float(all_lines or 0)
        except ZeroDivisionError:
            # logger.error("There is no denovo chimeras to count ratio.")
            pass

    """
    -----------------------------------------------------------------------------
        For 454.
        not tested
    """

    def chimera_denovo(self):
        chimera_region_found = False
        output = {}
        cluster_id_list = []

        for idx_key in self.idx_keys:
            input_file_name = os.path.join(self.indir, idx_key + '.abund.fa')
            if os.path.isfile(input_file_name):
                output_file_name = os.path.join(self.outdir,
                                                idx_key + '.chimera.denovo')
                #open(output_file_name, 'a').close()  # make sure file exists
                log_file = os.path.join(self.outdir, idx_key + ".denovo.log")

                dna_region = self.runobj.samples[idx_key].dna_region
                logger.debug("dna_region = %s" % dna_region)
                if self.runobj.vamps_user_upload:
                    # VAMPS users can chimera check regardless of region chosen
                    chimera_region_found = True
                else:
                    if dna_region in C.regions_to_chimera_check:
                        chimera_region_found = True
                    else:
                        logger.debug('region not checked: ' + dna_region)
                        continue

                self.utils.print_both(
                    "input_file_name = %s \noutput_file_name = %s" %
                    (input_file_name, output_file_name))

                #             uchime_cmd = C.clusterize_cmd
                #             uchime_cmd += " "
                #             uchime_cmd += self.usearch_cmd
                #             uchime_cmd += " --uchime "
                #             uchime_cmd += input_file_name
                #             uchime_cmd += " --uchimeout "
                #             uchime_cmd += output_file_name
                #             uchime_cmd += " --abskew "
                #             uchime_cmd += self.abskew
                uchime_cmd = ''
                if self.use_cluster:
                    uchime_cmd += C.clusterize_cmd
                    uchime_cmd += " "
                    uchime_cmd += " -log "
                    uchime_cmd += log_file
                    uchime_cmd += " "
                uchime_cmd += self.usearch_cmd
                uchime_cmd += " -uchime_denovo "
                uchime_cmd += input_file_name
                uchime_cmd += " -uchimeout "
                uchime_cmd += output_file_name

                logger.debug("uchime_denovo_cmd = %s" % (uchime_cmd))

                try:
                    logger.info("chimera denovo command: " + str(uchime_cmd))
                    #                 subprocess.Popen(uchime_cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)

                    self.utils.print_both("chimera denovo command: " +
                                          str(uchime_cmd))
                    #output[idx_key] = subprocess.Popen(uchime_cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
                    output[idx_key] = subprocess.check_output(uchime_cmd,
                                                              shell=True)
                    self.utils.print_both("chimera denovo result: " +
                                          str(output[idx_key]))
                    #self.utils.print_both("output[idx_key] = %s" % output[idx_key])
                    #if idx_key in output and len(output[idx_key].split()) > 1:
                    #self.utils.print_both(output[idx_key].split()[2])
                    items = output[idx_key].split()
                    if len(items) > 2:
                        cluster_id_list.append(items[2])

                except OSError:
                    e = sys.exc_info()[1]
                    self.utils.print_both(
                        "Error: Problems with this command: %s" % (uchime_cmd))
                    if self.utils.is_local():
                        print >> sys.stderr, "Error: Execution of %s failed: %s" % (
                            uchime_cmd, e)
                    else:
                        print >> sys.stderr, "Error: Execution of %s failed: %s" % (
                            uchime_cmd, e)
                        self.utils.print_both(
                            "Error: Execution of %s failed: %s" %
                            (uchime_cmd, e))
                        raise

# ???
        if not chimera_region_found:
            return ('NOREGION', 'No regions found that need checking', '')

        # ???
#         for idx_key in output:
#             if len(output[idx_key]) > 50 or len(output[idx_key]) < 40:
#                 return ('ERROR','uchime ref may have broken or empty', idx_key)

# finally
        self.utils.print_both('Finished Chimera Denovo')
        if cluster_id_list:
            return ('SUCCESS',
                    'uchime ref seems to have been submitted successfully',
                    cluster_id_list)
        else:
            return ('ERROR', 'uchime ref returned no cluster IDs',
                    cluster_id_list)

    def chimera_reference(self):

        chimera_region_found = False
        output = {}
        cluster_id_list = []
        for idx_key in self.run_keys:

            dna_region = self.runobj.samples[idx_key].dna_region
            if self.runobj.vamps_user_upload:
                # VAMPS users can chimera check regardless of region chosen
                chimera_region_found = True
            else:
                if dna_region in C.regions_to_chimera_check:
                    chimera_region_found = True
                else:
                    logger.debug('region not checked: ' + dna_region)
                    continue

            input_file_name = os.path.join(self.indir, idx_key + '.abund.fa')
            output_file_name = os.path.join(self.outdir,
                                            idx_key + ".chimera.ref")
            #open(output_file_name, 'a').close()  # make sure file exists
            log_file = os.path.join(self.outdir, idx_key + ".ref.log")
            logger.debug("OUT FILE NAME: " + output_file_name)

            #out_file_name = self.prefix[idx_key] + ".chimeras.db"
            input_file_name = os.path.join(self.indir, idx_key + '.abund.fa')
            if os.path.isfile(input_file_name):
                output_file_name = os.path.join(self.outdir,
                                                idx_key + ".chimera.ref")
                #open(output_file_name, 'a').close()  # make sure file exists
                log_file = os.path.join(self.outdir, idx_key + ".ref.log")
                logger.debug("OUT FILE NAME: " + output_file_name)
                # which ref db to use?
                ref_db = ''
                if dna_region.upper() == 'ITS':
                    logger.debug("got an ITS dna region so using refdb: " +
                                 self.its_refdb)
                    ref_db = self.its_refdb
                else:
                    logger.debug("using standard refdb: " + self.refdb)
                    ref_db = self.refdb

                uchime_cmd = ''
                if self.use_cluster:
                    uchime_cmd = C.clusterize_cmd
                    uchime_cmd += " "
                    uchime_cmd += " -log "
                    uchime_cmd += log_file
                    uchime_cmd += " "
                uchime_cmd += self.usearch_cmd
                uchime_cmd += " -uchime_ref "
                uchime_cmd += input_file_name
                uchime_cmd += " -uchimeout "
                uchime_cmd += output_file_name
                uchime_cmd += " -db "
                uchime_cmd += ref_db
                uchime_cmd += " -strand "
                uchime_cmd += "plus"

                logger.debug("uchime_ref_cmd = %s" % (uchime_cmd))

                try:

                    logger.info("vsearch version: " %
                                (self.utils.get_vsearch_version))
                    logger.info("chimera reference command: " +
                                str(uchime_cmd))
                    output[idx_key] = subprocess.check_output(uchime_cmd,
                                                              shell=True)
                    #logger.debug('outsplit',output[idx_key].split()[2])
                    cluster_id_list.append(output[idx_key].split()[2])
                    #logger.debug('Have %d bytes in output' % len(output))
                    #logger.debug('ref',idx_key,output,len(output))
                    if len(output[idx_key]) < 50 and len(output[idx_key]) > 40:
                        logger.debug(
                            idx_key +
                            " uchime ref seems to have been submitted successfully"
                        )
                    else:
                        if self.use_cluster:
                            print >> sys.stderr, "Error: uchime ref may be broke"
                            self.utils.print_both(
                                "Error: uchime ref may be broke")

                except OSError:
                    e = sys.exc_info()[1]
                    print >> sys.stderr, "Error: Execution of chimera_reference failed: %s" % (
                        uchime_cmd, e)
                    self.utils.print_both(
                        "Error: Execution of chimera_reference failed: %s" %
                        (uchime_cmd, e))
                    raise

        if not chimera_region_found:
            return ('NOREGION', 'No regions found that need checking', '')

        for idx_key in output:
            if (len(output[idx_key]) > 50
                    or len(output[idx_key]) < 40) and self.use_cluster:
                return ('ERROR', 'uchime ref may have broken or empty',
                        idx_key)
        self.utils.print_both('Finished Chimera Reference')
        return ('SUCCESS',
                'uchime ref seems to have been submitted successfully',
                cluster_id_list)

    def write_chimeras_to_deleted_file(self):

        for idx_key in self.run_keys:
            # open  deleted file and append chimera to it
            # open and read both chimeras files: chimeras.db and chimeras.txt

            # hash to remove dupes
            chimera_deleted = {}
            denovo_file = os.path.join(self.outdir,
                                       idx_key + '.chimera.denovo')
            ref_file = os.path.join(self.outdir, idx_key + ".chimera.ref")
            # deleted file is in trimming dir for vampsuser
            deleted_file = os.path.join(self.indir, idx_key + ".deleted.txt")
            for file in [denovo_file, ref_file]:
                if os.path.isfile(file):
                    fh = open(file, "r")
                    # make a list of chimera deleted read_ids
                    for line in fh.readlines():
                        lst = line.strip().split()
                        id = lst[1].split(';')[0]
                        chimera_yesno = lst[-1]
                        if (chimera_yesno) == 'Y':
                            chimera_deleted[id] = 'chimera'
            # open to append as trimming deletions are already there
            fh_del = open(deleted_file, "a")
            for id in chimera_deleted:
                fh_del.write(id + "\tChimera\n")
            fh_del.close()