class IlluminaFiles:
    """
    0) from run create all dataset_lines names files in output dir
    1) split fastq files from casava into files with dataset_names
    2) create ini files
    3) process them through Meren's script
    4) result - files dataset_lane-PERFECT_reads.fa.unique with frequencies - to process with env454upload()
    """
    def __init__(self, runobj):
        self.utils = PipelneUtils()
        self.runobj         = runobj
        self.out_files      = {}
        self.id_dataset_idx = {}
        self.in_file_path   = self.runobj.input_dir

        if self.runobj.vamps_user_upload:
            site = self.runobj.site
            dir_prefix=self.runobj.user+'_'+self.runobj.run
        else:
            site = ''
            dir_prefix = self.runobj.run
        if self.runobj.lane_name:
            lane_name = self.runobj.lane_name
        else:
            lane_name = ''

        dirs      = Dirs(self.runobj.vamps_user_upload, dir_prefix, self.runobj.platform, lane_name = lane_name, site = site)
        self.dirs = dirs
        self.out_file_path = dirs.check_dir(dirs.analysis_dir)
        self.results_path  = dirs.check_dir(dirs.reads_overlap_dir)
        self.platform = self.runobj.platform

    def split_files(self, compressed = False):
        """
        TODO: *) fastq_file_names method to collect all file_names with full path or directories_names (see get_all_files()?)
        """
#        logger.debug("compressed = %s" %       compressed)
#        compressed = ast.literal_eval(compressed)
        (in_files_r1, in_files_r2) = self.get_fastq_file_names(self.in_file_path)
#         correct_file_names = self.get_correct_file_names(in_files_r1)
        if (len(in_files_r1) > 0):
            self.read1(in_files_r1, compressed)
            self.read2(in_files_r2, compressed)
            self.create_inis()
        else:
#             logger.debug("ERROR: There is something wrong with fastq file names. Please check if they start with correct indexes.")
#             logger.debug("ERROR: There is something wrong with fastq file names. Please check if they start with correct indexes.")
            self.utils.print_both("ERROR: There is something wrong with fastq file names. Please check if they start with correct indexes.")
        self.close_dataset_files()



#        self.perfect_reads()
#        self.uniq_fa()

    def open_dataset_files(self):
        file_name_base = [i + "_R1" for i in self.runobj.samples.keys()] + [i + "_R2" for i in self.runobj.samples.keys()]
        for f_name in file_name_base:
            output_file = os.path.join(self.out_file_path, f_name + ".fastq")
            self.out_files[f_name] = fq.FastQOutput(output_file)
        self.out_files["unknown"] = fq.FastQOutput(os.path.join(self.out_file_path, "unknown" + ".fastq"))

    def close_dataset_files(self):
        [o_file[1].close() for o_file in self.out_files.items()]
        return

#     def perfect_reads(self):
#         self.utils.print_both("Extract perfect V6 reads:")
#         for idx_key in self.runobj.samples.keys():
#             file_name = os.path.join(self.out_file_path, idx_key + ".ini")
#             program_name = C.perfect_overlap_cmd
#             if self.utils.is_local():
#                 program_name = C.perfect_overlap_cmd_local
#             try:
#                 if self.runobj.samples[idx_key].primer_suite.lower().startswith('archaeal'):
#                     call([program_name, file_name, "--archaea"])
#                 else:
#                     call([program_name, file_name])
#             except:
#                 self.utils.print_both("Problems with program_name = %s, file_name = %s" % (program_name, file_name))
#                 raise
#
#     TODO: use from util
    def call_sh_script(self, script_name_w_path, where_to_run):
        try:
            call(['chmod', '0774', script_name_w_path])
            if self.utils.is_local():
                self.utils.print_both("call(['qsub', script_name_w_path], cwd=(where_to_run))")
                call(['bash', script_name_w_path], cwd=(where_to_run))
            else:
                call(['qsub', script_name_w_path], cwd=(where_to_run))
#             pass
        except:
            self.utils.print_both("Problems with script_name = %s or qsub" % (script_name_w_path))
            raise

#     todo: combine and DRY with partial - it's the same command, different arguments
    def merge_perfect(self):
        self.utils.print_both("merge perfect V6 reads:")
        program_name = C.perfect_overlap_cmd
        if self.utils.is_local():
            program_name = C.perfect_overlap_cmd_local
        add_arg = " --marker-gene-stringent --retain-only-overlap --max-num-mismatches 0"
        command_line          = program_name + add_arg
        file_list             = self.dirs.get_all_files_by_ext(self.out_file_path, "ini")
        script_file_name      = self.create_job_array_script(command_line, self.dirs.analysis_dir, file_list)
        script_file_name_full = os.path.join(self.dirs.analysis_dir, script_file_name)
        self.call_sh_script(script_file_name_full, self.dirs.analysis_dir)
        return script_file_name

    def trim_primers_perfect(self):
        self.utils.print_both("trim primers from perfect V6 reads:")

        merged_file_names = self.dirs.get_all_files_by_ext(self.dirs.reads_overlap_dir, "_MERGED")
        primer_suite = self.get_config_values('primer_suite')
        add_arg = ""
        if any([s.lower().startswith("archaeal") for s in primer_suite]):
            add_arg += " --archaea"
        program_name = C.trim_primers_cmd + add_arg
        script_file_name      = self.create_job_array_script(program_name, self.dirs.reads_overlap_dir, merged_file_names)
        script_file_name_full = os.path.join(self.dirs.reads_overlap_dir, script_file_name)
        self.call_sh_script(script_file_name_full, self.dirs.reads_overlap_dir)
        return script_file_name

    """
    def perfect_reads_cluster(self):
        '''
        iu-merge-pairs anna.ini --marker-gene-stringent --retain-only-overlap --max-num-mismatches 0
​            Each flag is critical. ​marker-gene-stringent looks complete overlaps, retain-only-overlap gets rid of adapters, max-num-mismatches retains only perfect overlaps.
            This generates the test_MERGED file with all complete overlaps without any mismatches. But it has all the primers.
            Then we process this file with the new and shiny iu-analyze-v6-complete-overlaps script:
        iu-trim-V6-primers test_MERGED

        '''
        self.utils.print_both("Extract perfect V6 reads:")
        script_file_name      = self.merge_perfect()
        trim_script_file_name = self.trim_primers_perfect()

        return (script_file_name, trim_script_file_name)
    """

    def partial_overlap_reads_cluster(self):
        self.utils.print_both("Extract partial_overlap reads (from partial_overlap_reads_cluster):")
        program_name = C.partial_overlap_cmd
        if self.utils.is_local():
            program_name = C.partial_overlap_cmd_local
        dna_region = self.get_config_values('dna_region')
        if set(C.marker_gene_stringent_regions) & set(list(dna_region)):
            add_arg = "--marker-gene-stringent"
        else:
            add_arg = ""
#         TODO: this part is the same in perfect overlap - move into a method
        command_line          = program_name + " --enforce-Q30-check " + add_arg
        file_list             = self.dirs.get_all_files_by_ext(self.out_file_path, "ini")
        script_file_name      = self.create_job_array_script(command_line, self.dirs.analysis_dir, file_list)
        script_file_name_full = os.path.join(self.dirs.analysis_dir, script_file_name)
        self.call_sh_script(script_file_name_full, self.dirs.analysis_dir)
        self.utils.print_both("self.dirs.chmod_all(%s)" % (self.dirs.analysis_dir))
        self.dirs.chmod_all(self.dirs.analysis_dir)

        return script_file_name

    def partial_overlap_reads(self):
        self.utils.print_both("Extract partial_overlap reads (from partial_overlap_reads):")
        for idx_key in self.runobj.samples.keys():
            ini_file_name = os.path.join(self.out_file_path, idx_key + ".ini")
            program_name = C.partial_overlap_cmd
            if self.utils.is_local():
                program_name = C.partial_overlap_cmd_local
            try:
                if set(C.marker_gene_stringent_regions) & set(list(self.runobj.samples[idx_key].dna_region)):
                # if (self.runobj.samples[idx_key].dna_region == "ITS1"):
                    call([program_name, "--enforce-Q30-check", "--marker-gene-stringent", ini_file_name])
                else:
                    call([program_name, "--enforce-Q30-check", ini_file_name])

#                 call([program_name, ini_file_name])
#                 call([program_name, ini_file_name, idx_key])
#                 call([program_name, "--fast-merge", ini_file_name, idx_key])
            except Exception:
#                 except Exception, err:
                message = traceback.format_exc()
                self.utils.print_both(message)
    #or
#     logger.debug(sys.exc_info()[0])

                self.utils.print_both("Problems with program_name = %s" % (program_name))
                raise

#             logger.debug("HERE: program_name = " % (program_name))
#             call([program_name, "--fast-merge", "--compute-qual-dicts", ini_file_name, idx_key])

    def get_config_values(self, key):
        config_path_data = [v for k, v in self.runobj.configPath.items()]
        return set([a[key] for a in config_path_data if key in a.keys()])

#     TODO: use from util
    def make_users_email(self):
        username = getpass.getuser()
        return username + "@mbl.edu"

#     TODO: use from util
#     Removed by Hilary's request:
#     # Send mail at job end (e); -m as sends abort, suspend.
#     #$ -m as
    def create_job_array_script(self, command_line, dir_to_run, files_list):
        files_string         = " ".join(files_list)
        files_list_size         = len(files_list)
        command_file_name = os.path.basename(command_line.split(" ")[0])
        script_file_name  = command_file_name + "_" + self.runobj.run + "_" + self.runobj.lane_name + ".sh"
        script_file_name_full = os.path.join(dir_to_run, script_file_name)
        log_file_name     = script_file_name + ".sge_script.sh.log"
        # email_mbl         = self.make_users_email()
        email_mbl = C.email_mbl
        text = (
                '''#!/bin/bash
#$ -cwd
#$ -S /bin/bash
#$ -N %s
# Giving the name of the output log file
#$ -o %s
# Combining output/error messages into one file
#$ -j y
# Send mail to these users
#$ -M %s
#$ -t 1-%s
# Now the script will iterate %s times.

  file_list=(%s)

  i=$(expr $SGE_TASK_ID - 1)
  # echo "i = $i"
  # . /etc/profile.d/modules.sh
  # . /xraid/bioware/bioware-loader.sh

  shopt -s expand_aliases # It will expand aliases that are loaded via modules
  . /xraid/bioware/Modules/etc/profile.modules
  module load bioware

  echo "%s ${file_list[$i]}"
  %s ${file_list[$i]}
''' % (script_file_name, log_file_name, email_mbl, files_list_size, files_list_size, files_string, command_line, command_line)
# ''' % (script_file_name, log_file_name, email_mbl, files_list_size, files_list_size, files_string, command_line)
                )
        self.open_write_close(script_file_name_full, text)
        return script_file_name

    def filter_mismatches_cluster(self, max_mismatch = 3):
        self.utils.print_both("Filter mismatches if more then %s" % (max_mismatch))
        command_line = C.filter_mismatch_cmd
        if self.utils.is_local():
            command_line = C.filter_mismatch_cmd_local
        files_dir = self.dirs.reads_overlap_dir

        file_list             = self.dirs.get_all_files_by_ext(files_dir, "_MERGED")
        script_file_name      = self.create_job_array_script(command_line, files_dir, file_list)
        script_file_name_full = os.path.join(files_dir, script_file_name)
        self.utils.call_sh_script(script_file_name_full, files_dir)
        self.utils.print_both("self.dirs.chmod_all(%s)" % (files_dir))
        self.dirs.chmod_all(files_dir)

        return script_file_name

    def filter_mismatches(self, max_mismatch = 3):
        self.utils.print_both("Filter mismatches if more then %s" % (max_mismatch))
        n = 0
        files = self.dirs.get_all_files()
        for full_name in files.keys():
            if files[full_name][0].endswith('_MERGED'):
                n +=1
#                logger.debug("%s fasta file: %s" % (n, full_name))
                program_name = C.filter_mismatch_cmd
                if self.utils.is_local():
                    program_name = C.filter_mismatch_cmd_local
                call([program_name, full_name])

    def uniq_fa_cluster(self):
        self.utils.print_both("Uniqueing fasta files")
        command_line = C.fastaunique_cmd
        if self.utils.is_local():
            command_line = C.fastaunique_cmd_local
        files_dir = self.dirs.reads_overlap_dir

        file_list             = self.dirs.get_all_files_by_ext(files_dir, C.filtered_suffix)
        if len(file_list) == 0:
            file_list         = self.dirs.get_all_files_by_ext(files_dir, ".fa")
        if len(file_list) == 0:
            file_list         = self.dirs.get_all_files_by_ext(files_dir, "MERGED_V6_PRIMERS_REMOVED")

        script_file_name      = self.create_job_array_script(command_line, files_dir, file_list)
        script_file_name_full = os.path.join(files_dir, script_file_name)
        self.call_sh_script(script_file_name_full, files_dir)
        self.utils.print_both("self.dirs.chmod_all(%s)" % (files_dir))
        self.dirs.chmod_all(files_dir)
        return script_file_name

    def uniq_fa(self):
        n = 0
        self.utils.print_both("Uniqueing fasta files")
        files = self.dirs.get_all_files()
        for full_name in files.keys():
#             if files[full_name][1] == ".fa" or files[full_name][0].endswith('_MERGED_FILTERED'):
            if files[full_name][1] == ".fa" or files[full_name][0].endswith(C.filtered_suffix):
                n +=1
                program_name = C.fastaunique_cmd
                if self.utils.is_local():
                    program_name = C.fastaunique_cmd_local
                call([program_name, full_name])

    def get_primers(self):
        proximal_primer = ""
        distal_primer   = ""
        primers         = {}
        for idx_key in self.runobj.samples.keys():
            primer_suite = self.runobj.samples[idx_key].primer_suite.lower()

            if primer_suite in C.primers_dict:
                proximal_primer = C.primers_dict[primer_suite]["proximal_primer"]
                distal_primer = C.primers_dict[primer_suite]["distal_primer"]
            else:
                self.utils.print_both("ERROR! Something wrong with the primer suite name: %s. NB: For v6mod it suppose to be 'Archaeal V6mod Suite'\n" % (primer_suite))
            primers[idx_key] = (proximal_primer, distal_primer)

        return primers

    def create_inis(self):
        for idx_key in self.runobj.samples.keys():
            run_key = idx_key.split('_')[1].replace("N", ".");
            "todo: check if works w/o NNNN when there is a proper csv"
            email = self.runobj.samples[idx_key].email
            text = """[general]
project_name = %s
researcher_email = %s
input_directory = %s
output_directory = %s

[files]
pair_1 = %s
pair_2 = %s
""" % (idx_key, email, self.out_file_path, self.results_path, idx_key + "_R1.fastq", idx_key + "_R2.fastq")

            "That's for parital overlap (v4v5 and hapto miseq illumina)"
            if not self.runobj.do_perfect:
                primers = self.get_primers()
                # logger.debug("run_key = %s, idx_key = %s, primers[idx_key][0], primers[idx_key][1] = %s" (run_key, idx_key, primers[idx_key][0], primers[idx_key][1]))
                text += """
# following section is optional
[prefixes]
pair_1_prefix = ^""" + run_key + primers[idx_key][0] + "\npair_2_prefix = ^" + primers[idx_key][1]

            ini_file_name = os.path.join(self.out_file_path,  idx_key + ".ini")
            self.open_write_close(ini_file_name, text)

#     TODO: use from utils
    def open_write_close(self, script_file_name, text):
        ini_file = open(script_file_name, "w")
        ini_file.write(text)
        ini_file.close()

    def get_fastq_file_names(self, f_input_file_path):
        in_files_r1 = []
        in_files_r2 = []
        "TODO: exclude dir with new created files from the loop"
        for dirname, dirnames, filenames in os.walk(f_input_file_path):
            correct_file_names = self.get_correct_file_names(filenames)

            for filename in sorted(list(correct_file_names)):
                if filename.find('_R1_') > 0:
                    in_files_r1.append(os.path.join(dirname, filename))
                elif filename.find('_R2_') > 0:
                    in_files_r2.append(os.path.join(dirname, filename))
                else:
                    sys.stderr.write("No read number in the file name: %s\n" % filename)
        self.utils.print_both("FFF0: in_files_r1 %s\n, in_files_r2 %s" % (in_files_r1, in_files_r2))
        return (in_files_r1, in_files_r2)

    def get_correct_file_names(self, filenames):
        correct_file_names = [];
        for file1 in filenames:
            index_sequence = self.get_index(file1)
#             self.runobj.run_keys
#
            good_run_key_lane_names = [x for x in self.runobj.run_keys if x.startswith(index_sequence)]
            if len(good_run_key_lane_names) > 0:
                correct_file_names.append(file1)
        return set(correct_file_names)


    def get_run_key(self, e_sequence, has_ns = "True"):
        if has_ns:
            return ("NNNN" + e_sequence[4:9])
        else:
            return e_sequence[0:5]

    def get_ini_run_key(self, index_sequence, e):
        has_ns = any("NNNN" in s for s in self.runobj.run_keys)

        lane_number = e.lane_number
        if self.platform == "nextseq":
            lane_number = "1"
        return index_sequence + "_" + self.get_run_key(e.sequence, has_ns) + "_" + lane_number

    def read1(self, files_r1, compressed):
        """ loop through the fastq_file_names
            1) e.pair_no = 1, find run_key -> dataset name
            2) collect the relevant part of id
        """
        for file_r1 in files_r1:
            self.utils.print_both("====\nFFF1: file %s" % file_r1)
            f_input  = fq.FastQSource(file_r1, compressed)
            index_sequence = self.get_index(file_r1)
            while f_input.next(trim_to = C.trimming_length):
            # while f_input.next(trim_to = C.trimming_length[self.platform]):
                e = f_input.entry
                # todo: a fork with or without NNNN, add an argument
                #                 ini_run_key  = index_sequence + "_" + "NNNN" + e.sequence[4:9] + "_" + e.lane_number
                # lane_number = e.lane_number
                # if self.platform == "nextseq":
                #     lane_number = "1"
                # ini_run_key  = index_sequence + "_" + self.get_run_key(e.sequence, has_ns) + "_" + lane_number
                ini_run_key = self.get_ini_run_key(index_sequence, e)
                if int(e.pair_no) == 1:
                    dataset_file_name_base_r1 = ini_run_key + "_R1"
                    if (dataset_file_name_base_r1 in self.out_files.keys()):
                        self.out_files[dataset_file_name_base_r1].store_entry(e)
                        "TODO: make a method:"
                        short_id1 = e.header_line.split()[0]
                        short_id2 = ":".join(e.header_line.split()[1].split(":")[1:])
                        id2 = short_id1 + " 2:" + short_id2
                        self.id_dataset_idx[id2] = ini_run_key
                    else:
                        self.out_files["unknown"].store_entry(e)

    # def truncate_seq(self, seq):
    #     return seq[:C.trimming_length]

    def remove_end_ns_strip(self, e_sequence):
        if e_sequence.endswith('N'):
            return e_sequence.rstrip('N')
        else:
            return e_sequence

    def read2(self, files_r2, compressed):
        "3) e.pair_no = 2, find id from 2), assign dataset_name"
        for file_r2 in files_r2:
            self.utils.print_both("FFF2: file %s" % file_r2)
            f_input  = fq.FastQSource(file_r2, compressed)
            while f_input.next(trim_to = C.trimming_length):
                e = f_input.entry

                if (int(e.pair_no) == 2) and (e.header_line in self.id_dataset_idx):
                    file_name = self.id_dataset_idx[e.header_line] + "_R2"
                    self.out_files[file_name].store_entry(e)
                else:
                    self.out_files["unknown"].store_entry(e)

    def get_index(self, file_r1):
        file_name_parts = os.path.basename(file_r1).split("_")
#        if the file name starts with "IDX, then actual idx will be next.
        index = file_name_parts[0]
        if file_name_parts[0].startswith("IDX"):
            index = file_name_parts[1]
        return index
class Chimera:
    """ Define here """
    def __init__(self, runobj=None):
        self.utils = PipelneUtils()
        self.runobj = runobj
        self.run_keys = self.runobj.run_keys
        self.rundate = self.runobj.run
        try:
            self.use_cluster = self.runobj.use_cluster
        except:
            self.use_cluster = True
        self.chg_suffix = ".chg"
        self.chimeras_suffix = ".chimeras"
        self.ref_suffix = ".db"
        self.denovo_suffix = ".txt"
        self.nonchimeric_suffix = "." + C.nonchimeric_suffix  #".nonchimeric.fa"
        self.chimeric_suffix = ".chimeric.fa"
        self.base_suffix = "unique" + self.chimeras_suffix

        self.cluster_slots = {
            "grendel": [12, 8],
            "cricket": [40],
            "cluster5": [32]
        }

        try:
            if self.runobj.lane_name:
                lane_name = self.runobj.lane_name
            else:
                lane_name = ''
        except:
            lane_name = ''

        if self.runobj.vamps_user_upload:
            os.environ['SGE_ROOT'] = '/opt/sge'
            os.environ['SGE_CELL'] = 'grendel'
            path = os.environ['PATH']
            os.environ['PATH'] = '/opt/sge/bin/lx24-amd64:' + path
            site = self.runobj.site
            dir_prefix = self.runobj.user + '_' + self.runobj.run
            self.dirs = Dirs(self.runobj.vamps_user_upload,
                             dir_prefix,
                             self.runobj.platform,
                             lane_name=lane_name,
                             site=site)
            self.idx_keys = convert_unicode_dictionary_to_str(
                json.loads(
                    open(self.runobj.trim_status_file_name,
                         "r").read()))["new_lane_keys"]
            self.analysis_dir = self.dirs.check_dir(self.dirs.analysis_dir)
            self.indir = self.dirs.check_dir(self.dirs.trimming_dir)
            self.outdir = self.dirs.check_dir(self.dirs.chimera_dir)
            self.gast_dir = self.dirs.check_dir(self.dirs.gast_dir)

        else:
            site = ''
            dir_prefix = self.runobj.run
            self.dirs = Dirs(self.runobj.vamps_user_upload,
                             dir_prefix,
                             self.runobj.platform,
                             lane_name=lane_name,
                             site=site)
            self.indir = self.dirs.check_dir(self.dirs.reads_overlap_dir)
            self.outdir = self.dirs.check_dir(self.dirs.chimera_dir)

#         self.usearch_cmd = C.usearch_cmd
        self.usearch_cmd = C.usearch6_cmd
        if self.utils.is_local():
            self.usearch_cmd = C.usearch6_cmd_local
        #self.abskew      = C.chimera_checking_abskew
        self.refdb = C.chimera_checking_refdb
        if self.utils.is_local():
            self.refdb_local = C.chimera_checking_refdb_local
        self.its_refdb = C.chimera_checking_its_refdb
        self.input_file_names = self.make_chimera_input_illumina_file_names()
#         self.output_file_names = self.make_chimera_output_illumina_file_names(self.input_file_names)

    def get_ref_db(self, dna_region):
        ref_db = ''
        if dna_region.upper() == 'ITS':
            ref_db = C.chimera_checking_its_refdb
            logger.debug("got an ITS dna region so using refdb: " + ref_db)
        else:
            ref_db = C.chimera_checking_refdb
            if self.utils.is_local():
                ref_db = C.chimera_checking_refdb_local
            logger.debug("using standard refdb: " + ref_db)
        return ref_db

    def make_chimera_input_illumina_file_names(self):
        input_file_names = {}

        for idx_key in self.run_keys:
            file_name = idx_key + "_" + C.filtered_suffix + ".unique"

            if os.path.exists(os.path.join(self.indir, file_name)):
                input_file_names[idx_key] = file_name

        return input_file_names

    def get_current_dirname(self, in_or_out=""):
        if in_or_out == "":
            cur_dirname = self.indir
        else:
            cur_dirname = self.outdir
        return cur_dirname

    def is_chimera_check_file(self, filename):
        return filename.endswith(
            (self.chimeras_suffix + self.denovo_suffix,
             self.chimeras_suffix + self.ref_suffix, self.chimeric_suffix,
             self.nonchimeric_suffix))

    def get_current_filenames(self, cur_dirname):
        cur_file_names = []
        if cur_dirname == self.indir:
            cur_file_names = self.input_file_names.values()
        elif cur_dirname == self.outdir:
            cur_file_names = self.get_chimera_file_names(self.outdir)
        return cur_file_names

    def get_chimera_file_names(self, cur_dirname):
        cur_file_names = []
        for dirname, dirnames, filenames in os.walk(cur_dirname):
            cur_file_names = [
                filename for filename in filenames
                if (self.is_chimera_check_file(filename))
            ]
        return cur_file_names

    def read_file(self, source_name):
        with open(source_name, "r") as sources:
            return sources.readlines()

    def illumina_sed(self, lines, target_name, regex, replace, uppercase):
        with open(target_name, "w") as target:
            for line in lines:
                if line.startswith(">"):
                    line1 = regex.sub(replace, line)
                else:
                    if (uppercase):
                        line1 = line.upper()
                    else:
                        line1 = line
                target.write(line1)

    def call_illumina_sed(self, from_to):
        """
            from_to = from_frequency_to_size or from_size_to_frequency
        """
        sed_from_to = namedtuple(
            'sed_from_to',
            'find, replace, cur_dirname, cur_file_names, change_from_suffix, change_to_suffix, uppercase'
        )

        from_frequency_to_size = sed_from_to(
            find="frequency:",
            replace=";size=",
            cur_dirname=self.indir,
            cur_file_names=self.get_current_filenames(self.indir),
            change_from_suffix="",
            change_to_suffix=self.chg_suffix,
            uppercase=True)

        from_size_to_frequency = sed_from_to(
            find=";size=",
            replace="frequency:",
            cur_dirname=self.outdir,
            cur_file_names=self.get_chimera_file_names(self.outdir),
            change_from_suffix="",
            change_to_suffix="",
            uppercase=False)

        if (from_to == "from_frequency_to_size"):
            tuple_name = from_frequency_to_size
        elif (from_to == "from_size_to_frequency"):
            tuple_name = from_size_to_frequency

        regex = re.compile(r"%s" % tuple_name.find)
        #         logger.debug("find = %s, replace = %s" % (find, replace))
        if (not tuple_name.cur_file_names) and (tuple_name
                                                == from_frequency_to_size):
            self.utils.print_both(
                'ERROR: Did not find uniqued files ("%s") in %s, please check if the previous step has finished. Exiting.\n'
                % (C.filtered_suffix + ".unique", self.indir))
            sys.exit()
        for cur_file_name in tuple_name.cur_file_names:
            file_name = os.path.join(tuple_name.cur_dirname, cur_file_name)
            source_name = file_name + tuple_name.change_from_suffix
            target_name = file_name + tuple_name.change_to_suffix
            lines = self.read_file(source_name)
            self.illumina_sed(lines, target_name, regex, tuple_name.replace,
                              tuple_name.uppercase)

    def illumina_freq_to_size_in_chg(self):
        find1 = "frequency:"
        replace1 = ";size="
        regex1 = re.compile(r"%s" % find1)

        #         logger.debug("cur_file_names: ")
        #         pprint(cur_file_names)
        cur_dirname = self.get_current_dirname()
        cur_file_names = self.get_current_filenames(cur_dirname)
        change_from_suffix = ""
        change_to_suffix = self.chg_suffix
        #         logger.debug("find = %s, replace = %s" % (find, replace))

        for cur_file_name in cur_file_names:
            file_name = os.path.join(cur_dirname, cur_file_name)
            lines = self.utils.read_file(file_name + change_from_suffix)
            with open(file_name + change_to_suffix, "w") as target:
                for line in lines:
                    if line.startswith(">"):
                        line1 = regex1.sub(replace1, line)
                    else:
                        line1 = line.upper()
#                     logger.debug(line1)
                    target.write(line1)

    def illumina_size_to_freq_in_chimer(self):
        find1 = ";size="
        replace1 = "frequency:"
        regex1 = re.compile(r"%s" % find1)

        cur_file_names = self.get_chimera_file_names(self.outdir)

        for file_chim in cur_file_names:
            file_chim_path = os.path.join(self.outdir, file_chim)
            lines = self.utils.read_file(file_chim_path)
            with open(file_chim_path, "w") as target:
                for line in lines:
                    line1 = regex1.sub(replace1, line)
                    target.write(line1)

    def illumina_rm_size_files(self):
        for idx_key in self.input_file_names:
            file_name = os.path.join(
                self.indir, self.input_file_names[idx_key] + self.chg_suffix)
            if os.path.exists(file_name):
                pass
                # os.remove(file_name)

    def check_if_chimera_dir_empty(self):
        if not os.listdir(self.outdir):
            self.utils.print_both(
                'ERROR: Did not find files in %s, something is wrong. First check if you ran the command on a cluster. Exiting.\n'
                % self.outdir)
            sys.exit()

    def check_if_cluster_is_done(self, time_before):
        cluster_done = False
        check_qstat_cmd_line = "qstat | grep \"%s\" | grep chimera_ch | wc -l" % time_before
        #         check_qstat_cmd_line = "qstat | grep vsearch"

        self.utils.print_both("check_qstat_cmd_line = %s" %
                              check_qstat_cmd_line)

        try:
            p = subprocess.Popen(check_qstat_cmd_line,
                                 stdout=subprocess.PIPE,
                                 shell=True)
            (output, err) = p.communicate()
            num_proc = int(output)
            self.utils.print_both("qstat is running %s 'vsearch' processes" %
                                  num_proc)
            #         pprint(p)

            if (num_proc == 0):
                cluster_done = True

    #         logger.debug("cluster_done from check_if_cluster_is_done = %s" % cluster_done)
        except:
            self.utils.print_both(
                "Chimera checking can be done only on a cluster.")
            raise

        return cluster_done

    def create_chimera_cmd(self, ref_db):
        """
        /usr/local/bin/vsearch
        -uchime_denovo
        /Users/ashipunova/BPC/py_mbl_sequencing_pipeline/test/miseq/20150223/lane_1_B/analysis/reads_overlap/TGACCA_NNNNCGACG_1_MERGED-MAX-MISMATCH-3.unique.chg
        -uchimeout
        /Users/ashipunova/BPC/py_mbl_sequencing_pipeline/test/miseq/20150223/lane_1_B/analysis/chimera/TGACCA_NNNNCGACG_1_MERGED-MAX-MISMATCH-3.unique.chimeras.txt
        -chimeras
        /Users/ashipunova/BPC/py_mbl_sequencing_pipeline/test/miseq/20150223/lane_1_B/analysis/chimera/TGACCA_NNNNCGACG_1_MERGED-MAX-MISMATCH-3.unique.chimeras.txt.chimeric.fa
        -notrunclabels
        ---
        /usr/local/bin/vsearch
        -uchime_ref
        /Users/ashipunova/BPC/py_mbl_sequencing_pipeline/test/miseq/20150223/lane_1_B/analysis/reads_overlap/TGACCA_NNNNCGACG_1_MERGED-MAX-MISMATCH-3.unique.chg
        -uchimeout
        /Users/ashipunova/BPC/py_mbl_sequencing_pipeline/test/miseq/20150223/lane_1_B/analysis/chimera/TGACCA_NNNNCGACG_1_MERGED-MAX-MISMATCH-3.unique.chimeras.db
        -chimeras
        /Users/ashipunova/BPC/py_mbl_sequencing_pipeline/test/miseq/20150223/lane_1_B/analysis/chimera/TGACCA_NNNNCGACG_1_MERGED-MAX-MISMATCH-3.unique.chimeras.db.chimeric.fa
        -notrunclabels
        -strand
        plus
        -db
        /groups/g454/blastdbs/rRNA16S.gold.fasta

        """
        command_line = []

        ref_or_novo_options = {
            self.denovo_suffix: "-uchime_denovo",
            self.ref_suffix: "-uchime_ref"
        }
        for suff, opt in ref_or_novo_options.items():
            input_file_name = self.indir + "/$filename_base" + self.chg_suffix
            output_file_name = self.outdir + "/$filename_base" + self.chimeras_suffix + suff

            ref_add = ""
            if (opt == "-uchime_ref"):
                ref_add = "-strand plus -db %s" % ref_db

            uchime_cmd = """%s %s %s -uchimeout %s -chimeras %s%s -notrunclabels %s
            """ % (self.usearch_cmd, opt, input_file_name, output_file_name,
                   output_file_name, self.chimeric_suffix, ref_add)
            logger.debug("UUU = uchime_cmd = %s" % uchime_cmd)
            logger.debug("+++")
            command_line.append(uchime_cmd)

        return command_line

    def create_chimera_cmd_old(self,
                               input_file_name,
                               output_file_name,
                               ref_or_novo,
                               ref_db=""):
        """
        http://www.drive5.com/usearch/manual/uchime_denovo.html
        from usearch -help
        Chimera detection (UCHIME ref. db. mode):
          usearch -uchime q.fasta [-db db.fasta] [-chimeras ch.fasta]
            [-nonchimeras good.fasta] [-uchimeout results.uch] [-uchimealns results.alns]

        Chimera detection (UCHIME de novo mode):
          usearch -uchime amplicons.fasta [-chimeras ch.fasta] [-nonchimeras good.fasta]
             [-uchimeout results.uch] [-uchimealns results.alns]
          Input is estimated amplicons with integer abundances specified using ";size=N".
        usearch -uchime_denovo amplicons.fasta -uchimeout results.uchime
        """

        uchime_cmd_append = ""
        db_cmd_append = ""
        dir_cmd_append = ""

        if (ref_or_novo == "denovo"):
            uchime_cmd_append = " -uchime_denovo "
            output_file_name = output_file_name + self.chimeras_suffix + self.denovo_suffix
        elif (ref_or_novo == "ref"):
            uchime_cmd_append = " -uchime_ref "
            output_file_name = output_file_name + self.chimeras_suffix + self.ref_suffix
            db_cmd_append = " -db " + ref_db
            dir_cmd_append = " -strand plus"
        else:
            self.utils.print_both(
                "Error: Incorrect method, should be \"denovo\" or \"ref\"")
        self.utils.print_both("output_file_name = %s" % output_file_name)

        uchime_cmd = C.clusterize_cmd
        if self.utils.is_local():
            uchime_cmd = ""
        uchime_cmd += " "
        uchime_cmd += self.usearch_cmd
        logger.debug("self.usearch_cmd FROM create_chimera_cmd = %s" %
                     (uchime_cmd))

        uchime_cmd += uchime_cmd_append + input_file_name
        logger.debug("uchime_cmd_append FROM create_chimera_cmd = %s" %
                     (uchime_cmd_append))

        uchime_cmd += db_cmd_append

        logger.debug("db_cmd_append FROM create_chimera_cmd = %s" %
                     (db_cmd_append))

        uchime_cmd += " -uchimeout " + output_file_name
        """if we need nonchimeric for denovo and db separate we might create them here"""
        uchime_cmd += " -nonchimeras "
        uchime_cmd += (output_file_name + self.nonchimeric_suffix)

        uchime_cmd += " -chimeras " + (output_file_name + self.chimeric_suffix)
        uchime_cmd += dir_cmd_append

        uchime_cmd += " -notrunclabels"

        logger.debug("uchime_cmd FROM create_chimera_cmd = %s" % (uchime_cmd))
        return uchime_cmd

    def get_sge_cluster_name(self):
        # import subprocess
        result = subprocess.run(['qstat', '-F'], stdout=subprocess.PIPE)
        a1 = result.stdout.decode('utf-8').split()
        for line in a1:
            if (line.find("hostname") !=
                    -1):  #qf:hostname=grendel-01.bpcservers.private
                return line.split("=")[1].split("-")[0]

    def get_sge_slot_number(
            self
    ):  # doesn't work on cricket because: 	qc:slots=12 and qc:slots=8
        result = subprocess.run(['qstat', '-F', 'slots'],
                                stdout=subprocess.PIPE)
        a1 = result.stdout.decode('utf-8').split()
        slots = []
        for line in a1:
            if line.startswith('qc:slots'):
                slots.append(int(line.split("=")[-1]))
        slots_uniq = set(slots)
        return max(slots_uniq)

    # TODO: temp! take from util. change illumina-files to use util, too
    #   create_job_array_script(self, command_line, dir_to_run, files_list, runobj)
    # feb 25 2019 removed, because didn't work on grendel:
    #  Use the allslots pe and all available slots on that cluster
    # #$ -pe allslots %s
    def create_job_array_script(self, script_file_name_base, command_line,
                                dir_to_run, files_list):
        # sge_slot_number = self.get_sge_slot_number()
        sge_cluster_name = self.get_sge_cluster_name()
        sge_slot_number = self.cluster_slots[sge_cluster_name][0]
        logger.debug("sge_slot_number FROM create_job_array_script = %s" %
                     (sge_slot_number))

        files_string = " ".join(files_list)
        files_list_size = len(files_list)
        #         command_file_name = os.path.basename(command_line.split(" ")[0])
        script_file_name = script_file_name_base + "_" + self.runobj.run + "_" + self.runobj.lane_name + ".sh"
        script_file_name_full = os.path.join(dir_to_run, script_file_name)
        log_file_name = script_file_name + ".sge_script.sh.log"
        email_mbl = C.email_mbl
        # self.utils.make_users_email()
        text = (
            '''#!/bin/bash
#$ -cwd
#$ -S /bin/bash
#$ -N %s
# Giving the name of the output log file
#$ -o %s
# Combining output/error messages into one file
#$ -j y
# Send mail to these users
#$ -M %s
# Send mail at job end (e); -m as sends abort, suspend.
#$ -m as
# max_running_tasks
#$ -tc 15
-# Use the allslots pe and all available slots on that cluster
#$ -pe allslots %s
#$ -t 1-%s
# Now the script will iterate %s times.

  file_list=(%s)

  i=$(expr $SGE_TASK_ID - 1)
  echo "i = $i"
  . /bioware/root/Modules/etc/profile.modules
  module load bioware
  module load vsearch

  INFILE=${file_list[$i]}

  filename=$(basename $INFILE)
  echo "INFILE = $INFILE"
  filename_base="${filename%%.*}"
  echo "filename_base = $filename_base"
  echo "%s"
  echo "%s"
  %s
  %s
''' % (script_file_name, log_file_name, email_mbl, sge_slot_number,
        files_list_size, files_list_size, files_string, command_line[0],
        command_line[1], command_line[0], command_line[1])
            # ''' % (script_file_name, log_file_name, email_mbl, files_list_size, files_list_size, files_string, command_line)
        )
        self.utils.open_write_close(script_file_name_full, text)
        return script_file_name

    def create_not_SGE_script(self, script_file_name_base, command_line,
                              dir_to_run, files_list):

        files_string = " ".join(files_list)
        script_file_name = script_file_name_base + "_" + self.runobj.run + "_" + self.runobj.lane_name + ".sh"
        script_file_name_full = os.path.join(dir_to_run, script_file_name)
        text = (
            '''#!/bin/bash

    file_list=(%s)

    . /bioware/root/Modules/etc/profile.modules
    module load bioware
    module load vsearch

    n=0
    for INFILE in "${file_list[@]}"
    do      
    n=$[n + 1]
    echo $n
    echo "INFILE = $INFILE"
    filename=$(basename $INFILE)
    filename_base="${filename%.*}"
    echo "filename_base = $filename_base"

    echo "%s"
    echo "%s"
    %s
    %s
    done
    ''' % (files_string, command_line[0], command_line[1], command_line[0],
           command_line[1])
            # ''' % (script_file_name, log_file_name, email_mbl, files_list_size, files_list_size, files_string, command_line)
        )
        self.utils.open_write_close(script_file_name_full, text)
        return script_file_name

    def chimera_checking(self):
        chimera_region_found = False

        file_list = self.dirs.get_all_files_by_ext(self.indir, self.chg_suffix)
        logger.debug("FFF = file_list = %s" % (file_list))

        #         TODO: method
        dna_region = list(
            set([
                self.runobj.samples[idx_key].dna_region
                for idx_key in self.input_file_names
            ]))[0]
        if dna_region in C.regions_to_chimera_check:
            chimera_region_found = True
        else:
            logger.debug('region not checked: ' + dna_region)
        ref_db = self.get_ref_db(dna_region)
        command_line = self.create_chimera_cmd(ref_db)
        sh_script_file_name = self.create_job_array_script(
            "chimera_checking", command_line, self.indir, file_list)
        script_file_name_full = os.path.join(self.indir, sh_script_file_name)
        self.utils.call_sh_script(script_file_name_full, self.indir)
        self.utils.print_both("self.dirs.chmod_all(%s)" % (self.indir))
        self.dirs.chmod_all(self.indir)
        logger.debug('sh_script_file_name: ' + sh_script_file_name)
        if not chimera_region_found:
            return ('NOREGION', 'No regions found that need checking', '')
        else:
            return ("The vsearch commands were created")

    def get_chimeric_ids(self):
        ids = set()
        chimera_file_names = self.get_chimera_file_names(self.outdir)
        file_ratio = self.check_chimeric_stats()

        for file_name in chimera_file_names:
            #             logger.debug("from get_chimeric_ids: file_name = %s" % file_name)
            if file_name.endswith(self.chimeric_suffix):
                both_or_denovo = self.get_chimeras_suffix(
                    file_ratio, file_name)
                #                 TODO: run ones for each file_base = ".".join(file_name.split(".")[0:3]) (for txt and db)
                if file_name.endswith(both_or_denovo):
                    file_name_path = os.path.join(self.outdir, file_name)
                    self.utils.print_both("Get ids from %s" % file_name_path)
                    read_fasta = fa.ReadFasta(file_name_path)
                    ids.update(set(read_fasta.ids))
        return ids

    def get_chimeras_suffix(self, file_ratio, file_name):
        """ use only de-novo (.txt) chimeric if
            check_chimeric_stats shows
            ratio ref to de-novo > 3
            e.g.
            if denovo_only:
                chimeric_suffix = self.chimeras_suffix + self.denovo_suffix + self.chimeric_suffix
            if no:
                chimeras_suffix = self.chimeric_suffix

            if file_name.endswith(chimeric_suffix):
            ...
                #     first_name, last_name = get_name()

        """
        #         for file_basename in file_ratio:
        (percent_ref, ratio) = file_ratio[".".join(file_name.split(".")[0:3])]

        chimeric_fa_suffix = ""
        #         logger.debug("percent_ref = %s, ratio = %s" % (percent_ref, ratio))
        #         if (percent_ref > 15) and (ratio > 2):
        if ratio > 3:
            chimeric_fa_suffix = self.chimeras_suffix + self.denovo_suffix + self.chimeric_suffix
        else:
            chimeric_fa_suffix = self.chimeric_suffix
        return chimeric_fa_suffix

    def move_out_chimeric(self):
        chimeric_ids = self.get_chimeric_ids()
        for idx_key in self.input_file_names:
            fasta_file_path = os.path.join(self.indir,
                                           self.input_file_names[idx_key])
            read_fasta = fa.ReadFasta(fasta_file_path)
            read_fasta.close()

            non_chimeric_file = fasta_file_path + self.nonchimeric_suffix
            non_chimeric_fasta = fa.FastaOutput(non_chimeric_file)

            fasta = fa.SequenceSource(fasta_file_path, lazy_init=False)
            while fasta.next():
                if not fasta.id in chimeric_ids:
                    non_chimeric_fasta.store(fasta, store_frequencies=False)
            non_chimeric_fasta.close()

    def check_chimeric_stats(self):
        all_lines_suffix = self.denovo_suffix  # ".txt" or ".db, doesn't matter"
        chimera_ref_suffix = self.ref_suffix + self.chimeric_suffix  #".db.chimeric.fa"
        chimera_denovo_suffix = self.denovo_suffix + self.chimeric_suffix  # ".txt.chimeric.fa"
        filenames = self.get_basenames(self.get_current_filenames(self.outdir))
        file_ratio = {}
        for file_basename in filenames:
            # logger.debug(file_basename)
            all_lines = 0
            ref_lines = 0
            denovo_lines = 0
            ratio = 0
            percent_ref = 0
            percent_denovo = 0

            all_lines_file_name = os.path.join(
                self.outdir, file_basename + all_lines_suffix)
            ref_lines_file_name = os.path.join(
                self.outdir, file_basename + chimera_ref_suffix)
            denovo_lines_file_name = os.path.join(
                self.outdir, file_basename + chimera_denovo_suffix)

            all_lines = int(self.wccount(all_lines_file_name) or 0)
            ref_lines = int(self.get_fa_lines_count(ref_lines_file_name) or 0)
            denovo_lines = int(
                self.get_fa_lines_count(denovo_lines_file_name) or 0)

            # denovo_lines = int(denovo_lines or 0)
            if (ref_lines == 0) or (all_lines == 0):
                file_ratio[file_basename] = (0, 0)
                continue
            else:
                percent_ref = self.percent_count(all_lines, ref_lines)

            if (denovo_lines == 0):
                file_ratio[file_basename] = (
                    percent_ref, percent_ref
                )  #use ref instead of ratio, because we are actually looking for a huge difference between ref and denovo (ref > 15 and denovo = 0)
                continue

            if (denovo_lines > 0):
                ratio = self.count_ratio(ref_lines, denovo_lines)
                percent_denovo = self.percent_count(all_lines, denovo_lines)
            file_ratio[file_basename] = (percent_ref, ratio)
            # percent_ref = int(percent_ref or 0)
            if (percent_ref > 15):
                self.utils.print_both("=" * 50)

                self.utils.print_both(file_basename)
                # logger.debug("all_lines_file_name = %s, ref_lines_file_name = %s, denovo_lines_file_name = %s" % (all_lines_file_name, ref_lines_file_name, denovo_lines_file_name))
                self.utils.print_both(
                    "all_lines = %s, ref_lines = %s, denovo_lines = %s" %
                    (all_lines, ref_lines, denovo_lines))
                self.utils.print_both("ratio = %s" % ratio)
                self.utils.print_both("percent_ref = %s, percent_denovo = %s" %
                                      (percent_ref, percent_denovo))
        return file_ratio

    def get_basenames(self, filenames):
        file_basenames = set()
        for f in filenames:
            file_basename = ".".join(f.split(".")[0:3])
            if file_basename.endswith(self.base_suffix):
                file_basenames.add(file_basename)

        return file_basenames

    def wccount(self, filename):
        return subprocess.check_output(['wc', '-l', filename]).split()[0]

    def count_ratio(self, ref_num, denovo_num):
        try:
            return float(ref_num or 0) / float(denovo_num or 0)
        except ZeroDivisionError:
            # logger.debug("There is no denovo chimeras to count ratio.")
            pass

    def get_fa_lines_count(self, file_name):
        # todo: use fastalib to get cnt?
        # return fa.SequenceSource(file_name, lazy_init = False).total_seq
        try:
            file_open = open(file_name)
            return len([l for l in file_open.readlines() if l.startswith('>')])
        except IOError:
            e = sys.exc_info()[1]
            self.utils.print_both(e)
            return 0
            # logger.error("%s\nThere is no such file: %s" % (e, file_name))

    def percent_count(self, all_lines, chimeric_count):
        try:
            return float(chimeric_count or 0) * 100 / float(all_lines or 0)
        except ZeroDivisionError:
            # logger.error("There is no denovo chimeras to count ratio.")
            pass

    """
    -----------------------------------------------------------------------------
        For 454.
        not tested
    """

    def chimera_denovo(self):
        chimera_region_found = False
        output = {}
        cluster_id_list = []

        for idx_key in self.idx_keys:
            input_file_name = os.path.join(self.indir, idx_key + '.abund.fa')
            if os.path.isfile(input_file_name):
                output_file_name = os.path.join(self.outdir,
                                                idx_key + '.chimera.denovo')
                #open(output_file_name, 'a').close()  # make sure file exists
                log_file = os.path.join(self.outdir, idx_key + ".denovo.log")

                dna_region = self.runobj.samples[idx_key].dna_region
                logger.debug("dna_region = %s" % dna_region)
                if self.runobj.vamps_user_upload:
                    # VAMPS users can chimera check regardless of region chosen
                    chimera_region_found = True
                else:
                    if dna_region in C.regions_to_chimera_check:
                        chimera_region_found = True
                    else:
                        logger.debug('region not checked: ' + dna_region)
                        continue

                self.utils.print_both(
                    "input_file_name = %s \noutput_file_name = %s" %
                    (input_file_name, output_file_name))

                #             uchime_cmd = C.clusterize_cmd
                #             uchime_cmd += " "
                #             uchime_cmd += self.usearch_cmd
                #             uchime_cmd += " --uchime "
                #             uchime_cmd += input_file_name
                #             uchime_cmd += " --uchimeout "
                #             uchime_cmd += output_file_name
                #             uchime_cmd += " --abskew "
                #             uchime_cmd += self.abskew
                uchime_cmd = ''
                if self.use_cluster:
                    uchime_cmd += C.clusterize_cmd
                    uchime_cmd += " "
                    uchime_cmd += " -log "
                    uchime_cmd += log_file
                    uchime_cmd += " "
                uchime_cmd += self.usearch_cmd
                uchime_cmd += " -uchime_denovo "
                uchime_cmd += input_file_name
                uchime_cmd += " -uchimeout "
                uchime_cmd += output_file_name

                logger.debug("uchime_denovo_cmd = %s" % (uchime_cmd))

                try:
                    logger.info("chimera denovo command: " + str(uchime_cmd))
                    #                 subprocess.Popen(uchime_cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)

                    self.utils.print_both("chimera denovo command: " +
                                          str(uchime_cmd))
                    #output[idx_key] = subprocess.Popen(uchime_cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
                    output[idx_key] = subprocess.check_output(uchime_cmd,
                                                              shell=True)
                    self.utils.print_both("chimera denovo result: " +
                                          str(output[idx_key]))
                    #self.utils.print_both("output[idx_key] = %s" % output[idx_key])
                    #if idx_key in output and len(output[idx_key].split()) > 1:
                    #self.utils.print_both(output[idx_key].split()[2])
                    items = output[idx_key].split()
                    if len(items) > 2:
                        cluster_id_list.append(items[2])

                except OSError:
                    e = sys.exc_info()[1]
                    self.utils.print_both(
                        "Error: Problems with this command: %s" % (uchime_cmd))
                    if self.utils.is_local():
                        print >> sys.stderr, "Error: Execution of %s failed: %s" % (
                            uchime_cmd, e)
                    else:
                        print >> sys.stderr, "Error: Execution of %s failed: %s" % (
                            uchime_cmd, e)
                        self.utils.print_both(
                            "Error: Execution of %s failed: %s" %
                            (uchime_cmd, e))
                        raise

# ???
        if not chimera_region_found:
            return ('NOREGION', 'No regions found that need checking', '')

        # ???
#         for idx_key in output:
#             if len(output[idx_key]) > 50 or len(output[idx_key]) < 40:
#                 return ('ERROR','uchime ref may have broken or empty', idx_key)

# finally
        self.utils.print_both('Finished Chimera Denovo')
        if cluster_id_list:
            return ('SUCCESS',
                    'uchime ref seems to have been submitted successfully',
                    cluster_id_list)
        else:
            return ('ERROR', 'uchime ref returned no cluster IDs',
                    cluster_id_list)

    def chimera_reference(self):

        chimera_region_found = False
        output = {}
        cluster_id_list = []
        for idx_key in self.run_keys:

            dna_region = self.runobj.samples[idx_key].dna_region
            if self.runobj.vamps_user_upload:
                # VAMPS users can chimera check regardless of region chosen
                chimera_region_found = True
            else:
                if dna_region in C.regions_to_chimera_check:
                    chimera_region_found = True
                else:
                    logger.debug('region not checked: ' + dna_region)
                    continue

            input_file_name = os.path.join(self.indir, idx_key + '.abund.fa')
            output_file_name = os.path.join(self.outdir,
                                            idx_key + ".chimera.ref")
            #open(output_file_name, 'a').close()  # make sure file exists
            log_file = os.path.join(self.outdir, idx_key + ".ref.log")
            logger.debug("OUT FILE NAME: " + output_file_name)

            #out_file_name = self.prefix[idx_key] + ".chimeras.db"
            input_file_name = os.path.join(self.indir, idx_key + '.abund.fa')
            if os.path.isfile(input_file_name):
                output_file_name = os.path.join(self.outdir,
                                                idx_key + ".chimera.ref")
                #open(output_file_name, 'a').close()  # make sure file exists
                log_file = os.path.join(self.outdir, idx_key + ".ref.log")
                logger.debug("OUT FILE NAME: " + output_file_name)
                # which ref db to use?
                ref_db = ''
                if dna_region.upper() == 'ITS':
                    logger.debug("got an ITS dna region so using refdb: " +
                                 self.its_refdb)
                    ref_db = self.its_refdb
                else:
                    logger.debug("using standard refdb: " + self.refdb)
                    ref_db = self.refdb

                uchime_cmd = ''
                if self.use_cluster:
                    uchime_cmd = C.clusterize_cmd
                    uchime_cmd += " "
                    uchime_cmd += " -log "
                    uchime_cmd += log_file
                    uchime_cmd += " "
                uchime_cmd += self.usearch_cmd
                uchime_cmd += " -uchime_ref "
                uchime_cmd += input_file_name
                uchime_cmd += " -uchimeout "
                uchime_cmd += output_file_name
                uchime_cmd += " -db "
                uchime_cmd += ref_db
                uchime_cmd += " -strand "
                uchime_cmd += "plus"

                logger.debug("uchime_ref_cmd = %s" % (uchime_cmd))

                try:

                    logger.info("vsearch version: " %
                                (self.utils.get_vsearch_version))
                    logger.info("chimera reference command: " +
                                str(uchime_cmd))
                    output[idx_key] = subprocess.check_output(uchime_cmd,
                                                              shell=True)
                    #logger.debug('outsplit',output[idx_key].split()[2])
                    cluster_id_list.append(output[idx_key].split()[2])
                    #logger.debug('Have %d bytes in output' % len(output))
                    #logger.debug('ref',idx_key,output,len(output))
                    if len(output[idx_key]) < 50 and len(output[idx_key]) > 40:
                        logger.debug(
                            idx_key +
                            " uchime ref seems to have been submitted successfully"
                        )
                    else:
                        if self.use_cluster:
                            print >> sys.stderr, "Error: uchime ref may be broke"
                            self.utils.print_both(
                                "Error: uchime ref may be broke")

                except OSError:
                    e = sys.exc_info()[1]
                    print >> sys.stderr, "Error: Execution of chimera_reference failed: %s" % (
                        uchime_cmd, e)
                    self.utils.print_both(
                        "Error: Execution of chimera_reference failed: %s" %
                        (uchime_cmd, e))
                    raise

        if not chimera_region_found:
            return ('NOREGION', 'No regions found that need checking', '')

        for idx_key in output:
            if (len(output[idx_key]) > 50
                    or len(output[idx_key]) < 40) and self.use_cluster:
                return ('ERROR', 'uchime ref may have broken or empty',
                        idx_key)
        self.utils.print_both('Finished Chimera Reference')
        return ('SUCCESS',
                'uchime ref seems to have been submitted successfully',
                cluster_id_list)

    def write_chimeras_to_deleted_file(self):

        for idx_key in self.run_keys:
            # open  deleted file and append chimera to it
            # open and read both chimeras files: chimeras.db and chimeras.txt

            # hash to remove dupes
            chimera_deleted = {}
            denovo_file = os.path.join(self.outdir,
                                       idx_key + '.chimera.denovo')
            ref_file = os.path.join(self.outdir, idx_key + ".chimera.ref")
            # deleted file is in trimming dir for vampsuser
            deleted_file = os.path.join(self.indir, idx_key + ".deleted.txt")
            for file in [denovo_file, ref_file]:
                if os.path.isfile(file):
                    fh = open(file, "r")
                    # make a list of chimera deleted read_ids
                    for line in fh.readlines():
                        lst = line.strip().split()
                        id = lst[1].split(';')[0]
                        chimera_yesno = lst[-1]
                        if (chimera_yesno) == 'Y':
                            chimera_deleted[id] = 'chimera'
            # open to append as trimming deletions are already there
            fh_del = open(deleted_file, "a")
            for id in chimera_deleted:
                fh_del.write(id + "\tChimera\n")
            fh_del.close()