def check_for_missing_values(self, data):
        missing_key   = ''
        error = False
        warn = False
        for item in data:
            if item == 'general':
                for k,v in data[item].items():
                    if not k:
                        #sys.exit("ERROR: key for: '"+v+"' is missing or corrupt - Exiting")
                        logger.warning("(key: "+item+") key for: '"+v+"' is missing or corrupt - Continuing")
                        warn=True
                    if v == '':
                        logger.warning("(key: "+item+") value of: '"+k+"' is missing or corrupt - Continuing")
                        warn=True

        for item in data:
            if item != 'general':
                for k,v in data[item].items():
                    if not k:
                        #sys.exit("ERROR: key for: '"+v+"' is missing or corrupt - Exiting")
                        logger.warning("(key: "+item+") key for: '"+v+"' is missing or corrupt - Continuing")
                        warn=True
                    if not v:
                        if (k == 'barcode' or k == 'adaptor'): #these could be empty
                            logger.warning("(key: "+item+") value of: '"+k+"' is missing or corrupt - Continuing")
                        else:
                            logger.error("(key: "+item+") value of: '"+k+"' is missing or corrupt - Continuing")
                            error=True
        return (error,warn)
def write_status_to_vamps_db(site='vampsdev', id='0', status='Test', message=''):
    """
    This should be available to write status updates to vamps:vamps_upload_status.
    It is especially important for MoBeDAC uploads because the qiime site
    will 'see' and react to the message in the db.  <-- not true any longer 2014-02-01 AAV


    """
    import ConMySQL
    from pipeline.db_upload import MyConnection
    today   = str(datetime.date.today())
    if site == 'vamps':
        db_host    = 'vampsdb'
        db_name    = 'vamps'
        db_home = '/xraid2-2/vampsweb/vamps/'
    else:
        db_host    = 'bpcweb7'
        db_name    = 'vamps'
        db_home = '/xraid2-2/vampsweb/vampsdev/'
    #obj=ConMySQL.New(db_host, db_name, db_home)
    #my_conn = MyConnection(db_host, db_name)
    obj=ConMySQL.New(db_host, db_name, db_home)
    conn = obj.get_conn()
    cursor = conn.cursor()
    query = "update vamps_upload_status set status='%s', status_message='%s', date='%s' where id='%s'" % (status, message, today, id)
    try:
        cursor.execute(query)
        #print("executing",query)
    except:
        conn.rollback()
        logger.error("ERROR status update failed")
    else:
        conn.commit()
Example #3
0
 def check_for_missing_values(self, data):
     missing_key   = ''
     error = False
     warn = False
     for item in data:
         if item == 'general':
             for k,v in data[item].iteritems():
                 if not k:
                     #sys.exit("ERROR: key for: '"+v+"' is missing or corrupt - Exiting")
                     logger.warning("(key: "+item+") key for: '"+v+"' is missing or corrupt - Continuing")
                     warn=True
                 if v == '':                        
                     logger.warning("(key: "+item+") value of: '"+k+"' is missing or corrupt - Continuing")
                     warn=True
                         
     for item in data:
         if item != 'general':
             for k,v in data[item].iteritems():
                 if not k:
                     #sys.exit("ERROR: key for: '"+v+"' is missing or corrupt - Exiting")
                     logger.warning("(key: "+item+") key for: '"+v+"' is missing or corrupt - Continuing")
                     warn=True
                 if not v:
                     if (k == 'barcode' or k == 'adaptor'): #these could be empty
                         logger.warning("(key: "+item+") value of: '"+k+"' is missing or corrupt - Continuing")
                     else:
                         logger.error("(key: "+item+") value of: '"+k+"' is missing or corrupt - Continuing")
                         error=True
     return (error,warn)
 def get_reference_databases(self,dna_region):
     
     #if dna region == v6v4(a) change it to v4v6
     # other reverse regions? 
     if dna_region == 'v6v4':
         dna_region = 'v4v6'
     if dna_region == 'v6v4a':
         dna_region = 'v4v6a'
     if C.use_full_length:
         if os.path.exists(os.path.join(self.refdb_dir, 'refssu.udb')):
             refdb = os.path.join(self.refdb_dir, 'refssu.udb')
             taxdb = os.path.join(self.refdb_dir, 'refssu.tax')
         elif os.path.exists(os.path.join(self.refdb_dir, 'refssu.fa')):
             refdb = os.path.join(self.refdb_dir, 'refssu.fa')
             taxdb = os.path.join(self.refdb_dir, 'refssu.tax')
     else:
         if os.path.exists(os.path.join(self.refdb_dir, C.refdbs[dna_region])):
             refdb = os.path.join(self.refdb_dir, C.refdbs[dna_region])
             taxdb = os.path.join(self.refdb_dir, 'ref'+dna_region+'.tax')
         elif os.path.exists(os.path.join(self.refdb_dir, 'ref'+dna_region+'.fa')):
             refdb = os.path.join(self.refdb_dir, 'ref'+dna_region+'.fa')
             taxdb = os.path.join(self.refdb_dir, 'ref'+dna_region+'.tax')
         elif os.path.exists(os.path.join(self.refdb_dir, 'refssu.udb')):
             refdb = os.path.join(self.refdb_dir, 'refssu.udb')
             taxdb = os.path.join(self.refdb_dir, 'refssu.tax')
         elif os.path.exists(os.path.join(self.refdb_dir, 'refssu.fa')):
             refdb = os.path.join(self.refdb_dir, 'refssu.fa')
             taxdb = os.path.join(self.refdb_dir, 'refssu.tax')
         else:
             logger.error("Could not find reference database "+refdb+" Exiting")
             sys.exit()  
     
     logger.info('tax_file '+taxdb)
     logger.info('ref_file '+refdb)        
     return (refdb,taxdb)   
    def check_headers(self, headers):
        if self.general_config_dict['platform'] in C.illumina_list:
            pl = self.general_config_dict['platform']
            known_header_list = self.known_header_list[pl]
        elif self.general_config_dict['platform'] == '454':
            known_header_list = self.known_header_list['454']
        else:
            logger.error("in utils: check_headers - unknown platform")
        #print(  sorted(known_header_list))
        #print(sorted(headers))
        self.res_headers = headers
        if "env_sample_source" in headers:
            self.env_source_to_id(headers)

        if sorted(known_header_list) != sorted(self.res_headers):
            print("=" * 40)
            print("csv file header problem")
            print("%-20s %-20s" % ("REQUIRED", "YOUR CSV"))
            for i in sorted(known_header_list):
                if i in headers:
                    print("%-20s%-20s" % (i,i))
                else:
                    print("%-20s%-20s" % (i,"----------- <--- missing"))
            for i in headers:

                if i not in known_header_list:
                    print("%-20s%-20s" % (" ",i+" <--- extra"))
            print("=" * 40)
            sys.exit("ERROR : unknown or missing headers\n")
        else:
            return True
Example #6
0
 def check_headers(self, headers):
     if self.general_config_dict['platform']=='illumina':
         known_header_list= self.known_header_list['illumina']
     elif self.general_config_dict['platform'] == '454':
         known_header_list = self.known_header_list['454']
     else:
         logger.error("in utils: check_headers - unknown platform")
     #print   sorted(known_header_list)
     #print sorted(headers)
     self.res_headers = headers
     if "env_sample_source" in headers:
         self.env_source_to_id(headers)
         
     if sorted(known_header_list) != sorted(self.res_headers):
         print "=" * 40
         print "csv file header problem"
         print "%-20s %-20s" % ("REQUIRED", "YOUR CSV")
         for i in sorted(known_header_list):
             if i in headers:
                 print "%-20s%-20s" % (i,i)
             else:
                 print "%-20s%-20s" % (i,"----------- <--- missing")
         for i in headers:
             
             if i not in known_header_list:
                 print "%-20s%-20s" % (" ",i+" <--- extra")
         print "=" * 40
         sys.exit("ERROR : unknown or missing headers\n")
     else:
         return True
 def chmod_all(self, dir_name):
   try:
     call(['chmod', '-R', 'ug+w', dir_name])
   except Exception:
     logger.error("call(['chmod', '-R', 'ug+w', %s]) didn't work: \n" % (dir_name))
     logger.error(Exception)
     pass
 def chmod_all(self, dir_name):
     try:
         call(['chmod', '-R', 'ug+w', dir_name])
     except Exception:
         logger.error("call(['chmod', '-R', 'ug+w', %s]) didn't work: \n" %
                      (dir_name))
         logger.error(Exception)
         pass
 def write_seq_frequencies_in_file(self, out_file, fa_file_name,
                                   seq_in_file):
     try:
         with open(out_file, "a") as myfile:
             myfile.write(
                 str(fa_file_name) + ": " + str(seq_in_file) + "\n")
     except Exception:
         logger.error(Exception)
 def env_source_to_id(self, headers):
     logger.error("self.utils.is_local() LLL2 metadata")
     logger.error(self.utils.is_local())
     if self.utils.is_local():
         self.my_conn     = MyConnection(host = 'localhost', db="test_env454")
     else:
         self.my_conn = MyConnection(host='bpcdb1', db="env454")
     # self.my_conn     = MyConnection()
     my_sql       = """SELECT * FROM env_sample_source"""
     self.env     = self.my_conn.execute_fetch_select(my_sql)
     self.res_headers = ["env_sample_source_id" if x=="env_sample_source" else x for x in headers]
Example #11
0
 def check_for_datasets(self,data):
     error = False
     warn=False
     for item in data:
         if item != 'general':
             #print 'ds',data[item]['dataset']
             if not data[item]['dataset']:
             #if 'dataset' not in data[item]:
                 logger.error("Current dataset name is missing or corrupt - Exiting (key: "+item+")")
                 error=True
     return (error,warn) 
 def check_for_datasets(self,data):
     error = False
     warn=False
     for item in data:
         if item != 'general':
             #print('ds',data[item]['dataset'])
             if not data[item]['dataset']:
             #if 'dataset' not in data[item]:
                 logger.error("Current dataset name is missing or corrupt - Exiting (key: "+item+")")
                 error=True
     return (error,warn)
def file_to_db_upload_seq(my_file_to_db_upload, filename, sequences):
    #     for filename in filenames:
    insert_seq_time_start = time.time()

    try:
        logger.debug("\n----------------\nfilename = %s" % filename)
        my_file_to_db_upload.seq.insert_seq(sequences)
        insert_seq_time = (time.time() - insert_seq_time_start)
        logger.debug("insert_seq() took %s sec to finish" % insert_seq_time)
    except:  # catch everything
        logger.error("\r[pipelineprocessor] Unexpected:")  # handle unexpected exceptions
        logger.error(sys.exc_info()[0])  # info about curr exception (type,value,traceback)
        raise  # re-throw caught exception
def file_to_db_upload_all_but_seq(my_file_to_db_upload, filename, no_run_info_list, full_upload):
    total_time = 0

    try:
        my_file_to_db_upload.get_gast_result(os.path.basename(filename))

        filename_base_no_suff = get_filename_base_no_suff(filename)

        run_info_ill_id = my_file_to_db_upload.get_run_info_ill_id(filename_base_no_suff)
        if run_info_ill_id:
            my_file_to_db_upload.collect_project_ids(run_info_ill_id)
            seq_in_file = len(my_file_to_db_upload.seq.fasta_dict)
            my_file_to_db_upload.put_seq_statistics_in_file(filename, seq_in_file)
            total_time += seq_in_file

            start_fasta_next = time.time()

            start_insert_pdr_info_time = 0
            start_insert_pdr_info_time = time.time()

            my_file_to_db_upload.insert_pdr_info(run_info_ill_id)
            insert_pdr_info_time = (time.time() - start_insert_pdr_info_time)

            start_insert_taxonomy_time = 0
            start_insert_taxonomy_time = time.time()
            my_file_to_db_upload.insert_taxonomy()
            insert_taxonomy_time = (time.time() - start_insert_taxonomy_time)

            insert_sequence_uniq_info_time = 0
            start_insert_sequence_uniq_info_time = time.time()
            my_file_to_db_upload.insert_sequence_uniq_info()
            insert_sequence_uniq_info_time = (time.time() - start_insert_sequence_uniq_info_time)

            logger.debug("start_fasta_loop took %s sec to finish" % (time.time() - start_fasta_next))
            logger.debug("insert_pdf_info_query_time took %s sec to finish" % insert_pdr_info_time)
            logger.debug("start_insert_taxonomy_upload_time took %s sec to finish" % insert_taxonomy_time)
            logger.debug("insert_sequence_uniq_info_time took %s sec to finish" % insert_sequence_uniq_info_time)

            return total_time
        else:
            utils = PipelneUtils()

            no_run_info_list.append(filename_base_no_suff)
            utils.print_both(
                "ERROR: There is no run info for %s yet, please check if it's in the csv and uploaded to the db" % filename)
            return 0

    except:  # catch everything
        logger.error("\r[pipelineprocessor] Unexpected:")  # handle unexpected exceptions
        logger.error(sys.exc_info()[0])  # info about curr exception (type,value,traceback)
        raise  # re-throw caught exception
Example #15
0
def file_to_db_upload_seq(my_file_to_db_upload, filename, sequences):
    #     for filename in filenames:
    insert_seq_time_start = time.time()

    try:
        logger.debug("\n----------------\nfilename = %s" % filename)
        my_file_to_db_upload.seq.insert_seq(sequences)
        insert_seq_time = (time.time() - insert_seq_time_start)
        logger.debug("insert_seq() took %s sec to finish" % insert_seq_time)
    except:  # catch everything
        logger.error("\r[pipelineprocessor] Unexpected:"
                     )  # handle unexpected exceptions
        logger.error(sys.exc_info()
                     [0])  # info about curr exception (type,value,traceback)
        raise  # re-throw caught exception
Example #16
0
 def check_dataset_name(self,data):
     """
     # CHECK: dataset name can be ONLY alphanumeric and underscore 
                 and cannot start with a number!
     """
     error   =False
     warn    =False
     for item in data:
         if item != 'general':
             dataset_name = data[item]['dataset']
             if not re.match("^[A-Za-z0-9_]*$", dataset_name):
                 logger.error("Dataset name has illeagal character(s): "+dataset_name+" (must be alphanumeric and underscore only)")
                 error = True
             #if  re.match("^[0-9]", dataset_name):
              #   logger.error("Dataset name cannot begin with a digit: "+dataset_name)
               #  error = True
             
     return (error,warn)   
    def check_if_array_job_is_done(self, job_name):
        cluster_done = False
        check_qstat_cmd_line = "qstat -r | grep %s | wc -l" % job_name
        logger.debug("check_qstat_cmd_line = %s" % check_qstat_cmd_line)
        try:
            p = subprocess.Popen(check_qstat_cmd_line, stdout=subprocess.PIPE, shell=True)
            (output, err) = p.communicate()
            num_proc = int(output)
            logger.debug("qstat is running %s '%s' processes" % (num_proc, job_name))
    #         pprint(p)

            if (num_proc == 0):
                cluster_done = True
    #         print("cluster_done from check_if_cluster_is_done = %s" % cluster_done)
        except:
            logger.error("%s can be done only on a cluster." % job_name)
            raise
        return cluster_done
    def check_dataset_name(self,data):
        """
        # CHECK: dataset name can be ONLY alphanumeric and underscore
                    and cannot start with a number!
        """
        error   =False
        warn    =False
        for item in data:
            if item != 'general':
                dataset_name = data[item]['dataset']
                if not re.match("^[A-Za-z0-9_]*$", dataset_name):
                    logger.error("Dataset name has illeagal character(s): "+dataset_name+" (must be alphanumeric and underscore only)")
                    error = True
                #if  re.match("^[0-9]", dataset_name):
                 #   logger.error("Dataset name cannot begin with a digit: "+dataset_name)
                  #  error = True

        return (error, warn)
Example #19
0
 def check_project_name(self, data):
     """
     # CHECK: project name format: 3 parts; end with Bv6,Ev9,Av6 or something similar
     """
     error   =False
     warn    =False
     for item in data:
         if item != 'general':
             try:
                 (a,b,c) = data[item]['project'].split('_')
             except:
                 logger.error("project not in correct format: "+data[item]['project']+" - Exiting (key: "+data[item]+")")
                 error=True
             (a,b,c) = data[item]['project'].split('_')
             #if c[0] not in [i[0].upper() for i in domains]:
             #    sys.exit("ERROR : Project suffix has incorrect/non-existant domain: "+c)
             if (c[1:] not in self.dna_regions) and (c.lower() not in self.dna_regions):
                 logger.error("Project suffix has incorrect DNA region: "+c+" - Exiting (key: "+data[item]+")")
                 error = True
     return (error,warn)
 def gast2tax(self): 
     
     for key in self.idx_keys:
         output_dir = os.path.join(self.basedir,key)
         gast_dir = os.path.join(output_dir,'gast')
         if key in self.runobj.samples:
             dna_region = self.runobj.samples[key].dna_region
         else:            
             dna_region = self.runobj.dna_region
         if not dna_region:
             logger.error("gast2tax: We have no DNA Region: Setting dna_region to 'unknown'")
             self.runobj.run_status_file_h.write("gast2tax: We have no DNA Region: Setting dna_region to 'unknown'")
             dna_region = 'unknown'
         
         (refdb,taxdb) = self.get_reference_databases(dna_region)
         
         
         #print tax_file
         max_distance = C.max_distance['default']
         if dna_region in C.max_distance:
             max_distance = C.max_distance[dna_region] 
         unique_file = 'Not Found'
         names_file  = 'Not Found'
         if self.runobj.platform == 'vamps':    
             unique_file = os.path.join(output_dir, key+'.unique.fa')
             names_file = os.path.join(output_dir,key+'.names')
         elif self.runobj.platform == 'illumina':
             file_prefix = self.runobj.samples[key].file_prefix
             unique_file = os.path.join(self.input_dir,file_prefix+"-PERFECT_reads.fa.unique")
             names_file = os.path.join(self.input_dir,file_prefix+"-PERFECT_reads.fa.unique.names")
         else:
             pass
         #usearch_filename= os.path.join(self.gast_dir, "uc")
         #uc_results = self.parse_uclust(usearch_filename)
         #print uc_results
         
         ref_taxa = self.load_reftaxa(taxdb)
         names_file  = os.path.join(output_dir, key+'.names')
         self.assign_taxonomy(gast_dir,dna_region,names_file, ref_taxa);
         
     return ("SUCCESS","gast2tax") 
 def check_domain_suite_region(self,data):
     error = False
     warn=False
     
     for item in data:
         
         if item != 'general':
             # CHECK MUST MATCH: "Domain","Primer Suite","DNA Region"
             if data[item]['primer_suite'] not in self.primer_suites:
                 logger.error("Primer Suite not found: "+data[item]['primer_suite']+" - Exiting (key: "+item+")")
                 error=True
             #if dataset_items['domain'] not in domains:
             #   sys.exit("ERROR: Domain not found: "+dataset_items['domain'])
             if data[item]['dna_region'] not in self.dna_regions:
                 logger.error("DNA Region not found: "+data[item]['dna_region']+" - Exiting (key: "+item+")")
                 error=True
             # "Bacterial v6","BacterialV6Suite","v6"
             #if dataset_items['domain'][:6] != dataset_items['primer_suite'][:6]:
             #    sys.exit("ERROR: Domain ("+dataset_items['domain']+") -- Primer Suite ("+dataset_items['primer_suite']+") mismatch.")
             #if dataset_items['domain'][-2:].lower() != dataset_items['dna_region'].lower():
             #    sys.exit("ERROR: DNA Region ("+dataset_items['dna_region']+") -- Domain ("+dataset_items['domain']+") mismatch.")
             if data[item]['dna_region'] not in data[item]['primer_suite']:
                 logger.error("DNA Region ("+data[item]['dna_region']+") not found in Primer Suite ("+data[item]['primer_suite']+") - Exiting (key: "+item+")")
                 error=True
     return (error,warn)
Example #22
0
def vampsupload(runobj):
    """
    Upload data files to VAMPS database
    """
    # for vamps 'new_lane_keys' will be prefix
    # of the uniques and names file
    # that was just created in vamps_gast.py
    # or we can get the 'lane_keys' directly from the config_file
    # for illumina:
    # a unique idx_key is a concatenation of barcode_index and run_key
    idx_keys = get_keys(runobj)

    #     if(runobj.vamps_user_upload):
    #         idx_keys = [runobj.user+runobj.runcode]
    #     else:
    #         idx_keys = convert_unicode_dictionary_to_str(json.loads(open(runobj.trim_status_file_name,"r").read()))["new_lane_keys"]

    # NOT NEEDED HERE: Find duplicate project names
    # if vamps user uploads this has already been done and this project is
    # already in vamps_upload_info table
    # if data from a csv file (illumina and 454) this also is not needed
    # as data is checked in metadata.py

    myvamps = Vamps(runobj, idx_keys)
    # Create files
    myvamps.create_vamps_files()
    # put files in db
    result_code = myvamps.load_vamps_db()

    if result_code[:5] == 'ERROR':
        logger.error("load_vamps_db failed")
        if runobj.vamps_user_upload:
            write_status_to_vamps_db(runobj.site, runobj.run, "GAST_ERROR",
                                     result_code)
        sys.exit("load_vamps_db failed")
    elif runobj.vamps_user_upload:
        logger.debug("Finished loading VAMPS data. %s" % result_code)
        write_status_to_vamps_db(runobj.site, runobj.run, 'GAST_SUCCESS',
                                 'Loading VAMPS Finished')
    def check_if_array_job_is_done(self, job_name):
        cluster_done = False
        check_qstat_cmd_line = "qstat -r | grep %s | wc -l" % job_name
        logger.debug("check_qstat_cmd_line = %s" % check_qstat_cmd_line)
        try:
            p = subprocess.Popen(check_qstat_cmd_line,
                                 stdout=subprocess.PIPE,
                                 shell=True)
            (output, err) = p.communicate()
            num_proc = int(output)
            logger.debug("qstat is running %s '%s' processes" %
                         (num_proc, job_name))
            #         pprint(p)

            if (num_proc == 0):
                cluster_done = True

    #         print("cluster_done from check_if_cluster_is_done = %s" % cluster_done)
        except:
            logger.error("%s can be done only on a cluster." % job_name)
            raise
        return cluster_done
Example #24
0
def vampsupload(runobj):
    """
    Upload data files to VAMPS database
    """
    # for vamps 'new_lane_keys' will be prefix 
    # of the uniques and names file
    # that was just created in vamps_gast.py
    # or we can get the 'lane_keys' directly from the config_file
    # for illumina:
    # a unique idx_key is a concatenation of barcode_index and run_key
    idx_keys = get_keys(runobj)
    
#     if(runobj.vamps_user_upload):
#         idx_keys = [runobj.user+runobj.runcode]        
#     else:
#         idx_keys = convert_unicode_dictionary_to_str(json.loads(open(runobj.trim_status_file_name,"r").read()))["new_lane_keys"]
     
     # NOT NEEDED HERE: Find duplicate project names
     # if vamps user uploads this has already been done and this project is
     # already in vamps_upload_info table
     # if data from a csv file (illumina and 454) this also is not needed
     # as data is checked in metadata.py
    
     
    myvamps = Vamps(runobj, idx_keys)
    # Create files
    myvamps.create_vamps_files()
    # put files in db
    result_code = myvamps.load_vamps_db()
    
    if result_code[:5] == 'ERROR':
        logger.error("load_vamps_db failed") 
        sys.exit("load_vamps_db failed")
        if runobj.vamps_user_upload:
            write_status_to_vamps_db( runobj.site, runobj.run, "GAST_ERROR", result_code )
    elif runobj.vamps_user_upload:
        print "Finished loading VAMPS data",result_code
        write_status_to_vamps_db( runobj.site, runobj.run, 'GAST_SUCCESS', 'Loading VAMPS Finished' )
def write_status_to_vamps_db(site='vampsdev',
                             id='0',
                             status='Test',
                             message=''):
    """
    This should be available to write status updates to vamps:vamps_upload_status.
    It is especially important for MoBeDAC uploads because the qiime site
    will 'see' and react to the message in the db.  <-- not true any longer 2014-02-01 AAV


    """
    import ConMySQL
    from pipeline.db_upload import MyConnection
    today = str(datetime.date.today())
    if site == 'vamps':
        db_host = 'vampsdb'
        db_name = 'vamps'
        db_home = '/xraid2-2/vampsweb/vamps/'
    else:
        db_host = 'bpcweb7'
        db_name = 'vamps'
        db_home = '/xraid2-2/vampsweb/vampsdev/'
    #obj=ConMySQL.New(db_host, db_name, db_home)
    #my_conn = MyConnection(db_host, db_name)
    obj = ConMySQL.New(db_host, db_name, db_home)
    conn = obj.get_conn()
    cursor = conn.cursor()
    query = "update vamps_upload_status set status='%s', status_message='%s', date='%s' where id='%s'" % (
        status, message, today, id)
    try:
        cursor.execute(query)
        #print("executing",query)
    except:
        conn.rollback()
        logger.error("ERROR status update failed")
    else:
        conn.commit()
def gast(runobj):

    # for vamps 'new_lane_keys' will be prefix
    # of the uniques and names file
    # that was just created in vamps_gast.py
    # or we can get the 'lane_keys' directly from the config_file
    # for illumina:
    # a unique idx_key is a concatenation of barcode_index and run_key
    # Should return a list not a string
    idx_keys = get_keys(runobj)

    # get GAST object
    mygast = Gast(runobj, idx_keys)

    # Check for unique files and create them if not there
    result_code = mygast.check_for_uniques_files(idx_keys)
    runobj.run_status_file_h.write(json.dumps(result_code) + "\n")
    if result_code[0] == "ERROR":
        logger.error("uniques not found failed")
        sys.exit("uniques not found failed")
    sleep(5)

    # CLUSTERGAST
    result_code = mygast.clustergast()
    runobj.run_status_file_h.write(json.dumps(result_code) + "\n")
    if result_code[0] == "ERROR":
        logger.error("clutergast failed")
        sys.exit("clustergast failed")
    sleep(5)

    # GAST_CLEANUP
    result_code = mygast.gast_cleanup()
    runobj.run_status_file_h.write(json.dumps(result_code) + "\n")
    if result_code[0] == "ERROR":
        logger.error("gast_cleanup failed")
        sys.exit("gast_cleanup failed")
    sleep(5)

    # GAST2TAX
    result_code = mygast.gast2tax()
    runobj.run_status_file_h.write(json.dumps(result_code) + "\n")
    if result_code[0] == "ERROR":
        logger.error("gast2tax failed")
        sys.exit("gast2tax failed")
def extract_zipped_file(run_date, outdir, filename):
    """

    """
    # check if zipped
    assert os.path.isdir(outdir)
    archivename = os.path.join(outdir,run_date+'.zip')
    if zipfile.is_zipfile(archivename):
        zf = zipfile.ZipFile(archivename, 'r')

        try:
            data = zf.read(filename)
        except KeyError:
            logger.error('ERROR: Did not find %s in zip file' % filename)
        else:
            logger.error(filename, ':')
            logger.error(repr(data))
        print
        zf.close()
    else:
        logger.error("No zipfile archive found:",archivename)
def extract_zipped_file(run_date, outdir, filename):
    """

    """
    # check if zipped
    assert os.path.isdir(outdir)
    archivename = os.path.join(outdir, run_date + '.zip')
    if zipfile.is_zipfile(archivename):
        zf = zipfile.ZipFile(archivename, 'r')

        try:
            data = zf.read(filename)
        except KeyError:
            logger.error('ERROR: Did not find %s in zip file' % filename)
        else:
            logger.error(filename, ':')
            logger.error(repr(data))
        print
        zf.close()
    else:
        logger.error("No zipfile archive found:", archivename)
    def check_domain_suite_region(self,data):
        error = False
        warn=False

        for item in data:

            if item != 'general':
                primer_suite = self.convert_primer_suites(data[item]['primer_suite'])
                dna_region   = self.convert_primer_suites(data[item]['dna_region'])

                # CHECK MUST MATCH: "Domain","Primer Suite","DNA Region"
                if primer_suite not in self.primer_suites:
                    logger.error("Primer Suite not found: "+primer_suite+" - Exiting (key: "+item+")")
                    error=True
                if dna_region not in self.dna_regions:
                    logger.error("DNA Region not found: "+dna_region+" - Exiting (key: "+item+")")
                    error=True
                if dna_region not in primer_suite:
                    logger.error("DNA Region ("+dna_region+") not found in Primer Suite ("+primer_suite+") - Exiting (key: "+item+")")
                    error=True
        return (error, warn)
Example #30
0
 def check_domain_suite_region(self,data):
     error = False
     warn=False
     
     for item in data:
         
         if item != 'general':
             primer_suite = self.convert_primer_suites(data[item]['primer_suite'])
             dna_region   = self.convert_primer_suites(data[item]['dna_region'])
             
             # CHECK MUST MATCH: "Domain","Primer Suite","DNA Region"
             if primer_suite not in self.primer_suites:
                 logger.error("Primer Suite not found: "+primer_suite+" - Exiting (key: "+item+")")
                 error=True
             if dna_region not in self.dna_regions:
                 logger.error("DNA Region not found: "+dna_region+" - Exiting (key: "+item+")")
                 error=True
             if dna_region not in primer_suite:
                 logger.error("DNA Region ("+dna_region+") not found in Primer Suite ("+primer_suite+") - Exiting (key: "+item+")")
                 error=True
     return (error, warn)
 def write_seq_frequencies_in_file(self, out_file, fa_file_name, seq_in_file):
     try:
         with open(out_file, "a") as myfile:
             myfile.write(str(fa_file_name) + ": " + str(seq_in_file) + "\n")
     except Exception:
         logger.error(Exception)
 def gast_cleanup(self):
     """
     gast_cleanup - follows clustergast, explodes the data and copies to gast_concat and gast files
     """
     logger.info("Starting GAST Cleanup")
     self.runobj.run_status_file_h.write("Starting gast_cleanup\n")
     for key in self.idx_keys:
         output_dir = os.path.join(self.basedir,key)
         gast_dir = os.path.join(output_dir,'gast')
         if key in self.runobj.samples:
             dna_region = self.runobj.samples[key].dna_region
         else:            
             dna_region = self.runobj.dna_region
         if not dna_region:
             logger.error("gast_cleanup: We have no DNA Region: Setting dna_region to 'unknown'")
             self.runobj.run_status_file_h.write("gast_cleanup: We have no DNA Region: Setting dna_region to 'unknown'\n")
             dna_region = 'unknown'
         # find gast_dir
         
         
         # for vamps user upload
         # basedir is like avoorhis_3453211
         # and outdir is like avoorhis_3453211/2012-06-25
         # for MBL pipeline
         # basedir is like 1_AGTCG
         # and outdir is like 1_AGTCG/2012-06-25
         unique_file = 'Not Found'
         names_file  = 'Not Found'
         if self.runobj.platform == 'vamps':    
             unique_file = os.path.join(output_dir, key+'.unique.fa')
             names_file = os.path.join(output_dir,key+'.names')
         elif self.runobj.platform == 'illumina':
             file_prefix = self.runobj.samples[key].file_prefix
             unique_file = os.path.join(self.input_dir,file_prefix+"-PERFECT_reads.fa.unique")
             names_file = os.path.join(self.input_dir,file_prefix+"-PERFECT_reads.fa.unique.names")
         else:
             pass
         print 'UNIQUE FILE',unique_file
         
         
         #print 'names file',names_file
         
         if not os.path.exists(gast_dir):
             logger.error("Could not find gast directory: "+gast_dir+" Exiting")
             sys.exit()
         clustergast_filename_single   = os.path.join(gast_dir, "gast"+dna_region)
         
         logger.debug('gast filesize:'+str(os.path.getsize(clustergast_filename_single)))
         
         gast_filename          = os.path.join(gast_dir, "gast")
         gastconcat_filename    = os.path.join(gast_dir, "gast_concat")  
         #dupes_filename    = os.path.join(gast_dir, "dupes") 
         #nonhits_filename    = os.path.join(gast_dir, "nonhits")   
         copies = {}
         nonhits = {}
         # open and read names file
         names_fh = open(names_file,'r')
         for line in names_fh:
             s = line.strip().split("\t")
             
             index_read = s[0]                
             copies[index_read] = s[1].split(',')
             
             if index_read in nonhits:
                 nonhits[index_read] += 1
             else:
                 nonhits[index_read] = 1
                 
             
             
         names_fh.close()            
         #print nonhits
         #print copies
         
         #######################################
         # 
         #  Insert records with valid gast hits into gast_file
         # 
         #######################################   
         # read the .gast file from clustergast            
         concat = {}
         gast_fh     = open(gast_filename,'w')
         if(os.path.exists(clustergast_filename_single)):
             in_gast_fh  = open(clustergast_filename_single,'r')
         else:
             print "No clustergast file found:",clustergast_filename_single,"\nExiting"
             self.runobj.run_status_file_h.write("No clustergast file found:",clustergast_filename_single," Exiting\n")
             sys.exit()
         for line in in_gast_fh:
             
             s = line.strip().split()
             if len(s) == 4:
                 read_id     = s[0]
                 refhvr_id   = s[1].split('|')[0]
                 distance    = s[2]
                 alignment   = s[3]
             #print read_id,refhvr_id
             # if this was in the gast table zero it out because it had a valid hit
             # so we don't insert them as non-hits later
             if read_id in nonhits:
                 del nonhits[read_id]
                 #print 'deleling',read_id
             #print 'nonhits',nonhits
             if read_id not in copies:
                 logger.info(read_id+' not in names file: Skipping')
                 continue
                 
             # give the same ref and dist for each duplicate
             for id in copies[read_id]:
                 
                 if id != read_id:
                     #print id,read_id,distance,refhvr_id  
                     gast_fh.write( id + "\t" + refhvr_id + "\t" + distance + "\t" + alignment + "\n" )
                     
                                            
         in_gast_fh.close()
          
         #######################################
         # 
         #  Insert a record for any valid sequence that had no blast hit and therefore no gast result
         #       into gast_filename
         # 
         #######################################   
         for read in sorted(nonhits.iterkeys()):                
             for d in copies[read]: 
                 gast_fh.write( d+"\t0\t1\t\n")
                 
                 
         gast_fh.close()
         
         # concatenate the two gast files
         clustergast_fh = open(clustergast_filename_single,'a')            
         shutil.copyfileobj(open(gast_filename,'rb'), clustergast_fh)
         clustergast_fh.close()
         #the open again and get data for gast concat
         concat = {}
         print clustergast_filename_single
         for line in open(clustergast_filename_single,'r'):
             data = line.strip().split("\t")
             id = data[0]
             refhvr_id = data[1].split('|')[0]
             distance = data[2]
             #print 'data',data
             if id in concat:
                 concat[id]['refhvrs'].append(refhvr_id)                        
             else:
                 concat[id] = {}
                 concat[id]['refhvrs'] = [refhvr_id]
             concat[id]['distance'] = distance     
             
         
         
         #######################################
         #
         # Insert records into gast_concat_filename
         #
         #######################################             
         # first we need to open the gast_filename
         gastconcat_fh     = open(gastconcat_filename,'w')
         for id, value in concat.iteritems():
             #print 'trying gastconcat', id,value
             gastconcat_fh.write( id + "\t" + concat[id]['distance'] + "\t" + ' '.join(concat[id]['refhvrs']) + "\n" )
         gastconcat_fh.close()
         
         
     print "Finished gast_cleanup"   
     logger.info("Finished gast_cleanup")
     return ("SUCCESS","gast_cleanup")
 def gather_files_per_key(self, key):
 
     file_collector={}
     out_gast_dir = os.path.join(self.global_gast_dir,key)  #directory
     file_collector['gast_concat_file'] = os.path.join(out_gast_dir,'gast_concat')
     file_collector['tagtax_file'] = os.path.join(out_gast_dir,'tagtax_terse')
     if not os.path.exists(file_collector['gast_concat_file']):
             logger.warning("Could not find gast_concat_file file: "+file_collector['gast_concat_file'])
              
     if not os.path.exists(file_collector['tagtax_file']):
         logger.warning("Could not find tagtax_file file: "+file_collector['tagtax_file'])
     #print key,self.runobj.platform
     
     if self.runobj.vamps_user_upload:
         
         file_collector['unique_file'] = os.path.join(out_gast_dir,'unique.fa')
         file_collector['original_fa_file'] = os.path.join(out_gast_dir,'fasta.fa')
         
         if self.runobj.fasta_file:
             grep_cmd = ['grep','-c','>',self.runobj.fasta_file]
         else:
             grep_cmd = ['grep','-c','>',file_collector['unique_file']]
     else:
         if self.runobj.platform == 'illumina':
             
             #unique_file = os.path.join(self.basedir,C.gast_dir),'unique.fa')
             reads_dir = dirs.check_dir(dirs.reads_overlap_dir)
             file_prefix = self.runobj.samples[key].file_prefix
             file_collector['unique_file'] = os.path.join(reads_dir,file_prefix+"-PERFECT_reads.fa.unique")
             # ANNA What is the correct file here:
             file_collector['original_fa_file'] = os.path.join(reads_dir,file_prefix+"-PERFECT_reads.fa.unique")
             grep_cmd = ['grep','-c','>',file_collector['unique_file']]
         elif self.runobj.platform == '454':
             pass
         else:
             sys.exit("no usable platform found")
         
     if not os.path.exists(file_collector['unique_file']):
         logger.error("Could not find unique_file: "+file_collector['unique_file'])
         
     # get dataset_count here from unique_file
     # the dataset_count should be from the non-unique file
     # but if we don't have that must use uniques
     
     try:
         dataset_count = subprocess.check_output(grep_cmd).strip()
     except:
         dataset_count = 0
     print key,": Sequence Count", dataset_count
     
     # output files to be created:            
     file_collector['taxes_file']                = os.path.join(out_gast_dir,'vamps_data_cube_uploads.txt')
     file_collector['summed_taxes_file']         = os.path.join(out_gast_dir,'vamps_junk_data_cube_pipe.txt')
     file_collector['distinct_taxes_file']       = os.path.join(out_gast_dir,'vamps_taxonomy_pipe.txt')
     file_collector['sequences_file']            = os.path.join(out_gast_dir,'vamps_sequences_pipe.txt')
     file_collector['export_file']               = os.path.join(out_gast_dir,'vamps_export_pipe.txt')
     file_collector['projects_datasets_file']    = os.path.join(out_gast_dir,'vamps_projects_datasets_pipe.txt')
     file_collector['project_info_file']         = os.path.join(out_gast_dir,'vamps_projects_info_pipe.txt')
 
 
     return (file_collector, dataset_count, out_gast_dir)
Example #34
0
def gast(runobj):  
    
    logger.info("STARTING GAST()")
#     logger.info("vsearch version: " % utils.get_vsearch_version)
    # for vamps 'new_lane_keys' will be prefix 
    # of the uniques and names file
    # that was just created in vamps_gast.py
    # or we can get the 'lane_keys' directly from the config_file
    # for illumina:
    # a unique idx_key is a concatenation of barcode_index and run_key
    # Should return a list not a string
    idx_keys = get_keys(runobj)
    
    # get GAST object
    mygast = Gast(runobj, idx_keys)
    
    
    # Check for unique files and create them if not there
    result_code = mygast.check_for_unique_files(idx_keys)
    runobj.run_status_file_h.write(json.dumps(result_code)+"\n")
    if result_code['status'] == 'ERROR':
        logger.error("uniques not found failed")
        sys.exit("uniques not found failed")
        if runobj.vamps_user_upload:
            write_status_to_vamps_db( runobj.site, runobj.run, "GAST ERROR", "uniques file not found - failed" )
    elif runobj.vamps_user_upload:
        write_status_to_vamps_db( runobj.site, runobj.run, result_code['status'], result_code['message'] )
        
    sleep(5)
    
    # CLUSTERGAST
    result_code = mygast.clustergast()
    runobj.run_status_file_h.write(json.dumps(result_code)+"\n")
    if result_code['status'] == 'ERROR':
        logger.error("clutergast failed")
        sys.exit("clustergast failed")
        if runobj.vamps_user_upload:
            write_status_to_vamps_db( runobj.site, runobj.run, "GAST ERROR", "clustergast failed" )
    elif runobj.vamps_user_upload:
        write_status_to_vamps_db( runobj.site, runobj.run, result_code['status'], result_code['message'] )
        
    sleep(5)
    
    # GAST_CLEANUP
    result_code = mygast.gast_cleanup()
    runobj.run_status_file_h.write(json.dumps(result_code)+"\n")
    if result_code['status'] == 'ERROR':
        logger.error("gast_cleanup failed")        
        sys.exit("gast_cleanup failed")
        if runobj.vamps_user_upload:
            write_status_to_vamps_db( runobj.site, runobj.run, "GAST ERROR", "gast_cleanup failed" )
    elif runobj.vamps_user_upload:
        write_status_to_vamps_db( runobj.site, runobj.run, result_code['status'], result_code['message'] )
        
    sleep(5)
    
    # GAST2TAX
    result_code = mygast.gast2tax()
    runobj.run_status_file_h.write(json.dumps(result_code)+"\n")
    if result_code['status'] == 'ERROR':
        logger.error("gast2tax failed") 
        sys.exit("gast2tax failed")
        if runobj.vamps_user_upload:
            write_status_to_vamps_db( runobj.site, runobj.run, "GAST ERROR", "gast2tax failed" )
    elif runobj.vamps_user_upload:
        write_status_to_vamps_db( runobj.site, runobj.run, result_code['status'], result_code['message'] )
    def check_project_name(self, data):
        """
        # CHECK: project name format: 3 parts; end with Bv6,Ev9,Av6 or something similar
        """
        error   =False
        warn    =False
        for item in data:
            if item != 'general':
                try:
                    (a,b,c) = data[item]['project'].split('_')
                except:
                    logger.error("project not in correct format: ")
                    logger.error(data[item]['project'])
                    logger.error(" - Exiting (key: ")
                    logger.error(data[item])
                    error=True
                (a,b,c) = data[item]['project'].split('_')
                #if c[0] not in [i[0].upper() for i in domains]:
                #    sys.exit("ERROR : Project suffix has incorrect/non-existant domain: "+c)
                # logger.error("c[1:] = ")
                # logger.error(c[1:])
                # logger.error("c.lower() =")
                # logger.error(c.lower())
                # logger.error("self.dna_regions")
                # logger.error(self.dna_regions )

                if (c[1:].lower() not in self.dna_regions) and (c.lower() not in self.dna_regions):
                    logger.error("Project suffix has incorrect DNA region: ")
                    logger.error(c)
                    logger.error(" - Exiting (key: ")
                    logger.error(data[item])
                    error = True
        return (error, warn)
Example #36
0
    def validate_illumina_ini(self, analysis_dir):
        """
        The csv headers are checked earlier
        """
        
        print "Validating ini type Config File (may have been converted from csv)"
        new_ini_file = os.path.join(analysis_dir, self.general_config_dict['run'] + '.ini')
        print "New ini file location: "+new_ini_file
        return_code = False
        error_code  = False
        warn_code   = False
        msg = ''
        error=False
        warn=False
        #print 'configpath',self.general_config_dict['configPath']
        # configPath here is the new configPath
        self.data_object = self.configDictionaryFromFile_ini(self.general_config_dict['configPath'])

        
        (error_code,warn_code) = self.check_for_missing_values(self.data_object)  
        if error_code: error=True
        if warn_code: warn=True
        (error_code,warn_code) = self.check_for_datasets(self.data_object)
        if error_code: error=True
        if warn_code: warn=True
        (error_code,warn_code) = self.check_domain_suite_region(self.data_object)
        if error_code: error=True
        if warn_code: warn=True
        (error_code,warn_code) = self.check_project_name(self.data_object)
        if error_code: error=True
        if warn_code: warn=True
        (error_code,warn_code) = self.check_dataset_name(self.data_object)
        if error_code: error=True
        if warn_code: warn=True
        (error_code,warn_code) = self.check_projects_and_datasets(self.data_object)
        if error_code: error=True
        if warn_code: warn=True
        #print self.data_object['input_dir']
        #print self.data_object['input_files']
 
 
        if 'input_dir' not in self.data_object['general'] and 'input_files' not in self.data_object['general']:
            logger.warning("No input directory and no input files")        
            warn=True
        elif not os.path.isdir(self.data_object['general']['input_dir']):
            logger.error("That is not a directory: "+self.data_object['general']['input_dir'])
            error=True
        elif self.data_object['general']['input_file_format'] == 'fastq' and self.data_object['general']['platform'] == 'illumina':
                file_exists = False
    #            if 'input_dir' in self.data_object['general'] and self.data_object['general']['input_dir']:
                for dirname, dirnames, filenames in os.walk(self.data_object['general']['input_dir']):
    #                if not filenames:
                    for file_name in filenames:
                        if os.path.isfile(os.path.join(dirname, file_name)):
                            file_exists = True
                            break
                if not file_exists:
                    logger.error("There are no files found in the input directory: "+self.data_object['general']['input_dir'])
                    error=True
        elif 'input_dir' in self.data_object['general'] and self.data_object['general']['input_dir'] and ('input_files' not in self.data_object['general'] or not self.data_object['general']['input_files']):
            logger.error("There are no files found in the input directory: "+self.data_object['general']['input_dir'])
            error=True
                        
        if error:
            sys.exit( """\n\t\033[91mTHERE WERE SEVERE PROBLEMS WITH THE CSV and/or CONFIG FILE - EXITING 
            PLEASE CORRECT THEM AND START OVER.\033[0m\n
            To view the errors add ' --loglevel info' to the command line.\n""")
        elif warn: 
            msg = """\n\t\033[93mTHERE WERE NON-FATAL PROBLEMS WITH THE CSV and/or CONFIG FILE THAT MAY OR MAY NOT CAUSE PROBLEMS.\033[0m\n
                To view the warnings add ' --loglevel warning' to the command line.\n"""
            print "\033[92mCSV File Passed Vaidation! (with warnings)\033[0m"
        else:
            print "\033[92mCSV File Passed Vaidation!\033[0m"
        return msg
Example #37
0
def gast(runobj):
    logger.info("STARTING GAST()")
    #     logger.info("vsearch version: " % utils.get_vsearch_version)
    # for vamps 'new_lane_keys' will be prefix
    # of the uniques and names file
    # that was just created in vamps_gast.py
    # or we can get the 'lane_keys' directly from the config_file
    # for illumina:
    # a unique idx_key is a concatenation of barcode_index and run_key
    # Should return a list not a string
    idx_keys = get_keys(runobj)

    # get GAST object
    mygast = Gast(runobj, idx_keys)

    # Check for unique files and create them if not there
    result_code = mygast.check_for_unique_files(idx_keys)
    runobj.run_status_file_h.write(json.dumps(result_code) + "\n")
    if result_code['status'] == 'ERROR':
        logger.error("uniques not found failed")
        if runobj.vamps_user_upload:
            write_status_to_vamps_db(runobj.site, runobj.run, "GAST ERROR",
                                     "uniques file not found - failed")
        sys.exit("uniques not found failed")
    elif runobj.vamps_user_upload:
        write_status_to_vamps_db(runobj.site, runobj.run,
                                 result_code['status'], result_code['message'])

    sleep(5)

    # CLUSTERGAST
    result_code = mygast.clustergast()
    runobj.run_status_file_h.write(json.dumps(result_code) + "\n")
    if result_code['status'] == 'ERROR':
        logger.error("clutergast failed")
        if runobj.vamps_user_upload:
            write_status_to_vamps_db(runobj.site, runobj.run, "GAST ERROR",
                                     "clustergast failed")
        sys.exit("clustergast failed")
    elif runobj.vamps_user_upload:
        write_status_to_vamps_db(runobj.site, runobj.run,
                                 result_code['status'], result_code['message'])

    sleep(5)

    # GAST_CLEANUP
    result_code = mygast.gast_cleanup()
    runobj.run_status_file_h.write(json.dumps(result_code) + "\n")
    if result_code['status'] == 'ERROR':
        logger.error("gast_cleanup failed")
        if runobj.vamps_user_upload:
            write_status_to_vamps_db(runobj.site, runobj.run, "GAST ERROR",
                                     "gast_cleanup failed")
        sys.exit("gast_cleanup failed")
    elif runobj.vamps_user_upload:
        write_status_to_vamps_db(runobj.site, runobj.run,
                                 result_code['status'], result_code['message'])

    sleep(5)

    # GAST2TAX
    result_code = mygast.gast2tax()
    runobj.run_status_file_h.write(json.dumps(result_code) + "\n")
    if result_code['status'] == 'ERROR':
        logger.error("gast2tax failed")
        if runobj.vamps_user_upload:
            write_status_to_vamps_db(runobj.site, runobj.run, "GAST ERROR",
                                     "gast2tax failed")
        sys.exit("gast2tax failed")
    elif runobj.vamps_user_upload:
        write_status_to_vamps_db(runobj.site, runobj.run,
                                 result_code['status'], result_code['message'])
Example #38
0
def file_to_db_upload_all_but_seq(my_file_to_db_upload, filename,
                                  no_run_info_list, full_upload):
    total_time = 0

    try:
        my_file_to_db_upload.get_gast_result(os.path.basename(filename))

        filename_base_no_suff = get_filename_base_no_suff(filename)

        run_info_ill_id = my_file_to_db_upload.get_run_info_ill_id(
            filename_base_no_suff)
        if run_info_ill_id:
            my_file_to_db_upload.collect_project_ids(run_info_ill_id)
            seq_in_file = len(my_file_to_db_upload.seq.fasta_dict)
            my_file_to_db_upload.put_seq_statistics_in_file(
                filename, seq_in_file)
            total_time += seq_in_file

            start_fasta_next = time.time()

            start_insert_pdr_info_time = 0
            start_insert_pdr_info_time = time.time()

            my_file_to_db_upload.insert_pdr_info(run_info_ill_id)
            insert_pdr_info_time = (time.time() - start_insert_pdr_info_time)

            start_insert_taxonomy_time = 0
            start_insert_taxonomy_time = time.time()
            my_file_to_db_upload.insert_taxonomy()
            insert_taxonomy_time = (time.time() - start_insert_taxonomy_time)

            insert_sequence_uniq_info_time = 0
            start_insert_sequence_uniq_info_time = time.time()
            my_file_to_db_upload.insert_sequence_uniq_info()
            insert_sequence_uniq_info_time = (
                time.time() - start_insert_sequence_uniq_info_time)

            logger.debug("start_fasta_loop took %s sec to finish" %
                         (time.time() - start_fasta_next))
            logger.debug("insert_pdf_info_query_time took %s sec to finish" %
                         insert_pdr_info_time)
            logger.debug(
                "start_insert_taxonomy_upload_time took %s sec to finish" %
                insert_taxonomy_time)
            logger.debug(
                "insert_sequence_uniq_info_time took %s sec to finish" %
                insert_sequence_uniq_info_time)

            return total_time
        else:
            utils = PipelneUtils()

            no_run_info_list.append(filename_base_no_suff)
            utils.print_both(
                "ERROR: There is no run info for %s yet, please check if it's in the csv and uploaded to the db"
                % filename)
            return 0

    except:  # catch everything
        logger.error("\r[pipelineprocessor] Unexpected:"
                     )  # handle unexpected exceptions
        logger.error(sys.exc_info()
                     [0])  # info about curr exception (type,value,traceback)
        raise  # re-throw caught exception
    def clustergast(self):
        """
        clustergast - runs the GAST pipeline on the cluster.
               GAST uses UClust to identify the best matches of a read sequence
               to references sequences in a reference database.
               VAMPS: The uniques and names files have previously been created in trim_run.py.
               Illumina :
        """
        logger.info("Starting Clustergast")
        self.runobj.run_status_file_h.write("Starting clustergast\n")
        # Step1: create empty gast table in database: gast_<rundate>
        # Step2: Count the number of sequences so the job can be split for nodes
        # $facount = `grep -c \">\" $fasta_uniqs_filename`;
        # $calcs = `/bioware/seqinfo/bin/calcnodes -t $facount -n $nodes -f 1`;

        #   /bioware/seqinfo/bin/fastasampler -n $start,$end ${gastDir}/${fasta_uniqs_filename} $tmp_fasta_filename
        #   $usearch_binary --global --query $tmp_fasta_filename --iddef 3 --gapopen 6I/1E --db $refhvr_fa --uc $tmp_usearch_filename --maxaccepts $max_accepts --maxrejects $max_rejects --id $pctid_threshold
        #   # sort the results for valid hits, saving only the ids and pct identity
        #   grep -P \"^H\\t\" $tmp_usearch_filename | sed -e 's/|.*\$//' | awk '{print \$9 \"\\t\" \$4 \"\\t\" \$10 \"\\t\" \$8}' | sort -k1,1b -k2,2gr | clustergast_tophit > $gast_filename
        #   Submit the script
        #   /usr/local/sge/bin/lx24-amd64/qsub $qsub_priority $script_filename
 
        
        
        calcnodes = C.calcnodes_cmd
        sqlImportCommand = C.mysqlimport_cmd
        #qsub = '/usr/local/sge/bin/lx24-amd64/qsub'
        clusterize = C.clusterize_cmd
        


        ###################################################################
        # use fasta.uniques file
        # split into smaller files
        # usearch --cluster each
        #######################################
        #
        # Split the uniques fasta and run UClust per node
        #
        #######################################
        qsub_prefix = 'clustergast_sub_'
        gast_prefix = 'gast_'
        if self.use_cluster:
            logger.info("Using cluster for clustergast")
        else:
            logger.info("Not using cluster")
        counter=0
        for key in self.idx_keys:
            print key
            counter +=1
            print "\nFile:",str(counter)
            if counter >= self.limit:
                pass
            
            cluster_nodes = C.cluster_nodes
            logger.info("Cluster nodes set to: "+str(cluster_nodes))
            output_dir = os.path.join(self.basedir,key)
            gast_dir = os.path.join(output_dir,'gast')
  # SMPL1_3_NNNNCGCTC_3          
            #print 'samples',key,self.runobj.samples
            if key in self.runobj.samples:
                dna_region = self.runobj.samples[key].dna_region
            else:            
                dna_region = self.runobj.dna_region
            if not dna_region:
                logger.error("clustergast: We have no DNA Region: Setting dna_region to 'unknown'")
                dna_region = 'unknown'
                
            (refdb,taxdb) = self.get_reference_databases(dna_region)
            #print 'DBs',refdb,taxdb
            
            # if no dna_region OR no refdb can be found then use
            # refssu
            #if refdb contains refssu
            #the add this to grep command
            #and change usearch to usearch64
            unique_file = 'Not Found'
            names_file  = 'Not Found'
            if self.runobj.platform == 'vamps':    
                unique_file = os.path.join(output_dir, key+'.unique.fa')
            elif self.runobj.platform == 'illumina':
                file_prefix = self.runobj.samples[key].file_prefix
                unique_file = os.path.join(self.input_dir,file_prefix+"-PERFECT_reads.fa.unique")
                names_file = os.path.join(self.input_dir,file_prefix+"-PERFECT_reads.fa.unique.names")
                
            else:
                pass
                
            print 'UNIQUE FILE',unique_file

            #print gast_dir
            #sys.exit("EXIT")
            
            
            i = 0
            if cluster_nodes:
                grep_cmd = ['grep','-c','>',unique_file]
                logger.debug( ' '.join(grep_cmd) )
                facount = subprocess.check_output(grep_cmd).strip()
                logger.debug( key+' count '+facount)
                calcnode_cmd = [calcnodes,'-t',str(facount),'-n',str(cluster_nodes),'-f','1']
                
                calcout = subprocess.check_output(calcnode_cmd).strip()
                logger.debug("calcout:\n"+calcout)
                #calcout:
                # node=1 start=1 end=1 rows=1
                # node=2 start=2 end=2 rows=1
                # node=3 start=3 end=3 rows=1           
                lines = calcout.split("\n")
                gast_file_list = []
                for line in lines:
                    i += 1
                    if i >= cluster_nodes:
                        continue
                    script_filename = os.path.join(gast_dir,qsub_prefix + str(i))
                    gast_filename   = os.path.join(gast_dir, gast_prefix + str(i))
                    fastasamp_filename = os.path.join(gast_dir, 'samp_' + str(i))
                    clustergast_filename   = os.path.join(gast_dir, key+".gast_" + str(i))
                    gast_file_list.append(clustergast_filename)
                    usearch_filename= os.path.join(gast_dir, "uc_" + str(i))
                    log_file = os.path.join(gast_dir, 'clustergast.log_' + str(i))
                    
                    data = line.split()
                    
                    if len(data) < 2:
                        continue
                    start = data[1].split('=')[1]
                    end  = data[2].split('=')[1]
                    
                    if self.use_cluster:
                        fh = open(script_filename,'w')
                        qstat_name = "gast" + key + '_' + self.runobj.run + "_" + str(i)
                        fh.write("#!/bin/csh\n")
                        fh.write("#$ -j y\n" )
                        fh.write("#$ -o " + log_file + "\n")
                        fh.write("#$ -N " + qstat_name + "\n\n")
                        #fh.write("source /xraid/bioware/Modules/etc/profile.modules\n");
                        #fh.write("module load bioware\n\n");
    
                        # setup environment
                        fh.write("source /xraid/bioware/Modules/etc/profile.modules\n")
                        fh.write("module load bioware\n\n")
                    
                    cmd1 = self.get_fastasampler_cmd(unique_file, fastasamp_filename,start,end)
                    

                    logger.debug("fastasampler command: "+cmd1)
                    
                    if self.use_cluster:
                        fh.write(cmd1 + "\n")
                    else:
                        subprocess.call(cmd1,shell=True)
                    
                    cmd2 = self.get_usearch_cmd(fastasamp_filename, refdb, usearch_filename)

                    logger.debug("usearch command: "+cmd2)
                    print 'usearch',cmd2
                    if self.use_cluster:
                        fh.write(cmd2 + "\n")
                    else:
                        subprocess.call(cmd2,shell=True)
                    
                    cmd3 = self.get_grep_cmd(usearch_filename, clustergast_filename)

                    logger.debug("grep command: "+cmd3)
                    if self.use_cluster:                
                        fh.write(cmd3 + "\n")
                        fh.close()
                        
                        # make script executable and run it
                        os.chmod(script_filename, stat.S_IRWXU)
                        qsub_cmd = clusterize + " " + script_filename
                        
                        # on vamps and vampsdev qsub cannot be run - unless you call it from the
                        # cluster aware directories /xraid2-2/vampsweb/vamps and /xraid2-2/vampsweb/vampsdev
                        qsub_cmd = C.qsub_cmd + " " + script_filename
                        logger.debug("qsub command: "+qsub_cmd)
                        
                        #subprocess.call(qsub_cmd, shell=True)
                        proc = subprocess.Popen(qsub_cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
                        # proc.communicate will block - probably not what we want
                        #(stdout, stderr) = proc.communicate() #block the last onehere
                        #print stderr,stdout
    
                    else:
                        subprocess.call(cmd3,shell=True)
                        print cmd3
            
            else:
                #fastasamp_filename = os.path.join(gast_dir, 'samp')
                usearch_filename= os.path.join(gast_dir, "uc")
                clustergast_filename_single   = os.path.join(gast_dir, "gast"+dna_region)
                print usearch_filename,clustergast_filename_single
                cmd1 = self.get_usearch_cmd(unique_file,refdb,usearch_filename)
                print cmd1
                subprocess.call(cmd1,shell=True)
                cmd2 = self.get_grep_cmd(usearch_filename, clustergast_filename_single)
                print cmd2
                subprocess.call(cmd2,shell=True)
                
            if self.use_cluster:
                # wait here for all the clustergast scripts to finish
                temp_file_list = gast_file_list
            
                c = False
                maxwaittime = C.maxwaittime  # seconds
                sleeptime   = C.sleeptime    # seconds
                counter = 0
                while c == False:
                    counter += 1
                    if counter >= maxwaittime / sleeptime:
                        raise Exception("Max wait time exceeded in gast.py")
                    for index, file in enumerate(temp_file_list):
                        #print temp_file_list
                        if os.path.exists(file) and os.path.getsize(file) > 0:
                            # remove from tmp list
                            logger.debug("Found file now removing from list: "+file)
                            temp_file_list = temp_file_list[:index] + temp_file_list[index+1:]
                    
                    if temp_file_list:
                        logger.info("waiting for clustergast files to fill...")
                        logger.debug(' '.join(temp_file_list))
                        logger.info("\ttime: "+str(counter * sleeptime)+" | files left: "+str(len(temp_file_list)))
                        time.sleep(sleeptime)
                    else:
                        c = True
                    
            # now concatenate all the clustergast_files into one file (if they were split)
            if cluster_nodes:
                # gast file
                clustergast_filename_single   = os.path.join(gast_dir, "gast"+dna_region)
                clustergast_fh = open(clustergast_filename_single,'w')
                # have to turn off cluster above to be able to 'find' these files for concatenation
                for n in range(1,i-1):
                    #cmd = "cat "+ gast_dir + key+".gast_" + str(n) + " >> " + gast_dir + key+".gast"
                    file = os.path.join(gast_dir, key+".gast_" + str(n))
                    if(os.path.exists(file)):                    
                        shutil.copyfileobj(open(file,'rb'), clustergast_fh)
                    else:
                        logger.info( "Could not find file: "+os.path.basename(file)+" Skipping")

                clustergast_fh.flush()
                clustergast_fh.close()
            
        if not self.test:    
            # remove tmp files
            for n in range(i+1):
                #print "Trying to remove "+os.path.join(gast_dir,"uc_"+str(n))
                if os.path.exists(os.path.join(gast_dir,"uc_"+str(n))):
                    os.remove(os.path.join(gast_dir,"uc_"+str(n)))
                    pass
                #print "Trying to remove "+os.path.join(gast_dir,"samp_"+str(n))
                if os.path.exists(os.path.join(gast_dir,"samp_"+str(n))):    
                    os.remove(os.path.join(gast_dir,"samp_"+str(n)))
                    pass
                #print "Trying to remove "+os.path.join(self.gast_dir,key+".gast_"+str(n))
                if os.path.exists(os.path.join(gast_dir,key+".gast_"+str(n))):    
                    os.remove(os.path.join(gast_dir,key+".gast_"+str(n)))
                    pass
                    
                    
        
        print "Finished clustergast"
        logger.info("Finished clustergast")
        return ("SUCCESS","Clustergast")
    def validate_illumina_ini(self, analysis_dir):
        """
        The csv headers are checked earlier
        """

        print("Validating ini type Config File (may have been converted from csv)")
        new_ini_file = os.path.join(analysis_dir, self.general_config_dict['run'] + '.ini')
        print("New ini file location: "+new_ini_file)
        return_code = False
        error_code  = False
        warn_code   = False
        msg = ''
        error=False
        warn=False
        #print('configpath',self.general_config_dict['configPath'])
        # configPath here is the new configPath
        self.data_object = self.configDictionaryFromFile_ini(self.general_config_dict['configPath'])


        (error_code,warn_code) = self.check_for_missing_values(self.data_object)
        if error_code: error=True
        if warn_code: warn=True
        (error_code,warn_code) = self.check_for_datasets(self.data_object)
        if error_code: error=True
        if warn_code: warn=True
        (error_code,warn_code) = self.check_domain_suite_region(self.data_object)
        if error_code: error=True
        if warn_code: warn=True
        (error_code,warn_code) = self.check_project_name(self.data_object)
        if error_code: error=True
        if warn_code: warn=True
        (error_code,warn_code) = self.check_dataset_name(self.data_object)
        if error_code: error=True
        if warn_code: warn=True
        (error_code,warn_code) = self.check_projects_and_datasets(self.data_object)
        if error_code: error=True
        if warn_code: warn=True
        #print(self.data_object['input_dir'])
        #print(self.data_object['input_files'])


        if 'input_dir' not in self.data_object['general'] and 'input_files' not in self.data_object['general']:
            logger.warning("No input directory and no input files")
            warn=True
        elif not os.path.isdir(self.data_object['general']['input_dir']):
            logger.error("That is not a directory: "+self.data_object['general']['input_dir'])
            error=True
        elif self.data_object['general']['input_file_format'] == 'fastq' and self.data_object['general']['platform'] in C.illumina_list:
                file_exists = False
    #            if 'input_dir' in self.data_object['general'] and self.data_object['general']['input_dir']:
                for dirname, dirnames, filenames in os.walk(self.data_object['general']['input_dir']):
    #                if not filenames:
                    for file_name in filenames:
                        if os.path.isfile(os.path.join(dirname, file_name)):
                            file_exists = True
                            break
                if not file_exists:
                    logger.error("There are no files found in the input directory: "+self.data_object['general']['input_dir'])
                    error=True
        elif 'input_dir' in self.data_object['general'] and self.data_object['general']['input_dir'] and ('input_files' not in self.data_object['general'] or not self.data_object['general']['input_files']):
            logger.error("There are no files found in the input directory: "+self.data_object['general']['input_dir'])
            error=True

        if error:
            sys.exit( """\n\t\033[91mTHERE WERE SEVERE PROBLEMS WITH THE CSV and/or CONFIG FILE - EXITING
            PLEASE CORRECT THEM AND START OVER.\033[0m\n
            To view the errors add ' --loglevel info' to the command line.\n""")
        elif warn:
            msg = """\n\t\033[93mTHERE WERE NON-FATAL PROBLEMS WITH THE CSV and/or CONFIG FILE THAT MAY OR MAY NOT CAUSE PROBLEMS.\033[0m\n
                To view the warnings add ' --loglevel warning' to the command line.\n"""
            print("\033[92mCSV File Passed Vaidation! (with warnings)\033[0m")
        else:
            print("\033[92mCSV File Passed Vaidation!\033[0m")
        return msg